#!/usr/bin/env python3 """Validate data pipeline setup for v0.2.0.""" import sys from pathlib import Path # Add project root to path sys.path.insert(0, str(Path(__file__).parent.parent)) from src.core.enums import Timeframe # noqa: E402 from src.data.database import get_engine, init_database # noqa: E402 from src.data.loaders import CSVLoader, ParquetLoader # noqa: E402 from src.data.preprocessors import ( # noqa: E402 filter_session, handle_missing_data, remove_duplicates, ) from src.data.repositories import OHLCVRepository # noqa: E402 from src.data.validators import check_continuity, detect_outliers, validate_ohlcv # noqa: E402 from src.logging import get_logger # noqa: E402 logger = get_logger(__name__) def validate_imports(): """Validate that all data module imports work.""" print("✓ Data module imports successful") def validate_database(): """Validate database setup.""" try: engine = get_engine() assert engine is not None print("✓ Database engine created") # Test initialization (will create tables if needed) init_database(create_tables=True) print("✓ Database initialization successful") except Exception as e: print(f"✗ Database validation failed: {e}") raise def validate_loaders(): """Validate data loaders.""" try: csv_loader = CSVLoader() parquet_loader = ParquetLoader() assert csv_loader is not None assert parquet_loader is not None print("✓ Data loaders initialized") except Exception as e: print(f"✗ Loader validation failed: {e}") raise def validate_preprocessors(): """Validate preprocessors.""" import pandas as pd import pytz # type: ignore[import-untyped] # Create sample data with EST timezone (trading session is 3-4 AM EST) est = pytz.timezone("America/New_York") timestamps = pd.date_range("2024-01-01 03:00", periods=10, freq="1min", tz=est) df = pd.DataFrame( { "timestamp": timestamps, "open": [100.0] * 10, "high": [100.5] * 10, "low": [99.5] * 10, "close": [100.2] * 10, } ) # Test preprocessors df_processed = handle_missing_data(df) df_processed = remove_duplicates(df_processed) df_filtered = filter_session(df_processed) assert len(df_filtered) > 0 print("✓ Preprocessors working") def validate_validators(): """Validate validators.""" import pandas as pd # Create valid data (timezone not required for validators) df = pd.DataFrame( { "timestamp": pd.date_range("2024-01-01 03:00", periods=10, freq="1min"), "open": [100.0] * 10, "high": [100.5] * 10, "low": [99.5] * 10, "close": [100.2] * 10, } ) # Test validators df_validated = validate_ohlcv(df) is_continuous, gaps = check_continuity(df_validated, Timeframe.M1) _ = detect_outliers(df_validated) # Check it runs without error assert len(df_validated) == 10 print("✓ Validators working") def validate_repositories(): """Validate repositories.""" from src.data.database import get_db_session try: with get_db_session() as session: repo = OHLCVRepository(session=session) assert repo is not None print("✓ Repositories working") except Exception as e: print(f"✗ Repository validation failed: {e}") raise def validate_directories(): """Validate directory structure.""" required_dirs = [ "data/raw/ohlcv/1min", "data/raw/ohlcv/5min", "data/raw/ohlcv/15min", "data/processed/features", "data/processed/patterns", "data/labels/individual_patterns", ] for dir_name in required_dirs: dir_path = Path(dir_name) if not dir_path.exists(): print(f"✗ Missing directory: {dir_name}") return False print("✓ Directory structure valid") return True def main(): """Run all validation checks.""" print("Validating ICT ML Trading System v0.2.0 Data Pipeline...") print("-" * 60) try: validate_imports() validate_database() validate_loaders() validate_preprocessors() validate_validators() validate_repositories() validate_directories() print("-" * 60) print("✓ All validations passed!") return 0 except Exception as e: print(f"✗ Validation failed: {e}") import traceback traceback.print_exc() return 1 if __name__ == "__main__": sys.exit(main())