feat(v0.2.0): data pipeline

2026-01-05 11:34:18 +02:00
parent 2527938680
commit b5e7043df6
23 changed files with 2813 additions and 7 deletions
--- a/scripts/download_data.py
+++ b/scripts/download_data.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+"""Download DAX OHLCV data from external sources."""
+
+import argparse
+import sys
+from pathlib import Path
+
+# Add project root to path
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))
+
+from src.core.enums import Timeframe  # noqa: E402
+from src.logging import get_logger  # noqa: E402
+
+logger = get_logger(__name__)
+
+
+def download_from_csv(
+    input_file: str,
+    symbol: str,
+    timeframe: Timeframe,
+    output_dir: Path,
+) -> None:
+    """
+    Copy/convert CSV file to standard format.
+
+    Args:
+        input_file: Path to input CSV file
+        symbol: Trading symbol
+        timeframe: Timeframe enum
+        output_dir: Output directory
+    """
+    from src.data.loaders import CSVLoader
+
+    loader = CSVLoader()
+    df = loader.load(input_file, symbol=symbol, timeframe=timeframe)
+
+    # Ensure output directory exists
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Save as CSV
+    output_file = output_dir / f"{symbol}_{timeframe.value}.csv"
+    df.to_csv(output_file, index=False)
+    logger.info(f"Saved {len(df)} rows to {output_file}")
+
+    # Also save as Parquet for faster loading
+    output_parquet = output_dir / f"{symbol}_{timeframe.value}.parquet"
+    df.to_parquet(output_parquet, index=False)
+    logger.info(f"Saved {len(df)} rows to {output_parquet}")
+
+
+def download_from_api(
+    symbol: str,
+    timeframe: Timeframe,
+    start_date: str,
+    end_date: str,
+    output_dir: Path,
+    api_provider: str = "manual",
+) -> None:
+    """
+    Download data from API (placeholder for future implementation).
+
+    Args:
+        symbol: Trading symbol
+        timeframe: Timeframe enum
+        start_date: Start date (YYYY-MM-DD)
+        end_date: End date (YYYY-MM-DD)
+        output_dir: Output directory
+        api_provider: API provider name
+    """
+    logger.warning(
+        "API download not yet implemented. " "Please provide CSV file using --input-file option."
+    )
+    logger.info(
+        f"Would download {symbol} {timeframe.value} data " f"from {start_date} to {end_date}"
+    )
+
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description="Download DAX OHLCV data",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Download from CSV file
+  python scripts/download_data.py --input-file data.csv \\
+      --symbol DAX --timeframe 1min \\
+      --output data/raw/ohlcv/1min/
+
+  # Download from API (when implemented)
+  python scripts/download_data.py --symbol DAX --timeframe 5min \\
+      --start 2024-01-01 --end 2024-01-31 \\
+      --output data/raw/ohlcv/5min/
+        """,
+    )
+
+    # Input options
+    input_group = parser.add_mutually_exclusive_group(required=True)
+    input_group.add_argument(
+        "--input-file",
+        type=str,
+        help="Path to input CSV file",
+    )
+    input_group.add_argument(
+        "--api",
+        action="store_true",
+        help="Download from API (not yet implemented)",
+    )
+
+    # Required arguments
+    parser.add_argument(
+        "--symbol",
+        type=str,
+        default="DAX",
+        help="Trading symbol (default: DAX)",
+    )
+    parser.add_argument(
+        "--timeframe",
+        type=str,
+        choices=["1min", "5min", "15min"],
+        required=True,
+        help="Timeframe",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        required=True,
+        help="Output directory",
+    )
+
+    # Optional arguments for API download
+    parser.add_argument(
+        "--start",
+        type=str,
+        help="Start date (YYYY-MM-DD) for API download",
+    )
+    parser.add_argument(
+        "--end",
+        type=str,
+        help="End date (YYYY-MM-DD) for API download",
+    )
+
+    args = parser.parse_args()
+
+    try:
+        # Convert timeframe string to enum
+        timeframe_map = {
+            "1min": Timeframe.M1,
+            "5min": Timeframe.M5,
+            "15min": Timeframe.M15,
+        }
+        timeframe = timeframe_map[args.timeframe]
+
+        # Create output directory
+        output_dir = Path(args.output)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        # Download data
+        if args.input_file:
+            logger.info(f"Downloading from CSV: {args.input_file}")
+            download_from_csv(args.input_file, args.symbol, timeframe, output_dir)
+        elif args.api:
+            if not args.start or not args.end:
+                parser.error("--start and --end are required for API download")
+            download_from_api(
+                args.symbol,
+                timeframe,
+                args.start,
+                args.end,
+                output_dir,
+            )
+
+        logger.info("Data download completed successfully")
+        return 0
+
+    except Exception as e:
+        logger.error(f"Data download failed: {e}", exc_info=True)
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/scripts/process_data.py
+++ b/scripts/process_data.py
@@ -0,0 +1,269 @@
+#!/usr/bin/env python3
+"""Batch process OHLCV data: clean, filter, and save."""
+
+import argparse
+import sys
+from pathlib import Path
+
+# Add project root to path
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))
+
+from src.core.enums import Timeframe  # noqa: E402
+from src.data.database import get_db_session  # noqa: E402
+from src.data.loaders import load_and_preprocess  # noqa: E402
+from src.data.models import OHLCVData  # noqa: E402
+from src.data.repositories import OHLCVRepository  # noqa: E402
+from src.logging import get_logger  # noqa: E402
+
+logger = get_logger(__name__)
+
+
+def process_file(
+    input_file: Path,
+    symbol: str,
+    timeframe: Timeframe,
+    output_dir: Path,
+    save_to_db: bool = False,
+    filter_session_hours: bool = True,
+) -> None:
+    """
+    Process a single data file.
+
+    Args:
+        input_file: Path to input file
+        symbol: Trading symbol
+        timeframe: Timeframe enum
+        output_dir: Output directory
+        save_to_db: Whether to save to database
+        filter_session_hours: Whether to filter to trading session (3-4 AM EST)
+    """
+    logger.info(f"Processing file: {input_file}")
+
+    # Load and preprocess
+    df = load_and_preprocess(
+        str(input_file),
+        loader_type="auto",
+        validate=True,
+        preprocess=True,
+        filter_to_session=filter_session_hours,
+    )
+
+    # Ensure symbol and timeframe columns
+    df["symbol"] = symbol
+    df["timeframe"] = timeframe.value
+
+    # Save processed CSV
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_csv = output_dir / f"{symbol}_{timeframe.value}_processed.csv"
+    df.to_csv(output_csv, index=False)
+    logger.info(f"Saved processed CSV: {output_csv} ({len(df)} rows)")
+
+    # Save processed Parquet
+    output_parquet = output_dir / f"{symbol}_{timeframe.value}_processed.parquet"
+    df.to_parquet(output_parquet, index=False)
+    logger.info(f"Saved processed Parquet: {output_parquet} ({len(df)} rows)")
+
+    # Save to database if requested
+    if save_to_db:
+        logger.info("Saving to database...")
+        with get_db_session() as session:
+            repo = OHLCVRepository(session=session)
+
+            # Convert DataFrame to OHLCVData models
+            records = []
+            for _, row in df.iterrows():
+                # Check if record already exists
+                if repo.exists(symbol, timeframe, row["timestamp"]):
+                    continue
+
+                record = OHLCVData(
+                    symbol=symbol,
+                    timeframe=timeframe,
+                    timestamp=row["timestamp"],
+                    open=row["open"],
+                    high=row["high"],
+                    low=row["low"],
+                    close=row["close"],
+                    volume=row.get("volume"),
+                )
+                records.append(record)
+
+            if records:
+                repo.create_batch(records)
+                logger.info(f"Saved {len(records)} records to database")
+            else:
+                logger.info("No new records to save (all already exist)")
+
+
+def process_directory(
+    input_dir: Path,
+    output_dir: Path,
+    symbol: str = "DAX",
+    save_to_db: bool = False,
+    filter_session_hours: bool = True,
+) -> None:
+    """
+    Process all data files in a directory.
+
+    Args:
+        input_dir: Input directory
+        output_dir: Output directory
+        symbol: Trading symbol
+        save_to_db: Whether to save to database
+        filter_session_hours: Whether to filter to trading session
+    """
+    # Find all CSV and Parquet files
+    files = list(input_dir.glob("*.csv")) + list(input_dir.glob("*.parquet"))
+
+    if not files:
+        logger.warning(f"No data files found in {input_dir}")
+        return
+
+    # Detect timeframe from directory name or file
+    timeframe_map = {
+        "1min": Timeframe.M1,
+        "5min": Timeframe.M5,
+        "15min": Timeframe.M15,
+    }
+
+    timeframe = None
+    for tf_name, tf_enum in timeframe_map.items():
+        if tf_name in str(input_dir):
+            timeframe = tf_enum
+            break
+
+    if timeframe is None:
+        logger.error(f"Could not determine timeframe from directory: {input_dir}")
+        return
+
+    logger.info(f"Processing {len(files)} files from {input_dir}")
+
+    for file_path in files:
+        try:
+            process_file(
+                file_path,
+                symbol,
+                timeframe,
+                output_dir,
+                save_to_db,
+                filter_session_hours,
+            )
+        except Exception as e:
+            logger.error(f"Failed to process {file_path}: {e}", exc_info=True)
+            continue
+
+    logger.info("Batch processing completed")
+
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description="Batch process OHLCV data",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Process single file
+  python scripts/process_data.py --input data/raw/ohlcv/1min/m1.csv \\
+      --output data/processed/ --symbol DAX --timeframe 1min
+
+  # Process directory
+  python scripts/process_data.py --input data/raw/ohlcv/1min/ \\
+      --output data/processed/ --symbol DAX
+
+  # Process and save to database
+  python scripts/process_data.py --input data/raw/ohlcv/1min/ \\
+      --output data/processed/ --save-db
+        """,
+    )
+
+    parser.add_argument(
+        "--input",
+        type=str,
+        required=True,
+        help="Input file or directory",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        required=True,
+        help="Output directory",
+    )
+    parser.add_argument(
+        "--symbol",
+        type=str,
+        default="DAX",
+        help="Trading symbol (default: DAX)",
+    )
+    parser.add_argument(
+        "--timeframe",
+        type=str,
+        choices=["1min", "5min", "15min"],
+        help="Timeframe (required if processing single file)",
+    )
+    parser.add_argument(
+        "--save-db",
+        action="store_true",
+        help="Save processed data to database",
+    )
+    parser.add_argument(
+        "--no-session-filter",
+        action="store_true",
+        help="Don't filter to trading session hours (3-4 AM EST)",
+    )
+
+    args = parser.parse_args()
+
+    try:
+        input_path = Path(args.input)
+        output_dir = Path(args.output)
+
+        if not input_path.exists():
+            logger.error(f"Input path does not exist: {input_path}")
+            return 1
+
+        # Process single file or directory
+        if input_path.is_file():
+            if not args.timeframe:
+                parser.error("--timeframe is required when processing a single file")
+                return 1
+
+            timeframe_map = {
+                "1min": Timeframe.M1,
+                "5min": Timeframe.M5,
+                "15min": Timeframe.M15,
+            }
+            timeframe = timeframe_map[args.timeframe]
+
+            process_file(
+                input_path,
+                args.symbol,
+                timeframe,
+                output_dir,
+                save_to_db=args.save_db,
+                filter_session_hours=not args.no_session_filter,
+            )
+
+        elif input_path.is_dir():
+            process_directory(
+                input_path,
+                output_dir,
+                symbol=args.symbol,
+                save_to_db=args.save_db,
+                filter_session_hours=not args.no_session_filter,
+            )
+
+        else:
+            logger.error(f"Input path is neither file nor directory: {input_path}")
+            return 1
+
+        logger.info("Data processing completed successfully")
+        return 0
+
+    except Exception as e:
+        logger.error(f"Data processing failed: {e}", exc_info=True)
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/scripts/setup_database.py
+++ b/scripts/setup_database.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+"""Initialize database and create tables."""
+
+import argparse
+import sys
+from pathlib import Path
+
+# Add project root to path
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))
+
+from src.data.database import init_database  # noqa: E402
+from src.logging import get_logger  # noqa: E402
+
+logger = get_logger(__name__)
+
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(description="Initialize database and create tables")
+    parser.add_argument(
+        "--skip-tables",
+        action="store_true",
+        help="Skip table creation (useful for testing connection only)",
+    )
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        help="Enable verbose logging",
+    )
+
+    args = parser.parse_args()
+
+    try:
+        logger.info("Initializing database...")
+        init_database(create_tables=not args.skip_tables)
+        logger.info("Database initialization completed successfully")
+        return 0
+
+    except Exception as e:
+        logger.error(f"Database initialization failed: {e}", exc_info=True)
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/scripts/validate_data_pipeline.py
+++ b/scripts/validate_data_pipeline.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+"""Validate data pipeline setup for v0.2.0."""
+
+import sys
+from pathlib import Path
+
+# Add project root to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from src.core.enums import Timeframe  # noqa: E402
+from src.data.database import get_engine, init_database  # noqa: E402
+from src.data.loaders import CSVLoader, ParquetLoader  # noqa: E402
+from src.data.preprocessors import (  # noqa: E402
+    filter_session,
+    handle_missing_data,
+    remove_duplicates,
+)
+from src.data.repositories import OHLCVRepository  # noqa: E402
+from src.data.validators import check_continuity, detect_outliers, validate_ohlcv  # noqa: E402
+from src.logging import get_logger  # noqa: E402
+
+logger = get_logger(__name__)
+
+
+def validate_imports():
+    """Validate that all data module imports work."""
+    print("✓ Data module imports successful")
+
+
+def validate_database():
+    """Validate database setup."""
+    try:
+        engine = get_engine()
+        assert engine is not None
+        print("✓ Database engine created")
+
+        # Test initialization (will create tables if needed)
+        init_database(create_tables=True)
+        print("✓ Database initialization successful")
+    except Exception as e:
+        print(f"✗ Database validation failed: {e}")
+        raise
+
+
+def validate_loaders():
+    """Validate data loaders."""
+    try:
+        csv_loader = CSVLoader()
+        parquet_loader = ParquetLoader()
+        assert csv_loader is not None
+        assert parquet_loader is not None
+        print("✓ Data loaders initialized")
+    except Exception as e:
+        print(f"✗ Loader validation failed: {e}")
+        raise
+
+
+def validate_preprocessors():
+    """Validate preprocessors."""
+    import pandas as pd
+    import pytz  # type: ignore[import-untyped]
+
+    # Create sample data with EST timezone (trading session is 3-4 AM EST)
+    est = pytz.timezone("America/New_York")
+    timestamps = pd.date_range("2024-01-01 03:00", periods=10, freq="1min", tz=est)
+
+    df = pd.DataFrame(
+        {
+            "timestamp": timestamps,
+            "open": [100.0] * 10,
+            "high": [100.5] * 10,
+            "low": [99.5] * 10,
+            "close": [100.2] * 10,
+        }
+    )
+
+    # Test preprocessors
+    df_processed = handle_missing_data(df)
+    df_processed = remove_duplicates(df_processed)
+    df_filtered = filter_session(df_processed)
+
+    assert len(df_filtered) > 0
+    print("✓ Preprocessors working")
+
+
+def validate_validators():
+    """Validate validators."""
+    import pandas as pd
+
+    # Create valid data (timezone not required for validators)
+    df = pd.DataFrame(
+        {
+            "timestamp": pd.date_range("2024-01-01 03:00", periods=10, freq="1min"),
+            "open": [100.0] * 10,
+            "high": [100.5] * 10,
+            "low": [99.5] * 10,
+            "close": [100.2] * 10,
+        }
+    )
+
+    # Test validators
+    df_validated = validate_ohlcv(df)
+    is_continuous, gaps = check_continuity(df_validated, Timeframe.M1)
+    _ = detect_outliers(df_validated)  # Check it runs without error
+
+    assert len(df_validated) == 10
+    print("✓ Validators working")
+
+
+def validate_repositories():
+    """Validate repositories."""
+    from src.data.database import get_db_session
+
+    try:
+        with get_db_session() as session:
+            repo = OHLCVRepository(session=session)
+            assert repo is not None
+        print("✓ Repositories working")
+    except Exception as e:
+        print(f"✗ Repository validation failed: {e}")
+        raise
+
+
+def validate_directories():
+    """Validate directory structure."""
+    required_dirs = [
+        "data/raw/ohlcv/1min",
+        "data/raw/ohlcv/5min",
+        "data/raw/ohlcv/15min",
+        "data/processed/features",
+        "data/processed/patterns",
+        "data/labels/individual_patterns",
+    ]
+
+    for dir_name in required_dirs:
+        dir_path = Path(dir_name)
+        if not dir_path.exists():
+            print(f"✗ Missing directory: {dir_name}")
+            return False
+
+    print("✓ Directory structure valid")
+    return True
+
+
+def main():
+    """Run all validation checks."""
+    print("Validating ICT ML Trading System v0.2.0 Data Pipeline...")
+    print("-" * 60)
+
+    try:
+        validate_imports()
+        validate_database()
+        validate_loaders()
+        validate_preprocessors()
+        validate_validators()
+        validate_repositories()
+        validate_directories()
+
+        print("-" * 60)
+        print("✓ All validations passed!")
+        return 0
+
+    except Exception as e:
+        print(f"✗ Validation failed: {e}")
+        import traceback
+
+        traceback.print_exc()
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())