feat(v0.2.0): data pipeline
This commit is contained in:
183
scripts/download_data.py
Executable file
183
scripts/download_data.py
Executable file
@@ -0,0 +1,183 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Download DAX OHLCV data from external sources."""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add project root to path
|
||||
project_root = Path(__file__).parent.parent
|
||||
sys.path.insert(0, str(project_root))
|
||||
|
||||
from src.core.enums import Timeframe # noqa: E402
|
||||
from src.logging import get_logger # noqa: E402
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def download_from_csv(
|
||||
input_file: str,
|
||||
symbol: str,
|
||||
timeframe: Timeframe,
|
||||
output_dir: Path,
|
||||
) -> None:
|
||||
"""
|
||||
Copy/convert CSV file to standard format.
|
||||
|
||||
Args:
|
||||
input_file: Path to input CSV file
|
||||
symbol: Trading symbol
|
||||
timeframe: Timeframe enum
|
||||
output_dir: Output directory
|
||||
"""
|
||||
from src.data.loaders import CSVLoader
|
||||
|
||||
loader = CSVLoader()
|
||||
df = loader.load(input_file, symbol=symbol, timeframe=timeframe)
|
||||
|
||||
# Ensure output directory exists
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save as CSV
|
||||
output_file = output_dir / f"{symbol}_{timeframe.value}.csv"
|
||||
df.to_csv(output_file, index=False)
|
||||
logger.info(f"Saved {len(df)} rows to {output_file}")
|
||||
|
||||
# Also save as Parquet for faster loading
|
||||
output_parquet = output_dir / f"{symbol}_{timeframe.value}.parquet"
|
||||
df.to_parquet(output_parquet, index=False)
|
||||
logger.info(f"Saved {len(df)} rows to {output_parquet}")
|
||||
|
||||
|
||||
def download_from_api(
|
||||
symbol: str,
|
||||
timeframe: Timeframe,
|
||||
start_date: str,
|
||||
end_date: str,
|
||||
output_dir: Path,
|
||||
api_provider: str = "manual",
|
||||
) -> None:
|
||||
"""
|
||||
Download data from API (placeholder for future implementation).
|
||||
|
||||
Args:
|
||||
symbol: Trading symbol
|
||||
timeframe: Timeframe enum
|
||||
start_date: Start date (YYYY-MM-DD)
|
||||
end_date: End date (YYYY-MM-DD)
|
||||
output_dir: Output directory
|
||||
api_provider: API provider name
|
||||
"""
|
||||
logger.warning(
|
||||
"API download not yet implemented. " "Please provide CSV file using --input-file option."
|
||||
)
|
||||
logger.info(
|
||||
f"Would download {symbol} {timeframe.value} data " f"from {start_date} to {end_date}"
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download DAX OHLCV data",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Download from CSV file
|
||||
python scripts/download_data.py --input-file data.csv \\
|
||||
--symbol DAX --timeframe 1min \\
|
||||
--output data/raw/ohlcv/1min/
|
||||
|
||||
# Download from API (when implemented)
|
||||
python scripts/download_data.py --symbol DAX --timeframe 5min \\
|
||||
--start 2024-01-01 --end 2024-01-31 \\
|
||||
--output data/raw/ohlcv/5min/
|
||||
""",
|
||||
)
|
||||
|
||||
# Input options
|
||||
input_group = parser.add_mutually_exclusive_group(required=True)
|
||||
input_group.add_argument(
|
||||
"--input-file",
|
||||
type=str,
|
||||
help="Path to input CSV file",
|
||||
)
|
||||
input_group.add_argument(
|
||||
"--api",
|
||||
action="store_true",
|
||||
help="Download from API (not yet implemented)",
|
||||
)
|
||||
|
||||
# Required arguments
|
||||
parser.add_argument(
|
||||
"--symbol",
|
||||
type=str,
|
||||
default="DAX",
|
||||
help="Trading symbol (default: DAX)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeframe",
|
||||
type=str,
|
||||
choices=["1min", "5min", "15min"],
|
||||
required=True,
|
||||
help="Timeframe",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Output directory",
|
||||
)
|
||||
|
||||
# Optional arguments for API download
|
||||
parser.add_argument(
|
||||
"--start",
|
||||
type=str,
|
||||
help="Start date (YYYY-MM-DD) for API download",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--end",
|
||||
type=str,
|
||||
help="End date (YYYY-MM-DD) for API download",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
# Convert timeframe string to enum
|
||||
timeframe_map = {
|
||||
"1min": Timeframe.M1,
|
||||
"5min": Timeframe.M5,
|
||||
"15min": Timeframe.M15,
|
||||
}
|
||||
timeframe = timeframe_map[args.timeframe]
|
||||
|
||||
# Create output directory
|
||||
output_dir = Path(args.output)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Download data
|
||||
if args.input_file:
|
||||
logger.info(f"Downloading from CSV: {args.input_file}")
|
||||
download_from_csv(args.input_file, args.symbol, timeframe, output_dir)
|
||||
elif args.api:
|
||||
if not args.start or not args.end:
|
||||
parser.error("--start and --end are required for API download")
|
||||
download_from_api(
|
||||
args.symbol,
|
||||
timeframe,
|
||||
args.start,
|
||||
args.end,
|
||||
output_dir,
|
||||
)
|
||||
|
||||
logger.info("Data download completed successfully")
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Data download failed: {e}", exc_info=True)
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
269
scripts/process_data.py
Executable file
269
scripts/process_data.py
Executable file
@@ -0,0 +1,269 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Batch process OHLCV data: clean, filter, and save."""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add project root to path
|
||||
project_root = Path(__file__).parent.parent
|
||||
sys.path.insert(0, str(project_root))
|
||||
|
||||
from src.core.enums import Timeframe # noqa: E402
|
||||
from src.data.database import get_db_session # noqa: E402
|
||||
from src.data.loaders import load_and_preprocess # noqa: E402
|
||||
from src.data.models import OHLCVData # noqa: E402
|
||||
from src.data.repositories import OHLCVRepository # noqa: E402
|
||||
from src.logging import get_logger # noqa: E402
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def process_file(
|
||||
input_file: Path,
|
||||
symbol: str,
|
||||
timeframe: Timeframe,
|
||||
output_dir: Path,
|
||||
save_to_db: bool = False,
|
||||
filter_session_hours: bool = True,
|
||||
) -> None:
|
||||
"""
|
||||
Process a single data file.
|
||||
|
||||
Args:
|
||||
input_file: Path to input file
|
||||
symbol: Trading symbol
|
||||
timeframe: Timeframe enum
|
||||
output_dir: Output directory
|
||||
save_to_db: Whether to save to database
|
||||
filter_session_hours: Whether to filter to trading session (3-4 AM EST)
|
||||
"""
|
||||
logger.info(f"Processing file: {input_file}")
|
||||
|
||||
# Load and preprocess
|
||||
df = load_and_preprocess(
|
||||
str(input_file),
|
||||
loader_type="auto",
|
||||
validate=True,
|
||||
preprocess=True,
|
||||
filter_to_session=filter_session_hours,
|
||||
)
|
||||
|
||||
# Ensure symbol and timeframe columns
|
||||
df["symbol"] = symbol
|
||||
df["timeframe"] = timeframe.value
|
||||
|
||||
# Save processed CSV
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
output_csv = output_dir / f"{symbol}_{timeframe.value}_processed.csv"
|
||||
df.to_csv(output_csv, index=False)
|
||||
logger.info(f"Saved processed CSV: {output_csv} ({len(df)} rows)")
|
||||
|
||||
# Save processed Parquet
|
||||
output_parquet = output_dir / f"{symbol}_{timeframe.value}_processed.parquet"
|
||||
df.to_parquet(output_parquet, index=False)
|
||||
logger.info(f"Saved processed Parquet: {output_parquet} ({len(df)} rows)")
|
||||
|
||||
# Save to database if requested
|
||||
if save_to_db:
|
||||
logger.info("Saving to database...")
|
||||
with get_db_session() as session:
|
||||
repo = OHLCVRepository(session=session)
|
||||
|
||||
# Convert DataFrame to OHLCVData models
|
||||
records = []
|
||||
for _, row in df.iterrows():
|
||||
# Check if record already exists
|
||||
if repo.exists(symbol, timeframe, row["timestamp"]):
|
||||
continue
|
||||
|
||||
record = OHLCVData(
|
||||
symbol=symbol,
|
||||
timeframe=timeframe,
|
||||
timestamp=row["timestamp"],
|
||||
open=row["open"],
|
||||
high=row["high"],
|
||||
low=row["low"],
|
||||
close=row["close"],
|
||||
volume=row.get("volume"),
|
||||
)
|
||||
records.append(record)
|
||||
|
||||
if records:
|
||||
repo.create_batch(records)
|
||||
logger.info(f"Saved {len(records)} records to database")
|
||||
else:
|
||||
logger.info("No new records to save (all already exist)")
|
||||
|
||||
|
||||
def process_directory(
|
||||
input_dir: Path,
|
||||
output_dir: Path,
|
||||
symbol: str = "DAX",
|
||||
save_to_db: bool = False,
|
||||
filter_session_hours: bool = True,
|
||||
) -> None:
|
||||
"""
|
||||
Process all data files in a directory.
|
||||
|
||||
Args:
|
||||
input_dir: Input directory
|
||||
output_dir: Output directory
|
||||
symbol: Trading symbol
|
||||
save_to_db: Whether to save to database
|
||||
filter_session_hours: Whether to filter to trading session
|
||||
"""
|
||||
# Find all CSV and Parquet files
|
||||
files = list(input_dir.glob("*.csv")) + list(input_dir.glob("*.parquet"))
|
||||
|
||||
if not files:
|
||||
logger.warning(f"No data files found in {input_dir}")
|
||||
return
|
||||
|
||||
# Detect timeframe from directory name or file
|
||||
timeframe_map = {
|
||||
"1min": Timeframe.M1,
|
||||
"5min": Timeframe.M5,
|
||||
"15min": Timeframe.M15,
|
||||
}
|
||||
|
||||
timeframe = None
|
||||
for tf_name, tf_enum in timeframe_map.items():
|
||||
if tf_name in str(input_dir):
|
||||
timeframe = tf_enum
|
||||
break
|
||||
|
||||
if timeframe is None:
|
||||
logger.error(f"Could not determine timeframe from directory: {input_dir}")
|
||||
return
|
||||
|
||||
logger.info(f"Processing {len(files)} files from {input_dir}")
|
||||
|
||||
for file_path in files:
|
||||
try:
|
||||
process_file(
|
||||
file_path,
|
||||
symbol,
|
||||
timeframe,
|
||||
output_dir,
|
||||
save_to_db,
|
||||
filter_session_hours,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process {file_path}: {e}", exc_info=True)
|
||||
continue
|
||||
|
||||
logger.info("Batch processing completed")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Batch process OHLCV data",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Process single file
|
||||
python scripts/process_data.py --input data/raw/ohlcv/1min/m1.csv \\
|
||||
--output data/processed/ --symbol DAX --timeframe 1min
|
||||
|
||||
# Process directory
|
||||
python scripts/process_data.py --input data/raw/ohlcv/1min/ \\
|
||||
--output data/processed/ --symbol DAX
|
||||
|
||||
# Process and save to database
|
||||
python scripts/process_data.py --input data/raw/ohlcv/1min/ \\
|
||||
--output data/processed/ --save-db
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Input file or directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Output directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--symbol",
|
||||
type=str,
|
||||
default="DAX",
|
||||
help="Trading symbol (default: DAX)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeframe",
|
||||
type=str,
|
||||
choices=["1min", "5min", "15min"],
|
||||
help="Timeframe (required if processing single file)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--save-db",
|
||||
action="store_true",
|
||||
help="Save processed data to database",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-session-filter",
|
||||
action="store_true",
|
||||
help="Don't filter to trading session hours (3-4 AM EST)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
input_path = Path(args.input)
|
||||
output_dir = Path(args.output)
|
||||
|
||||
if not input_path.exists():
|
||||
logger.error(f"Input path does not exist: {input_path}")
|
||||
return 1
|
||||
|
||||
# Process single file or directory
|
||||
if input_path.is_file():
|
||||
if not args.timeframe:
|
||||
parser.error("--timeframe is required when processing a single file")
|
||||
return 1
|
||||
|
||||
timeframe_map = {
|
||||
"1min": Timeframe.M1,
|
||||
"5min": Timeframe.M5,
|
||||
"15min": Timeframe.M15,
|
||||
}
|
||||
timeframe = timeframe_map[args.timeframe]
|
||||
|
||||
process_file(
|
||||
input_path,
|
||||
args.symbol,
|
||||
timeframe,
|
||||
output_dir,
|
||||
save_to_db=args.save_db,
|
||||
filter_session_hours=not args.no_session_filter,
|
||||
)
|
||||
|
||||
elif input_path.is_dir():
|
||||
process_directory(
|
||||
input_path,
|
||||
output_dir,
|
||||
symbol=args.symbol,
|
||||
save_to_db=args.save_db,
|
||||
filter_session_hours=not args.no_session_filter,
|
||||
)
|
||||
|
||||
else:
|
||||
logger.error(f"Input path is neither file nor directory: {input_path}")
|
||||
return 1
|
||||
|
||||
logger.info("Data processing completed successfully")
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Data processing failed: {e}", exc_info=True)
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
47
scripts/setup_database.py
Executable file
47
scripts/setup_database.py
Executable file
@@ -0,0 +1,47 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Initialize database and create tables."""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add project root to path
|
||||
project_root = Path(__file__).parent.parent
|
||||
sys.path.insert(0, str(project_root))
|
||||
|
||||
from src.data.database import init_database # noqa: E402
|
||||
from src.logging import get_logger # noqa: E402
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(description="Initialize database and create tables")
|
||||
parser.add_argument(
|
||||
"--skip-tables",
|
||||
action="store_true",
|
||||
help="Skip table creation (useful for testing connection only)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
"-v",
|
||||
action="store_true",
|
||||
help="Enable verbose logging",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
logger.info("Initializing database...")
|
||||
init_database(create_tables=not args.skip_tables)
|
||||
logger.info("Database initialization completed successfully")
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Database initialization failed: {e}", exc_info=True)
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
172
scripts/validate_data_pipeline.py
Executable file
172
scripts/validate_data_pipeline.py
Executable file
@@ -0,0 +1,172 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Validate data pipeline setup for v0.2.0."""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add project root to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from src.core.enums import Timeframe # noqa: E402
|
||||
from src.data.database import get_engine, init_database # noqa: E402
|
||||
from src.data.loaders import CSVLoader, ParquetLoader # noqa: E402
|
||||
from src.data.preprocessors import ( # noqa: E402
|
||||
filter_session,
|
||||
handle_missing_data,
|
||||
remove_duplicates,
|
||||
)
|
||||
from src.data.repositories import OHLCVRepository # noqa: E402
|
||||
from src.data.validators import check_continuity, detect_outliers, validate_ohlcv # noqa: E402
|
||||
from src.logging import get_logger # noqa: E402
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def validate_imports():
|
||||
"""Validate that all data module imports work."""
|
||||
print("✓ Data module imports successful")
|
||||
|
||||
|
||||
def validate_database():
|
||||
"""Validate database setup."""
|
||||
try:
|
||||
engine = get_engine()
|
||||
assert engine is not None
|
||||
print("✓ Database engine created")
|
||||
|
||||
# Test initialization (will create tables if needed)
|
||||
init_database(create_tables=True)
|
||||
print("✓ Database initialization successful")
|
||||
except Exception as e:
|
||||
print(f"✗ Database validation failed: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def validate_loaders():
|
||||
"""Validate data loaders."""
|
||||
try:
|
||||
csv_loader = CSVLoader()
|
||||
parquet_loader = ParquetLoader()
|
||||
assert csv_loader is not None
|
||||
assert parquet_loader is not None
|
||||
print("✓ Data loaders initialized")
|
||||
except Exception as e:
|
||||
print(f"✗ Loader validation failed: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def validate_preprocessors():
|
||||
"""Validate preprocessors."""
|
||||
import pandas as pd
|
||||
import pytz # type: ignore[import-untyped]
|
||||
|
||||
# Create sample data with EST timezone (trading session is 3-4 AM EST)
|
||||
est = pytz.timezone("America/New_York")
|
||||
timestamps = pd.date_range("2024-01-01 03:00", periods=10, freq="1min", tz=est)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"timestamp": timestamps,
|
||||
"open": [100.0] * 10,
|
||||
"high": [100.5] * 10,
|
||||
"low": [99.5] * 10,
|
||||
"close": [100.2] * 10,
|
||||
}
|
||||
)
|
||||
|
||||
# Test preprocessors
|
||||
df_processed = handle_missing_data(df)
|
||||
df_processed = remove_duplicates(df_processed)
|
||||
df_filtered = filter_session(df_processed)
|
||||
|
||||
assert len(df_filtered) > 0
|
||||
print("✓ Preprocessors working")
|
||||
|
||||
|
||||
def validate_validators():
|
||||
"""Validate validators."""
|
||||
import pandas as pd
|
||||
|
||||
# Create valid data (timezone not required for validators)
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"timestamp": pd.date_range("2024-01-01 03:00", periods=10, freq="1min"),
|
||||
"open": [100.0] * 10,
|
||||
"high": [100.5] * 10,
|
||||
"low": [99.5] * 10,
|
||||
"close": [100.2] * 10,
|
||||
}
|
||||
)
|
||||
|
||||
# Test validators
|
||||
df_validated = validate_ohlcv(df)
|
||||
is_continuous, gaps = check_continuity(df_validated, Timeframe.M1)
|
||||
_ = detect_outliers(df_validated) # Check it runs without error
|
||||
|
||||
assert len(df_validated) == 10
|
||||
print("✓ Validators working")
|
||||
|
||||
|
||||
def validate_repositories():
|
||||
"""Validate repositories."""
|
||||
from src.data.database import get_db_session
|
||||
|
||||
try:
|
||||
with get_db_session() as session:
|
||||
repo = OHLCVRepository(session=session)
|
||||
assert repo is not None
|
||||
print("✓ Repositories working")
|
||||
except Exception as e:
|
||||
print(f"✗ Repository validation failed: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def validate_directories():
|
||||
"""Validate directory structure."""
|
||||
required_dirs = [
|
||||
"data/raw/ohlcv/1min",
|
||||
"data/raw/ohlcv/5min",
|
||||
"data/raw/ohlcv/15min",
|
||||
"data/processed/features",
|
||||
"data/processed/patterns",
|
||||
"data/labels/individual_patterns",
|
||||
]
|
||||
|
||||
for dir_name in required_dirs:
|
||||
dir_path = Path(dir_name)
|
||||
if not dir_path.exists():
|
||||
print(f"✗ Missing directory: {dir_name}")
|
||||
return False
|
||||
|
||||
print("✓ Directory structure valid")
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
"""Run all validation checks."""
|
||||
print("Validating ICT ML Trading System v0.2.0 Data Pipeline...")
|
||||
print("-" * 60)
|
||||
|
||||
try:
|
||||
validate_imports()
|
||||
validate_database()
|
||||
validate_loaders()
|
||||
validate_preprocessors()
|
||||
validate_validators()
|
||||
validate_repositories()
|
||||
validate_directories()
|
||||
|
||||
print("-" * 60)
|
||||
print("✓ All validations passed!")
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Validation failed: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user