feat(v0.2.0): data pipeline
This commit is contained in:
183
scripts/download_data.py
Executable file
183
scripts/download_data.py
Executable file
@@ -0,0 +1,183 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Download DAX OHLCV data from external sources."""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add project root to path
|
||||
project_root = Path(__file__).parent.parent
|
||||
sys.path.insert(0, str(project_root))
|
||||
|
||||
from src.core.enums import Timeframe # noqa: E402
|
||||
from src.logging import get_logger # noqa: E402
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def download_from_csv(
|
||||
input_file: str,
|
||||
symbol: str,
|
||||
timeframe: Timeframe,
|
||||
output_dir: Path,
|
||||
) -> None:
|
||||
"""
|
||||
Copy/convert CSV file to standard format.
|
||||
|
||||
Args:
|
||||
input_file: Path to input CSV file
|
||||
symbol: Trading symbol
|
||||
timeframe: Timeframe enum
|
||||
output_dir: Output directory
|
||||
"""
|
||||
from src.data.loaders import CSVLoader
|
||||
|
||||
loader = CSVLoader()
|
||||
df = loader.load(input_file, symbol=symbol, timeframe=timeframe)
|
||||
|
||||
# Ensure output directory exists
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save as CSV
|
||||
output_file = output_dir / f"{symbol}_{timeframe.value}.csv"
|
||||
df.to_csv(output_file, index=False)
|
||||
logger.info(f"Saved {len(df)} rows to {output_file}")
|
||||
|
||||
# Also save as Parquet for faster loading
|
||||
output_parquet = output_dir / f"{symbol}_{timeframe.value}.parquet"
|
||||
df.to_parquet(output_parquet, index=False)
|
||||
logger.info(f"Saved {len(df)} rows to {output_parquet}")
|
||||
|
||||
|
||||
def download_from_api(
|
||||
symbol: str,
|
||||
timeframe: Timeframe,
|
||||
start_date: str,
|
||||
end_date: str,
|
||||
output_dir: Path,
|
||||
api_provider: str = "manual",
|
||||
) -> None:
|
||||
"""
|
||||
Download data from API (placeholder for future implementation).
|
||||
|
||||
Args:
|
||||
symbol: Trading symbol
|
||||
timeframe: Timeframe enum
|
||||
start_date: Start date (YYYY-MM-DD)
|
||||
end_date: End date (YYYY-MM-DD)
|
||||
output_dir: Output directory
|
||||
api_provider: API provider name
|
||||
"""
|
||||
logger.warning(
|
||||
"API download not yet implemented. " "Please provide CSV file using --input-file option."
|
||||
)
|
||||
logger.info(
|
||||
f"Would download {symbol} {timeframe.value} data " f"from {start_date} to {end_date}"
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download DAX OHLCV data",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Download from CSV file
|
||||
python scripts/download_data.py --input-file data.csv \\
|
||||
--symbol DAX --timeframe 1min \\
|
||||
--output data/raw/ohlcv/1min/
|
||||
|
||||
# Download from API (when implemented)
|
||||
python scripts/download_data.py --symbol DAX --timeframe 5min \\
|
||||
--start 2024-01-01 --end 2024-01-31 \\
|
||||
--output data/raw/ohlcv/5min/
|
||||
""",
|
||||
)
|
||||
|
||||
# Input options
|
||||
input_group = parser.add_mutually_exclusive_group(required=True)
|
||||
input_group.add_argument(
|
||||
"--input-file",
|
||||
type=str,
|
||||
help="Path to input CSV file",
|
||||
)
|
||||
input_group.add_argument(
|
||||
"--api",
|
||||
action="store_true",
|
||||
help="Download from API (not yet implemented)",
|
||||
)
|
||||
|
||||
# Required arguments
|
||||
parser.add_argument(
|
||||
"--symbol",
|
||||
type=str,
|
||||
default="DAX",
|
||||
help="Trading symbol (default: DAX)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeframe",
|
||||
type=str,
|
||||
choices=["1min", "5min", "15min"],
|
||||
required=True,
|
||||
help="Timeframe",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Output directory",
|
||||
)
|
||||
|
||||
# Optional arguments for API download
|
||||
parser.add_argument(
|
||||
"--start",
|
||||
type=str,
|
||||
help="Start date (YYYY-MM-DD) for API download",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--end",
|
||||
type=str,
|
||||
help="End date (YYYY-MM-DD) for API download",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
# Convert timeframe string to enum
|
||||
timeframe_map = {
|
||||
"1min": Timeframe.M1,
|
||||
"5min": Timeframe.M5,
|
||||
"15min": Timeframe.M15,
|
||||
}
|
||||
timeframe = timeframe_map[args.timeframe]
|
||||
|
||||
# Create output directory
|
||||
output_dir = Path(args.output)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Download data
|
||||
if args.input_file:
|
||||
logger.info(f"Downloading from CSV: {args.input_file}")
|
||||
download_from_csv(args.input_file, args.symbol, timeframe, output_dir)
|
||||
elif args.api:
|
||||
if not args.start or not args.end:
|
||||
parser.error("--start and --end are required for API download")
|
||||
download_from_api(
|
||||
args.symbol,
|
||||
timeframe,
|
||||
args.start,
|
||||
args.end,
|
||||
output_dir,
|
||||
)
|
||||
|
||||
logger.info("Data download completed successfully")
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Data download failed: {e}", exc_info=True)
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user