#!/usr/bin/env python3 """ ╔══════════════════════════════════════════════════════════════════════╗ ║ fetch_data.py — MT5 XAUUSDc M15 Data Fetcher ║ ║ Fetches 1-year OHLCV + spread data from MetaTrader5, saves to CSV ║ ║ Run locally with MT5 terminal open and logged in. ║ ╚══════════════════════════════════════════════════════════════════════╝ """ import sys import time import numpy as np import pandas as pd from datetime import datetime, timedelta from pathlib import Path try: import MetaTrader5 as mt5 except ImportError: print("ERROR: MetaTrader5 package not installed. Run: pip install MetaTrader5") sys.exit(1) # ══════════════════════════════════════════════════════════════════════ # CONFIGURATION # ══════════════════════════════════════════════════════════════════════ SYMBOL = "XAUUSDc" TIMEFRAME = mt5.TIMEFRAME_M15 TF_LABEL = "M15" LOOKBACK_DAYS = 365 # 1 year OUTPUT_DIR = Path(__file__).parent OUTPUT_FILE = OUTPUT_DIR / f"{SYMBOL}_{TF_LABEL}_data.csv" # ══════════════════════════════════════════════════════════════════════ # MT5 CONNECTION # ══════════════════════════════════════════════════════════════════════ def init_mt5() -> None: """Initialize MT5 connection with retries.""" for attempt in range(3): if mt5.initialize(): info = mt5.terminal_info() print(f"✓ MT5 connected — Build {info.build}, Company: {info.company}") return print(f" Attempt {attempt+1}/3 failed, retrying in 2s...") time.sleep(2) print(f"✗ MT5 initialization failed: {mt5.last_error()}") sys.exit(1) def validate_symbol(symbol: str) -> dict: """Validate symbol exists and return its properties.""" info = mt5.symbol_info(symbol) if info is None: # Try to find similar symbols symbols = mt5.symbols_get() gold_syms = [s.name for s in symbols if "XAU" in s.name or "GOLD" in s.name.upper()] print(f"✗ Symbol '{symbol}' not found.") if gold_syms: print(f" Available gold symbols: {gold_syms}") else: print(f" No gold symbols found. Check your broker.") sys.exit(1) if not info.visible: mt5.symbol_select(symbol, True) time.sleep(0.5) props = { "name": info.name, "digits": info.digits, "point": info.point, "spread": info.spread, "trade_mode": info.trade_mode, "volume_min": info.volume_min, "volume_max": info.volume_max, "volume_step": info.volume_step, } print(f"✓ Symbol validated: {info.name}") print(f" Digits: {info.digits} | Point: {info.point} | " f"Current Spread: {info.spread} | Min Lot: {info.volume_min}") return props # ══════════════════════════════════════════════════════════════════════ # DATA FETCHING # ══════════════════════════════════════════════════════════════════════ def fetch_ohlcv(symbol: str, timeframe: int, days: int) -> pd.DataFrame: """Fetch OHLCV bars from MT5. Chunked to avoid API limits.""" utc_now = datetime.utcnow() date_from = utc_now - timedelta(days=days) print(f"\n→ Fetching {TF_LABEL} bars: {date_from.date()} to {utc_now.date()}...") # Fetch in chunks of 50000 (MT5 limit is ~100K per call) rates = mt5.copy_rates_range(symbol, timeframe, date_from, utc_now) if rates is None or len(rates) == 0: print(f"✗ No data returned: {mt5.last_error()}") sys.exit(1) df = pd.DataFrame(rates) df["time"] = pd.to_datetime(df["time"], unit="s", utc=True) df.rename(columns={"real_volume": "volume"}, inplace=True) # Ensure columns required = ["time", "open", "high", "low", "close", "tick_volume", "spread"] for col in required: if col not in df.columns: if col == "volume" and "tick_volume" in df.columns: df["volume"] = df["tick_volume"] elif col == "spread": df["spread"] = 0 # Will be filled from ticks else: print(f"✗ Missing column: {col}") sys.exit(1) print(f"✓ Fetched {len(df):,} bars") print(f" Date range: {df['time'].iloc[0]} → {df['time'].iloc[-1]}") return df def fetch_spread_from_ticks(symbol: str, days: int, bar_times: pd.Series) -> pd.Series: """Fetch tick data to compute average spread per bar. Falls back to bar spread.""" print(f"\n→ Computing spread from tick data (sampling last 30 days)...") utc_now = datetime.utcnow() # Only fetch recent ticks for spread estimation (full year would be enormous) tick_start = utc_now - timedelta(days=min(days, 30)) ticks = mt5.copy_ticks_range(symbol, tick_start, utc_now, mt5.COPY_TICKS_INFO) if ticks is None or len(ticks) == 0: print(f" ⚠ No tick data available, using bar spread column") return None tick_df = pd.DataFrame(ticks) tick_df["time"] = pd.to_datetime(tick_df["time"], unit="s", utc=True) tick_df["spread_pts"] = (tick_df["ask"] - tick_df["bid"]) / mt5.symbol_info(symbol).point avg_spread = tick_df["spread_pts"].mean() median_spread = tick_df["spread_pts"].median() max_spread = tick_df["spread_pts"].quantile(0.99) print(f"✓ Processed {len(tick_df):,} ticks") print(f" Avg spread: {avg_spread:.1f} pts | " f"Median: {median_spread:.1f} pts | " f"99th pctl: {max_spread:.1f} pts") return tick_df["spread_pts"] # ══════════════════════════════════════════════════════════════════════ # DATA VALIDATION & CLEANING # ══════════════════════════════════════════════════════════════════════ def validate_data(df: pd.DataFrame) -> pd.DataFrame: """Validate and clean OHLCV data.""" print(f"\n→ Validating data quality...") issues = [] # 1. Check for NaN nan_count = df[["open", "high", "low", "close"]].isnull().sum().sum() if nan_count > 0: issues.append(f" ⚠ {nan_count} NaN values in OHLCV — forward-filling") df[["open", "high", "low", "close"]] = df[["open", "high", "low", "close"]].ffill() # 2. Check OHLC integrity bad_hl = (df["high"] < df["low"]).sum() if bad_hl > 0: issues.append(f" ⚠ {bad_hl} bars where high < low — swapping") mask = df["high"] < df["low"] df.loc[mask, ["high", "low"]] = df.loc[mask, ["low", "high"]].values bad_range = ((df["open"] > df["high"]) | (df["open"] < df["low"]) | (df["close"] > df["high"]) | (df["close"] < df["low"])).sum() if bad_range > 0: issues.append(f" ⚠ {bad_range} bars where open/close outside high-low range — clamping") df["open"] = df["open"].clip(lower=df["low"], upper=df["high"]) df["close"] = df["close"].clip(lower=df["low"], upper=df["high"]) # 3. Check for duplicates dups = df["time"].duplicated().sum() if dups > 0: issues.append(f" ⚠ {dups} duplicate timestamps — keeping last") df = df.drop_duplicates(subset="time", keep="last") # 4. Check for large time gaps (> 2 days = likely holiday, OK; > 5 days = suspicious) time_diff = df["time"].diff() large_gaps = time_diff[time_diff > pd.Timedelta(days=5)] if len(large_gaps) > 0: for idx in large_gaps.index: gap = time_diff.loc[idx] issues.append(f" ⚠ Large gap: {df['time'].iloc[idx-1]} → {df['time'].iloc[idx]} ({gap})") # 5. Ensure data is sorted df = df.sort_values("time").reset_index(drop=True) # 6. Filter weekends (Saturday/Sunday bars are artifacts) weekend_mask = df["time"].dt.dayofweek.isin([5, 6]) weekend_count = weekend_mask.sum() if weekend_count > 0: issues.append(f" ℹ Removed {weekend_count} weekend bars") df = df[~weekend_mask].reset_index(drop=True) if issues: for issue in issues: print(issue) else: print(" ✓ Data quality: PASS (no issues found)") # Summary stats print(f"\n Final dataset: {len(df):,} bars") print(f" Price range: {df['close'].min():.2f} — {df['close'].max():.2f}") print(f" Avg spread: {df['spread'].mean():.1f} pts") print(f" Date range: {df['time'].iloc[0].date()} → {df['time'].iloc[-1].date()}") return df # ══════════════════════════════════════════════════════════════════════ # MAIN # ══════════════════════════════════════════════════════════════════════ def main(): print("=" * 68) print(" MT5 Data Fetcher — XAUUSDc M15 (1 Year)") print("=" * 68) # 1. Connect init_mt5() try: # 2. Validate symbol sym_props = validate_symbol(SYMBOL) # 3. Fetch OHLCV df = fetch_ohlcv(SYMBOL, TIMEFRAME, LOOKBACK_DAYS) # 4. Enhance spread data from ticks tick_spread = fetch_spread_from_ticks(SYMBOL, LOOKBACK_DAYS, df["time"]) if tick_spread is not None: # Use tick-derived average as fallback for bars with 0 spread avg_tick_spread = tick_spread.median() zero_spread_mask = df["spread"] == 0 if zero_spread_mask.sum() > 0: df.loc[zero_spread_mask, "spread"] = int(avg_tick_spread) print(f" Filled {zero_spread_mask.sum()} zero-spread bars with median: {avg_tick_spread:.0f}") # 5. Validate df = validate_data(df) # 6. Add metadata columns useful for training df["hour"] = df["time"].dt.hour df["dayofweek"] = df["time"].dt.dayofweek df["returns"] = np.log(df["close"] / df["close"].shift(1)) # 7. Save # Select and order columns for output output_cols = [ "time", "open", "high", "low", "close", "tick_volume", "spread", "hour", "dayofweek", "returns" ] # Add volume if it exists separately if "volume" in df.columns and "volume" not in output_cols: output_cols.insert(5, "volume") df_out = df[[c for c in output_cols if c in df.columns]] df_out.to_csv(OUTPUT_FILE, index=False) print(f"\n{'=' * 68}") print(f" ✓ SAVED: {OUTPUT_FILE}") print(f" ✓ Rows: {len(df_out):,} | Columns: {len(df_out.columns)}") print(f" ✓ File size: {OUTPUT_FILE.stat().st_size / 1024:.0f} KB") print(f"{'=' * 68}") # 8. Quick sanity print print(f"\nSample (first 3 rows):") print(df_out.head(3).to_string(index=False)) print(f"\nSample (last 3 rows):") print(df_out.tail(3).to_string(index=False)) finally: mt5.shutdown() print("\n✓ MT5 connection closed.") if __name__ == "__main__": main()