algorembrant's picture
Upload 3 files
3817007 verified
#!/usr/bin/env python3
"""
╔══════════════════════════════════════════════════════════════════════╗
β•‘ fetch_data.py β€” MT5 XAUUSDc M15 Data Fetcher β•‘
β•‘ Fetches 1-year OHLCV + spread data from MetaTrader5, saves to CSV β•‘
β•‘ Run locally with MT5 terminal open and logged in. β•‘
β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
"""
import sys
import time
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from pathlib import Path
try:
import MetaTrader5 as mt5
except ImportError:
print("ERROR: MetaTrader5 package not installed. Run: pip install MetaTrader5")
sys.exit(1)
# ══════════════════════════════════════════════════════════════════════
# CONFIGURATION
# ══════════════════════════════════════════════════════════════════════
SYMBOL = "XAUUSDc"
TIMEFRAME = mt5.TIMEFRAME_M15
TF_LABEL = "M15"
LOOKBACK_DAYS = 365 # 1 year
OUTPUT_DIR = Path(__file__).parent
OUTPUT_FILE = OUTPUT_DIR / f"{SYMBOL}_{TF_LABEL}_data.csv"
# ══════════════════════════════════════════════════════════════════════
# MT5 CONNECTION
# ══════════════════════════════════════════════════════════════════════
def init_mt5() -> None:
"""Initialize MT5 connection with retries."""
for attempt in range(3):
if mt5.initialize():
info = mt5.terminal_info()
print(f"βœ“ MT5 connected β€” Build {info.build}, Company: {info.company}")
return
print(f" Attempt {attempt+1}/3 failed, retrying in 2s...")
time.sleep(2)
print(f"βœ— MT5 initialization failed: {mt5.last_error()}")
sys.exit(1)
def validate_symbol(symbol: str) -> dict:
"""Validate symbol exists and return its properties."""
info = mt5.symbol_info(symbol)
if info is None:
# Try to find similar symbols
symbols = mt5.symbols_get()
gold_syms = [s.name for s in symbols if "XAU" in s.name or "GOLD" in s.name.upper()]
print(f"βœ— Symbol '{symbol}' not found.")
if gold_syms:
print(f" Available gold symbols: {gold_syms}")
else:
print(f" No gold symbols found. Check your broker.")
sys.exit(1)
if not info.visible:
mt5.symbol_select(symbol, True)
time.sleep(0.5)
props = {
"name": info.name,
"digits": info.digits,
"point": info.point,
"spread": info.spread,
"trade_mode": info.trade_mode,
"volume_min": info.volume_min,
"volume_max": info.volume_max,
"volume_step": info.volume_step,
}
print(f"βœ“ Symbol validated: {info.name}")
print(f" Digits: {info.digits} | Point: {info.point} | "
f"Current Spread: {info.spread} | Min Lot: {info.volume_min}")
return props
# ══════════════════════════════════════════════════════════════════════
# DATA FETCHING
# ══════════════════════════════════════════════════════════════════════
def fetch_ohlcv(symbol: str, timeframe: int, days: int) -> pd.DataFrame:
"""Fetch OHLCV bars from MT5. Chunked to avoid API limits."""
utc_now = datetime.utcnow()
date_from = utc_now - timedelta(days=days)
print(f"\n→ Fetching {TF_LABEL} bars: {date_from.date()} to {utc_now.date()}...")
# Fetch in chunks of 50000 (MT5 limit is ~100K per call)
rates = mt5.copy_rates_range(symbol, timeframe, date_from, utc_now)
if rates is None or len(rates) == 0:
print(f"βœ— No data returned: {mt5.last_error()}")
sys.exit(1)
df = pd.DataFrame(rates)
df["time"] = pd.to_datetime(df["time"], unit="s", utc=True)
df.rename(columns={"real_volume": "volume"}, inplace=True)
# Ensure columns
required = ["time", "open", "high", "low", "close", "tick_volume", "spread"]
for col in required:
if col not in df.columns:
if col == "volume" and "tick_volume" in df.columns:
df["volume"] = df["tick_volume"]
elif col == "spread":
df["spread"] = 0 # Will be filled from ticks
else:
print(f"βœ— Missing column: {col}")
sys.exit(1)
print(f"βœ“ Fetched {len(df):,} bars")
print(f" Date range: {df['time'].iloc[0]} β†’ {df['time'].iloc[-1]}")
return df
def fetch_spread_from_ticks(symbol: str, days: int, bar_times: pd.Series) -> pd.Series:
"""Fetch tick data to compute average spread per bar. Falls back to bar spread."""
print(f"\n→ Computing spread from tick data (sampling last 30 days)...")
utc_now = datetime.utcnow()
# Only fetch recent ticks for spread estimation (full year would be enormous)
tick_start = utc_now - timedelta(days=min(days, 30))
ticks = mt5.copy_ticks_range(symbol, tick_start, utc_now, mt5.COPY_TICKS_INFO)
if ticks is None or len(ticks) == 0:
print(f" ⚠ No tick data available, using bar spread column")
return None
tick_df = pd.DataFrame(ticks)
tick_df["time"] = pd.to_datetime(tick_df["time"], unit="s", utc=True)
tick_df["spread_pts"] = (tick_df["ask"] - tick_df["bid"]) / mt5.symbol_info(symbol).point
avg_spread = tick_df["spread_pts"].mean()
median_spread = tick_df["spread_pts"].median()
max_spread = tick_df["spread_pts"].quantile(0.99)
print(f"βœ“ Processed {len(tick_df):,} ticks")
print(f" Avg spread: {avg_spread:.1f} pts | "
f"Median: {median_spread:.1f} pts | "
f"99th pctl: {max_spread:.1f} pts")
return tick_df["spread_pts"]
# ══════════════════════════════════════════════════════════════════════
# DATA VALIDATION & CLEANING
# ══════════════════════════════════════════════════════════════════════
def validate_data(df: pd.DataFrame) -> pd.DataFrame:
"""Validate and clean OHLCV data."""
print(f"\n→ Validating data quality...")
issues = []
# 1. Check for NaN
nan_count = df[["open", "high", "low", "close"]].isnull().sum().sum()
if nan_count > 0:
issues.append(f" ⚠ {nan_count} NaN values in OHLCV β€” forward-filling")
df[["open", "high", "low", "close"]] = df[["open", "high", "low", "close"]].ffill()
# 2. Check OHLC integrity
bad_hl = (df["high"] < df["low"]).sum()
if bad_hl > 0:
issues.append(f" ⚠ {bad_hl} bars where high < low β€” swapping")
mask = df["high"] < df["low"]
df.loc[mask, ["high", "low"]] = df.loc[mask, ["low", "high"]].values
bad_range = ((df["open"] > df["high"]) | (df["open"] < df["low"]) |
(df["close"] > df["high"]) | (df["close"] < df["low"])).sum()
if bad_range > 0:
issues.append(f" ⚠ {bad_range} bars where open/close outside high-low range β€” clamping")
df["open"] = df["open"].clip(lower=df["low"], upper=df["high"])
df["close"] = df["close"].clip(lower=df["low"], upper=df["high"])
# 3. Check for duplicates
dups = df["time"].duplicated().sum()
if dups > 0:
issues.append(f" ⚠ {dups} duplicate timestamps β€” keeping last")
df = df.drop_duplicates(subset="time", keep="last")
# 4. Check for large time gaps (> 2 days = likely holiday, OK; > 5 days = suspicious)
time_diff = df["time"].diff()
large_gaps = time_diff[time_diff > pd.Timedelta(days=5)]
if len(large_gaps) > 0:
for idx in large_gaps.index:
gap = time_diff.loc[idx]
issues.append(f" ⚠ Large gap: {df['time'].iloc[idx-1]} β†’ {df['time'].iloc[idx]} ({gap})")
# 5. Ensure data is sorted
df = df.sort_values("time").reset_index(drop=True)
# 6. Filter weekends (Saturday/Sunday bars are artifacts)
weekend_mask = df["time"].dt.dayofweek.isin([5, 6])
weekend_count = weekend_mask.sum()
if weekend_count > 0:
issues.append(f" β„Ή Removed {weekend_count} weekend bars")
df = df[~weekend_mask].reset_index(drop=True)
if issues:
for issue in issues:
print(issue)
else:
print(" βœ“ Data quality: PASS (no issues found)")
# Summary stats
print(f"\n Final dataset: {len(df):,} bars")
print(f" Price range: {df['close'].min():.2f} β€” {df['close'].max():.2f}")
print(f" Avg spread: {df['spread'].mean():.1f} pts")
print(f" Date range: {df['time'].iloc[0].date()} β†’ {df['time'].iloc[-1].date()}")
return df
# ══════════════════════════════════════════════════════════════════════
# MAIN
# ══════════════════════════════════════════════════════════════════════
def main():
print("=" * 68)
print(" MT5 Data Fetcher β€” XAUUSDc M15 (1 Year)")
print("=" * 68)
# 1. Connect
init_mt5()
try:
# 2. Validate symbol
sym_props = validate_symbol(SYMBOL)
# 3. Fetch OHLCV
df = fetch_ohlcv(SYMBOL, TIMEFRAME, LOOKBACK_DAYS)
# 4. Enhance spread data from ticks
tick_spread = fetch_spread_from_ticks(SYMBOL, LOOKBACK_DAYS, df["time"])
if tick_spread is not None:
# Use tick-derived average as fallback for bars with 0 spread
avg_tick_spread = tick_spread.median()
zero_spread_mask = df["spread"] == 0
if zero_spread_mask.sum() > 0:
df.loc[zero_spread_mask, "spread"] = int(avg_tick_spread)
print(f" Filled {zero_spread_mask.sum()} zero-spread bars with median: {avg_tick_spread:.0f}")
# 5. Validate
df = validate_data(df)
# 6. Add metadata columns useful for training
df["hour"] = df["time"].dt.hour
df["dayofweek"] = df["time"].dt.dayofweek
df["returns"] = np.log(df["close"] / df["close"].shift(1))
# 7. Save
# Select and order columns for output
output_cols = [
"time", "open", "high", "low", "close",
"tick_volume", "spread", "hour", "dayofweek", "returns"
]
# Add volume if it exists separately
if "volume" in df.columns and "volume" not in output_cols:
output_cols.insert(5, "volume")
df_out = df[[c for c in output_cols if c in df.columns]]
df_out.to_csv(OUTPUT_FILE, index=False)
print(f"\n{'=' * 68}")
print(f" βœ“ SAVED: {OUTPUT_FILE}")
print(f" βœ“ Rows: {len(df_out):,} | Columns: {len(df_out.columns)}")
print(f" βœ“ File size: {OUTPUT_FILE.stat().st_size / 1024:.0f} KB")
print(f"{'=' * 68}")
# 8. Quick sanity print
print(f"\nSample (first 3 rows):")
print(df_out.head(3).to_string(index=False))
print(f"\nSample (last 3 rows):")
print(df_out.tail(3).to_string(index=False))
finally:
mt5.shutdown()
print("\nβœ“ MT5 connection closed.")
if __name__ == "__main__":
main()