Spaces:
Sleeping
Sleeping
| """ | |
| Data loader for the Quant Research Environment. | |
| Loads compressed NIFTY/BANKNIFTY CSV data, performs inner-join merge, | |
| and caches results in memory. | |
| """ | |
| import gzip | |
| import io | |
| from pathlib import Path | |
| from typing import Tuple | |
| import pandas as pd | |
| _DATA_DIR = Path(__file__).parent / "data" | |
| # Module-level cache | |
| _cache: dict = {} | |
| def _load_and_merge(nifty_path: Path, banknifty_path: Path) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: | |
| """Load two CSVs and inner-join merge on Date.""" | |
| if str(nifty_path).endswith(".gz"): | |
| with gzip.open(nifty_path, "rt") as f: | |
| nifty = pd.read_csv(f, parse_dates=["Date"]) | |
| with gzip.open(banknifty_path, "rt") as f: | |
| banknifty = pd.read_csv(f, parse_dates=["Date"]) | |
| else: | |
| nifty = pd.read_csv(nifty_path, parse_dates=["Date"]) | |
| banknifty = pd.read_csv(banknifty_path, parse_dates=["Date"]) | |
| merged = pd.merge( | |
| nifty, banknifty, | |
| on="Date", how="inner", suffixes=("_nifty", "_banknifty"), | |
| ).sort_values("Date").reset_index(drop=True) | |
| nifty_aligned = pd.DataFrame({ | |
| "Date": merged["Date"], | |
| "Open": merged["Open_nifty"], | |
| "High": merged["High_nifty"], | |
| "Low": merged["Low_nifty"], | |
| "Close": merged["Close_nifty"], | |
| }) | |
| banknifty_aligned = pd.DataFrame({ | |
| "Date": merged["Date"], | |
| "Open": merged["Open_banknifty"], | |
| "High": merged["High_banknifty"], | |
| "Low": merged["Low_banknifty"], | |
| "Close": merged["Close_banknifty"], | |
| }) | |
| return nifty_aligned, banknifty_aligned, merged | |
| def get_train_data() -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: | |
| """ | |
| Load and cache training data (2015-2022). | |
| Returns (nifty_aligned, banknifty_aligned, merged). | |
| """ | |
| if "train" not in _cache: | |
| nifty_path = _DATA_DIR / "nifty_train.csv.gz" | |
| banknifty_path = _DATA_DIR / "banknifty_train.csv.gz" | |
| _cache["train"] = _load_and_merge(nifty_path, banknifty_path) | |
| return _cache["train"] | |
| def get_test_data() -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: | |
| """ | |
| Load and cache test data (2023-2026). | |
| Returns (nifty_aligned, banknifty_aligned, merged). | |
| """ | |
| if "test" not in _cache: | |
| nifty_path = _DATA_DIR / "nifty_test.csv.gz" | |
| banknifty_path = _DATA_DIR / "banknifty_test.csv.gz" | |
| if not nifty_path.exists(): | |
| raise FileNotFoundError("Test data not available") | |
| _cache["test"] = _load_and_merge(nifty_path, banknifty_path) | |
| return _cache["test"] | |
| def get_data_schema(merged: pd.DataFrame) -> str: | |
| """Return a human-readable schema description.""" | |
| n_rows = len(merged) | |
| n_days = merged["Date"].dt.date.nunique() | |
| date_range = f"{merged['Date'].iloc[0].date()} to {merged['Date'].iloc[-1].date()}" | |
| cols = list(merged.columns) | |
| return ( | |
| f"Pre-merged NIFTY 50 and BANKNIFTY minute-bar OHLC data.\n" | |
| f"Rows: {n_rows:,} | Trading days: {n_days:,} | Period: {date_range}\n" | |
| f"Columns: {cols}\n" | |
| f"Suffixes: _nifty for NIFTY 50, _banknifty for BANKNIFTY.\n" | |
| f"Index is numeric (0 to {n_rows - 1}). 375 bars per trading day." | |
| ) | |
| def get_data_preview(merged: pd.DataFrame, n_rows: int = 10) -> str: | |
| """Return first N rows as a formatted string.""" | |
| return merged.head(n_rows).to_string(index=True, max_cols=None) | |