Spaces:
Sleeping
Sleeping
File size: 3,387 Bytes
e9c52c9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | """
Data loader for the Quant Research Environment.
Loads compressed NIFTY/BANKNIFTY CSV data, performs inner-join merge,
and caches results in memory.
"""
import gzip
import io
from pathlib import Path
from typing import Tuple
import pandas as pd
_DATA_DIR = Path(__file__).parent / "data"
# Module-level cache
_cache: dict = {}
def _load_and_merge(nifty_path: Path, banknifty_path: Path) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""Load two CSVs and inner-join merge on Date."""
if str(nifty_path).endswith(".gz"):
with gzip.open(nifty_path, "rt") as f:
nifty = pd.read_csv(f, parse_dates=["Date"])
with gzip.open(banknifty_path, "rt") as f:
banknifty = pd.read_csv(f, parse_dates=["Date"])
else:
nifty = pd.read_csv(nifty_path, parse_dates=["Date"])
banknifty = pd.read_csv(banknifty_path, parse_dates=["Date"])
merged = pd.merge(
nifty, banknifty,
on="Date", how="inner", suffixes=("_nifty", "_banknifty"),
).sort_values("Date").reset_index(drop=True)
nifty_aligned = pd.DataFrame({
"Date": merged["Date"],
"Open": merged["Open_nifty"],
"High": merged["High_nifty"],
"Low": merged["Low_nifty"],
"Close": merged["Close_nifty"],
})
banknifty_aligned = pd.DataFrame({
"Date": merged["Date"],
"Open": merged["Open_banknifty"],
"High": merged["High_banknifty"],
"Low": merged["Low_banknifty"],
"Close": merged["Close_banknifty"],
})
return nifty_aligned, banknifty_aligned, merged
def get_train_data() -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""
Load and cache training data (2015-2022).
Returns (nifty_aligned, banknifty_aligned, merged).
"""
if "train" not in _cache:
nifty_path = _DATA_DIR / "nifty_train.csv.gz"
banknifty_path = _DATA_DIR / "banknifty_train.csv.gz"
_cache["train"] = _load_and_merge(nifty_path, banknifty_path)
return _cache["train"]
def get_test_data() -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""
Load and cache test data (2023-2026).
Returns (nifty_aligned, banknifty_aligned, merged).
"""
if "test" not in _cache:
nifty_path = _DATA_DIR / "nifty_test.csv.gz"
banknifty_path = _DATA_DIR / "banknifty_test.csv.gz"
if not nifty_path.exists():
raise FileNotFoundError("Test data not available")
_cache["test"] = _load_and_merge(nifty_path, banknifty_path)
return _cache["test"]
def get_data_schema(merged: pd.DataFrame) -> str:
"""Return a human-readable schema description."""
n_rows = len(merged)
n_days = merged["Date"].dt.date.nunique()
date_range = f"{merged['Date'].iloc[0].date()} to {merged['Date'].iloc[-1].date()}"
cols = list(merged.columns)
return (
f"Pre-merged NIFTY 50 and BANKNIFTY minute-bar OHLC data.\n"
f"Rows: {n_rows:,} | Trading days: {n_days:,} | Period: {date_range}\n"
f"Columns: {cols}\n"
f"Suffixes: _nifty for NIFTY 50, _banknifty for BANKNIFTY.\n"
f"Index is numeric (0 to {n_rows - 1}). 375 bars per trading day."
)
def get_data_preview(merged: pd.DataFrame, n_rows: int = 10) -> str:
"""Return first N rows as a formatted string."""
return merged.head(n_rows).to_string(index=True, max_cols=None)
|