Jitendra12421 commited on
Commit
a32ec2b
·
verified ·
1 Parent(s): 8103af4

Upload 7 files

Browse files
Files changed (7) hide show
  1. .dockerignore +18 -0
  2. .gitattributes +3 -33
  3. Dockerfile +23 -0
  4. README.md +123 -6
  5. app.py +560 -0
  6. requirements.txt +11 -0
  7. runtime_config.example.env +14 -0
.dockerignore ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ .gitignore
3
+ __pycache__/
4
+ *.py[cod]
5
+ .space_state/
6
+ .env
7
+ *.env
8
+ !*.example.env
9
+ research_runtime/Code/artifacts/
10
+ research_runtime/Code/docs/
11
+ research_runtime/Code/scripts/backtesting/
12
+ research_runtime/Code/scripts/tuning/
13
+ research_runtime/Code/models/**/outputs/*dataset*.csv
14
+ research_runtime/Code/models/**/outputs/test_predictions.csv
15
+ research_runtime/Code/models/**/outputs/*predictions.csv
16
+ research_runtime/Code/models/**/outputs/*.joblib
17
+ research_runtime/Data/
18
+ research_runtime/Alt Data/
.gitattributes CHANGED
@@ -1,35 +1,5 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.csv filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
2
  *.joblib filter=lfs diff=lfs merge=lfs -text
3
+ *.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  *.zip filter=lfs diff=lfs merge=lfs -text
5
+ *.parquet filter=lfs diff=lfs merge=lfs -text
 
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1 \
4
+ PYTHONUNBUFFERED=1 \
5
+ PIP_NO_CACHE_DIR=1 \
6
+ PORT=7860 \
7
+ FORECASTING_PROJECT_ROOT=/app/research_runtime
8
+
9
+ WORKDIR /app
10
+
11
+ RUN apt-get update \
12
+ && apt-get install -y --no-install-recommends build-essential curl git libgomp1 \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ COPY requirements.txt .
16
+ RUN pip install --upgrade pip \
17
+ && pip install -r requirements.txt
18
+
19
+ COPY . .
20
+
21
+ EXPOSE 7860
22
+
23
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--ws", "none"]
README.md CHANGED
@@ -1,11 +1,128 @@
1
  ---
2
- title: Prediction Site
3
- emoji: 🐨
4
- colorFrom: pink
5
- colorTo: blue
6
  sdk: docker
 
7
  pinned: false
8
- short_description: backend
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Trading Forecasting Backend
3
+ colorFrom: blue
4
+ colorTo: green
 
5
  sdk: docker
6
+ app_port: 7860
7
  pinned: false
 
8
  ---
9
 
10
+ # Trading Forecasting Backend
11
+
12
+ This folder is now a standalone Hugging Face Docker Space backend. Upload the contents of this `backend` folder to a Hugging Face Space repository, upload the separate `dataset` folder to a Hugging Face Dataset repository, and deploy the separate `frontend` folder to Netlify.
13
+
14
+ The backend contains the quantitative model code, training scripts, model outputs, primary market data, and alternative data from the forecasting research workspace.
15
+
16
+ ## Hugging Face Space Setup
17
+
18
+ Create a new Hugging Face Space with Docker SDK, then upload this backend folder as the Space root.
19
+
20
+ Required Space variables/secrets:
21
+
22
+ - `FRONTEND_ORIGINS`: your Netlify URL, for example `https://your-site.netlify.app`.
23
+ - `CRON_SECRET`: a long shared secret. Use the same value in Netlify.
24
+ - `HF_DATASET_REPO_ID`: your Hugging Face Dataset repo id, for example `your-username/your-forecasting-dataset`.
25
+
26
+ Useful optional settings:
27
+
28
+ - `AUTO_UPDATE_ENABLED=true`
29
+ - `AUTO_RETRAIN_ENABLED=true`
30
+ - `AUTO_UPDATE_ON_START=false`
31
+ - `DATASET_SYNC_ON_START=true`
32
+ - `HF_DATASET_REVISION=main`
33
+ - `DAILY_UPDATE_TIME=17:30`
34
+ - `UPDATE_TIMEZONE=Asia/Kolkata`
35
+ - `MARKET_BUILD_WORKERS=2`
36
+
37
+ The app listens on port `7860` and exposes Swagger docs at `/docs`.
38
+
39
+ ## API Routes
40
+
41
+ - `GET /health` - Space health, file checks, latest data date, and update status.
42
+ - `GET /api/status` - same as health, for frontend polling.
43
+ - `GET /api/forecast/latest` - latest stock high/low, first-extrema, and Nifty forecasts.
44
+ - `GET /api/models/summaries` - model summary JSONs.
45
+ - `GET /api/data/catalog` - searchable data manifest.
46
+ - `GET /api/data/sample?category=bars&asset=nifty50&timeframe=1d` - small sample from a manifest dataset.
47
+ - `POST /api/cron/tick` - Netlify scheduled ping endpoint; starts an update only when due.
48
+ - `POST /api/update/start` - manual update trigger. Send `x-admin-secret` if `CRON_SECRET` or `ADMIN_SECRET` is set.
49
+ - `POST /api/dataset/sync` - manually sync the Hugging Face Dataset repo into the Space runtime.
50
+
51
+ ## Netlify Keep-Awake Cron
52
+
53
+ The `frontend` folder now includes:
54
+
55
+ - `frontend/netlify.toml`
56
+ - `frontend/netlify/functions/keep-space-awake.mjs`
57
+
58
+ On Netlify, set these environment variables:
59
+
60
+ - `HUGGING_FACE_SPACE_URL=https://YOUR-HF-USERNAME-YOUR-SPACE.hf.space`
61
+ - `CRON_SECRET=<same value as the Space CRON_SECRET>`
62
+
63
+ The scheduled function runs every 10 minutes and calls `/api/cron/tick`. This keeps the Space warm and lets the backend start its daily update/retrain job after the configured market-close time.
64
+
65
+ ## Layout
66
+
67
+ - `app.py` - FastAPI backend app for Hugging Face Spaces.
68
+ - `Dockerfile` - Docker Space runtime setup.
69
+ - `requirements.txt` - Python dependencies.
70
+ - `research_runtime/Code/models/` - trainable model packages and the small latest forecast/summary outputs needed by the API.
71
+ - `research_runtime/Code/scripts/data_ingestion/` - data refresh scripts used by update jobs.
72
+ - `research_runtime/Code/scripts/data_preparation/` - research data rebuild scripts used by update jobs.
73
+
74
+ `research_runtime/Data/` and `research_runtime/Alt Data/` are intentionally not bundled in the Space repo anymore. They now live in the separate Hugging Face Dataset repo and are downloaded into `research_runtime/` by the backend when `HF_DATASET_REPO_ID` is set.
75
+
76
+ ## Main Model Outputs To Wire First
77
+
78
+ - Stock high/low forecasts: `research_runtime/Code/models/stock_high_low_forecaster/outputs/latest_forecasts.csv`
79
+ - Stock high/low metrics: `research_runtime/Code/models/stock_high_low_forecaster/outputs/metrics_by_symbol.csv`
80
+ - First-extrema forecasts: `research_runtime/Code/models/first_extrema_forecaster/outputs/latest_forecasts.csv`
81
+ - Nifty forecasts: `research_runtime/Code/models/nifty_forecaster/outputs/forecaster_latest_forecasts.csv`
82
+ - Nifty summary: `research_runtime/Code/models/nifty_forecaster/outputs/forecaster_summary.json`
83
+
84
+ ## Training Entrypoints
85
+
86
+ Run these from `backend/research_runtime` so project-relative paths resolve correctly:
87
+
88
+ ```powershell
89
+ python Code\models\stock_high_low_forecaster\train.py
90
+ python Code\models\first_extrema_forecaster\train.py
91
+ python Code\models\nifty_forecaster\train.py
92
+ ```
93
+
94
+ ## Data Labels
95
+
96
+ These live in the separate Dataset repo:
97
+
98
+ - Raw minute OHLCV: `Data/raw/minute/*_minute.csv`
99
+ - Processed bars: `Data/processed/bars/{1m,5m,1h,4h,1d}/*.csv`
100
+ - Processed features: `Data/processed/features/{1m,5m,1h,4h,1d}/*.csv`
101
+ - Market panels: `Data/processed/panels/*_market_panel.csv`
102
+ - Master daily panel: `Data/processed/panels/daily_master_panel.csv`
103
+ - Data manifest: `Data/metadata/manifest.csv`
104
+ - Feature dictionary: `Data/metadata/feature_dictionary.csv`
105
+ - Options features: `Alt Data/options/processed/*_options_daily_features.csv`
106
+ - Institutional panel: `Alt Data/institutional/processed/institutional_daily_panel.csv`
107
+ - External daily panel: `Alt Data/external/processed/external_daily_panel.csv`
108
+ - Corporate events: `Alt Data/corporate/processed/corporate_announcements.csv`
109
+
110
+ ## Frontend Wiring Notes
111
+
112
+ The current frontend is static mock data in `frontend/index.html` and `frontend/script.js`.
113
+
114
+ - Forecast cards can call `/api/forecast/latest`.
115
+ - Model accuracy and version/date stats can call `/api/models/summaries`.
116
+ - Market Data can call `/api/data/catalog` and `/api/data/sample`.
117
+
118
+ ## Pruned From Backend
119
+
120
+ - Kotak credential/runtime files.
121
+ - Live-trading scripts and live broker artifacts.
122
+ - Kotak monitor artifacts and cached NSE temp folders.
123
+ - Python `__pycache__` folders.
124
+ - CatBoost generated training-log folder.
125
+ - One-off maintenance/backfill scripts.
126
+ - Backtest artifacts, chart images, old trade reports, test prediction dumps, generated training datasets, and saved model binaries.
127
+
128
+ `KOTAKBANK` CSV files remain because those are normal market datasets for Kotak Mahindra Bank, not broker-runtime files.
app.py ADDED
@@ -0,0 +1,560 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import shutil
6
+ import subprocess
7
+ import sys
8
+ import threading
9
+ import time
10
+ from datetime import datetime, time as dt_time
11
+ from pathlib import Path
12
+ from typing import Any
13
+ from zoneinfo import ZoneInfo
14
+
15
+ import pandas as pd
16
+ from fastapi import BackgroundTasks, FastAPI, Header, HTTPException, Query, Request
17
+ from fastapi.middleware.cors import CORSMiddleware
18
+ from fastapi.responses import JSONResponse, PlainTextResponse
19
+ from huggingface_hub import snapshot_download
20
+
21
+
22
+ BASE_DIR = Path(__file__).resolve().parent
23
+ RESEARCH_ROOT = Path(os.environ.get("FORECASTING_PROJECT_ROOT", BASE_DIR / "research_runtime")).resolve()
24
+ STATE_DIR = Path(os.environ.get("SPACE_STATE_DIR", "/data/forecasting-space-state" if Path("/data").exists() else BASE_DIR / ".space_state"))
25
+ STATUS_PATH = STATE_DIR / "update_status.json"
26
+ DATASET_READY_MARKER = STATE_DIR / "dataset_ready.json"
27
+
28
+ API_TITLE = "Trading Forecasting Space Backend"
29
+ API_VERSION = "1.0.0"
30
+ DEFAULT_TIMEZONE = os.environ.get("UPDATE_TIMEZONE", "Asia/Kolkata")
31
+ DEFAULT_UPDATE_TIME = os.environ.get("DAILY_UPDATE_TIME", "17:30")
32
+
33
+ app = FastAPI(title=API_TITLE, version=API_VERSION)
34
+
35
+
36
+ def cors_origins() -> list[str]:
37
+ raw = os.environ.get("FRONTEND_ORIGINS", "*").strip()
38
+ return ["*"] if raw == "*" else [item.strip() for item in raw.split(",") if item.strip()]
39
+
40
+
41
+ app.add_middleware(
42
+ CORSMiddleware,
43
+ allow_origins=cors_origins(),
44
+ allow_credentials=False,
45
+ allow_methods=["GET", "POST", "OPTIONS"],
46
+ allow_headers=["*"],
47
+ )
48
+
49
+ update_lock = threading.Lock()
50
+ worker_thread: threading.Thread | None = None
51
+ dataset_lock = threading.Lock()
52
+
53
+
54
+ def now_utc() -> str:
55
+ return datetime.utcnow().replace(microsecond=0).isoformat() + "Z"
56
+
57
+
58
+ def safe_json(value: Any) -> Any:
59
+ if isinstance(value, dict):
60
+ return {str(k): safe_json(v) for k, v in value.items()}
61
+ if isinstance(value, list):
62
+ return [safe_json(v) for v in value]
63
+ if not isinstance(value, (tuple, set)):
64
+ try:
65
+ if pd.isna(value):
66
+ return None
67
+ except Exception:
68
+ pass
69
+ if hasattr(value, "item"):
70
+ try:
71
+ return safe_json(value.item())
72
+ except Exception:
73
+ pass
74
+ if isinstance(value, Path):
75
+ return str(value)
76
+ if isinstance(value, datetime):
77
+ return value.isoformat()
78
+ return value
79
+
80
+
81
+ def read_json(path: Path, default: Any) -> Any:
82
+ try:
83
+ return json.loads(path.read_text(encoding="utf-8"))
84
+ except Exception:
85
+ return default
86
+
87
+
88
+ def write_json(path: Path, payload: Any) -> None:
89
+ path.parent.mkdir(parents=True, exist_ok=True)
90
+ path.write_text(json.dumps(safe_json(payload), indent=2), encoding="utf-8")
91
+
92
+
93
+ def read_status() -> dict[str, Any]:
94
+ return read_json(
95
+ STATUS_PATH,
96
+ {
97
+ "state": "idle",
98
+ "last_started_at": None,
99
+ "last_finished_at": None,
100
+ "last_success_at": None,
101
+ "last_error": None,
102
+ "last_exit_code": None,
103
+ "last_log_tail": [],
104
+ },
105
+ )
106
+
107
+
108
+ def write_status(**updates: Any) -> None:
109
+ status = read_status()
110
+ status.update(updates)
111
+ write_json(STATUS_PATH, status)
112
+
113
+
114
+ def require_secret(x_cron_secret: str | None = Header(default=None), x_admin_secret: str | None = Header(default=None)) -> None:
115
+ expected = os.environ.get("CRON_SECRET") or os.environ.get("ADMIN_SECRET")
116
+ if not expected:
117
+ return
118
+ supplied = x_cron_secret or x_admin_secret
119
+ if supplied != expected:
120
+ raise HTTPException(status_code=401, detail="Missing or invalid cron/admin secret.")
121
+
122
+
123
+ def csv_rows(path: Path, *, limit: int | None = None, columns: list[str] | None = None) -> list[dict[str, Any]]:
124
+ if not path.exists():
125
+ return []
126
+ try:
127
+ frame = pd.read_csv(path, usecols=columns)
128
+ except ValueError:
129
+ frame = pd.read_csv(path)
130
+ if columns:
131
+ frame = frame[[col for col in columns if col in frame.columns]]
132
+ if limit is not None:
133
+ frame = frame.head(limit)
134
+ return safe_json(frame.where(pd.notna(frame), None).to_dict(orient="records"))
135
+
136
+
137
+ def model_output_path(*parts: str) -> Path:
138
+ return RESEARCH_ROOT / "Code" / "models" / Path(*parts)
139
+
140
+
141
+ def manifest_path() -> Path:
142
+ return RESEARCH_ROOT / "Data" / "metadata" / "manifest.csv"
143
+
144
+
145
+ def dataset_dirs_present() -> bool:
146
+ return (RESEARCH_ROOT / "Data").is_dir() and (RESEARCH_ROOT / "Alt Data").is_dir()
147
+
148
+
149
+ def dataset_status() -> dict[str, Any]:
150
+ marker = read_json(DATASET_READY_MARKER, {})
151
+ return {
152
+ "ready": dataset_dirs_present(),
153
+ "repo_id": os.environ.get("HF_DATASET_REPO_ID"),
154
+ "revision": os.environ.get("HF_DATASET_REVISION", "main"),
155
+ "data_dir": file_meta(RESEARCH_ROOT / "Data"),
156
+ "alt_data_dir": file_meta(RESEARCH_ROOT / "Alt Data"),
157
+ "last_sync": marker,
158
+ }
159
+
160
+
161
+ def ensure_dataset_available(force: bool = False) -> bool:
162
+ if dataset_dirs_present() and not force:
163
+ return True
164
+
165
+ repo_id = os.environ.get("HF_DATASET_REPO_ID", "").strip()
166
+ if not repo_id:
167
+ return dataset_dirs_present()
168
+
169
+ with dataset_lock:
170
+ if dataset_dirs_present() and not force:
171
+ return True
172
+
173
+ STATE_DIR.mkdir(parents=True, exist_ok=True)
174
+ revision = os.environ.get("HF_DATASET_REVISION", "main")
175
+ local_dir = Path(os.environ.get("HF_DATASET_LOCAL_DIR", str(RESEARCH_ROOT))).resolve()
176
+ local_dir.mkdir(parents=True, exist_ok=True)
177
+
178
+ snapshot_download(
179
+ repo_id=repo_id,
180
+ repo_type="dataset",
181
+ revision=revision,
182
+ local_dir=str(local_dir),
183
+ local_dir_use_symlinks=False,
184
+ allow_patterns=["Data/**", "Alt Data/**", "README.md"],
185
+ )
186
+
187
+ write_json(
188
+ DATASET_READY_MARKER,
189
+ {
190
+ "repo_id": repo_id,
191
+ "revision": revision,
192
+ "synced_at": now_utc(),
193
+ "local_dir": str(local_dir),
194
+ },
195
+ )
196
+ return dataset_dirs_present()
197
+
198
+
199
+ def resolve_dataset_path(value: str) -> Path:
200
+ raw = str(value)
201
+ candidate = Path(raw)
202
+ if candidate.exists():
203
+ return candidate
204
+
205
+ normalized = raw.replace("\\", "/")
206
+ marker = "research_runtime/"
207
+ if marker in normalized:
208
+ suffix = normalized.split(marker, 1)[1]
209
+ return BASE_DIR / "research_runtime" / Path(*suffix.split("/"))
210
+
211
+ relative = Path(*normalized.split("/"))
212
+ if not relative.is_absolute():
213
+ return BASE_DIR / relative
214
+ return candidate
215
+
216
+
217
+ def file_meta(path: Path) -> dict[str, Any]:
218
+ if not path.exists():
219
+ return {"exists": False, "path": str(path)}
220
+ stat = path.stat()
221
+ return {
222
+ "exists": True,
223
+ "path": str(path),
224
+ "bytes": stat.st_size,
225
+ "modified_at": datetime.utcfromtimestamp(stat.st_mtime).replace(microsecond=0).isoformat() + "Z",
226
+ }
227
+
228
+
229
+ def latest_manifest_end() -> str | None:
230
+ path = manifest_path()
231
+ if not path.exists():
232
+ return None
233
+ try:
234
+ frame = pd.read_csv(path, usecols=["end"])
235
+ dates = pd.to_datetime(frame["end"], errors="coerce").dropna()
236
+ return str(dates.max()) if not dates.empty else None
237
+ except Exception:
238
+ return None
239
+
240
+
241
+ def parse_daily_update_time() -> dt_time:
242
+ hour, minute = DEFAULT_UPDATE_TIME.split(":", 1)
243
+ return dt_time(int(hour), int(minute))
244
+
245
+
246
+ def update_due() -> bool:
247
+ if os.environ.get("AUTO_UPDATE_ENABLED", "true").lower() not in {"1", "true", "yes", "on"}:
248
+ return False
249
+ status = read_status()
250
+ if status.get("state") == "running":
251
+ return False
252
+
253
+ tz = ZoneInfo(DEFAULT_TIMEZONE)
254
+ local_now = datetime.now(tz)
255
+ if local_now.time() < parse_daily_update_time():
256
+ return False
257
+
258
+ last_success = status.get("last_success_at")
259
+ if not last_success:
260
+ return True
261
+ try:
262
+ last_success_date = datetime.fromisoformat(last_success.replace("Z", "+00:00")).astimezone(tz).date()
263
+ except ValueError:
264
+ return True
265
+ return last_success_date < local_now.date()
266
+
267
+
268
+ def build_update_commands(retrain: bool) -> list[list[str]]:
269
+ commands = [
270
+ [
271
+ sys.executable,
272
+ "Code/scripts/data_ingestion/refresh_market_data.py",
273
+ "--end-date",
274
+ datetime.now(ZoneInfo(DEFAULT_TIMEZONE)).date().isoformat(),
275
+ ]
276
+ ]
277
+ if retrain:
278
+ commands.extend(
279
+ [
280
+ [sys.executable, "Code/models/stock_high_low_forecaster/train.py"],
281
+ [sys.executable, "Code/models/first_extrema_forecaster/train.py", "--rebuild-cache"],
282
+ [sys.executable, "Code/models/nifty_forecaster/train.py", "--no-progress"],
283
+ ]
284
+ )
285
+ return commands
286
+
287
+
288
+ def prune_generated_junk() -> None:
289
+ patterns = [
290
+ "Code/artifacts",
291
+ "Code/models/*/outputs/*dataset*.csv",
292
+ "Code/models/*/outputs/test_predictions.csv",
293
+ "Code/models/*/outputs/*_test_predictions.csv",
294
+ "Code/models/*/outputs/*predictions.csv",
295
+ "Code/models/*/outputs/*.joblib",
296
+ "Code/models/*/outputs/report.md",
297
+ "Code/models/*/outputs/*report.md",
298
+ "Code/models/*/outputs/candidate*.csv",
299
+ "Code/models/*/outputs/*candidate*.csv",
300
+ "Code/models/first_extrema_forecaster/outputs/may7_forecasts.csv",
301
+ "Code/models/nifty_forecaster/outputs/forecaster_latest.csv",
302
+ "Code/models/nifty_forecaster/outputs/forecaster_blend_details.json",
303
+ ]
304
+ for pattern in patterns:
305
+ for path in RESEARCH_ROOT.glob(pattern):
306
+ try:
307
+ if path.is_dir():
308
+ shutil.rmtree(path)
309
+ elif path.exists():
310
+ path.unlink()
311
+ except OSError:
312
+ pass
313
+ for cache_dir in RESEARCH_ROOT.rglob("__pycache__"):
314
+ try:
315
+ shutil.rmtree(cache_dir)
316
+ except OSError:
317
+ pass
318
+
319
+
320
+ def run_update_job(trigger: str = "manual", retrain: bool | None = None) -> None:
321
+ global worker_thread
322
+ with update_lock:
323
+ status = read_status()
324
+ if status.get("state") == "running":
325
+ return
326
+ write_status(
327
+ state="running",
328
+ trigger=trigger,
329
+ last_started_at=now_utc(),
330
+ last_finished_at=None,
331
+ last_error=None,
332
+ last_exit_code=None,
333
+ last_log_tail=[],
334
+ )
335
+
336
+ if retrain is None:
337
+ retrain = os.environ.get("AUTO_RETRAIN_ENABLED", "true").lower() in {"1", "true", "yes", "on"}
338
+
339
+ env = os.environ.copy()
340
+ env["FORECASTING_PROJECT_ROOT"] = str(RESEARCH_ROOT)
341
+ env.setdefault("PYTHONUNBUFFERED", "1")
342
+ env.setdefault("MARKET_BUILD_WORKERS", "2")
343
+
344
+ log_tail: list[str] = []
345
+ exit_code = 0
346
+ try:
347
+ if not ensure_dataset_available():
348
+ raise RuntimeError("Dataset folders are missing. Set HF_DATASET_REPO_ID to the Hugging Face Dataset repo.")
349
+ for command in build_update_commands(retrain):
350
+ log_tail.append("$ " + " ".join(command))
351
+ process = subprocess.Popen(
352
+ command,
353
+ cwd=RESEARCH_ROOT,
354
+ env=env,
355
+ stdout=subprocess.PIPE,
356
+ stderr=subprocess.STDOUT,
357
+ text=True,
358
+ bufsize=1,
359
+ )
360
+ assert process.stdout is not None
361
+ for line in process.stdout:
362
+ line = line.rstrip()
363
+ if line:
364
+ log_tail.append(line)
365
+ log_tail = log_tail[-80:]
366
+ exit_code = process.wait()
367
+ if exit_code != 0:
368
+ raise RuntimeError(f"Command failed with exit code {exit_code}: {' '.join(command)}")
369
+ prune_generated_junk()
370
+ write_status(
371
+ state="idle",
372
+ last_finished_at=now_utc(),
373
+ last_success_at=now_utc(),
374
+ last_error=None,
375
+ last_exit_code=exit_code,
376
+ last_log_tail=log_tail[-80:],
377
+ )
378
+ except Exception as exc:
379
+ write_status(
380
+ state="failed",
381
+ last_finished_at=now_utc(),
382
+ last_error=str(exc),
383
+ last_exit_code=exit_code,
384
+ last_log_tail=log_tail[-80:],
385
+ )
386
+
387
+
388
+ def start_update(trigger: str, retrain: bool | None = None) -> bool:
389
+ global worker_thread
390
+ status = read_status()
391
+ if status.get("state") == "running":
392
+ return False
393
+ worker_thread = threading.Thread(target=run_update_job, kwargs={"trigger": trigger, "retrain": retrain}, daemon=True)
394
+ worker_thread.start()
395
+ return True
396
+
397
+
398
+ def scheduler_loop() -> None:
399
+ while True:
400
+ if update_due():
401
+ start_update("internal_scheduler")
402
+ time.sleep(300)
403
+
404
+
405
+ @app.on_event("startup")
406
+ def startup() -> None:
407
+ STATE_DIR.mkdir(parents=True, exist_ok=True)
408
+ prune_generated_junk()
409
+ if not STATUS_PATH.exists():
410
+ write_status(state="idle", app_started_at=now_utc())
411
+ if os.environ.get("DATASET_SYNC_ON_START", "true").lower() in {"1", "true", "yes", "on"}:
412
+ try:
413
+ ensure_dataset_available()
414
+ except Exception as exc:
415
+ write_status(dataset_sync_error=str(exc), dataset_sync_failed_at=now_utc())
416
+ threading.Thread(target=scheduler_loop, daemon=True).start()
417
+ if os.environ.get("AUTO_UPDATE_ON_START", "false").lower() in {"1", "true", "yes", "on"}:
418
+ start_update("startup")
419
+
420
+
421
+ @app.get("/", response_class=PlainTextResponse)
422
+ def root() -> str:
423
+ return "Trading Forecasting Hugging Face Space backend is running. See /docs for API routes."
424
+
425
+
426
+ @app.get("/health")
427
+ def health() -> dict[str, Any]:
428
+ required = {
429
+ "research_root": file_meta(RESEARCH_ROOT),
430
+ "manifest": file_meta(manifest_path()),
431
+ "stock_latest": file_meta(model_output_path("stock_high_low_forecaster", "outputs", "latest_forecasts.csv")),
432
+ "extrema_latest": file_meta(model_output_path("first_extrema_forecaster", "outputs", "latest_forecasts.csv")),
433
+ "nifty_latest": file_meta(model_output_path("nifty_forecaster", "outputs", "forecaster_latest_forecasts.csv")),
434
+ }
435
+ ok = all(item["exists"] for item in required.values())
436
+ return {
437
+ "ok": ok,
438
+ "service": API_TITLE,
439
+ "version": API_VERSION,
440
+ "checked_at": now_utc(),
441
+ "latest_manifest_end": latest_manifest_end(),
442
+ "dataset": dataset_status(),
443
+ "update_status": read_status(),
444
+ "files": required,
445
+ }
446
+
447
+
448
+ @app.get("/api/status")
449
+ def api_status() -> dict[str, Any]:
450
+ return health()
451
+
452
+
453
+ @app.get("/api/forecast/latest")
454
+ def latest_forecasts() -> dict[str, Any]:
455
+ return {
456
+ "generated_at": now_utc(),
457
+ "stock_high_low": csv_rows(model_output_path("stock_high_low_forecaster", "outputs", "latest_forecasts.csv")),
458
+ "first_extrema": csv_rows(
459
+ model_output_path("first_extrema_forecaster", "outputs", "latest_forecasts.csv"),
460
+ columns=["date", "symbol", "target", "prob_high_first", "prediction"],
461
+ ),
462
+ "nifty_direction": csv_rows(model_output_path("nifty_forecaster", "outputs", "forecaster_latest_forecasts.csv")),
463
+ }
464
+
465
+
466
+ @app.get("/api/models/summaries")
467
+ def model_summaries() -> dict[str, Any]:
468
+ return safe_json(
469
+ {
470
+ "stock_high_low": read_json(model_output_path("stock_high_low_forecaster", "outputs", "summary.json"), {}),
471
+ "first_extrema": read_json(model_output_path("first_extrema_forecaster", "outputs", "summary.json"), {}),
472
+ "nifty_direction": read_json(model_output_path("nifty_forecaster", "outputs", "forecaster_summary.json"), []),
473
+ }
474
+ )
475
+
476
+
477
+ @app.get("/api/data/catalog")
478
+ def data_catalog(
479
+ category: str | None = None,
480
+ asset: str | None = None,
481
+ timeframe: str | None = None,
482
+ limit: int = Query(default=500, ge=1, le=5000),
483
+ ) -> dict[str, Any]:
484
+ path = manifest_path()
485
+ if not path.exists():
486
+ ensure_dataset_available()
487
+ if not path.exists():
488
+ return {"count": 0, "items": []}
489
+ frame = pd.read_csv(path)
490
+ if category:
491
+ frame = frame[frame["category"].astype(str).str.lower() == category.lower()]
492
+ if asset:
493
+ frame = frame[frame["asset"].astype(str).str.lower() == asset.lower()]
494
+ if timeframe:
495
+ frame = frame[frame["timeframe"].astype(str).str.lower() == timeframe.lower()]
496
+ return {"count": int(len(frame)), "items": safe_json(frame.head(limit).where(pd.notna(frame), None).to_dict(orient="records"))}
497
+
498
+
499
+ @app.get("/api/data/sample")
500
+ def data_sample(
501
+ category: str,
502
+ asset: str,
503
+ timeframe: str,
504
+ limit: int = Query(default=50, ge=1, le=1000),
505
+ ) -> dict[str, Any]:
506
+ path = manifest_path()
507
+ if not path.exists():
508
+ ensure_dataset_available()
509
+ if not path.exists():
510
+ raise HTTPException(status_code=404, detail="Data manifest not found.")
511
+ manifest = pd.read_csv(path)
512
+ matches = manifest[
513
+ (manifest["category"].astype(str).str.lower() == category.lower())
514
+ & (manifest["asset"].astype(str).str.lower() == asset.lower())
515
+ & (manifest["timeframe"].astype(str).str.lower() == timeframe.lower())
516
+ ]
517
+ if matches.empty:
518
+ raise HTTPException(status_code=404, detail="No matching dataset in manifest.")
519
+ dataset_path = resolve_dataset_path(str(matches.iloc[0]["path"]))
520
+ if not dataset_path.exists():
521
+ raise HTTPException(status_code=404, detail=f"Dataset file not found: {dataset_path}")
522
+ return {
523
+ "dataset": safe_json(matches.iloc[0].to_dict()),
524
+ "rows": csv_rows(dataset_path, limit=limit),
525
+ }
526
+
527
+
528
+ @app.api_route("/api/cron/tick", methods=["GET", "POST"])
529
+ async def cron_tick(
530
+ request: Request,
531
+ background_tasks: BackgroundTasks,
532
+ x_cron_secret: str | None = Header(default=None),
533
+ ) -> JSONResponse:
534
+ require_secret(x_cron_secret=x_cron_secret)
535
+ due = update_due()
536
+ started = False
537
+ if due:
538
+ background_tasks.add_task(start_update, "netlify_cron")
539
+ started = True
540
+ return JSONResponse({"ok": True, "checked_at": now_utc(), "update_due": due, "update_start_queued": started, "status": read_status()})
541
+
542
+
543
+ @app.post("/api/update/start")
544
+ def manual_update(
545
+ retrain: bool | None = None,
546
+ x_admin_secret: str | None = Header(default=None),
547
+ ) -> dict[str, Any]:
548
+ require_secret(x_admin_secret=x_admin_secret)
549
+ started = start_update("manual_api", retrain=retrain)
550
+ return {"ok": True, "started": started, "status": read_status()}
551
+
552
+
553
+ @app.post("/api/dataset/sync")
554
+ def sync_dataset(
555
+ force: bool = False,
556
+ x_admin_secret: str | None = Header(default=None),
557
+ ) -> dict[str, Any]:
558
+ require_secret(x_admin_secret=x_admin_secret)
559
+ ok = ensure_dataset_available(force=force)
560
+ return {"ok": ok, "dataset": dataset_status()}
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.115.12
2
+ uvicorn[standard]==0.34.2
3
+ pandas==2.2.3
4
+ numpy==2.2.6
5
+ requests==2.32.3
6
+ scikit-learn==1.6.1
7
+ joblib==1.4.2
8
+ xgboost==3.0.1
9
+ catboost==1.2.8
10
+ lightgbm==4.6.0
11
+ huggingface_hub==0.31.4
runtime_config.example.env ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face Space backend settings
2
+ FORECASTING_PROJECT_ROOT=/app/research_runtime
3
+ FRONTEND_ORIGINS=https://your-netlify-site.netlify.app
4
+ CRON_SECRET=replace-with-a-long-shared-secret
5
+ HF_DATASET_REPO_ID=your-hf-username/your-forecasting-dataset
6
+ HF_DATASET_REVISION=main
7
+
8
+ # Automatic update settings
9
+ AUTO_UPDATE_ENABLED=true
10
+ AUTO_RETRAIN_ENABLED=true
11
+ AUTO_UPDATE_ON_START=false
12
+ DAILY_UPDATE_TIME=17:30
13
+ UPDATE_TIMEZONE=Asia/Kolkata
14
+ MARKET_BUILD_WORKERS=2