Spaces:
Running
Running
Commit ·
aac542c
1
Parent(s): a365e2e
code
Browse files- .gitattributes +0 -35
- README.md +0 -83
- __pycache__/main.cpython-314.pyc +0 -0
- app/__init__.py +0 -0
- app/auth.py +0 -22
- app/core/__init__.py +0 -0
- app/core/auth.py +40 -0
- app/core/cache.py +117 -0
- app/core/exceptions.py +104 -0
- app/core/logging_config.py +34 -0
- app/database.py +0 -51
- app/models.py +0 -38
- app/models/__init__.py +0 -0
- app/models/common.py +78 -0
- app/models/indicators.py +29 -0
- app/models/market_data.py +46 -0
- app/models/reports.py +41 -0
- app/models/symbols.py +43 -0
- app/routers/market_data.py +0 -96
- app/routers/raw_query.py +0 -122
- app/routers/system.py +0 -58
- app/services/__init__.py +0 -0
- app/services/dataset.py +0 -114
- docs/DATASET_CONTEXT.md +350 -0
- docs/Plan.md +374 -0
- hf_dataset_structure_report.csv +0 -0
- scripts/__init__.py +0 -0
- scripts/sync_data.py +38 -0
- testdata.py +52 -0
.gitattributes
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
DELETED
|
@@ -1,83 +0,0 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: Cloud Data Lake API
|
| 3 |
-
emoji: 📈
|
| 4 |
-
colorFrom: blue
|
| 5 |
-
colorTo: indigo
|
| 6 |
-
sdk: docker
|
| 7 |
-
pinned: false
|
| 8 |
-
app_port: 7860
|
| 9 |
-
---
|
| 10 |
-
|
| 11 |
-
# Cloud Data Lake API
|
| 12 |
-
|
| 13 |
-
A production-ready Financial Data API built with **FastAPI**, **DuckDB**, and **Hugging Face**.
|
| 14 |
-
|
| 15 |
-
This API allows serverless SQL querying of Parquet datasets stored on Hugging Face, with specialized endpoints for financial market data (OHLC).
|
| 16 |
-
|
| 17 |
-
## Features
|
| 18 |
-
|
| 19 |
-
- 🚀 **Serverless Architecture**: No database to manage; queries remote Parquet files directly.
|
| 20 |
-
- ⚡ **High Performance**: Powered by DuckDB's vectorized engine and `httpfs`.
|
| 21 |
-
- 🔒 **Secure**: Bearer Token authentication and read-only access.
|
| 22 |
-
- 📈 **Financial Data**: Dedicated endpoints for OHLC (Open-High-Low-Close) data.
|
| 23 |
-
- 🌐 **Hugging Face Integrated**: Seamless access to private Datasets.
|
| 24 |
-
|
| 25 |
-
## Installation
|
| 26 |
-
|
| 27 |
-
1. **Clone the repository**
|
| 28 |
-
```bash
|
| 29 |
-
git clone https://huggingface.co/spaces/OMCHOKSI108/forexdatalake
|
| 30 |
-
cd forexdatalake
|
| 31 |
-
```
|
| 32 |
-
|
| 33 |
-
2. **Install Dependencies**
|
| 34 |
-
```bash
|
| 35 |
-
pip install -r requirements.txt
|
| 36 |
-
```
|
| 37 |
-
|
| 38 |
-
3. **Configure Environment**
|
| 39 |
-
Create a `.env` file:
|
| 40 |
-
```env
|
| 41 |
-
# Secrets (Best kept in repository secrets if deploying to Spaces)
|
| 42 |
-
HF_TOKEN=hf_your_huggingface_token
|
| 43 |
-
API_KEY=your-secret-key
|
| 44 |
-
|
| 45 |
-
# Configuration (Can be standard variables)
|
| 46 |
-
DATASET_URL=hf://datasets/OMCHOKSI108/my-cloud-data-lake/ALL_TIME_DATA/**/*.parquet
|
| 47 |
-
# Optional Tuning
|
| 48 |
-
DUCKDB_MEMORY_LIMIT=1GB
|
| 49 |
-
DUCKDB_THREADS=2
|
| 50 |
-
```
|
| 51 |
-
|
| 52 |
-
### Environment Variables
|
| 53 |
-
|
| 54 |
-
| Variable | Type | Description | Required | Default |
|
| 55 |
-
|----------|------|-------------|----------|---------|
|
| 56 |
-
| `HF_TOKEN` | **Secret** | Hugging Face Access Token with read permissions. | Yes | - |
|
| 57 |
-
| `API_KEY` | **Secret** | Secret key for authenticating API requests. | Yes | `sk-dev-key-123` |
|
| 58 |
-
| `DATASET_URL` | Config | `hf://` URL pattern to your parquet files. | No | *See config.py* |
|
| 59 |
-
| `DUCKDB_MEMORY_LIMIT` | Config | Max memory for DuckDB (e.g. '1GB'). | No | `1GB` |
|
| 60 |
-
| `DUCKDB_THREADS` | Config | Number of Threads. | No | `2` |
|
| 61 |
-
|
| 62 |
-
## Usage
|
| 63 |
-
|
| 64 |
-
### Run the Server
|
| 65 |
-
```bash
|
| 66 |
-
uvicorn app.main:app --host 0.0.0.0 --port 7860
|
| 67 |
-
```
|
| 68 |
-
|
| 69 |
-
### API Endpoints
|
| 70 |
-
|
| 71 |
-
| Method | Endpoint | Description | Auth |
|
| 72 |
-
|--------|----------|-------------|------|
|
| 73 |
-
| `GET` | `/system/health` | Health check | No |
|
| 74 |
-
| `GET` | `/v1/symbols` | List available pairs | Yes |
|
| 75 |
-
| `GET` | `/v1/ohlc/{symbol}` | Get historical OHLC data | Yes |
|
| 76 |
-
| `POST` | `/query` | Execute raw SQL (Admin) | Yes |
|
| 77 |
-
|
| 78 |
-
### Example Request
|
| 79 |
-
```bash
|
| 80 |
-
curl -H "Authorization: Bearer your-secret-key" \
|
| 81 |
-
"http://localhost:7860/v1/ohlc/EURUSD?interval=15min&limit=10"
|
| 82 |
-
```
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__pycache__/main.cpython-314.pyc
DELETED
|
Binary file (15.2 kB)
|
|
|
app/__init__.py
ADDED
|
File without changes
|
app/auth.py
DELETED
|
@@ -1,22 +0,0 @@
|
|
| 1 |
-
from fastapi import Security, HTTPException, status
|
| 2 |
-
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
| 3 |
-
from app.config import get_settings
|
| 4 |
-
|
| 5 |
-
security = HTTPBearer()
|
| 6 |
-
|
| 7 |
-
def get_current_user(credentials: HTTPAuthorizationCredentials = Security(security)):
|
| 8 |
-
"""
|
| 9 |
-
Validate the Bearer token.
|
| 10 |
-
In a real production app, this would check against a database of users/keys.
|
| 11 |
-
"""
|
| 12 |
-
settings = get_settings()
|
| 13 |
-
token = credentials.credentials
|
| 14 |
-
|
| 15 |
-
# Check against the configured API key (simple auth for now)
|
| 16 |
-
if token != settings.API_KEY:
|
| 17 |
-
raise HTTPException(
|
| 18 |
-
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 19 |
-
detail="Invalid authentication credentials",
|
| 20 |
-
headers={"WWW-Authenticate": "Bearer"},
|
| 21 |
-
)
|
| 22 |
-
return token
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/core/__init__.py
ADDED
|
File without changes
|
app/core/auth.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""API key authentication dependency."""
|
| 2 |
+
|
| 3 |
+
import secrets
|
| 4 |
+
|
| 5 |
+
from fastapi import Security
|
| 6 |
+
from fastapi.security import APIKeyHeader
|
| 7 |
+
|
| 8 |
+
from app.config import settings
|
| 9 |
+
from app.core.exceptions import AuthenticationError
|
| 10 |
+
|
| 11 |
+
_api_key_header = APIKeyHeader(name=settings.API_KEY_HEADER, auto_error=False)
|
| 12 |
+
|
| 13 |
+
# Paths that bypass authentication
|
| 14 |
+
PUBLIC_PATHS: set[str] = {
|
| 15 |
+
"/api/v1/health",
|
| 16 |
+
"/docs",
|
| 17 |
+
"/redoc",
|
| 18 |
+
"/openapi.json",
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
async def require_api_key(
|
| 23 |
+
api_key: str | None = Security(_api_key_header),
|
| 24 |
+
) -> str:
|
| 25 |
+
"""FastAPI dependency that validates the X-API-Key header.
|
| 26 |
+
|
| 27 |
+
Usage:
|
| 28 |
+
@router.get("/protected", dependencies=[Depends(require_api_key)])
|
| 29 |
+
"""
|
| 30 |
+
if not settings.API_KEY:
|
| 31 |
+
# No API key configured — auth is disabled
|
| 32 |
+
return "no-auth"
|
| 33 |
+
|
| 34 |
+
if api_key is None:
|
| 35 |
+
raise AuthenticationError("Missing API key — provide via X-API-Key header")
|
| 36 |
+
|
| 37 |
+
if not secrets.compare_digest(api_key, settings.API_KEY):
|
| 38 |
+
raise AuthenticationError("Invalid API key")
|
| 39 |
+
|
| 40 |
+
return api_key
|
app/core/cache.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""TTL-based in-memory cache with thread-safety.
|
| 2 |
+
|
| 3 |
+
A lightweight alternative to Redis for single-process deployments.
|
| 4 |
+
Supports decorator-based caching and manual get/set/invalidate.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import hashlib
|
| 8 |
+
import json
|
| 9 |
+
import logging
|
| 10 |
+
import threading
|
| 11 |
+
import time
|
| 12 |
+
from collections import OrderedDict
|
| 13 |
+
from typing import Any, Callable
|
| 14 |
+
|
| 15 |
+
from app.config import settings
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class TTLCache:
|
| 21 |
+
"""Thread-safe LRU cache with per-entry TTL expiration."""
|
| 22 |
+
|
| 23 |
+
def __init__(
|
| 24 |
+
self,
|
| 25 |
+
max_size: int = settings.CACHE_MAX_SIZE,
|
| 26 |
+
default_ttl: int = settings.CACHE_TTL,
|
| 27 |
+
) -> None:
|
| 28 |
+
self._store: OrderedDict[str, tuple[Any, float]] = OrderedDict()
|
| 29 |
+
self._lock = threading.Lock()
|
| 30 |
+
self._max_size = max_size
|
| 31 |
+
self._default_ttl = default_ttl
|
| 32 |
+
self._hits = 0
|
| 33 |
+
self._misses = 0
|
| 34 |
+
|
| 35 |
+
# -- core operations -----------------------------------------------------
|
| 36 |
+
|
| 37 |
+
def get(self, key: str) -> Any | None:
|
| 38 |
+
with self._lock:
|
| 39 |
+
entry = self._store.get(key)
|
| 40 |
+
if entry is None:
|
| 41 |
+
self._misses += 1
|
| 42 |
+
return None
|
| 43 |
+
value, expires_at = entry
|
| 44 |
+
if time.time() > expires_at:
|
| 45 |
+
del self._store[key]
|
| 46 |
+
self._misses += 1
|
| 47 |
+
return None
|
| 48 |
+
# Move to end (most-recently-used)
|
| 49 |
+
self._store.move_to_end(key)
|
| 50 |
+
self._hits += 1
|
| 51 |
+
return value
|
| 52 |
+
|
| 53 |
+
def set(self, key: str, value: Any, ttl: int | None = None) -> None:
|
| 54 |
+
ttl = ttl if ttl is not None else self._default_ttl
|
| 55 |
+
expires_at = time.time() + ttl
|
| 56 |
+
with self._lock:
|
| 57 |
+
if key in self._store:
|
| 58 |
+
self._store.move_to_end(key)
|
| 59 |
+
self._store[key] = (value, expires_at)
|
| 60 |
+
# Evict oldest if over capacity
|
| 61 |
+
while len(self._store) > self._max_size:
|
| 62 |
+
evicted_key, _ = self._store.popitem(last=False)
|
| 63 |
+
logger.debug("Cache evicted key: %s", evicted_key)
|
| 64 |
+
|
| 65 |
+
def invalidate(self, key: str) -> bool:
|
| 66 |
+
with self._lock:
|
| 67 |
+
if key in self._store:
|
| 68 |
+
del self._store[key]
|
| 69 |
+
return True
|
| 70 |
+
return False
|
| 71 |
+
|
| 72 |
+
def invalidate_pattern(self, prefix: str) -> int:
|
| 73 |
+
"""Remove all keys starting with the given prefix."""
|
| 74 |
+
with self._lock:
|
| 75 |
+
keys_to_remove = [k for k in self._store if k.startswith(prefix)]
|
| 76 |
+
for k in keys_to_remove:
|
| 77 |
+
del self._store[k]
|
| 78 |
+
return len(keys_to_remove)
|
| 79 |
+
|
| 80 |
+
def clear(self) -> None:
|
| 81 |
+
with self._lock:
|
| 82 |
+
self._store.clear()
|
| 83 |
+
self._hits = 0
|
| 84 |
+
self._misses = 0
|
| 85 |
+
|
| 86 |
+
@property
|
| 87 |
+
def stats(self) -> dict[str, int]:
|
| 88 |
+
with self._lock:
|
| 89 |
+
return {
|
| 90 |
+
"size": len(self._store),
|
| 91 |
+
"max_size": self._max_size,
|
| 92 |
+
"hits": self._hits,
|
| 93 |
+
"misses": self._misses,
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
# -- cleanup -------------------------------------------------------------
|
| 97 |
+
|
| 98 |
+
def evict_expired(self) -> int:
|
| 99 |
+
"""Remove all expired entries. Returns count of evicted entries."""
|
| 100 |
+
now = time.time()
|
| 101 |
+
with self._lock:
|
| 102 |
+
expired = [k for k, (_, exp) in self._store.items() if now > exp]
|
| 103 |
+
for k in expired:
|
| 104 |
+
del self._store[k]
|
| 105 |
+
return len(expired)
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def make_cache_key(*args: Any, **kwargs: Any) -> str:
|
| 109 |
+
"""Generate a deterministic cache key from arguments."""
|
| 110 |
+
raw = json.dumps({"args": args, "kwargs": kwargs}, sort_keys=True, default=str)
|
| 111 |
+
return hashlib.md5(raw.encode()).hexdigest()
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
# ---------------------------------------------------------------------------
|
| 115 |
+
# Module-level singleton
|
| 116 |
+
# ---------------------------------------------------------------------------
|
| 117 |
+
cache = TTLCache()
|
app/core/exceptions.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Custom exception hierarchy for the ForexDataLake API.
|
| 2 |
+
|
| 3 |
+
All application exceptions inherit from AppException and carry
|
| 4 |
+
a status_code, error_code, and detail message. The global exception
|
| 5 |
+
handler in main.py catches these and returns structured JSON responses.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from fastapi import Request
|
| 9 |
+
from fastapi.responses import JSONResponse
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class AppException(Exception):
|
| 13 |
+
"""Base exception for all application errors."""
|
| 14 |
+
|
| 15 |
+
status_code: int = 500
|
| 16 |
+
error_code: str = "INTERNAL_ERROR"
|
| 17 |
+
detail: str = "An unexpected error occurred"
|
| 18 |
+
|
| 19 |
+
def __init__(self, detail: str | None = None, **kwargs):
|
| 20 |
+
self.detail = detail or self.__class__.detail
|
| 21 |
+
self.extra = kwargs
|
| 22 |
+
super().__init__(self.detail)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class SymbolNotFoundError(AppException):
|
| 26 |
+
status_code = 404
|
| 27 |
+
error_code = "SYMBOL_NOT_FOUND"
|
| 28 |
+
detail = "The requested symbol was not found"
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class TimeframeNotFoundError(AppException):
|
| 32 |
+
status_code = 404
|
| 33 |
+
error_code = "TIMEFRAME_NOT_FOUND"
|
| 34 |
+
detail = "The requested timeframe is not available"
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class InvalidDateRangeError(AppException):
|
| 38 |
+
status_code = 400
|
| 39 |
+
error_code = "INVALID_DATE_RANGE"
|
| 40 |
+
detail = "The provided date range is invalid"
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class DataNotAvailableError(AppException):
|
| 44 |
+
status_code = 404
|
| 45 |
+
error_code = "DATA_NOT_AVAILABLE"
|
| 46 |
+
detail = "No data available for the requested parameters"
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class AuthenticationError(AppException):
|
| 50 |
+
status_code = 401
|
| 51 |
+
error_code = "AUTHENTICATION_FAILED"
|
| 52 |
+
detail = "Invalid or missing API key"
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
class RateLimitExceededError(AppException):
|
| 56 |
+
status_code = 429
|
| 57 |
+
error_code = "RATE_LIMIT_EXCEEDED"
|
| 58 |
+
detail = "Too many requests — please slow down"
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class DataSyncError(AppException):
|
| 62 |
+
status_code = 503
|
| 63 |
+
error_code = "DATA_SYNC_ERROR"
|
| 64 |
+
detail = "Failed to synchronize data from remote source"
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class ValidationError(AppException):
|
| 68 |
+
status_code = 422
|
| 69 |
+
error_code = "VALIDATION_ERROR"
|
| 70 |
+
detail = "Request validation failed"
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
class DatabaseError(AppException):
|
| 74 |
+
status_code = 500
|
| 75 |
+
error_code = "DATABASE_ERROR"
|
| 76 |
+
detail = "A database error occurred"
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
# ---------------------------------------------------------------------------
|
| 80 |
+
# Global exception handlers (registered in main.py via app.add_exception_handler)
|
| 81 |
+
# ---------------------------------------------------------------------------
|
| 82 |
+
|
| 83 |
+
async def app_exception_handler(request: Request, exc: AppException) -> JSONResponse:
|
| 84 |
+
"""Handle all AppException subclasses with structured JSON."""
|
| 85 |
+
return JSONResponse(
|
| 86 |
+
status_code=exc.status_code,
|
| 87 |
+
content={
|
| 88 |
+
"error": exc.error_code,
|
| 89 |
+
"detail": exc.detail,
|
| 90 |
+
"path": str(request.url),
|
| 91 |
+
},
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
async def generic_exception_handler(request: Request, exc: Exception) -> JSONResponse:
|
| 96 |
+
"""Catch-all for unhandled exceptions — never leak stack traces."""
|
| 97 |
+
return JSONResponse(
|
| 98 |
+
status_code=500,
|
| 99 |
+
content={
|
| 100 |
+
"error": "INTERNAL_ERROR",
|
| 101 |
+
"detail": "An unexpected internal error occurred",
|
| 102 |
+
"path": str(request.url),
|
| 103 |
+
},
|
| 104 |
+
)
|
app/core/logging_config.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Structured logging configuration for the application."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
import sys
|
| 5 |
+
|
| 6 |
+
from app.config import settings
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def setup_logging() -> None:
|
| 10 |
+
"""Configure root logger with structured format and appropriate level."""
|
| 11 |
+
log_level = getattr(logging, settings.LOG_LEVEL.upper(), logging.INFO)
|
| 12 |
+
|
| 13 |
+
formatter = logging.Formatter(
|
| 14 |
+
fmt="%(asctime)s | %(levelname)-8s | %(name)-30s | %(message)s",
|
| 15 |
+
datefmt="%Y-%m-%d %H:%M:%S",
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
handler = logging.StreamHandler(sys.stdout)
|
| 19 |
+
handler.setFormatter(formatter)
|
| 20 |
+
handler.setLevel(log_level)
|
| 21 |
+
|
| 22 |
+
root_logger = logging.getLogger()
|
| 23 |
+
root_logger.setLevel(log_level)
|
| 24 |
+
root_logger.handlers.clear()
|
| 25 |
+
root_logger.addHandler(handler)
|
| 26 |
+
|
| 27 |
+
# Silence noisy third-party loggers
|
| 28 |
+
for name in ("httpcore", "httpx", "hpack", "urllib3", "fsspec"):
|
| 29 |
+
logging.getLogger(name).setLevel(logging.WARNING)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def get_logger(name: str) -> logging.Logger:
|
| 33 |
+
"""Return a named logger instance."""
|
| 34 |
+
return logging.getLogger(name)
|
app/database.py
DELETED
|
@@ -1,51 +0,0 @@
|
|
| 1 |
-
import duckdb
|
| 2 |
-
import logging
|
| 3 |
-
from contextlib import contextmanager
|
| 4 |
-
from huggingface_hub import HfFileSystem
|
| 5 |
-
from app.config import get_settings
|
| 6 |
-
|
| 7 |
-
logger = logging.getLogger(__name__)
|
| 8 |
-
|
| 9 |
-
def get_duckdb_connection():
|
| 10 |
-
"""Get a new DuckDB connection with httpfs extension and registered HF filesystem."""
|
| 11 |
-
settings = get_settings()
|
| 12 |
-
conn = duckdb.connect()
|
| 13 |
-
|
| 14 |
-
# Configure DuckDB for performance
|
| 15 |
-
conn.execute(f"""
|
| 16 |
-
SET memory_limit='{settings.DUCKDB_MEMORY_LIMIT}';
|
| 17 |
-
SET threads={settings.DUCKDB_THREADS};
|
| 18 |
-
SET enable_progress_bar=false;
|
| 19 |
-
SET enable_object_cache=true;
|
| 20 |
-
""")
|
| 21 |
-
|
| 22 |
-
# Register Hugging Face FileSystem
|
| 23 |
-
if settings.HF_TOKEN:
|
| 24 |
-
try:
|
| 25 |
-
fs = HfFileSystem(token=settings.HF_TOKEN)
|
| 26 |
-
conn.register_filesystem(fs)
|
| 27 |
-
except Exception as e:
|
| 28 |
-
logger.error(f"Failed to register HfFileSystem: {e}")
|
| 29 |
-
else:
|
| 30 |
-
logger.warning("HF_TOKEN is not set. Access to private datasets will fail.")
|
| 31 |
-
|
| 32 |
-
# Install and load httpfs extension for remote file access
|
| 33 |
-
try:
|
| 34 |
-
conn.execute("INSTALL httpfs")
|
| 35 |
-
conn.execute("LOAD httpfs")
|
| 36 |
-
except Exception as e:
|
| 37 |
-
logger.error(f"Failed to load httpfs extension: {e}")
|
| 38 |
-
# Re-raise or handle appropriately depending on app needs
|
| 39 |
-
raise
|
| 40 |
-
|
| 41 |
-
return conn
|
| 42 |
-
|
| 43 |
-
@contextmanager
|
| 44 |
-
def duckdb_transaction():
|
| 45 |
-
"""Context manager for DuckDB transactions."""
|
| 46 |
-
db_conn = get_duckdb_connection()
|
| 47 |
-
try:
|
| 48 |
-
yield db_conn
|
| 49 |
-
except Exception as e:
|
| 50 |
-
logger.error(f"Database error: {e}")
|
| 51 |
-
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/models.py
DELETED
|
@@ -1,38 +0,0 @@
|
|
| 1 |
-
from typing import List, Dict, Any, Optional
|
| 2 |
-
from pydantic import BaseModel, Field
|
| 3 |
-
|
| 4 |
-
# --- Shared Models ---
|
| 5 |
-
|
| 6 |
-
class ErrorResponse(BaseModel):
|
| 7 |
-
error: str
|
| 8 |
-
detail: Optional[str] = None
|
| 9 |
-
|
| 10 |
-
# --- Query Models ---
|
| 11 |
-
|
| 12 |
-
class QueryRequest(BaseModel):
|
| 13 |
-
sql_query: str = Field(..., description="SQL query to execute against the dataset")
|
| 14 |
-
limit: Optional[int] = Field(1000, description="Maximum number of rows to return", ge=1, le=10000)
|
| 15 |
-
|
| 16 |
-
class QueryResponse(BaseModel):
|
| 17 |
-
data: List[Dict[str, Any]]
|
| 18 |
-
columns: List[str]
|
| 19 |
-
row_count: int
|
| 20 |
-
execution_time_ms: float
|
| 21 |
-
|
| 22 |
-
# --- Schema Models ---
|
| 23 |
-
|
| 24 |
-
class SchemaResponse(BaseModel):
|
| 25 |
-
schema: Dict[str, str]
|
| 26 |
-
total_files: int
|
| 27 |
-
|
| 28 |
-
# --- Financial Data Models ---
|
| 29 |
-
|
| 30 |
-
class OHLCResponse(BaseModel):
|
| 31 |
-
symbol: str
|
| 32 |
-
interval: str
|
| 33 |
-
data: List[Dict[str, Any]] # format: {time, open, high, low, close}
|
| 34 |
-
count: int
|
| 35 |
-
|
| 36 |
-
class SymbolResponse(BaseModel):
|
| 37 |
-
symbols: List[str]
|
| 38 |
-
count: int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/models/__init__.py
ADDED
|
File without changes
|
app/models/common.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared enums, pagination, and base response schemas."""
|
| 2 |
+
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from enum import Enum
|
| 5 |
+
from typing import Any, Generic, TypeVar
|
| 6 |
+
|
| 7 |
+
from pydantic import BaseModel, Field
|
| 8 |
+
|
| 9 |
+
from app.config import settings
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class TimeframeEnum(str, Enum):
|
| 13 |
+
"""Supported candlestick timeframes."""
|
| 14 |
+
MIN_1 = "1min"
|
| 15 |
+
MIN_5 = "5min"
|
| 16 |
+
MIN_15 = "15min"
|
| 17 |
+
MIN_30 = "30min"
|
| 18 |
+
HR_1 = "1hr"
|
| 19 |
+
HR_4 = "4hr"
|
| 20 |
+
DAY_1 = "1day"
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class AssetClassEnum(str, Enum):
|
| 24 |
+
"""Asset class categories."""
|
| 25 |
+
FOREX_MAJOR = "forex_major"
|
| 26 |
+
FOREX_CROSS = "forex_cross"
|
| 27 |
+
CRYPTO_MAJOR = "crypto_major"
|
| 28 |
+
CRYPTO_ALT = "crypto_alt"
|
| 29 |
+
CRYPTO_CROSS = "crypto_cross"
|
| 30 |
+
COMMODITIES = "commodities"
|
| 31 |
+
STOCKS = "stocks"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class SourceEnum(str, Enum):
|
| 35 |
+
"""Data source partition."""
|
| 36 |
+
ALL_TIME = "ALL_TIME_DATA"
|
| 37 |
+
RECENT = "recent"
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class PaginationParams(BaseModel):
|
| 41 |
+
"""Common pagination parameters."""
|
| 42 |
+
limit: int = Field(
|
| 43 |
+
default=settings.DEFAULT_LIMIT,
|
| 44 |
+
ge=1,
|
| 45 |
+
le=settings.MAX_ROWS_PER_REQUEST,
|
| 46 |
+
description="Number of rows to return",
|
| 47 |
+
)
|
| 48 |
+
offset: int = Field(default=0, ge=0, description="Number of rows to skip")
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
T = TypeVar("T")
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class PaginatedResponse(BaseModel, Generic[T]):
|
| 55 |
+
"""Generic paginated response wrapper."""
|
| 56 |
+
total: int = Field(description="Total available rows")
|
| 57 |
+
returned: int = Field(description="Rows returned in this response")
|
| 58 |
+
limit: int
|
| 59 |
+
offset: int
|
| 60 |
+
data: list[T]
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class ErrorResponse(BaseModel):
|
| 64 |
+
"""Standard error response body."""
|
| 65 |
+
error: str = Field(description="Machine-readable error code")
|
| 66 |
+
detail: str = Field(description="Human-readable error message")
|
| 67 |
+
path: str = Field(description="Request path that triggered the error")
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class HealthResponse(BaseModel):
|
| 71 |
+
"""Health check response."""
|
| 72 |
+
status: str = "ok"
|
| 73 |
+
version: str = "1.0.0"
|
| 74 |
+
timestamp: datetime = Field(default_factory=datetime.utcnow)
|
| 75 |
+
total_files: int = 0
|
| 76 |
+
total_symbols: int = 0
|
| 77 |
+
total_timeframes: int = 0
|
| 78 |
+
cache_entries: int = 0
|
app/models/indicators.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pydantic schemas for technical indicator endpoints."""
|
| 2 |
+
|
| 3 |
+
from typing import Any
|
| 4 |
+
|
| 5 |
+
from pydantic import BaseModel, Field
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class IndicatorValue(BaseModel):
|
| 9 |
+
"""Single indicator data point alongside OHLCV."""
|
| 10 |
+
ts: Any
|
| 11 |
+
open: float
|
| 12 |
+
high: float
|
| 13 |
+
low: float
|
| 14 |
+
close: float
|
| 15 |
+
volume: float
|
| 16 |
+
indicator_value: float | None = Field(
|
| 17 |
+
default=None, description="Computed indicator value"
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class IndicatorResponse(BaseModel):
|
| 22 |
+
"""Response for indicator calculation."""
|
| 23 |
+
symbol: str
|
| 24 |
+
timeframe: str
|
| 25 |
+
indicator: str
|
| 26 |
+
period: int
|
| 27 |
+
total_rows: int
|
| 28 |
+
returned: int
|
| 29 |
+
data: list[IndicatorValue]
|
app/models/market_data.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pydantic schemas for market data (OHLCV) endpoints."""
|
| 2 |
+
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from typing import Any
|
| 5 |
+
|
| 6 |
+
from pydantic import BaseModel, Field
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class OHLCVBar(BaseModel):
|
| 10 |
+
"""Single OHLCV candlestick bar."""
|
| 11 |
+
ts: Any = Field(description="Candle timestamp")
|
| 12 |
+
open: float = Field(description="Opening price")
|
| 13 |
+
high: float = Field(description="Highest price")
|
| 14 |
+
low: float = Field(description="Lowest price")
|
| 15 |
+
close: float = Field(description="Closing price")
|
| 16 |
+
volume: float = Field(description="Trade volume")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class OHLCVResponse(BaseModel):
|
| 20 |
+
"""Paginated OHLCV data response."""
|
| 21 |
+
symbol: str
|
| 22 |
+
timeframe: str
|
| 23 |
+
source: str = "ALL_TIME_DATA"
|
| 24 |
+
total_rows: int = Field(description="Total available rows for this query")
|
| 25 |
+
returned: int = Field(description="Rows in this response")
|
| 26 |
+
limit: int
|
| 27 |
+
offset: int
|
| 28 |
+
data: list[OHLCVBar]
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class DateRangeInfo(BaseModel):
|
| 32 |
+
"""Available date range for a symbol/timeframe."""
|
| 33 |
+
symbol: str
|
| 34 |
+
timeframe: str
|
| 35 |
+
source: str = "ALL_TIME_DATA"
|
| 36 |
+
start_date: Any = Field(description="Earliest timestamp in dataset")
|
| 37 |
+
end_date: Any = Field(description="Latest timestamp in dataset")
|
| 38 |
+
total_rows: int = Field(description="Total row count")
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class LatestBar(BaseModel):
|
| 42 |
+
"""Latest N bars response."""
|
| 43 |
+
symbol: str
|
| 44 |
+
timeframe: str
|
| 45 |
+
count: int
|
| 46 |
+
data: list[OHLCVBar]
|
app/models/reports.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pydantic schemas for portfolio report endpoints."""
|
| 2 |
+
|
| 3 |
+
from typing import Any
|
| 4 |
+
|
| 5 |
+
from pydantic import BaseModel, Field
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class EquityPoint(BaseModel):
|
| 9 |
+
"""Single equity curve data point."""
|
| 10 |
+
step: int = Field(description="Simulation step index")
|
| 11 |
+
equity: float = Field(description="Portfolio equity value")
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class EquityResponse(BaseModel):
|
| 15 |
+
"""Paginated equity curve response."""
|
| 16 |
+
total: int
|
| 17 |
+
returned: int
|
| 18 |
+
limit: int
|
| 19 |
+
offset: int
|
| 20 |
+
data: list[EquityPoint]
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class TradeRecord(BaseModel):
|
| 24 |
+
"""Single trade record from backtest."""
|
| 25 |
+
pair: str = Field(description="Trading pair")
|
| 26 |
+
type: str = Field(description="Order type")
|
| 27 |
+
side: str = Field(description="Buy or Sell")
|
| 28 |
+
price: float = Field(description="Execution price")
|
| 29 |
+
size: float = Field(description="Position size")
|
| 30 |
+
time: Any = Field(description="Trade timestamp")
|
| 31 |
+
score: float = Field(description="Strategy signal score")
|
| 32 |
+
pnl: float = Field(description="Profit & Loss")
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class TradesResponse(BaseModel):
|
| 36 |
+
"""Paginated trades response."""
|
| 37 |
+
total: int
|
| 38 |
+
returned: int
|
| 39 |
+
limit: int
|
| 40 |
+
offset: int
|
| 41 |
+
data: list[TradeRecord]
|
app/models/symbols.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pydantic schemas for symbol-related endpoints."""
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, Field
|
| 4 |
+
|
| 5 |
+
from app.models.common import AssetClassEnum, TimeframeEnum
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class SymbolInfo(BaseModel):
|
| 9 |
+
"""Summary info for a single instrument."""
|
| 10 |
+
symbol: str = Field(description="Instrument symbol identifier")
|
| 11 |
+
asset_class: AssetClassEnum | None = Field(
|
| 12 |
+
default=None, description="Asset class category"
|
| 13 |
+
)
|
| 14 |
+
available_timeframes: list[str] = Field(
|
| 15 |
+
default_factory=list, description="Timeframes with data"
|
| 16 |
+
)
|
| 17 |
+
description: str = Field(default="", description="Human-readable name")
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class SymbolDetail(BaseModel):
|
| 21 |
+
"""Full detail for a single instrument including per-timeframe stats."""
|
| 22 |
+
symbol: str
|
| 23 |
+
asset_class: AssetClassEnum | None = None
|
| 24 |
+
description: str = ""
|
| 25 |
+
timeframes: list["TimeframeDetail"] = Field(default_factory=list)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class TimeframeDetail(BaseModel):
|
| 29 |
+
"""Stats for one timeframe of a symbol."""
|
| 30 |
+
timeframe: str
|
| 31 |
+
source: str
|
| 32 |
+
row_count: int = 0
|
| 33 |
+
file_path: str = ""
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class SymbolSearchResult(BaseModel):
|
| 37 |
+
"""Search result for symbol lookup."""
|
| 38 |
+
total: int
|
| 39 |
+
results: list[SymbolInfo]
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# Rebuild forward refs
|
| 43 |
+
SymbolDetail.model_rebuild()
|
app/routers/market_data.py
DELETED
|
@@ -1,96 +0,0 @@
|
|
| 1 |
-
from datetime import datetime, date
|
| 2 |
-
from typing import Optional, List, Dict
|
| 3 |
-
from fastapi import APIRouter, HTTPException, Depends, Query
|
| 4 |
-
from app.database import duckdb_transaction
|
| 5 |
-
from app.models import OHLCResponse, SymbolResponse
|
| 6 |
-
from app.config import get_settings
|
| 7 |
-
from app.auth import get_current_user
|
| 8 |
-
from app.services.dataset import get_cached_symbols, get_file_url
|
| 9 |
-
|
| 10 |
-
router = APIRouter(prefix="/v1", tags=["Market Data"])
|
| 11 |
-
|
| 12 |
-
@router.get("/symbols", response_model=SymbolResponse)
|
| 13 |
-
async def list_symbols(user: str = Depends(get_current_user)):
|
| 14 |
-
"""
|
| 15 |
-
List all available trading pairs/symbols.
|
| 16 |
-
Fetches file list from Hugging Face API (cached).
|
| 17 |
-
"""
|
| 18 |
-
try:
|
| 19 |
-
symbols = await get_cached_symbols()
|
| 20 |
-
return SymbolResponse(
|
| 21 |
-
symbols=symbols,
|
| 22 |
-
count=len(symbols)
|
| 23 |
-
)
|
| 24 |
-
|
| 25 |
-
except Exception as e:
|
| 26 |
-
raise HTTPException(status_code=500, detail=f"Failed to list symbols: {str(e)}")
|
| 27 |
-
|
| 28 |
-
@router.get("/ohlc/{symbol}", response_model=OHLCResponse)
|
| 29 |
-
async def get_ohlc_data(
|
| 30 |
-
symbol: str,
|
| 31 |
-
interval: str = Query("15min", description="Time interval: 1min, 5min, 15min, 30min, 1hr, 4hr, 1day"),
|
| 32 |
-
start_date: Optional[date] = Query(None, description="Start date (YYYY-MM-DD)"),
|
| 33 |
-
end_date: Optional[date] = Query(None, description="End date (YYYY-MM-DD)"),
|
| 34 |
-
limit: int = Query(100, ge=1, le=10000, description="Max records to return"),
|
| 35 |
-
user: str = Depends(get_current_user)
|
| 36 |
-
):
|
| 37 |
-
"""
|
| 38 |
-
Get OHLC (Open, High, Low, Close) data.
|
| 39 |
-
"""
|
| 40 |
-
try:
|
| 41 |
-
# Normalize interval
|
| 42 |
-
interval_map = {
|
| 43 |
-
"1m": "1min", "1min": "1min",
|
| 44 |
-
"5m": "5min", "5min": "5min",
|
| 45 |
-
"15m": "15min", "15min": "15min",
|
| 46 |
-
"30m": "30min", "30min": "30min",
|
| 47 |
-
"1h": "1hr", "1hr": "1hr",
|
| 48 |
-
"4h": "4hr", "4hr": "4hr",
|
| 49 |
-
"1d": "1day", "1day": "1day"
|
| 50 |
-
}
|
| 51 |
-
target_interval = interval_map.get(interval, "15min")
|
| 52 |
-
|
| 53 |
-
# Get file URL from cache
|
| 54 |
-
file_url = await get_file_url(symbol, target_interval)
|
| 55 |
-
|
| 56 |
-
if not file_url:
|
| 57 |
-
raise HTTPException(status_code=404, detail=f"Symbol '{symbol}' not found for interval '{interval}'")
|
| 58 |
-
|
| 59 |
-
with duckdb_transaction() as db_conn:
|
| 60 |
-
where_clauses = []
|
| 61 |
-
if start_date:
|
| 62 |
-
where_clauses.append(f"time >= '{start_date}'")
|
| 63 |
-
if end_date:
|
| 64 |
-
where_clauses.append(f"time <= '{end_date}'")
|
| 65 |
-
|
| 66 |
-
where_stmt = "WHERE " + " AND ".join(where_clauses) if where_clauses else ""
|
| 67 |
-
|
| 68 |
-
query = f"""
|
| 69 |
-
SELECT *
|
| 70 |
-
FROM read_parquet('{file_url}')
|
| 71 |
-
{where_stmt}
|
| 72 |
-
ORDER BY time DESC
|
| 73 |
-
LIMIT {limit}
|
| 74 |
-
"""
|
| 75 |
-
|
| 76 |
-
result = db_conn.execute(query)
|
| 77 |
-
|
| 78 |
-
columns = [desc[0] for desc in result.description]
|
| 79 |
-
rows = result.fetchall()
|
| 80 |
-
|
| 81 |
-
data = []
|
| 82 |
-
for row in rows:
|
| 83 |
-
item = dict(zip(columns, row))
|
| 84 |
-
data.append(item)
|
| 85 |
-
|
| 86 |
-
return OHLCResponse(
|
| 87 |
-
symbol=symbol,
|
| 88 |
-
interval=target_interval,
|
| 89 |
-
data=data,
|
| 90 |
-
count=len(data)
|
| 91 |
-
)
|
| 92 |
-
|
| 93 |
-
except HTTPException:
|
| 94 |
-
raise
|
| 95 |
-
except Exception as e:
|
| 96 |
-
raise HTTPException(status_code=500, detail=f"Failed to fetch OHLC data: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/routers/raw_query.py
DELETED
|
@@ -1,122 +0,0 @@
|
|
| 1 |
-
from fastapi import APIRouter, HTTPException, Depends
|
| 2 |
-
from app.database import duckdb_transaction
|
| 3 |
-
from app.models import QueryResponse, QueryRequest
|
| 4 |
-
from app.config import get_settings
|
| 5 |
-
from app.auth import get_current_user
|
| 6 |
-
import re
|
| 7 |
-
import time
|
| 8 |
-
|
| 9 |
-
router = APIRouter(prefix="/query", tags=["Raw SQL"])
|
| 10 |
-
|
| 11 |
-
def validate_sql_query(query: str) -> bool:
|
| 12 |
-
"""
|
| 13 |
-
Validate SQL query for security and allowed operations.
|
| 14 |
-
Only allows SELECT queries and prevents malicious operations.
|
| 15 |
-
"""
|
| 16 |
-
query_upper = query.upper().strip()
|
| 17 |
-
|
| 18 |
-
# Only allow SELECT statements
|
| 19 |
-
if not query_upper.startswith('SELECT'):
|
| 20 |
-
return False
|
| 21 |
-
|
| 22 |
-
# Block dangerous keywords
|
| 23 |
-
dangerous_keywords = [
|
| 24 |
-
'DROP', 'DELETE', 'UPDATE', 'INSERT', 'CREATE', 'ALTER',
|
| 25 |
-
'EXEC', 'EXECUTE', 'UNION', 'SCRIPT', 'COPY', 'EXPORT',
|
| 26 |
-
'PRAGMA', 'VACUUM', 'CHECKPOINT', 'CALL', 'LOAD', 'INSTALL'
|
| 27 |
-
]
|
| 28 |
-
|
| 29 |
-
for keyword in dangerous_keywords:
|
| 30 |
-
if keyword in query_upper:
|
| 31 |
-
return False
|
| 32 |
-
|
| 33 |
-
# Block file access functions
|
| 34 |
-
file_access_functions = [
|
| 35 |
-
'READ_CSV', 'READ_PARQUET', 'READ_JSON', 'READ_BLOB', 'READ_TEXT',
|
| 36 |
-
'SCAN_PARQUET', 'SCAN_CSV', 'SCAN_JSON'
|
| 37 |
-
]
|
| 38 |
-
|
| 39 |
-
for func in file_access_functions:
|
| 40 |
-
if func in query_upper:
|
| 41 |
-
return False
|
| 42 |
-
|
| 43 |
-
# Check for suspicious patterns
|
| 44 |
-
suspicious_patterns = [
|
| 45 |
-
r';\s*SELECT', # Multiple statements
|
| 46 |
-
r'--', # SQL comments
|
| 47 |
-
r'/\*', # Multi-line comments
|
| 48 |
-
r'xp_', # Extended procedures
|
| 49 |
-
r'sp_', # Stored procedures
|
| 50 |
-
]
|
| 51 |
-
|
| 52 |
-
for pattern in suspicious_patterns:
|
| 53 |
-
if re.search(pattern, query, re.IGNORECASE):
|
| 54 |
-
return False
|
| 55 |
-
|
| 56 |
-
return True
|
| 57 |
-
|
| 58 |
-
def sanitize_query(query: str, dataset_url: str) -> str:
|
| 59 |
-
"""
|
| 60 |
-
Sanitize and prepare query for execution against remote Parquet files.
|
| 61 |
-
Ensures query targets the correct dataset URL.
|
| 62 |
-
"""
|
| 63 |
-
# Simple replacement if FROM clause exists
|
| 64 |
-
if "FROM" in query.upper():
|
| 65 |
-
# Replace user table with read_parquet
|
| 66 |
-
query = re.sub(r'FROM\s+([a-zA-Z0-9_]+)', f'FROM read_parquet(\'{dataset_url}\', filename=true)', query, flags=re.IGNORECASE)
|
| 67 |
-
|
| 68 |
-
# Ensure the query ends with a semicolon
|
| 69 |
-
if not query.strip().endswith(';'):
|
| 70 |
-
query += ';'
|
| 71 |
-
|
| 72 |
-
return query
|
| 73 |
-
|
| 74 |
-
@router.post("/", response_model=QueryResponse)
|
| 75 |
-
async def execute_query(request: QueryRequest, user: str = Depends(get_current_user)):
|
| 76 |
-
"""
|
| 77 |
-
Execute SQL query against the remote Parquet dataset.
|
| 78 |
-
**Requires Authentication**
|
| 79 |
-
"""
|
| 80 |
-
settings = get_settings()
|
| 81 |
-
start_time = time.time()
|
| 82 |
-
|
| 83 |
-
# Validate query
|
| 84 |
-
if not validate_sql_query(request.sql_query):
|
| 85 |
-
raise HTTPException(
|
| 86 |
-
status_code=400,
|
| 87 |
-
detail="Invalid SQL query. Only SELECT statements are allowed."
|
| 88 |
-
)
|
| 89 |
-
|
| 90 |
-
# Prepare query
|
| 91 |
-
sanitized_query = sanitize_query(request.sql_query, settings.DATASET_URL)
|
| 92 |
-
|
| 93 |
-
# Add limit if not specified in query
|
| 94 |
-
if 'LIMIT' not in sanitized_query.upper():
|
| 95 |
-
sanitized_query = sanitized_query.rstrip(';') + f' LIMIT {request.limit};'
|
| 96 |
-
|
| 97 |
-
try:
|
| 98 |
-
with duckdb_transaction() as db_conn:
|
| 99 |
-
# Execute query
|
| 100 |
-
result = db_conn.execute(sanitized_query)
|
| 101 |
-
|
| 102 |
-
# Fetch results
|
| 103 |
-
rows = result.fetchall()
|
| 104 |
-
columns = [desc[0] for desc in result.description]
|
| 105 |
-
|
| 106 |
-
# Convert to list of dictionaries
|
| 107 |
-
data = []
|
| 108 |
-
for row in rows:
|
| 109 |
-
item = dict(zip(columns, row))
|
| 110 |
-
data.append(item)
|
| 111 |
-
|
| 112 |
-
execution_time = (time.time() - start_time) * 1000
|
| 113 |
-
|
| 114 |
-
return QueryResponse(
|
| 115 |
-
data=data,
|
| 116 |
-
columns=columns,
|
| 117 |
-
row_count=len(data),
|
| 118 |
-
execution_time_ms=execution_time
|
| 119 |
-
)
|
| 120 |
-
|
| 121 |
-
except Exception as e:
|
| 122 |
-
raise HTTPException(status_code=500, detail=f"Query execution failed: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/routers/system.py
DELETED
|
@@ -1,58 +0,0 @@
|
|
| 1 |
-
from fastapi import APIRouter, HTTPException, Depends
|
| 2 |
-
from app.database import duckdb_transaction
|
| 3 |
-
from app.models import SchemaResponse, QueryResponse, QueryRequest
|
| 4 |
-
from app.config import get_settings
|
| 5 |
-
from app.auth import get_current_user
|
| 6 |
-
|
| 7 |
-
router = APIRouter(prefix="/system", tags=["System"])
|
| 8 |
-
|
| 9 |
-
@router.get("/health")
|
| 10 |
-
async def health_check():
|
| 11 |
-
"""Health check endpoint."""
|
| 12 |
-
return {"status": "healthy", "service": "cloud-data-lake-api"}
|
| 13 |
-
|
| 14 |
-
@router.get("/describe", response_model=SchemaResponse)
|
| 15 |
-
async def get_dataset_schema(user: str = Depends(get_current_user)):
|
| 16 |
-
"""
|
| 17 |
-
Get the schema of the dataset.
|
| 18 |
-
Requires authentication.
|
| 19 |
-
"""
|
| 20 |
-
settings = get_settings()
|
| 21 |
-
try:
|
| 22 |
-
# Get a safe file to describe schema from
|
| 23 |
-
from app.services.dataset import get_one_safe_file_url, get_total_file_count
|
| 24 |
-
|
| 25 |
-
safe_file_url = await get_one_safe_file_url()
|
| 26 |
-
if not safe_file_url:
|
| 27 |
-
raise HTTPException(status_code=404, detail="No parquet files found in dataset to describe.")
|
| 28 |
-
|
| 29 |
-
total_files = await get_total_file_count()
|
| 30 |
-
|
| 31 |
-
with duckdb_transaction() as db_conn:
|
| 32 |
-
# Get schema from the single safe file
|
| 33 |
-
# We use filename=true to ensure the filename column is available if requested
|
| 34 |
-
schema_query = f"""
|
| 35 |
-
SELECT
|
| 36 |
-
column_name,
|
| 37 |
-
column_type
|
| 38 |
-
FROM (
|
| 39 |
-
DESCRIBE SELECT * FROM read_parquet('{safe_file_url}', filename=true)
|
| 40 |
-
)
|
| 41 |
-
"""
|
| 42 |
-
|
| 43 |
-
result = db_conn.execute(schema_query)
|
| 44 |
-
schema_rows = result.fetchall()
|
| 45 |
-
|
| 46 |
-
# Build schema dictionary
|
| 47 |
-
schema_dict = {}
|
| 48 |
-
for row in schema_rows:
|
| 49 |
-
column_name, column_type = row[:2]
|
| 50 |
-
schema_dict[column_name] = column_type
|
| 51 |
-
|
| 52 |
-
return SchemaResponse(
|
| 53 |
-
schema=schema_dict,
|
| 54 |
-
total_files=total_files
|
| 55 |
-
)
|
| 56 |
-
|
| 57 |
-
except Exception as e:
|
| 58 |
-
raise HTTPException(status_code=500, detail=f"Failed to retrieve schema: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/services/__init__.py
ADDED
|
File without changes
|
app/services/dataset.py
DELETED
|
@@ -1,114 +0,0 @@
|
|
| 1 |
-
from huggingface_hub import HfApi
|
| 2 |
-
import re
|
| 3 |
-
import time
|
| 4 |
-
import urllib.parse
|
| 5 |
-
from typing import List, Dict, Optional
|
| 6 |
-
from fastapi.concurrency import run_in_threadpool
|
| 7 |
-
from app.config import get_settings
|
| 8 |
-
|
| 9 |
-
# Global Cache
|
| 10 |
-
# Map: symbol -> {interval: safe_url}
|
| 11 |
-
SYMBOL_CACHE: Dict[str, Dict[str, str]] = {}
|
| 12 |
-
CACHE_TIMESTAMP = 0
|
| 13 |
-
CACHE_DURATION_SECONDS = 300 # 5 minutes
|
| 14 |
-
ALL_FILES_CACHE: List[str] = []
|
| 15 |
-
|
| 16 |
-
def fetch_file_list_sync(repo_id: str, token: str) -> List[str]:
|
| 17 |
-
"""
|
| 18 |
-
Sync function to list files using HfApi.
|
| 19 |
-
"""
|
| 20 |
-
api = HfApi(token=token)
|
| 21 |
-
return api.list_repo_files(repo_id=repo_id, repo_type="dataset")
|
| 22 |
-
|
| 23 |
-
async def refresh_cache_if_needed():
|
| 24 |
-
"""
|
| 25 |
-
Refresh the global symbol cache if expired.
|
| 26 |
-
Uses HfApi to list files and constructs safe encoded URLs.
|
| 27 |
-
"""
|
| 28 |
-
global SYMBOL_CACHE, CACHE_TIMESTAMP, ALL_FILES_CACHE
|
| 29 |
-
|
| 30 |
-
settings = get_settings()
|
| 31 |
-
|
| 32 |
-
if time.time() - CACHE_TIMESTAMP < CACHE_DURATION_SECONDS and SYMBOL_CACHE:
|
| 33 |
-
return
|
| 34 |
-
|
| 35 |
-
# Extract Repo ID
|
| 36 |
-
if "hf://datasets/" in settings.DATASET_URL:
|
| 37 |
-
# e.g. hf://datasets/OMCHOKSI108/my-cloud-data-lake/...
|
| 38 |
-
repo_parts = settings.DATASET_URL.replace("hf://datasets/", "").split("/")
|
| 39 |
-
repo_id = f"{repo_parts[0]}/{repo_parts[1]}"
|
| 40 |
-
else:
|
| 41 |
-
repo_id = "OMCHOKSI108/my-cloud-data-lake"
|
| 42 |
-
|
| 43 |
-
try:
|
| 44 |
-
# Run sync API call in threadpool to avoid blocking async loop
|
| 45 |
-
files = await run_in_threadpool(fetch_file_list_sync, repo_id, settings.HF_TOKEN)
|
| 46 |
-
|
| 47 |
-
# Prepare new cache
|
| 48 |
-
new_cache = {}
|
| 49 |
-
valid_files = []
|
| 50 |
-
|
| 51 |
-
for file_path in files:
|
| 52 |
-
if not file_path.endswith('.parquet'):
|
| 53 |
-
continue
|
| 54 |
-
|
| 55 |
-
# Construct Safe URL for DuckDB
|
| 56 |
-
# Encode path components, specifically to handle '#' -> '%23'
|
| 57 |
-
# file_path is like: ALL_TIME_DATA/15min_time/AAVEUSD#_15min.parquet
|
| 58 |
-
encoded_path = urllib.parse.quote(file_path, safe='/')
|
| 59 |
-
full_url = f"hf://datasets/{repo_id}/{encoded_path}"
|
| 60 |
-
|
| 61 |
-
valid_files.append(full_url)
|
| 62 |
-
|
| 63 |
-
# Parse Symbol and Interval
|
| 64 |
-
filename = file_path.split("/")[-1]
|
| 65 |
-
|
| 66 |
-
# Regex to match: SYMBOL#_INTERVAL.parquet OR SYMBOL_INTERVAL.parquet
|
| 67 |
-
# Handling both cases (with and without #)
|
| 68 |
-
match = re.search(r"([A-Z0-9\.\s]+)[#_]+([0-9a-z]+)\.parquet", filename)
|
| 69 |
-
|
| 70 |
-
if match:
|
| 71 |
-
symbol = match.group(1)
|
| 72 |
-
interval = match.group(2) # e.g. "15min"
|
| 73 |
-
|
| 74 |
-
if symbol not in new_cache:
|
| 75 |
-
new_cache[symbol] = {}
|
| 76 |
-
|
| 77 |
-
new_cache[symbol][interval] = full_url
|
| 78 |
-
|
| 79 |
-
# Update Globals
|
| 80 |
-
ALL_FILES_CACHE = valid_files
|
| 81 |
-
SYMBOL_CACHE = new_cache
|
| 82 |
-
CACHE_TIMESTAMP = time.time()
|
| 83 |
-
|
| 84 |
-
except Exception as e:
|
| 85 |
-
print(f"Error refreshing dataset cache: {e}")
|
| 86 |
-
# If cache exists, keep using it on error
|
| 87 |
-
if not SYMBOL_CACHE:
|
| 88 |
-
# If no cache, we might want to let it be empty or re-raise
|
| 89 |
-
pass
|
| 90 |
-
|
| 91 |
-
async def get_cached_symbols() -> List[str]:
|
| 92 |
-
await refresh_cache_if_needed()
|
| 93 |
-
return sorted(list(SYMBOL_CACHE.keys()))
|
| 94 |
-
|
| 95 |
-
async def get_file_url(symbol: str, interval: str) -> Optional[str]:
|
| 96 |
-
await refresh_cache_if_needed()
|
| 97 |
-
if symbol in SYMBOL_CACHE and interval in SYMBOL_CACHE[symbol]:
|
| 98 |
-
return SYMBOL_CACHE[symbol][interval]
|
| 99 |
-
return None
|
| 100 |
-
|
| 101 |
-
async def get_total_file_count() -> int:
|
| 102 |
-
await refresh_cache_if_needed()
|
| 103 |
-
return len(ALL_FILES_CACHE)
|
| 104 |
-
|
| 105 |
-
async def get_one_safe_file_url() -> Optional[str]:
|
| 106 |
-
"""Return one safe file URL for schema inference."""
|
| 107 |
-
await refresh_cache_if_needed()
|
| 108 |
-
if ALL_FILES_CACHE:
|
| 109 |
-
# Prefer a file without special characters if possible
|
| 110 |
-
for f in ALL_FILES_CACHE:
|
| 111 |
-
if "%23" not in f: # Encoded #
|
| 112 |
-
return f
|
| 113 |
-
return ALL_FILES_CACHE[0]
|
| 114 |
-
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/DATASET_CONTEXT.md
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# OMCHOKSI108/my-cloud-data-lake — Dataset Context & API Reference
|
| 2 |
+
|
| 3 |
+
> **Source:** `https://huggingface.co/datasets/OMCHOKSI108/my-cloud-data-lake`
|
| 4 |
+
> **Format:** Apache Parquet
|
| 5 |
+
> **Total Size:** ~4.65 GB
|
| 6 |
+
> **Generated:** February 2026
|
| 7 |
+
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
## 1. High-Level Summary
|
| 11 |
+
|
| 12 |
+
| Metric | Value |
|
| 13 |
+
|---|---|
|
| 14 |
+
| Total Parquet Files | **793** |
|
| 15 |
+
| Total Data Rows | **276,427,113** (~276 M) |
|
| 16 |
+
| Unique Instruments | **111** |
|
| 17 |
+
| Timeframes | **7** (1min, 5min, 15min, 30min, 1hr, 4hr, 1day) |
|
| 18 |
+
| Top-Level Folders | **3** (`ALL_TIME_DATA`, `data`, `reports`) |
|
| 19 |
+
| Column Schema (OHLCV) | `ts`, `open`, `high`, `low`, `close`, `volume` |
|
| 20 |
+
| Row Count Range | 106 — 9,823,126 per file |
|
| 21 |
+
|
| 22 |
+
---
|
| 23 |
+
|
| 24 |
+
## 2. Folder Structure
|
| 25 |
+
|
| 26 |
+
```
|
| 27 |
+
my-cloud-data-lake/ (4.65 GB)
|
| 28 |
+
├── ALL_TIME_DATA/ 756 files │ 275,232,406 rows
|
| 29 |
+
│ ├── 1min_time/ 108 files │ 182,411,731 rows
|
| 30 |
+
│ ├── 5min_time/ 108 files │ 51,658,834 rows
|
| 31 |
+
│ ├── 15min_time/ 108 files │ 22,286,150 rows
|
| 32 |
+
│ ├── 30min_time/ 108 files │ 11,251,532 rows
|
| 33 |
+
│ ├── 1hr_time/ 108 files │ 5,739,404 rows
|
| 34 |
+
│ ├── 4hr_time/ 108 files │ 1,526,167 rows
|
| 35 |
+
│ └── 1day_time/ 108 files │ 358,588 rows
|
| 36 |
+
├── data/ 35 files │ 1,185,910 rows
|
| 37 |
+
│ ├── 1min_time/ 5 files │ 500,093 rows
|
| 38 |
+
│ ├── 5min_time/ 5 files │ 422,691 rows
|
| 39 |
+
│ ├── 15min_time/ 5 files │ 144,273 rows
|
| 40 |
+
│ ├── 30min_time/ 5 files │ 72,193 rows
|
| 41 |
+
│ ├── 1hr_time/ 5 files │ 36,099 rows
|
| 42 |
+
│ ├── 4hr_time/ 5 files │ 9,039 rows
|
| 43 |
+
│ └── 1day_time/ 5 files │ 1,522 rows
|
| 44 |
+
├── reports/ 2 files │ 8,797 rows
|
| 45 |
+
│ ├── portfolio_equity.parquet 1 file │ 5,000 rows
|
| 46 |
+
│ └── portfolio_trades.parquet 1 file │ 3,797 rows
|
| 47 |
+
├── .gitattributes
|
| 48 |
+
└── README.md
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
---
|
| 52 |
+
|
| 53 |
+
## 3. Data Schema
|
| 54 |
+
|
| 55 |
+
### 3.1 OHLCV Files (791 files — ALL_TIME_DATA + data)
|
| 56 |
+
|
| 57 |
+
Every OHLCV parquet file has **6 columns** with a uniform schema:
|
| 58 |
+
|
| 59 |
+
| Column | Type | Description |
|
| 60 |
+
|---|---|---|
|
| 61 |
+
| `ts` | datetime/string | Timestamp of the candle bar |
|
| 62 |
+
| `open` | float | Opening price |
|
| 63 |
+
| `high` | float | Highest price in the period |
|
| 64 |
+
| `low` | float | Lowest price in the period |
|
| 65 |
+
| `close` | float | Closing price |
|
| 66 |
+
| `volume` | int/float | Trade volume during the period |
|
| 67 |
+
|
| 68 |
+
> **Note:** Some files in `data/` have 7 columns — the first column appears to be a malformed header row baked into the schema (data from a raw TSV conversion). The core data columns remain the same 6 OHLCV fields.
|
| 69 |
+
|
| 70 |
+
### 3.2 Reports Files
|
| 71 |
+
|
| 72 |
+
**`reports/portfolio_equity.parquet`** — 5,000 rows, 2 columns:
|
| 73 |
+
|
| 74 |
+
| Column | Description |
|
| 75 |
+
|---|---|
|
| 76 |
+
| `step` | Simulation/backtest step index |
|
| 77 |
+
| `equity` | Portfolio equity value at that step |
|
| 78 |
+
|
| 79 |
+
**`reports/portfolio_trades.parquet`** — 3,797 rows, 8 columns:
|
| 80 |
+
|
| 81 |
+
| Column | Description |
|
| 82 |
+
|---|---|
|
| 83 |
+
| `pair` | Trading pair/instrument |
|
| 84 |
+
| `type` | Order type |
|
| 85 |
+
| `side` | Buy or Sell |
|
| 86 |
+
| `price` | Execution price |
|
| 87 |
+
| `size` | Position size |
|
| 88 |
+
| `time` | Trade timestamp |
|
| 89 |
+
| `score` | Signal/strategy score |
|
| 90 |
+
| `pnl` | Profit & Loss |
|
| 91 |
+
|
| 92 |
+
---
|
| 93 |
+
|
| 94 |
+
## 4. Instruments Catalog (111 Unique)
|
| 95 |
+
|
| 96 |
+
### 4.1 Forex Pairs (28)
|
| 97 |
+
|
| 98 |
+
Major, minor, and cross pairs:
|
| 99 |
+
|
| 100 |
+
| Pair | Pair | Pair | Pair |
|
| 101 |
+
|---|---|---|---|
|
| 102 |
+
| AUDCAD# | AUDCHF# | AUDJPY# | AUDNZD# |
|
| 103 |
+
| AUDUSD# | CADCHF# | CADJPY# | CHFJPY# |
|
| 104 |
+
| EURAUD# | EURCAD# | EURCHF# | EURGBP# |
|
| 105 |
+
| EURJPY# | EURNZD# | EURUSD# | GBPAUD# |
|
| 106 |
+
| GBPCAD# | GBPCHF# | GBPJPY# | GBPNZD# |
|
| 107 |
+
| GBPUSD# | NZDCAD# | NZDCHF# | NZDJPY# |
|
| 108 |
+
| NZDUSD# | USDCAD# | USDCHF# | USDJPY# |
|
| 109 |
+
|
| 110 |
+
> Additional forex in `data/` folder without `#` suffix: EURUSD, GBPUSD, USDJPY
|
| 111 |
+
|
| 112 |
+
### 4.2 Cryptocurrencies (52)
|
| 113 |
+
|
| 114 |
+
| Symbol | Symbol | Symbol | Symbol |
|
| 115 |
+
|---|---|---|---|
|
| 116 |
+
| 1INCHUSD# | AAVEUSD# | ADAUSD# | ALGOUSD# |
|
| 117 |
+
| APEUSD# | APTUSD# | ARBUSD# | ATOMUSD# |
|
| 118 |
+
| AVAXUSD# | AXSUSD# | BATUSD# | BCHUSD# |
|
| 119 |
+
| BTCEUR# | BTCGBP# | BTCJPY# | BTCUSD# |
|
| 120 |
+
| BTGUSD# | CHZUSD# | COMPUSD# | CRVUSD# |
|
| 121 |
+
| DASHUSD# | DOGEUSD# | DOTUSD# | EGLDUSD# |
|
| 122 |
+
| ENJUSD# | ETCUSD# | ETHBTC# | ETHEUR# |
|
| 123 |
+
| ETHGBP# | ETHUSD# | FILUSD# | FLOWUSD# |
|
| 124 |
+
| GRTUSD# | ICPUSD# | IMXUSD# | LDOUSD# |
|
| 125 |
+
| LINKUSD# | LRCUSD# | LTCUSD# | MANAUSD# |
|
| 126 |
+
| MATICUSD# | NEARUSD# | OPUSD# | SANDUSD# |
|
| 127 |
+
| SHIBUSD# | SNXUSD# | SOLUSD# | STORJUSD# |
|
| 128 |
+
| STXUSD# | SUSHIUSD# | UMAUSD# | UNIUSD# |
|
| 129 |
+
| VAULTAUSD# | XLMUSD# | XRPUSD# | XTZUSD# |
|
| 130 |
+
| ZECUSD# | ZRXUSD# | | |
|
| 131 |
+
|
| 132 |
+
### 4.3 Commodities / Precious Metals (8)
|
| 133 |
+
|
| 134 |
+
| Symbol | Description |
|
| 135 |
+
|---|---|
|
| 136 |
+
| GOLD.i# | Gold (USD) |
|
| 137 |
+
| SILVER.i# | Silver (USD) |
|
| 138 |
+
| XAUCNH.i# | Gold / Chinese Yuan |
|
| 139 |
+
| XAUEUR.i# | Gold / Euro |
|
| 140 |
+
| XAUJPY.i# | Gold / Japanese Yen |
|
| 141 |
+
| GAUCNH.i# | Gold (alternate CNH) |
|
| 142 |
+
| GAUUSD.i# | Gold (alternate USD) |
|
| 143 |
+
| XPDUSD.i# | Palladium / USD |
|
| 144 |
+
| XPTUSD.i# | Platinum / USD |
|
| 145 |
+
|
| 146 |
+
### 4.4 Stocks / Equities (12)
|
| 147 |
+
|
| 148 |
+
| Symbol | Company |
|
| 149 |
+
|---|---|
|
| 150 |
+
| Amazon | Amazon.com Inc. |
|
| 151 |
+
| BancoBradesco | Banco Bradesco S.A. |
|
| 152 |
+
| DraftKings | DraftKings Inc. |
|
| 153 |
+
| Ford | Ford Motor Company |
|
| 154 |
+
| Gerdau | Gerdau S.A. |
|
| 155 |
+
| Intel | Intel Corporation |
|
| 156 |
+
| Nu Holdings | Nu Holdings Ltd. |
|
| 157 |
+
| Nvidia | NVIDIA Corporation |
|
| 158 |
+
| Pinterest | Pinterest Inc. |
|
| 159 |
+
| PlugPower | Plug Power Inc. |
|
| 160 |
+
| Rivian | Rivian Automotive Inc. |
|
| 161 |
+
| Tesla | Tesla Inc. |
|
| 162 |
+
| Transocean | Transocean Ltd. |
|
| 163 |
+
|
| 164 |
+
---
|
| 165 |
+
|
| 166 |
+
## 5. Timeframes
|
| 167 |
+
|
| 168 |
+
| Timeframe | Label in Path | Files (ALL_TIME_DATA) | Rows (ALL_TIME_DATA) | Typical Row Count per Instrument |
|
| 169 |
+
|---|---|---|---|---|
|
| 170 |
+
| 1 Minute | `1min_time` | 108 | 182,411,731 | 42K — 9.8M |
|
| 171 |
+
| 5 Minutes | `5min_time` | 108 | 51,658,834 | 8K — 2.0M |
|
| 172 |
+
| 15 Minutes | `15min_time` | 108 | 22,286,150 | 9K — 679K |
|
| 173 |
+
| 30 Minutes | `30min_time` | 108 | 11,251,532 | 4K — 343K |
|
| 174 |
+
| 1 Hour | `1hr_time` | 108 | 5,739,404 | 2K — 175K |
|
| 175 |
+
| 4 Hours | `4hr_time` | 108 | 1,526,167 | 628 — 49K |
|
| 176 |
+
| 1 Day | `1day_time` | 108 | 358,588 | 106 — 14K |
|
| 177 |
+
|
| 178 |
+
---
|
| 179 |
+
|
| 180 |
+
## 6. Data Partitioning: `ALL_TIME_DATA` vs `data`
|
| 181 |
+
|
| 182 |
+
| Property | `ALL_TIME_DATA/` | `data/` |
|
| 183 |
+
|---|---|---|
|
| 184 |
+
| **Purpose** | Full historical archive | Recent/sampled subset |
|
| 185 |
+
| **File Count** | 756 | 35 |
|
| 186 |
+
| **Row Count** | 275,232,406 | 1,185,910 |
|
| 187 |
+
| **Instruments** | 108 (all) | 5 (BTCUSD#, ETHUSD#, EURUSD, GBPUSD, USDJPY) |
|
| 188 |
+
| **Timeframes** | All 7 | All 7 |
|
| 189 |
+
| **Schema Notes** | Clean 6-col OHLCV | Some files have 7 cols (legacy header artifact) |
|
| 190 |
+
|
| 191 |
+
---
|
| 192 |
+
|
| 193 |
+
## 7. File Naming Convention
|
| 194 |
+
|
| 195 |
+
```
|
| 196 |
+
{InstrumentSymbol}_{Timeframe}.parquet
|
| 197 |
+
```
|
| 198 |
+
|
| 199 |
+
**Examples:**
|
| 200 |
+
- `BTCUSD#_1min.parquet` → Bitcoin/USD, 1-minute bars
|
| 201 |
+
- `EURUSD#_1day.parquet` → EUR/USD, daily bars
|
| 202 |
+
- `Tesla_15min.parquet` → Tesla stock, 15-minute bars
|
| 203 |
+
- `GOLD.i#_4hr.parquet` → Gold, 4-hour bars
|
| 204 |
+
|
| 205 |
+
**Symbol suffix meanings:**
|
| 206 |
+
- `#` → CFD/derivative instrument
|
| 207 |
+
- `.i#` → Index/commodity CFD
|
| 208 |
+
- No suffix → Spot or direct instrument
|
| 209 |
+
|
| 210 |
+
---
|
| 211 |
+
|
| 212 |
+
## 8. API Design Reference
|
| 213 |
+
|
| 214 |
+
### 8.1 Recommended API Endpoints
|
| 215 |
+
|
| 216 |
+
```
|
| 217 |
+
GET /api/v1/instruments
|
| 218 |
+
→ List all 111 available instruments with metadata
|
| 219 |
+
|
| 220 |
+
GET /api/v1/instruments/{symbol}
|
| 221 |
+
→ Instrument details (asset class, available timeframes, row counts)
|
| 222 |
+
|
| 223 |
+
GET /api/v1/ohlcv/{symbol}
|
| 224 |
+
?timeframe=1min|5min|15min|30min|1hr|4hr|1day
|
| 225 |
+
&start=2024-01-01T00:00:00Z
|
| 226 |
+
&end=2025-12-31T23:59:59Z
|
| 227 |
+
&limit=1000
|
| 228 |
+
&offset=0
|
| 229 |
+
→ OHLCV candle data with pagination
|
| 230 |
+
|
| 231 |
+
GET /api/v1/reports/equity
|
| 232 |
+
→ Portfolio equity curve (5,000 steps)
|
| 233 |
+
|
| 234 |
+
GET /api/v1/reports/trades
|
| 235 |
+
?pair=BTCUSD
|
| 236 |
+
&side=buy|sell
|
| 237 |
+
→ Portfolio trade history (3,797 trades)
|
| 238 |
+
|
| 239 |
+
GET /api/v1/metadata
|
| 240 |
+
→ Dataset-level metadata (total files, rows, timeframes, etc.)
|
| 241 |
+
|
| 242 |
+
GET /api/v1/search
|
| 243 |
+
?q=BTC&asset_class=crypto
|
| 244 |
+
→ Search instruments by name/class
|
| 245 |
+
```
|
| 246 |
+
|
| 247 |
+
### 8.2 Data Access Pattern (HuggingFace)
|
| 248 |
+
|
| 249 |
+
```python
|
| 250 |
+
# Direct parquet read from HuggingFace
|
| 251 |
+
from huggingface_hub import hf_hub_url
|
| 252 |
+
import pandas as pd
|
| 253 |
+
|
| 254 |
+
url = hf_hub_url(
|
| 255 |
+
repo_id="OMCHOKSI108/my-cloud-data-lake",
|
| 256 |
+
filename="ALL_TIME_DATA/1hr_time/BTCUSD#_1hr.parquet",
|
| 257 |
+
repo_type="dataset"
|
| 258 |
+
)
|
| 259 |
+
df = pd.read_parquet(url)
|
| 260 |
+
```
|
| 261 |
+
|
| 262 |
+
### 8.3 Query Parameters for API
|
| 263 |
+
|
| 264 |
+
| Parameter | Type | Description |
|
| 265 |
+
|---|---|---|
|
| 266 |
+
| `symbol` | string | Instrument symbol (e.g., `BTCUSD#`, `Tesla`) |
|
| 267 |
+
| `timeframe` | enum | `1min`, `5min`, `15min`, `30min`, `1hr`, `4hr`, `1day` |
|
| 268 |
+
| `start` | ISO datetime | Start of date range filter |
|
| 269 |
+
| `end` | ISO datetime | End of date range filter |
|
| 270 |
+
| `limit` | int | Max rows returned (default: 1000, max: 10000) |
|
| 271 |
+
| `offset` | int | Pagination offset |
|
| 272 |
+
| `source` | enum | `all_time` or `recent` (maps to folder) |
|
| 273 |
+
| `format` | enum | `json`, `csv`, `parquet` |
|
| 274 |
+
|
| 275 |
+
### 8.4 Response Schema
|
| 276 |
+
|
| 277 |
+
```json
|
| 278 |
+
{
|
| 279 |
+
"symbol": "BTCUSD#",
|
| 280 |
+
"timeframe": "1hr",
|
| 281 |
+
"total_rows": 66829,
|
| 282 |
+
"returned": 1000,
|
| 283 |
+
"data": [
|
| 284 |
+
{
|
| 285 |
+
"ts": "2024-01-01T00:00:00Z",
|
| 286 |
+
"open": 42150.50,
|
| 287 |
+
"high": 42280.00,
|
| 288 |
+
"low": 42100.00,
|
| 289 |
+
"close": 42230.75,
|
| 290 |
+
"volume": 1234
|
| 291 |
+
}
|
| 292 |
+
]
|
| 293 |
+
}
|
| 294 |
+
```
|
| 295 |
+
|
| 296 |
+
---
|
| 297 |
+
|
| 298 |
+
## 9. Asset Classification Map
|
| 299 |
+
|
| 300 |
+
Use this mapping to categorize instruments in the API:
|
| 301 |
+
|
| 302 |
+
```json
|
| 303 |
+
{
|
| 304 |
+
"forex_major": ["EURUSD#", "GBPUSD#", "USDJPY#", "USDCHF#", "AUDUSD#", "NZDUSD#", "USDCAD#"],
|
| 305 |
+
"forex_cross": ["EURJPY#", "GBPJPY#", "EURGBP#", "AUDCAD#", "AUDCHF#", "AUDJPY#", "AUDNZD#", "CADCHF#", "CADJPY#", "CHFJPY#", "EURAUD#", "EURCAD#", "EURCHF#", "EURNZD#", "GBPAUD#", "GBPCAD#", "GBPCHF#", "GBPNZD#", "NZDCAD#", "NZDCHF#", "NZDJPY#"],
|
| 306 |
+
"crypto_major": ["BTCUSD#", "ETHUSD#", "LTCUSD#", "XRPUSD#", "BCHUSD#"],
|
| 307 |
+
"crypto_alt": ["ADAUSD#", "SOLUSD#", "DOTUSD#", "LINKUSD#", "AVAXUSD#", "DOGEUSD#", "SHIBUSD#", "MATICUSD#", "UNIUSD#", "AAVEUSD#", "...and 40+ more"],
|
| 308 |
+
"crypto_cross": ["BTCEUR#", "BTCGBP#", "BTCJPY#", "ETHBTC#", "ETHEUR#", "ETHGBP#"],
|
| 309 |
+
"commodities": ["GOLD.i#", "SILVER.i#", "XPDUSD.i#", "XPTUSD.i#", "XAUEUR.i#", "XAUJPY.i#", "XAUCNH.i#", "GAUCNH.i#", "GAUUSD.i#"],
|
| 310 |
+
"stocks": ["Amazon", "Tesla", "Nvidia", "Intel", "Ford", "Rivian", "Pinterest", "PlugPower", "DraftKings", "Nu Holdings", "Gerdau", "BancoBradesco", "Transocean"]
|
| 311 |
+
}
|
| 312 |
+
```
|
| 313 |
+
|
| 314 |
+
---
|
| 315 |
+
|
| 316 |
+
## 10. Data Volume by Asset Class (ALL_TIME_DATA)
|
| 317 |
+
|
| 318 |
+
| Asset Class | Instruments | Files | Est. Rows |
|
| 319 |
+
|---|---|---|---|
|
| 320 |
+
| Forex | ~28 | 196 | ~80M+ |
|
| 321 |
+
| Crypto | ~52 | 364 | ~150M+ |
|
| 322 |
+
| Commodities | ~9 | 63 | ~20M+ |
|
| 323 |
+
| Stocks | ~13 | 91 | ~25M+ |
|
| 324 |
+
| **Reports** | 2 | 2 | 8,797 |
|
| 325 |
+
|
| 326 |
+
---
|
| 327 |
+
|
| 328 |
+
## 11. Key Observations & Notes
|
| 329 |
+
|
| 330 |
+
1. **Uniform OHLCV schema** across all market data files — no schema conflicts between asset classes
|
| 331 |
+
2. **Highest granularity data** is in 1-minute bars, accounting for 66% of all rows (182M rows)
|
| 332 |
+
3. **Longest history instruments**: Major forex pairs (EURUSD, GBPUSD, USDJPY, USDCHF) have up to ~14K daily bars (~55+ years of data) and 9.8M 1-minute bars
|
| 333 |
+
4. **Shortest history instruments**: GAUCNH.i#, GAUUSD.i#, XAUCNH.i#, XAUJPY.i# have only ~106 daily bars
|
| 334 |
+
5. **`data/` folder** contains a focused subset of 5 key instruments (BTCUSD#, ETHUSD#, EURUSD, GBPUSD, USDJPY) — likely used for development/testing
|
| 335 |
+
6. **`reports/` folder** contains backtesting/simulation results — equity curve and trade log
|
| 336 |
+
7. **Data format** is Apache Parquet — columnar, compressed, ideal for analytical queries
|
| 337 |
+
8. **No bid/ask spread data** — only mid-price OHLCV
|
| 338 |
+
9. **No fundamental data** — purely technical/price data
|
| 339 |
+
|
| 340 |
+
---
|
| 341 |
+
|
| 342 |
+
## 12. Potential Use Cases
|
| 343 |
+
|
| 344 |
+
- **Trading Strategy Backtesting** — multi-asset, multi-timeframe
|
| 345 |
+
- **ML/AI Price Prediction Models** — 276M row training dataset
|
| 346 |
+
- **Technical Analysis API** — serve OHLCV with on-the-fly indicator calculation
|
| 347 |
+
- **Cross-Asset Correlation Analysis** — forex, crypto, commodities, stocks in one lake
|
| 348 |
+
- **Portfolio Simulation** — reports data already includes equity curves and trade logs
|
| 349 |
+
- **Real-Time Dashboard** — serve historical + stream live data via WebSocket
|
| 350 |
+
- **Market Data Microservice** — HuggingFace as cold storage, API serves hot queries via Redis/DuckDB cache
|
docs/Plan.md
ADDED
|
@@ -0,0 +1,374 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Now we’re talking like a systems architect 😌🔥
|
| 2 |
+
|
| 3 |
+
You’re not building “an API”.
|
| 4 |
+
You’re building a **market data engine**.
|
| 5 |
+
|
| 6 |
+
And yes — this is not easy.
|
| 7 |
+
If it were easy, Bloomberg wouldn’t cost $25,000 per year.
|
| 8 |
+
|
| 9 |
+
Let’s design this properly.
|
| 10 |
+
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# 🧠 Core Vision
|
| 14 |
+
|
| 15 |
+
You want:
|
| 16 |
+
|
| 17 |
+
* Dynamic symbol discovery
|
| 18 |
+
* Fast search
|
| 19 |
+
* Timeframe awareness
|
| 20 |
+
* Date range introspection
|
| 21 |
+
* Range-based data extraction
|
| 22 |
+
* Scalable architecture
|
| 23 |
+
* Low latency
|
| 24 |
+
* Production ready
|
| 25 |
+
|
| 26 |
+
That’s basically a mini Bloomberg / TradingView backend.
|
| 27 |
+
|
| 28 |
+
Good. Let’s build it in layers.
|
| 29 |
+
|
| 30 |
+
---
|
| 31 |
+
|
| 32 |
+
# 🔥 1️⃣ Expanded Real-World Use Cases (Beyond Your 5–10%)
|
| 33 |
+
|
| 34 |
+
You mentioned symbol list + range query.
|
| 35 |
+
|
| 36 |
+
That’s only surface-level. Here’s the real ecosystem:
|
| 37 |
+
|
| 38 |
+
---
|
| 39 |
+
|
| 40 |
+
## 📊 Market Data Retrieval
|
| 41 |
+
|
| 42 |
+
1. List all available symbols
|
| 43 |
+
2. Search symbols by prefix or fuzzy match
|
| 44 |
+
3. Get available timeframes per symbol
|
| 45 |
+
4. Get min/max available date for symbol/timeframe
|
| 46 |
+
5. Fetch OHLCV data by:
|
| 47 |
+
|
| 48 |
+
* symbol
|
| 49 |
+
* timeframe
|
| 50 |
+
* date range
|
| 51 |
+
6. Fetch latest N candles
|
| 52 |
+
7. Fetch last price snapshot
|
| 53 |
+
8. Multi-symbol batch query
|
| 54 |
+
|
| 55 |
+
---
|
| 56 |
+
|
| 57 |
+
## 📈 Analytics & Indicators
|
| 58 |
+
|
| 59 |
+
9. Compute SMA/EMA on the fly
|
| 60 |
+
10. RSI/MACD calculation
|
| 61 |
+
11. VWAP calculation
|
| 62 |
+
12. ATR/Volatility stats
|
| 63 |
+
13. Rolling window queries
|
| 64 |
+
14. Correlation between symbols
|
| 65 |
+
15. Spread analysis
|
| 66 |
+
16. Multi-timeframe aggregation
|
| 67 |
+
|
| 68 |
+
---
|
| 69 |
+
|
| 70 |
+
## 📦 Metadata Layer
|
| 71 |
+
|
| 72 |
+
17. Symbol classification (Forex, Crypto, Equity)
|
| 73 |
+
18. Exchange mapping
|
| 74 |
+
19. Trading session hours
|
| 75 |
+
20. Symbol status (active/delisted)
|
| 76 |
+
21. Liquidity score
|
| 77 |
+
|
| 78 |
+
---
|
| 79 |
+
|
| 80 |
+
## 🚀 Performance / Pro Features
|
| 81 |
+
|
| 82 |
+
22. Precomputed aggregates
|
| 83 |
+
23. Candle compression (1m → 5m)
|
| 84 |
+
24. Server-side pagination
|
| 85 |
+
25. Async streaming responses
|
| 86 |
+
26. Real-time websocket updates
|
| 87 |
+
27. Snapshot caching
|
| 88 |
+
28. Heatmap endpoint
|
| 89 |
+
|
| 90 |
+
---
|
| 91 |
+
|
| 92 |
+
## 🤖 ML & Research Layer
|
| 93 |
+
|
| 94 |
+
29. Return normalized data
|
| 95 |
+
30. Return feature-engineered dataset
|
| 96 |
+
31. Backtesting query endpoint
|
| 97 |
+
32. Walk-forward validation data splits
|
| 98 |
+
33. Dataset slicing for training
|
| 99 |
+
|
| 100 |
+
---
|
| 101 |
+
|
| 102 |
+
## 🛡 Operational
|
| 103 |
+
|
| 104 |
+
34. Rate limiting
|
| 105 |
+
35. Access tiers
|
| 106 |
+
36. API key auth
|
| 107 |
+
37. Usage metrics
|
| 108 |
+
38. Request logging
|
| 109 |
+
39. Error monitoring
|
| 110 |
+
|
| 111 |
+
---
|
| 112 |
+
|
| 113 |
+
# 🏗 2️⃣ Architecture Design (High Performance Version)
|
| 114 |
+
|
| 115 |
+
Now let’s talk real engineering.
|
| 116 |
+
|
| 117 |
+
Your biggest enemy:
|
| 118 |
+
|
| 119 |
+
👉 Time-series query latency
|
| 120 |
+
👉 File scanning
|
| 121 |
+
👉 Memory overload
|
| 122 |
+
|
| 123 |
+
---
|
| 124 |
+
|
| 125 |
+
## 🔥 Recommended Architecture
|
| 126 |
+
|
| 127 |
+
### Layer 1: Storage
|
| 128 |
+
|
| 129 |
+
Do NOT serve directly from raw HF HTTP every request.
|
| 130 |
+
|
| 131 |
+
Instead:
|
| 132 |
+
|
| 133 |
+
Option A (simple):
|
| 134 |
+
|
| 135 |
+
* Sync parquet files into container on startup
|
| 136 |
+
|
| 137 |
+
Option B (proper scalable):
|
| 138 |
+
|
| 139 |
+
* Store in object storage (S3-compatible)
|
| 140 |
+
* Use partitioned parquet structure
|
| 141 |
+
|
| 142 |
+
Example ideal structure:
|
| 143 |
+
|
| 144 |
+
```
|
| 145 |
+
/data
|
| 146 |
+
/symbol=BTCUSD
|
| 147 |
+
/timeframe=1m
|
| 148 |
+
/year=2024
|
| 149 |
+
part-001.parquet
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
Partitioning by:
|
| 153 |
+
|
| 154 |
+
* symbol
|
| 155 |
+
* timeframe
|
| 156 |
+
* year
|
| 157 |
+
|
| 158 |
+
This enables very fast filtering.
|
| 159 |
+
|
| 160 |
+
---
|
| 161 |
+
|
| 162 |
+
### Layer 2: Metadata Index (CRITICAL)
|
| 163 |
+
|
| 164 |
+
Never scan files per request.
|
| 165 |
+
|
| 166 |
+
Instead build metadata table:
|
| 167 |
+
|
| 168 |
+
| symbol | timeframe | start_date | end_date | file_path |
|
| 169 |
+
|
| 170 |
+
Store this in:
|
| 171 |
+
|
| 172 |
+
* SQLite (simple)
|
| 173 |
+
* PostgreSQL (better)
|
| 174 |
+
* DuckDB (excellent for time-series)
|
| 175 |
+
* Redis (for fast lookups)
|
| 176 |
+
|
| 177 |
+
This allows instant symbol listing and range discovery.
|
| 178 |
+
|
| 179 |
+
---
|
| 180 |
+
|
| 181 |
+
### Layer 3: Query Engine
|
| 182 |
+
|
| 183 |
+
When request comes:
|
| 184 |
+
|
| 185 |
+
1. Validate symbol
|
| 186 |
+
2. Validate timeframe
|
| 187 |
+
3. Lookup metadata
|
| 188 |
+
4. Determine which partitions to read
|
| 189 |
+
5. Use PyArrow Dataset filtering
|
| 190 |
+
6. Return JSON
|
| 191 |
+
|
| 192 |
+
PyArrow filtering example:
|
| 193 |
+
|
| 194 |
+
```python
|
| 195 |
+
dataset = ds.dataset("data/", format="parquet")
|
| 196 |
+
table = dataset.to_table(
|
| 197 |
+
filter=(
|
| 198 |
+
(ds.field("symbol") == "BTCUSD") &
|
| 199 |
+
(ds.field("timestamp") >= start) &
|
| 200 |
+
(ds.field("timestamp") <= end)
|
| 201 |
+
)
|
| 202 |
+
)
|
| 203 |
+
```
|
| 204 |
+
|
| 205 |
+
This reads only relevant row groups.
|
| 206 |
+
|
| 207 |
+
That’s how you keep it fast.
|
| 208 |
+
|
| 209 |
+
---
|
| 210 |
+
|
| 211 |
+
### Layer 4: Caching Layer
|
| 212 |
+
|
| 213 |
+
Add Redis:
|
| 214 |
+
|
| 215 |
+
* Cache symbol list
|
| 216 |
+
* Cache latest candle
|
| 217 |
+
* Cache hot queries
|
| 218 |
+
* Cache range metadata
|
| 219 |
+
|
| 220 |
+
Time-based invalidation.
|
| 221 |
+
|
| 222 |
+
---
|
| 223 |
+
|
| 224 |
+
# ⚡ 3️⃣ API Design Best Practices
|
| 225 |
+
|
| 226 |
+
## REST Design (Recommended)
|
| 227 |
+
|
| 228 |
+
```
|
| 229 |
+
GET /symbols
|
| 230 |
+
GET /symbols/search?q=btc
|
| 231 |
+
GET /symbols/{symbol}/timeframes
|
| 232 |
+
GET /symbols/{symbol}/{timeframe}/range
|
| 233 |
+
GET /data?symbol=BTCUSD&tf=1m&start=2024-01-01&end=2024-02-01
|
| 234 |
+
GET /data/latest?symbol=BTCUSD&tf=1m
|
| 235 |
+
```
|
| 236 |
+
|
| 237 |
+
Use:
|
| 238 |
+
|
| 239 |
+
* Query parameters for filters
|
| 240 |
+
* Pagination
|
| 241 |
+
* Limit max rows
|
| 242 |
+
|
| 243 |
+
---
|
| 244 |
+
|
| 245 |
+
## Response Optimization
|
| 246 |
+
|
| 247 |
+
* Gzip compression
|
| 248 |
+
* Option for CSV or JSON
|
| 249 |
+
* Option for binary Arrow format
|
| 250 |
+
* Pagination for large ranges
|
| 251 |
+
|
| 252 |
+
---
|
| 253 |
+
|
| 254 |
+
## GraphQL?
|
| 255 |
+
|
| 256 |
+
Only if you want flexible multi-field selection.
|
| 257 |
+
For time-series APIs, REST is simpler and faster.
|
| 258 |
+
|
| 259 |
+
---
|
| 260 |
+
|
| 261 |
+
# ⚡ 4️⃣ Performance Strategy
|
| 262 |
+
|
| 263 |
+
Here’s where people fail.
|
| 264 |
+
|
| 265 |
+
### ❌ Bad Design
|
| 266 |
+
|
| 267 |
+
* Load full parquet file
|
| 268 |
+
* Filter in pandas
|
| 269 |
+
* Return giant JSON
|
| 270 |
+
|
| 271 |
+
### ✅ Good Design
|
| 272 |
+
|
| 273 |
+
* Partitioned storage
|
| 274 |
+
* Arrow dataset filtering
|
| 275 |
+
* Limit rows
|
| 276 |
+
* Cache hot symbols
|
| 277 |
+
* Async FastAPI
|
| 278 |
+
* Use uvicorn with workers
|
| 279 |
+
|
| 280 |
+
---
|
| 281 |
+
|
| 282 |
+
# ⚠ 5️⃣ Major Challenges
|
| 283 |
+
|
| 284 |
+
### 1. Large Range Queries
|
| 285 |
+
|
| 286 |
+
Solution:
|
| 287 |
+
|
| 288 |
+
* Hard row limits
|
| 289 |
+
* Pagination
|
| 290 |
+
* Stream responses
|
| 291 |
+
|
| 292 |
+
---
|
| 293 |
+
|
| 294 |
+
### 2. Cold Start
|
| 295 |
+
|
| 296 |
+
Solution:
|
| 297 |
+
|
| 298 |
+
* Preload metadata
|
| 299 |
+
* Warm-up query
|
| 300 |
+
|
| 301 |
+
---
|
| 302 |
+
|
| 303 |
+
### 3. Concurrency
|
| 304 |
+
|
| 305 |
+
Solution:
|
| 306 |
+
|
| 307 |
+
* Async endpoints
|
| 308 |
+
* Gunicorn workers
|
| 309 |
+
* Avoid global pandas DataFrames
|
| 310 |
+
|
| 311 |
+
---
|
| 312 |
+
|
| 313 |
+
### 4. Memory Pressure
|
| 314 |
+
|
| 315 |
+
Solution:
|
| 316 |
+
|
| 317 |
+
* Avoid pandas
|
| 318 |
+
* Use PyArrow
|
| 319 |
+
* Stream results
|
| 320 |
+
|
| 321 |
+
---
|
| 322 |
+
|
| 323 |
+
### 5. Scaling
|
| 324 |
+
|
| 325 |
+
Solution:
|
| 326 |
+
|
| 327 |
+
* Stateless API
|
| 328 |
+
* External object storage
|
| 329 |
+
* Horizontal scaling
|
| 330 |
+
|
| 331 |
+
---
|
| 332 |
+
|
| 333 |
+
# 🧠 Advanced Optimization
|
| 334 |
+
|
| 335 |
+
If you want serious performance:
|
| 336 |
+
|
| 337 |
+
* Use DuckDB as query engine
|
| 338 |
+
* It reads parquet directly
|
| 339 |
+
* Very fast filtering
|
| 340 |
+
* SQL-based
|
| 341 |
+
|
| 342 |
+
Example:
|
| 343 |
+
|
| 344 |
+
```python
|
| 345 |
+
import duckdb
|
| 346 |
+
|
| 347 |
+
con = duckdb.connect()
|
| 348 |
+
df = con.execute("""
|
| 349 |
+
SELECT * FROM 'data/*.parquet'
|
| 350 |
+
WHERE symbol='BTCUSD'
|
| 351 |
+
AND timestamp BETWEEN '2024-01-01' AND '2024-02-01'
|
| 352 |
+
""").df()
|
| 353 |
+
```
|
| 354 |
+
|
| 355 |
+
DuckDB is insanely fast for this use case.
|
| 356 |
+
|
| 357 |
+
Honestly? For your architecture — DuckDB is a better backend engine than pandas.
|
| 358 |
+
|
| 359 |
+
---
|
| 360 |
+
|
| 361 |
+
# 🏁 Final Architecture Recommendation
|
| 362 |
+
|
| 363 |
+
If I were building your system:
|
| 364 |
+
|
| 365 |
+
Storage → Partitioned Parquet
|
| 366 |
+
Metadata → SQLite / Postgres
|
| 367 |
+
Query Engine → DuckDB
|
| 368 |
+
API → FastAPI (async)
|
| 369 |
+
Cache → Redis
|
| 370 |
+
Deployment → Docker
|
| 371 |
+
Hosting → HF Space (demo) or Cloud VM (production)
|
| 372 |
+
|
| 373 |
+
---
|
| 374 |
+
|
hf_dataset_structure_report.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
scripts/__init__.py
ADDED
|
File without changes
|
scripts/sync_data.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Standalone script to list remote parquet files and verify connectivity.
|
| 3 |
+
|
| 4 |
+
Usage:
|
| 5 |
+
python scripts/sync_data.py
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
# Allow running as a script from project root
|
| 12 |
+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
| 13 |
+
|
| 14 |
+
from app.config import settings
|
| 15 |
+
from app.core.logging_config import setup_logging
|
| 16 |
+
from app.core.database import db_manager
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def main() -> None:
|
| 20 |
+
setup_logging()
|
| 21 |
+
|
| 22 |
+
print(f"Connecting to DuckDB (remote mode) …")
|
| 23 |
+
db_manager.connect()
|
| 24 |
+
|
| 25 |
+
print(f"Listing files from HuggingFace repo: {settings.HF_REPO_ID}")
|
| 26 |
+
db_manager.build_metadata_index()
|
| 27 |
+
|
| 28 |
+
print(f"\n=== Remote Metadata ===")
|
| 29 |
+
print(f" Total files : {len(db_manager.metadata)}")
|
| 30 |
+
print(f" Symbols : {len(db_manager.symbols)}")
|
| 31 |
+
if db_manager.symbols:
|
| 32 |
+
print(f" First 10 : {db_manager.symbols[:10]}")
|
| 33 |
+
|
| 34 |
+
db_manager.close()
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
if __name__ == "__main__":
|
| 38 |
+
main()
|
testdata.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import pyarrow.parquet as pq
|
| 3 |
+
import fsspec
|
| 4 |
+
from huggingface_hub import list_repo_files
|
| 5 |
+
from urllib.parse import quote
|
| 6 |
+
|
| 7 |
+
# ==================================
|
| 8 |
+
REPO_ID = "OMCHOKSI108/my-cloud-data-lake"
|
| 9 |
+
OUTPUT_FILE = "hf_dataset_structure_report.csv"
|
| 10 |
+
summary = []
|
| 11 |
+
|
| 12 |
+
print(f"\nConnecting to HuggingFace dataset: {REPO_ID}\n")
|
| 13 |
+
|
| 14 |
+
files = list_repo_files(repo_id=REPO_ID, repo_type="dataset")
|
| 15 |
+
parquet_files = [f for f in files if f.endswith(".parquet")]
|
| 16 |
+
|
| 17 |
+
print(f"Total parquet files found: {len(parquet_files)}\n")
|
| 18 |
+
|
| 19 |
+
for file_path in parquet_files:
|
| 20 |
+
print(f"Inspecting: {file_path}")
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
# 🔥 ENCODE SPECIAL CHARACTERS
|
| 24 |
+
encoded_path = quote(file_path)
|
| 25 |
+
|
| 26 |
+
hf_url = f"https://huggingface.co/datasets/{REPO_ID}/resolve/main/{encoded_path}"
|
| 27 |
+
|
| 28 |
+
with fsspec.open(hf_url, "rb") as f:
|
| 29 |
+
parquet_file = pq.ParquetFile(f)
|
| 30 |
+
|
| 31 |
+
schema = parquet_file.schema
|
| 32 |
+
num_rows = parquet_file.metadata.num_rows
|
| 33 |
+
|
| 34 |
+
summary.append({
|
| 35 |
+
"file_path": file_path,
|
| 36 |
+
"folder": file_path.split("/")[0],
|
| 37 |
+
"file_name": file_path.split("/")[-1],
|
| 38 |
+
"num_columns": len(schema.names),
|
| 39 |
+
"num_rows": num_rows,
|
| 40 |
+
"columns": schema.names
|
| 41 |
+
})
|
| 42 |
+
|
| 43 |
+
except Exception as e:
|
| 44 |
+
print("Error:", e)
|
| 45 |
+
|
| 46 |
+
df = pd.DataFrame(summary)
|
| 47 |
+
|
| 48 |
+
print("\n===== DATASET STRUCTURE =====\n")
|
| 49 |
+
print(df[["file_name", "folder", "num_columns", "num_rows"]])
|
| 50 |
+
|
| 51 |
+
df.to_csv(OUTPUT_FILE, index=False)
|
| 52 |
+
print(f"\nReport saved as: {OUTPUT_FILE}")
|