Spaces:

OMCHOKSI108
/

forexdatalake

Running

App Files Files Community

OMCHOKSI108 commited on Feb 15

Commit

aac542c

1 Parent(s): a365e2e

code

Browse files

Files changed (29) hide show

.gitattributes +0 -35
README.md +0 -83
__pycache__/main.cpython-314.pyc +0 -0
app/__init__.py +0 -0
app/auth.py +0 -22
app/core/__init__.py +0 -0
app/core/auth.py +40 -0
app/core/cache.py +117 -0
app/core/exceptions.py +104 -0
app/core/logging_config.py +34 -0
app/database.py +0 -51
app/models.py +0 -38
app/models/__init__.py +0 -0
app/models/common.py +78 -0
app/models/indicators.py +29 -0
app/models/market_data.py +46 -0
app/models/reports.py +41 -0
app/models/symbols.py +43 -0
app/routers/market_data.py +0 -96
app/routers/raw_query.py +0 -122
app/routers/system.py +0 -58
app/services/__init__.py +0 -0
app/services/dataset.py +0 -114
docs/DATASET_CONTEXT.md +350 -0
docs/Plan.md +374 -0
hf_dataset_structure_report.csv +0 -0
scripts/__init__.py +0 -0
scripts/sync_data.py +38 -0
testdata.py +52 -0

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md DELETED Viewed

@@ -1,83 +0,0 @@
----
-title: Cloud Data Lake API
-emoji: 📈
-colorFrom: blue
-colorTo: indigo
-sdk: docker
-pinned: false
-app_port: 7860
----
-# Cloud Data Lake API
-A production-ready Financial Data API built with **FastAPI**, **DuckDB**, and **Hugging Face**.
-This API allows serverless SQL querying of Parquet datasets stored on Hugging Face, with specialized endpoints for financial market data (OHLC).
-## Features
-- 🚀 **Serverless Architecture**: No database to manage; queries remote Parquet files directly.
-- ⚡ **High Performance**: Powered by DuckDB's vectorized engine and `httpfs`.
-- 🔒 **Secure**: Bearer Token authentication and read-only access.
-- 📈 **Financial Data**: Dedicated endpoints for OHLC (Open-High-Low-Close) data.
-- 🌐 **Hugging Face Integrated**: Seamless access to private Datasets.
-## Installation
-1. **Clone the repository**
-   ```bash
-   git clone https://huggingface.co/spaces/OMCHOKSI108/forexdatalake
-   cd forexdatalake
-   ```
-2. **Install Dependencies**
-   ```bash
-   pip install -r requirements.txt
-   ```
-3. **Configure Environment**
-   Create a `.env` file:
-   ```env
-   # Secrets (Best kept in repository secrets if deploying to Spaces)
-   HF_TOKEN=hf_your_huggingface_token
-   API_KEY=your-secret-key
-   # Configuration (Can be standard variables)
-   DATASET_URL=hf://datasets/OMCHOKSI108/my-cloud-data-lake/ALL_TIME_DATA/**/*.parquet
-   # Optional Tuning
-   DUCKDB_MEMORY_LIMIT=1GB
-   DUCKDB_THREADS=2
-   ```
-### Environment Variables
-| Variable | Type | Description | Required | Default |
-|----------|------|-------------|----------|---------|
-| `HF_TOKEN` | **Secret** | Hugging Face Access Token with read permissions. | Yes | - |
-| `API_KEY` | **Secret** | Secret key for authenticating API requests. | Yes | `sk-dev-key-123` |
-| `DATASET_URL` | Config | `hf://` URL pattern to your parquet files. | No | *See config.py* |
-| `DUCKDB_MEMORY_LIMIT` | Config | Max memory for DuckDB (e.g. '1GB'). | No | `1GB` |
-| `DUCKDB_THREADS` | Config | Number of Threads. | No | `2` |
-## Usage
-### Run the Server
-```bash
-uvicorn app.main:app --host 0.0.0.0 --port 7860
-```
-### API Endpoints
-| Method | Endpoint | Description | Auth |
-|--------|----------|-------------|------|
-| `GET` | `/system/health` | Health check | No |
-| `GET` | `/v1/symbols` | List available pairs | Yes |
-| `GET` | `/v1/ohlc/{symbol}` | Get historical OHLC data | Yes |
-| `POST` | `/query` | Execute raw SQL (Admin) | Yes |
-### Example Request
-```bash
-curl -H "Authorization: Bearer your-secret-key" \
-     "http://localhost:7860/v1/ohlc/EURUSD?interval=15min&limit=10"
-```

__pycache__/main.cpython-314.pyc DELETED Viewed

Binary file (15.2 kB)

app/__init__.py ADDED Viewed

File without changes

app/auth.py DELETED Viewed

@@ -1,22 +0,0 @@
-from fastapi import Security, HTTPException, status
-from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
-from app.config import get_settings
-security = HTTPBearer()
-def get_current_user(credentials: HTTPAuthorizationCredentials = Security(security)):
-    """
-    Validate the Bearer token.
-    In a real production app, this would check against a database of users/keys.
-    """
-    settings = get_settings()
-    token = credentials.credentials
-    # Check against the configured API key (simple auth for now)
-    if token != settings.API_KEY:
-        raise HTTPException(
-            status_code=status.HTTP_401_UNAUTHORIZED,
-            detail="Invalid authentication credentials",
-            headers={"WWW-Authenticate": "Bearer"},
-        )
-    return token

app/core/__init__.py ADDED Viewed

File without changes

app/core/auth.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""API key authentication dependency."""
+import secrets
+from fastapi import Security
+from fastapi.security import APIKeyHeader
+from app.config import settings
+from app.core.exceptions import AuthenticationError
+_api_key_header = APIKeyHeader(name=settings.API_KEY_HEADER, auto_error=False)
+# Paths that bypass authentication
+PUBLIC_PATHS: set[str] = {
+    "/api/v1/health",
+    "/docs",
+    "/redoc",
+    "/openapi.json",
+}
+async def require_api_key(
+    api_key: str | None = Security(_api_key_header),
+) -> str:
+    """FastAPI dependency that validates the X-API-Key header.
+    Usage:
+        @router.get("/protected", dependencies=[Depends(require_api_key)])
+    """
+    if not settings.API_KEY:
+        # No API key configured — auth is disabled
+        return "no-auth"
+    if api_key is None:
+        raise AuthenticationError("Missing API key — provide via X-API-Key header")
+    if not secrets.compare_digest(api_key, settings.API_KEY):
+        raise AuthenticationError("Invalid API key")
+    return api_key

app/core/cache.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""TTL-based in-memory cache with thread-safety.
+A lightweight alternative to Redis for single-process deployments.
+Supports decorator-based caching and manual get/set/invalidate.
+"""
+import hashlib
+import json
+import logging
+import threading
+import time
+from collections import OrderedDict
+from typing import Any, Callable
+from app.config import settings
+logger = logging.getLogger(__name__)
+class TTLCache:
+    """Thread-safe LRU cache with per-entry TTL expiration."""
+    def __init__(
+        self,
+        max_size: int = settings.CACHE_MAX_SIZE,
+        default_ttl: int = settings.CACHE_TTL,
+    ) -> None:
+        self._store: OrderedDict[str, tuple[Any, float]] = OrderedDict()
+        self._lock = threading.Lock()
+        self._max_size = max_size
+        self._default_ttl = default_ttl
+        self._hits = 0
+        self._misses = 0
+    # -- core operations -----------------------------------------------------
+    def get(self, key: str) -> Any | None:
+        with self._lock:
+            entry = self._store.get(key)
+            if entry is None:
+                self._misses += 1
+                return None
+            value, expires_at = entry
+            if time.time() > expires_at:
+                del self._store[key]
+                self._misses += 1
+                return None
+            # Move to end (most-recently-used)
+            self._store.move_to_end(key)
+            self._hits += 1
+            return value
+    def set(self, key: str, value: Any, ttl: int | None = None) -> None:
+        ttl = ttl if ttl is not None else self._default_ttl
+        expires_at = time.time() + ttl
+        with self._lock:
+            if key in self._store:
+                self._store.move_to_end(key)
+            self._store[key] = (value, expires_at)
+            # Evict oldest if over capacity
+            while len(self._store) > self._max_size:
+                evicted_key, _ = self._store.popitem(last=False)
+                logger.debug("Cache evicted key: %s", evicted_key)
+    def invalidate(self, key: str) -> bool:
+        with self._lock:
+            if key in self._store:
+                del self._store[key]
+                return True
+            return False
+    def invalidate_pattern(self, prefix: str) -> int:
+        """Remove all keys starting with the given prefix."""
+        with self._lock:
+            keys_to_remove = [k for k in self._store if k.startswith(prefix)]
+            for k in keys_to_remove:
+                del self._store[k]
+            return len(keys_to_remove)
+    def clear(self) -> None:
+        with self._lock:
+            self._store.clear()
+            self._hits = 0
+            self._misses = 0
+    @property
+    def stats(self) -> dict[str, int]:
+        with self._lock:
+            return {
+                "size": len(self._store),
+                "max_size": self._max_size,
+                "hits": self._hits,
+                "misses": self._misses,
+            }
+    # -- cleanup -------------------------------------------------------------
+    def evict_expired(self) -> int:
+        """Remove all expired entries. Returns count of evicted entries."""
+        now = time.time()
+        with self._lock:
+            expired = [k for k, (_, exp) in self._store.items() if now > exp]
+            for k in expired:
+                del self._store[k]
+            return len(expired)
+def make_cache_key(*args: Any, **kwargs: Any) -> str:
+    """Generate a deterministic cache key from arguments."""
+    raw = json.dumps({"args": args, "kwargs": kwargs}, sort_keys=True, default=str)
+    return hashlib.md5(raw.encode()).hexdigest()
+# ---------------------------------------------------------------------------
+# Module-level singleton
+# ---------------------------------------------------------------------------
+cache = TTLCache()

app/core/exceptions.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""Custom exception hierarchy for the ForexDataLake API.
+All application exceptions inherit from AppException and carry
+a status_code, error_code, and detail message. The global exception
+handler in main.py catches these and returns structured JSON responses.
+"""
+from fastapi import Request
+from fastapi.responses import JSONResponse
+class AppException(Exception):
+    """Base exception for all application errors."""
+    status_code: int = 500
+    error_code: str = "INTERNAL_ERROR"
+    detail: str = "An unexpected error occurred"
+    def __init__(self, detail: str | None = None, **kwargs):
+        self.detail = detail or self.__class__.detail
+        self.extra = kwargs
+        super().__init__(self.detail)
+class SymbolNotFoundError(AppException):
+    status_code = 404
+    error_code = "SYMBOL_NOT_FOUND"
+    detail = "The requested symbol was not found"
+class TimeframeNotFoundError(AppException):
+    status_code = 404
+    error_code = "TIMEFRAME_NOT_FOUND"
+    detail = "The requested timeframe is not available"
+class InvalidDateRangeError(AppException):
+    status_code = 400
+    error_code = "INVALID_DATE_RANGE"
+    detail = "The provided date range is invalid"
+class DataNotAvailableError(AppException):
+    status_code = 404
+    error_code = "DATA_NOT_AVAILABLE"
+    detail = "No data available for the requested parameters"
+class AuthenticationError(AppException):
+    status_code = 401
+    error_code = "AUTHENTICATION_FAILED"
+    detail = "Invalid or missing API key"
+class RateLimitExceededError(AppException):
+    status_code = 429
+    error_code = "RATE_LIMIT_EXCEEDED"
+    detail = "Too many requests — please slow down"
+class DataSyncError(AppException):
+    status_code = 503
+    error_code = "DATA_SYNC_ERROR"
+    detail = "Failed to synchronize data from remote source"
+class ValidationError(AppException):
+    status_code = 422
+    error_code = "VALIDATION_ERROR"
+    detail = "Request validation failed"
+class DatabaseError(AppException):
+    status_code = 500
+    error_code = "DATABASE_ERROR"
+    detail = "A database error occurred"
+# ---------------------------------------------------------------------------
+# Global exception handlers (registered in main.py via app.add_exception_handler)
+# ---------------------------------------------------------------------------
+async def app_exception_handler(request: Request, exc: AppException) -> JSONResponse:
+    """Handle all AppException subclasses with structured JSON."""
+    return JSONResponse(
+        status_code=exc.status_code,
+        content={
+            "error": exc.error_code,
+            "detail": exc.detail,
+            "path": str(request.url),
+        },
+    )
+async def generic_exception_handler(request: Request, exc: Exception) -> JSONResponse:
+    """Catch-all for unhandled exceptions — never leak stack traces."""
+    return JSONResponse(
+        status_code=500,
+        content={
+            "error": "INTERNAL_ERROR",
+            "detail": "An unexpected internal error occurred",
+            "path": str(request.url),
+        },
+    )

app/core/logging_config.py ADDED Viewed

	@@ -0,0 +1,34 @@

+"""Structured logging configuration for the application."""
+import logging
+import sys
+from app.config import settings
+def setup_logging() -> None:
+    """Configure root logger with structured format and appropriate level."""
+    log_level = getattr(logging, settings.LOG_LEVEL.upper(), logging.INFO)
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)-8s | %(name)-30s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    handler = logging.StreamHandler(sys.stdout)
+    handler.setFormatter(formatter)
+    handler.setLevel(log_level)
+    root_logger = logging.getLogger()
+    root_logger.setLevel(log_level)
+    root_logger.handlers.clear()
+    root_logger.addHandler(handler)
+    # Silence noisy third-party loggers
+    for name in ("httpcore", "httpx", "hpack", "urllib3", "fsspec"):
+        logging.getLogger(name).setLevel(logging.WARNING)
+def get_logger(name: str) -> logging.Logger:
+    """Return a named logger instance."""
+    return logging.getLogger(name)

app/database.py DELETED Viewed

@@ -1,51 +0,0 @@
-import duckdb
-import logging
-from contextlib import contextmanager
-from huggingface_hub import HfFileSystem
-from app.config import get_settings
-logger = logging.getLogger(__name__)
-def get_duckdb_connection():
-    """Get a new DuckDB connection with httpfs extension and registered HF filesystem."""
-    settings = get_settings()
-    conn = duckdb.connect()
-    # Configure DuckDB for performance
-    conn.execute(f"""
-        SET memory_limit='{settings.DUCKDB_MEMORY_LIMIT}';
-        SET threads={settings.DUCKDB_THREADS};
-        SET enable_progress_bar=false;
-        SET enable_object_cache=true;
-    """)
-    # Register Hugging Face FileSystem
-    if settings.HF_TOKEN:
-        try:
-            fs = HfFileSystem(token=settings.HF_TOKEN)
-            conn.register_filesystem(fs)
-        except Exception as e:
-            logger.error(f"Failed to register HfFileSystem: {e}")
-    else:
-        logger.warning("HF_TOKEN is not set. Access to private datasets will fail.")
-    # Install and load httpfs extension for remote file access
-    try:
-        conn.execute("INSTALL httpfs")
-        conn.execute("LOAD httpfs")
-    except Exception as e:
-        logger.error(f"Failed to load httpfs extension: {e}")
-        # Re-raise or handle appropriately depending on app needs
-        raise
-    return conn
-@contextmanager
-def duckdb_transaction():
-    """Context manager for DuckDB transactions."""
-    db_conn = get_duckdb_connection()
-    try:
-        yield db_conn
-    except Exception as e:
-        logger.error(f"Database error: {e}")
-        raise

app/models.py DELETED Viewed

@@ -1,38 +0,0 @@
-from typing import List, Dict, Any, Optional
-from pydantic import BaseModel, Field
-# --- Shared Models ---
-class ErrorResponse(BaseModel):
-    error: str
-    detail: Optional[str] = None
-# --- Query Models ---
-class QueryRequest(BaseModel):
-    sql_query: str = Field(..., description="SQL query to execute against the dataset")
-    limit: Optional[int] = Field(1000, description="Maximum number of rows to return", ge=1, le=10000)
-class QueryResponse(BaseModel):
-    data: List[Dict[str, Any]]
-    columns: List[str]
-    row_count: int
-    execution_time_ms: float
-# --- Schema Models ---
-class SchemaResponse(BaseModel):
-    schema: Dict[str, str]
-    total_files: int
-# --- Financial Data Models ---
-class OHLCResponse(BaseModel):
-    symbol: str
-    interval: str
-    data: List[Dict[str, Any]]  # format: {time, open, high, low, close}
-    count: int
-class SymbolResponse(BaseModel):
-    symbols: List[str]
-    count: int

app/models/__init__.py ADDED Viewed

File without changes

app/models/common.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""Shared enums, pagination, and base response schemas."""
+from datetime import datetime
+from enum import Enum
+from typing import Any, Generic, TypeVar
+from pydantic import BaseModel, Field
+from app.config import settings
+class TimeframeEnum(str, Enum):
+    """Supported candlestick timeframes."""
+    MIN_1 = "1min"
+    MIN_5 = "5min"
+    MIN_15 = "15min"
+    MIN_30 = "30min"
+    HR_1 = "1hr"
+    HR_4 = "4hr"
+    DAY_1 = "1day"
+class AssetClassEnum(str, Enum):
+    """Asset class categories."""
+    FOREX_MAJOR = "forex_major"
+    FOREX_CROSS = "forex_cross"
+    CRYPTO_MAJOR = "crypto_major"
+    CRYPTO_ALT = "crypto_alt"
+    CRYPTO_CROSS = "crypto_cross"
+    COMMODITIES = "commodities"
+    STOCKS = "stocks"
+class SourceEnum(str, Enum):
+    """Data source partition."""
+    ALL_TIME = "ALL_TIME_DATA"
+    RECENT = "recent"
+class PaginationParams(BaseModel):
+    """Common pagination parameters."""
+    limit: int = Field(
+        default=settings.DEFAULT_LIMIT,
+        ge=1,
+        le=settings.MAX_ROWS_PER_REQUEST,
+        description="Number of rows to return",
+    )
+    offset: int = Field(default=0, ge=0, description="Number of rows to skip")
+T = TypeVar("T")
+class PaginatedResponse(BaseModel, Generic[T]):
+    """Generic paginated response wrapper."""
+    total: int = Field(description="Total available rows")
+    returned: int = Field(description="Rows returned in this response")
+    limit: int
+    offset: int
+    data: list[T]
+class ErrorResponse(BaseModel):
+    """Standard error response body."""
+    error: str = Field(description="Machine-readable error code")
+    detail: str = Field(description="Human-readable error message")
+    path: str = Field(description="Request path that triggered the error")
+class HealthResponse(BaseModel):
+    """Health check response."""
+    status: str = "ok"
+    version: str = "1.0.0"
+    timestamp: datetime = Field(default_factory=datetime.utcnow)
+    total_files: int = 0
+    total_symbols: int = 0
+    total_timeframes: int = 0
+    cache_entries: int = 0

app/models/indicators.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""Pydantic schemas for technical indicator endpoints."""
+from typing import Any
+from pydantic import BaseModel, Field
+class IndicatorValue(BaseModel):
+    """Single indicator data point alongside OHLCV."""
+    ts: Any
+    open: float
+    high: float
+    low: float
+    close: float
+    volume: float
+    indicator_value: float | None = Field(
+        default=None, description="Computed indicator value"
+    )
+class IndicatorResponse(BaseModel):
+    """Response for indicator calculation."""
+    symbol: str
+    timeframe: str
+    indicator: str
+    period: int
+    total_rows: int
+    returned: int
+    data: list[IndicatorValue]

app/models/market_data.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""Pydantic schemas for market data (OHLCV) endpoints."""
+from datetime import datetime
+from typing import Any
+from pydantic import BaseModel, Field
+class OHLCVBar(BaseModel):
+    """Single OHLCV candlestick bar."""
+    ts: Any = Field(description="Candle timestamp")
+    open: float = Field(description="Opening price")
+    high: float = Field(description="Highest price")
+    low: float = Field(description="Lowest price")
+    close: float = Field(description="Closing price")
+    volume: float = Field(description="Trade volume")
+class OHLCVResponse(BaseModel):
+    """Paginated OHLCV data response."""
+    symbol: str
+    timeframe: str
+    source: str = "ALL_TIME_DATA"
+    total_rows: int = Field(description="Total available rows for this query")
+    returned: int = Field(description="Rows in this response")
+    limit: int
+    offset: int
+    data: list[OHLCVBar]
+class DateRangeInfo(BaseModel):
+    """Available date range for a symbol/timeframe."""
+    symbol: str
+    timeframe: str
+    source: str = "ALL_TIME_DATA"
+    start_date: Any = Field(description="Earliest timestamp in dataset")
+    end_date: Any = Field(description="Latest timestamp in dataset")
+    total_rows: int = Field(description="Total row count")
+class LatestBar(BaseModel):
+    """Latest N bars response."""
+    symbol: str
+    timeframe: str
+    count: int
+    data: list[OHLCVBar]

app/models/reports.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""Pydantic schemas for portfolio report endpoints."""
+from typing import Any
+from pydantic import BaseModel, Field
+class EquityPoint(BaseModel):
+    """Single equity curve data point."""
+    step: int = Field(description="Simulation step index")
+    equity: float = Field(description="Portfolio equity value")
+class EquityResponse(BaseModel):
+    """Paginated equity curve response."""
+    total: int
+    returned: int
+    limit: int
+    offset: int
+    data: list[EquityPoint]
+class TradeRecord(BaseModel):
+    """Single trade record from backtest."""
+    pair: str = Field(description="Trading pair")
+    type: str = Field(description="Order type")
+    side: str = Field(description="Buy or Sell")
+    price: float = Field(description="Execution price")
+    size: float = Field(description="Position size")
+    time: Any = Field(description="Trade timestamp")
+    score: float = Field(description="Strategy signal score")
+    pnl: float = Field(description="Profit & Loss")
+class TradesResponse(BaseModel):
+    """Paginated trades response."""
+    total: int
+    returned: int
+    limit: int
+    offset: int
+    data: list[TradeRecord]

app/models/symbols.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""Pydantic schemas for symbol-related endpoints."""
+from pydantic import BaseModel, Field
+from app.models.common import AssetClassEnum, TimeframeEnum
+class SymbolInfo(BaseModel):
+    """Summary info for a single instrument."""
+    symbol: str = Field(description="Instrument symbol identifier")
+    asset_class: AssetClassEnum | None = Field(
+        default=None, description="Asset class category"
+    )
+    available_timeframes: list[str] = Field(
+        default_factory=list, description="Timeframes with data"
+    )
+    description: str = Field(default="", description="Human-readable name")
+class SymbolDetail(BaseModel):
+    """Full detail for a single instrument including per-timeframe stats."""
+    symbol: str
+    asset_class: AssetClassEnum | None = None
+    description: str = ""
+    timeframes: list["TimeframeDetail"] = Field(default_factory=list)
+class TimeframeDetail(BaseModel):
+    """Stats for one timeframe of a symbol."""
+    timeframe: str
+    source: str
+    row_count: int = 0
+    file_path: str = ""
+class SymbolSearchResult(BaseModel):
+    """Search result for symbol lookup."""
+    total: int
+    results: list[SymbolInfo]
+# Rebuild forward refs
+SymbolDetail.model_rebuild()

app/routers/market_data.py DELETED Viewed

@@ -1,96 +0,0 @@
-from datetime import datetime, date
-from typing import Optional, List, Dict
-from fastapi import APIRouter, HTTPException, Depends, Query
-from app.database import duckdb_transaction
-from app.models import OHLCResponse, SymbolResponse
-from app.config import get_settings
-from app.auth import get_current_user
-from app.services.dataset import get_cached_symbols, get_file_url
-router = APIRouter(prefix="/v1", tags=["Market Data"])
-@router.get("/symbols", response_model=SymbolResponse)
-async def list_symbols(user: str = Depends(get_current_user)):
-    """
-    List all available trading pairs/symbols.
-    Fetches file list from Hugging Face API (cached).
-    """
-    try:
-        symbols = await get_cached_symbols()
-        return SymbolResponse(
-            symbols=symbols,
-            count=len(symbols)
-        )
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to list symbols: {str(e)}")
-@router.get("/ohlc/{symbol}", response_model=OHLCResponse)
-async def get_ohlc_data(
-    symbol: str,
-    interval: str = Query("15min", description="Time interval: 1min, 5min, 15min, 30min, 1hr, 4hr, 1day"),
-    start_date: Optional[date] = Query(None, description="Start date (YYYY-MM-DD)"),
-    end_date: Optional[date] = Query(None, description="End date (YYYY-MM-DD)"),
-    limit: int = Query(100, ge=1, le=10000, description="Max records to return"),
-    user: str = Depends(get_current_user)
-):
-    """
-    Get OHLC (Open, High, Low, Close) data.
-    """
-    try:
-        # Normalize interval
-        interval_map = {
-            "1m": "1min", "1min": "1min",
-            "5m": "5min", "5min": "5min",
-            "15m": "15min", "15min": "15min",
-            "30m": "30min", "30min": "30min",
-            "1h": "1hr", "1hr": "1hr",
-            "4h": "4hr", "4hr": "4hr",
-            "1d": "1day", "1day": "1day"
-        }
-        target_interval = interval_map.get(interval, "15min")
-        # Get file URL from cache
-        file_url = await get_file_url(symbol, target_interval)
-        if not file_url:
-            raise HTTPException(status_code=404, detail=f"Symbol '{symbol}' not found for interval '{interval}'")
-        with duckdb_transaction() as db_conn:
-            where_clauses = []
-            if start_date:
-                where_clauses.append(f"time >= '{start_date}'")
-            if end_date:
-                where_clauses.append(f"time <= '{end_date}'")
-            where_stmt = "WHERE " + " AND ".join(where_clauses) if where_clauses else ""
-            query = f"""
-            SELECT *
-            FROM read_parquet('{file_url}')
-            {where_stmt}
-            ORDER BY time DESC
-            LIMIT {limit}
-            """
-            result = db_conn.execute(query)
-            columns = [desc[0] for desc in result.description]
-            rows = result.fetchall()
-            data = []
-            for row in rows:
-                item = dict(zip(columns, row))
-                data.append(item)
-            return OHLCResponse(
-                symbol=symbol,
-                interval=target_interval,
-                data=data,
-                count=len(data)
-            )
-    except HTTPException:
-        raise
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to fetch OHLC data: {str(e)}")

app/routers/raw_query.py DELETED Viewed

@@ -1,122 +0,0 @@
-from fastapi import APIRouter, HTTPException, Depends
-from app.database import duckdb_transaction
-from app.models import QueryResponse, QueryRequest
-from app.config import get_settings
-from app.auth import get_current_user
-import re
-import time
-router = APIRouter(prefix="/query", tags=["Raw SQL"])
-def validate_sql_query(query: str) -> bool:
-    """
-    Validate SQL query for security and allowed operations.
-    Only allows SELECT queries and prevents malicious operations.
-    """
-    query_upper = query.upper().strip()
-    # Only allow SELECT statements
-    if not query_upper.startswith('SELECT'):
-        return False
-    # Block dangerous keywords
-    dangerous_keywords = [
-        'DROP', 'DELETE', 'UPDATE', 'INSERT', 'CREATE', 'ALTER',
-        'EXEC', 'EXECUTE', 'UNION', 'SCRIPT', 'COPY', 'EXPORT',
-        'PRAGMA', 'VACUUM', 'CHECKPOINT', 'CALL', 'LOAD', 'INSTALL'
-    ]
-    for keyword in dangerous_keywords:
-        if keyword in query_upper:
-            return False
-    # Block file access functions
-    file_access_functions = [
-        'READ_CSV', 'READ_PARQUET', 'READ_JSON', 'READ_BLOB', 'READ_TEXT',
-        'SCAN_PARQUET', 'SCAN_CSV', 'SCAN_JSON'
-    ]
-    for func in file_access_functions:
-        if func in query_upper:
-             return False
-    # Check for suspicious patterns
-    suspicious_patterns = [
-        r';\s*SELECT',  # Multiple statements
-        r'--',          # SQL comments
-        r'/\*',         # Multi-line comments
-        r'xp_',         # Extended procedures
-        r'sp_',         # Stored procedures
-    ]
-    for pattern in suspicious_patterns:
-        if re.search(pattern, query, re.IGNORECASE):
-            return False
-    return True
-def sanitize_query(query: str, dataset_url: str) -> str:
-    """
-    Sanitize and prepare query for execution against remote Parquet files.
-    Ensures query targets the correct dataset URL.
-    """
-    # Simple replacement if FROM clause exists
-    if "FROM" in query.upper():
-         # Replace user table with read_parquet
-         query = re.sub(r'FROM\s+([a-zA-Z0-9_]+)', f'FROM read_parquet(\'{dataset_url}\', filename=true)', query, flags=re.IGNORECASE)
-    # Ensure the query ends with a semicolon
-    if not query.strip().endswith(';'):
-        query += ';'
-    return query
-@router.post("/", response_model=QueryResponse)
-async def execute_query(request: QueryRequest, user: str = Depends(get_current_user)):
-    """
-    Execute SQL query against the remote Parquet dataset.
-    **Requires Authentication**
-    """
-    settings = get_settings()
-    start_time = time.time()
-    # Validate query
-    if not validate_sql_query(request.sql_query):
-        raise HTTPException(
-            status_code=400,
-            detail="Invalid SQL query. Only SELECT statements are allowed."
-        )
-    # Prepare query
-    sanitized_query = sanitize_query(request.sql_query, settings.DATASET_URL)
-    # Add limit if not specified in query
-    if 'LIMIT' not in sanitized_query.upper():
-        sanitized_query = sanitized_query.rstrip(';') + f' LIMIT {request.limit};'
-    try:
-        with duckdb_transaction() as db_conn:
-            # Execute query
-            result = db_conn.execute(sanitized_query)
-            # Fetch results
-            rows = result.fetchall()
-            columns = [desc[0] for desc in result.description]
-            # Convert to list of dictionaries
-            data = []
-            for row in rows:
-                item = dict(zip(columns, row))
-                data.append(item)
-            execution_time = (time.time() - start_time) * 1000
-            return QueryResponse(
-                data=data,
-                columns=columns,
-                row_count=len(data),
-                execution_time_ms=execution_time
-            )
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Query execution failed: {str(e)}")

app/routers/system.py DELETED Viewed

@@ -1,58 +0,0 @@
-from fastapi import APIRouter, HTTPException, Depends
-from app.database import duckdb_transaction
-from app.models import SchemaResponse, QueryResponse, QueryRequest
-from app.config import get_settings
-from app.auth import get_current_user
-router = APIRouter(prefix="/system", tags=["System"])
-@router.get("/health")
-async def health_check():
-    """Health check endpoint."""
-    return {"status": "healthy", "service": "cloud-data-lake-api"}
-@router.get("/describe", response_model=SchemaResponse)
-async def get_dataset_schema(user: str = Depends(get_current_user)):
-    """
-    Get the schema of the dataset.
-    Requires authentication.
-    """
-    settings = get_settings()
-    try:
-        # Get a safe file to describe schema from
-        from app.services.dataset import get_one_safe_file_url, get_total_file_count
-        safe_file_url = await get_one_safe_file_url()
-        if not safe_file_url:
-            raise HTTPException(status_code=404, detail="No parquet files found in dataset to describe.")
-        total_files = await get_total_file_count()
-        with duckdb_transaction() as db_conn:
-            # Get schema from the single safe file
-            # We use filename=true to ensure the filename column is available if requested
-            schema_query = f"""
-            SELECT
-                column_name,
-                column_type
-            FROM (
-                DESCRIBE SELECT * FROM read_parquet('{safe_file_url}', filename=true)
-            )
-            """
-            result = db_conn.execute(schema_query)
-            schema_rows = result.fetchall()
-            # Build schema dictionary
-            schema_dict = {}
-            for row in schema_rows:
-                column_name, column_type = row[:2]
-                schema_dict[column_name] = column_type
-            return SchemaResponse(
-                schema=schema_dict,
-                total_files=total_files
-            )
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to retrieve schema: {str(e)}")

app/services/__init__.py ADDED Viewed

File without changes

app/services/dataset.py DELETED Viewed

@@ -1,114 +0,0 @@
-from huggingface_hub import HfApi
-import re
-import time
-import urllib.parse
-from typing import List, Dict, Optional
-from fastapi.concurrency import run_in_threadpool
-from app.config import get_settings
-# Global Cache
-# Map: symbol -> {interval: safe_url}
-SYMBOL_CACHE: Dict[str, Dict[str, str]] = {}
-CACHE_TIMESTAMP = 0
-CACHE_DURATION_SECONDS = 300  # 5 minutes
-ALL_FILES_CACHE: List[str] = []
-def fetch_file_list_sync(repo_id: str, token: str) -> List[str]:
-    """
-    Sync function to list files using HfApi.
-    """
-    api = HfApi(token=token)
-    return api.list_repo_files(repo_id=repo_id, repo_type="dataset")
-async def refresh_cache_if_needed():
-    """
-    Refresh the global symbol cache if expired.
-    Uses HfApi to list files and constructs safe encoded URLs.
-    """
-    global SYMBOL_CACHE, CACHE_TIMESTAMP, ALL_FILES_CACHE
-    settings = get_settings()
-    if time.time() - CACHE_TIMESTAMP < CACHE_DURATION_SECONDS and SYMBOL_CACHE:
-        return
-    # Extract Repo ID
-    if "hf://datasets/" in settings.DATASET_URL:
-        # e.g. hf://datasets/OMCHOKSI108/my-cloud-data-lake/...
-        repo_parts = settings.DATASET_URL.replace("hf://datasets/", "").split("/")
-        repo_id = f"{repo_parts[0]}/{repo_parts[1]}"
-    else:
-        repo_id = "OMCHOKSI108/my-cloud-data-lake"
-    try:
-        # Run sync API call in threadpool to avoid blocking async loop
-        files = await run_in_threadpool(fetch_file_list_sync, repo_id, settings.HF_TOKEN)
-        # Prepare new cache
-        new_cache = {}
-        valid_files = []
-        for file_path in files:
-            if not file_path.endswith('.parquet'):
-                continue
-            # Construct Safe URL for DuckDB
-            # Encode path components, specifically to handle '#' -> '%23'
-            # file_path is like: ALL_TIME_DATA/15min_time/AAVEUSD#_15min.parquet
-            encoded_path = urllib.parse.quote(file_path, safe='/')
-            full_url = f"hf://datasets/{repo_id}/{encoded_path}"
-            valid_files.append(full_url)
-            # Parse Symbol and Interval
-            filename = file_path.split("/")[-1]
-            # Regex to match: SYMBOL#_INTERVAL.parquet OR SYMBOL_INTERVAL.parquet
-            # Handling both cases (with and without #)
-            match = re.search(r"([A-Z0-9\.\s]+)[#_]+([0-9a-z]+)\.parquet", filename)
-            if match:
-                symbol = match.group(1)
-                interval = match.group(2) # e.g. "15min"
-                if symbol not in new_cache:
-                    new_cache[symbol] = {}
-                new_cache[symbol][interval] = full_url
-        # Update Globals
-        ALL_FILES_CACHE = valid_files
-        SYMBOL_CACHE = new_cache
-        CACHE_TIMESTAMP = time.time()
-    except Exception as e:
-        print(f"Error refreshing dataset cache: {e}")
-        # If cache exists, keep using it on error
-        if not SYMBOL_CACHE:
-             # If no cache, we might want to let it be empty or re-raise
-             pass
-async def get_cached_symbols() -> List[str]:
-    await refresh_cache_if_needed()
-    return sorted(list(SYMBOL_CACHE.keys()))
-async def get_file_url(symbol: str, interval: str) -> Optional[str]:
-    await refresh_cache_if_needed()
-    if symbol in SYMBOL_CACHE and interval in SYMBOL_CACHE[symbol]:
-        return SYMBOL_CACHE[symbol][interval]
-    return None
-async def get_total_file_count() -> int:
-    await refresh_cache_if_needed()
-    return len(ALL_FILES_CACHE)
-async def get_one_safe_file_url() -> Optional[str]:
-    """Return one safe file URL for schema inference."""
-    await refresh_cache_if_needed()
-    if ALL_FILES_CACHE:
-        # Prefer a file without special characters if possible
-        for f in ALL_FILES_CACHE:
-            if "%23" not in f: # Encoded #
-                return f
-        return ALL_FILES_CACHE[0]
-    return None

docs/DATASET_CONTEXT.md ADDED Viewed

	@@ -0,0 +1,350 @@

+# OMCHOKSI108/my-cloud-data-lake — Dataset Context & API Reference
+> **Source:** `https://huggingface.co/datasets/OMCHOKSI108/my-cloud-data-lake`
+> **Format:** Apache Parquet
+> **Total Size:** ~4.65 GB
+> **Generated:** February 2026
+---
+## 1. High-Level Summary
+| Metric | Value |
+|---|---|
+| Total Parquet Files | **793** |
+| Total Data Rows | **276,427,113** (~276 M) |
+| Unique Instruments | **111** |
+| Timeframes | **7** (1min, 5min, 15min, 30min, 1hr, 4hr, 1day) |
+| Top-Level Folders | **3** (`ALL_TIME_DATA`, `data`, `reports`) |
+| Column Schema (OHLCV) | `ts`, `open`, `high`, `low`, `close`, `volume` |
+| Row Count Range | 106 — 9,823,126 per file |
+---
+## 2. Folder Structure
+```
+my-cloud-data-lake/                          (4.65 GB)
+├── ALL_TIME_DATA/                           756 files │ 275,232,406 rows
+│   ├── 1min_time/                           108 files │ 182,411,731 rows
+│   ├── 5min_time/                           108 files │  51,658,834 rows
+│   ├── 15min_time/                          108 files │  22,286,150 rows
+│   ├── 30min_time/                          108 files │  11,251,532 rows
+│   ├── 1hr_time/                            108 files │   5,739,404 rows
+│   ├── 4hr_time/                            108 files │   1,526,167 rows
+│   └── 1day_time/                           108 files │     358,588 rows
+├── data/                                     35 files │   1,185,910 rows
+│   ├── 1min_time/                             5 files │     500,093 rows
+│   ├── 5min_time/                             5 files │     422,691 rows
+│   ├── 15min_time/                            5 files │     144,273 rows
+│   ├── 30min_time/                            5 files │      72,193 rows
+│   ├── 1hr_time/                              5 files │      36,099 rows
+│   ├── 4hr_time/                              5 files │       9,039 rows
+│   └── 1day_time/                             5 files │       1,522 rows
+├── reports/                                   2 files │       8,797 rows
+│   ├── portfolio_equity.parquet               1 file  │       5,000 rows
+│   └── portfolio_trades.parquet               1 file  │       3,797 rows
+├── .gitattributes
+└── README.md
+```
+---
+## 3. Data Schema
+### 3.1 OHLCV Files (791 files — ALL_TIME_DATA + data)
+Every OHLCV parquet file has **6 columns** with a uniform schema:
+| Column | Type | Description |
+|---|---|---|
+| `ts` | datetime/string | Timestamp of the candle bar |
+| `open` | float | Opening price |
+| `high` | float | Highest price in the period |
+| `low` | float | Lowest price in the period |
+| `close` | float | Closing price |
+| `volume` | int/float | Trade volume during the period |
+> **Note:** Some files in `data/` have 7 columns — the first column appears to be a malformed header row baked into the schema (data from a raw TSV conversion). The core data columns remain the same 6 OHLCV fields.
+### 3.2 Reports Files
+**`reports/portfolio_equity.parquet`** — 5,000 rows, 2 columns:
+| Column | Description |
+|---|---|
+| `step` | Simulation/backtest step index |
+| `equity` | Portfolio equity value at that step |
+**`reports/portfolio_trades.parquet`** — 3,797 rows, 8 columns:
+| Column | Description |
+|---|---|
+| `pair` | Trading pair/instrument |
+| `type` | Order type |
+| `side` | Buy or Sell |
+| `price` | Execution price |
+| `size` | Position size |
+| `time` | Trade timestamp |
+| `score` | Signal/strategy score |
+| `pnl` | Profit & Loss |
+---
+## 4. Instruments Catalog (111 Unique)
+### 4.1 Forex Pairs (28)
+Major, minor, and cross pairs:
+| Pair | Pair | Pair | Pair |
+|---|---|---|---|
+| AUDCAD# | AUDCHF# | AUDJPY# | AUDNZD# |
+| AUDUSD# | CADCHF# | CADJPY# | CHFJPY# |
+| EURAUD# | EURCAD# | EURCHF# | EURGBP# |
+| EURJPY# | EURNZD# | EURUSD# | GBPAUD# |
+| GBPCAD# | GBPCHF# | GBPJPY# | GBPNZD# |
+| GBPUSD# | NZDCAD# | NZDCHF# | NZDJPY# |
+| NZDUSD# | USDCAD# | USDCHF# | USDJPY# |
+> Additional forex in `data/` folder without `#` suffix: EURUSD, GBPUSD, USDJPY
+### 4.2 Cryptocurrencies (52)
+| Symbol | Symbol | Symbol | Symbol |
+|---|---|---|---|
+| 1INCHUSD# | AAVEUSD# | ADAUSD# | ALGOUSD# |
+| APEUSD# | APTUSD# | ARBUSD# | ATOMUSD# |
+| AVAXUSD# | AXSUSD# | BATUSD# | BCHUSD# |
+| BTCEUR# | BTCGBP# | BTCJPY# | BTCUSD# |
+| BTGUSD# | CHZUSD# | COMPUSD# | CRVUSD# |
+| DASHUSD# | DOGEUSD# | DOTUSD# | EGLDUSD# |
+| ENJUSD# | ETCUSD# | ETHBTC# | ETHEUR# |
+| ETHGBP# | ETHUSD# | FILUSD# | FLOWUSD# |
+| GRTUSD# | ICPUSD# | IMXUSD# | LDOUSD# |
+| LINKUSD# | LRCUSD# | LTCUSD# | MANAUSD# |
+| MATICUSD# | NEARUSD# | OPUSD# | SANDUSD# |
+| SHIBUSD# | SNXUSD# | SOLUSD# | STORJUSD# |
+| STXUSD# | SUSHIUSD# | UMAUSD# | UNIUSD# |
+| VAULTAUSD# | XLMUSD# | XRPUSD# | XTZUSD# |
+| ZECUSD# | ZRXUSD# | | |
+### 4.3 Commodities / Precious Metals (8)
+| Symbol | Description |
+|---|---|
+| GOLD.i# | Gold (USD) |
+| SILVER.i# | Silver (USD) |
+| XAUCNH.i# | Gold / Chinese Yuan |
+| XAUEUR.i# | Gold / Euro |
+| XAUJPY.i# | Gold / Japanese Yen |
+| GAUCNH.i# | Gold (alternate CNH) |
+| GAUUSD.i# | Gold (alternate USD) |
+| XPDUSD.i# | Palladium / USD |
+| XPTUSD.i# | Platinum / USD |
+### 4.4 Stocks / Equities (12)
+| Symbol | Company |
+|---|---|
+| Amazon | Amazon.com Inc. |
+| BancoBradesco | Banco Bradesco S.A. |
+| DraftKings | DraftKings Inc. |
+| Ford | Ford Motor Company |
+| Gerdau | Gerdau S.A. |
+| Intel | Intel Corporation |
+| Nu Holdings | Nu Holdings Ltd. |
+| Nvidia | NVIDIA Corporation |
+| Pinterest | Pinterest Inc. |
+| PlugPower | Plug Power Inc. |
+| Rivian | Rivian Automotive Inc. |
+| Tesla | Tesla Inc. |
+| Transocean | Transocean Ltd. |
+---
+## 5. Timeframes
+| Timeframe | Label in Path | Files (ALL_TIME_DATA) | Rows (ALL_TIME_DATA) | Typical Row Count per Instrument |
+|---|---|---|---|---|
+| 1 Minute | `1min_time` | 108 | 182,411,731 | 42K — 9.8M |
+| 5 Minutes | `5min_time` | 108 | 51,658,834 | 8K — 2.0M |
+| 15 Minutes | `15min_time` | 108 | 22,286,150 | 9K — 679K |
+| 30 Minutes | `30min_time` | 108 | 11,251,532 | 4K — 343K |
+| 1 Hour | `1hr_time` | 108 | 5,739,404 | 2K — 175K |
+| 4 Hours | `4hr_time` | 108 | 1,526,167 | 628 — 49K |
+| 1 Day | `1day_time` | 108 | 358,588 | 106 — 14K |
+---
+## 6. Data Partitioning: `ALL_TIME_DATA` vs `data`
+| Property | `ALL_TIME_DATA/` | `data/` |
+|---|---|---|
+| **Purpose** | Full historical archive | Recent/sampled subset |
+| **File Count** | 756 | 35 |
+| **Row Count** | 275,232,406 | 1,185,910 |
+| **Instruments** | 108 (all) | 5 (BTCUSD#, ETHUSD#, EURUSD, GBPUSD, USDJPY) |
+| **Timeframes** | All 7 | All 7 |
+| **Schema Notes** | Clean 6-col OHLCV | Some files have 7 cols (legacy header artifact) |
+---
+## 7. File Naming Convention
+```
+{InstrumentSymbol}_{Timeframe}.parquet
+```
+**Examples:**
+- `BTCUSD#_1min.parquet` → Bitcoin/USD, 1-minute bars
+- `EURUSD#_1day.parquet` → EUR/USD, daily bars
+- `Tesla_15min.parquet` → Tesla stock, 15-minute bars
+- `GOLD.i#_4hr.parquet` → Gold, 4-hour bars
+**Symbol suffix meanings:**
+- `#` → CFD/derivative instrument
+- `.i#` → Index/commodity CFD
+- No suffix → Spot or direct instrument
+---
+## 8. API Design Reference
+### 8.1 Recommended API Endpoints
+```
+GET /api/v1/instruments
+    → List all 111 available instruments with metadata
+GET /api/v1/instruments/{symbol}
+    → Instrument details (asset class, available timeframes, row counts)
+GET /api/v1/ohlcv/{symbol}
+    ?timeframe=1min|5min|15min|30min|1hr|4hr|1day
+    &start=2024-01-01T00:00:00Z
+    &end=2025-12-31T23:59:59Z
+    &limit=1000
+    &offset=0
+    → OHLCV candle data with pagination
+GET /api/v1/reports/equity
+    → Portfolio equity curve (5,000 steps)
+GET /api/v1/reports/trades
+    ?pair=BTCUSD
+    &side=buy|sell
+    → Portfolio trade history (3,797 trades)
+GET /api/v1/metadata
+    → Dataset-level metadata (total files, rows, timeframes, etc.)
+GET /api/v1/search
+    ?q=BTC&asset_class=crypto
+    → Search instruments by name/class
+```
+### 8.2 Data Access Pattern (HuggingFace)
+```python
+# Direct parquet read from HuggingFace
+from huggingface_hub import hf_hub_url
+import pandas as pd
+url = hf_hub_url(
+    repo_id="OMCHOKSI108/my-cloud-data-lake",
+    filename="ALL_TIME_DATA/1hr_time/BTCUSD#_1hr.parquet",
+    repo_type="dataset"
+)
+df = pd.read_parquet(url)
+```
+### 8.3 Query Parameters for API
+| Parameter | Type | Description |
+|---|---|---|
+| `symbol` | string | Instrument symbol (e.g., `BTCUSD#`, `Tesla`) |
+| `timeframe` | enum | `1min`, `5min`, `15min`, `30min`, `1hr`, `4hr`, `1day` |
+| `start` | ISO datetime | Start of date range filter |
+| `end` | ISO datetime | End of date range filter |
+| `limit` | int | Max rows returned (default: 1000, max: 10000) |
+| `offset` | int | Pagination offset |
+| `source` | enum | `all_time` or `recent` (maps to folder) |
+| `format` | enum | `json`, `csv`, `parquet` |
+### 8.4 Response Schema
+```json
+{
+  "symbol": "BTCUSD#",
+  "timeframe": "1hr",
+  "total_rows": 66829,
+  "returned": 1000,
+  "data": [
+    {
+      "ts": "2024-01-01T00:00:00Z",
+      "open": 42150.50,
+      "high": 42280.00,
+      "low": 42100.00,
+      "close": 42230.75,
+      "volume": 1234
+    }
+  ]
+}
+```
+---
+## 9. Asset Classification Map
+Use this mapping to categorize instruments in the API:
+```json
+{
+  "forex_major": ["EURUSD#", "GBPUSD#", "USDJPY#", "USDCHF#", "AUDUSD#", "NZDUSD#", "USDCAD#"],
+  "forex_cross": ["EURJPY#", "GBPJPY#", "EURGBP#", "AUDCAD#", "AUDCHF#", "AUDJPY#", "AUDNZD#", "CADCHF#", "CADJPY#", "CHFJPY#", "EURAUD#", "EURCAD#", "EURCHF#", "EURNZD#", "GBPAUD#", "GBPCAD#", "GBPCHF#", "GBPNZD#", "NZDCAD#", "NZDCHF#", "NZDJPY#"],
+  "crypto_major": ["BTCUSD#", "ETHUSD#", "LTCUSD#", "XRPUSD#", "BCHUSD#"],
+  "crypto_alt": ["ADAUSD#", "SOLUSD#", "DOTUSD#", "LINKUSD#", "AVAXUSD#", "DOGEUSD#", "SHIBUSD#", "MATICUSD#", "UNIUSD#", "AAVEUSD#", "...and 40+ more"],
+  "crypto_cross": ["BTCEUR#", "BTCGBP#", "BTCJPY#", "ETHBTC#", "ETHEUR#", "ETHGBP#"],
+  "commodities": ["GOLD.i#", "SILVER.i#", "XPDUSD.i#", "XPTUSD.i#", "XAUEUR.i#", "XAUJPY.i#", "XAUCNH.i#", "GAUCNH.i#", "GAUUSD.i#"],
+  "stocks": ["Amazon", "Tesla", "Nvidia", "Intel", "Ford", "Rivian", "Pinterest", "PlugPower", "DraftKings", "Nu Holdings", "Gerdau", "BancoBradesco", "Transocean"]
+}
+```
+---
+## 10. Data Volume by Asset Class (ALL_TIME_DATA)
+| Asset Class | Instruments | Files | Est. Rows |
+|---|---|---|---|
+| Forex | ~28 | 196 | ~80M+ |
+| Crypto | ~52 | 364 | ~150M+ |
+| Commodities | ~9 | 63 | ~20M+ |
+| Stocks | ~13 | 91 | ~25M+ |
+| **Reports** | 2 | 2 | 8,797 |
+---
+## 11. Key Observations & Notes
+1. **Uniform OHLCV schema** across all market data files — no schema conflicts between asset classes
+2. **Highest granularity data** is in 1-minute bars, accounting for 66% of all rows (182M rows)
+3. **Longest history instruments**: Major forex pairs (EURUSD, GBPUSD, USDJPY, USDCHF) have up to ~14K daily bars (~55+ years of data) and 9.8M 1-minute bars
+4. **Shortest history instruments**: GAUCNH.i#, GAUUSD.i#, XAUCNH.i#, XAUJPY.i# have only ~106 daily bars
+5. **`data/` folder** contains a focused subset of 5 key instruments (BTCUSD#, ETHUSD#, EURUSD, GBPUSD, USDJPY) — likely used for development/testing
+6. **`reports/` folder** contains backtesting/simulation results — equity curve and trade log
+7. **Data format** is Apache Parquet — columnar, compressed, ideal for analytical queries
+8. **No bid/ask spread data** — only mid-price OHLCV
+9. **No fundamental data** — purely technical/price data
+---
+## 12. Potential Use Cases
+- **Trading Strategy Backtesting** — multi-asset, multi-timeframe
+- **ML/AI Price Prediction Models** — 276M row training dataset
+- **Technical Analysis API** — serve OHLCV with on-the-fly indicator calculation
+- **Cross-Asset Correlation Analysis** — forex, crypto, commodities, stocks in one lake
+- **Portfolio Simulation** — reports data already includes equity curves and trade logs
+- **Real-Time Dashboard** — serve historical + stream live data via WebSocket
+- **Market Data Microservice** — HuggingFace as cold storage, API serves hot queries via Redis/DuckDB cache

docs/Plan.md ADDED Viewed

	@@ -0,0 +1,374 @@

+Now we’re talking like a systems architect 😌🔥
+You’re not building “an API”.
+You’re building a **market data engine**.
+And yes — this is not easy.
+If it were easy, Bloomberg wouldn’t cost $25,000 per year.
+Let’s design this properly.
+---
+# 🧠 Core Vision
+You want:
+* Dynamic symbol discovery
+* Fast search
+* Timeframe awareness
+* Date range introspection
+* Range-based data extraction
+* Scalable architecture
+* Low latency
+* Production ready
+That’s basically a mini Bloomberg / TradingView backend.
+Good. Let’s build it in layers.
+---
+# 🔥 1️⃣ Expanded Real-World Use Cases (Beyond Your 5–10%)
+You mentioned symbol list + range query.
+That’s only surface-level. Here’s the real ecosystem:
+---
+## 📊 Market Data Retrieval
+1. List all available symbols
+2. Search symbols by prefix or fuzzy match
+3. Get available timeframes per symbol
+4. Get min/max available date for symbol/timeframe
+5. Fetch OHLCV data by:
+   * symbol
+   * timeframe
+   * date range
+6. Fetch latest N candles
+7. Fetch last price snapshot
+8. Multi-symbol batch query
+---
+## 📈 Analytics & Indicators
+9. Compute SMA/EMA on the fly
+10. RSI/MACD calculation
+11. VWAP calculation
+12. ATR/Volatility stats
+13. Rolling window queries
+14. Correlation between symbols
+15. Spread analysis
+16. Multi-timeframe aggregation
+---
+## 📦 Metadata Layer
+17. Symbol classification (Forex, Crypto, Equity)
+18. Exchange mapping
+19. Trading session hours
+20. Symbol status (active/delisted)
+21. Liquidity score
+---
+## 🚀 Performance / Pro Features
+22. Precomputed aggregates
+23. Candle compression (1m → 5m)
+24. Server-side pagination
+25. Async streaming responses
+26. Real-time websocket updates
+27. Snapshot caching
+28. Heatmap endpoint
+---
+## 🤖 ML & Research Layer
+29. Return normalized data
+30. Return feature-engineered dataset
+31. Backtesting query endpoint
+32. Walk-forward validation data splits
+33. Dataset slicing for training
+---
+## 🛡 Operational
+34. Rate limiting
+35. Access tiers
+36. API key auth
+37. Usage metrics
+38. Request logging
+39. Error monitoring
+---
+# 🏗 2️⃣ Architecture Design (High Performance Version)
+Now let’s talk real engineering.
+Your biggest enemy:
+👉 Time-series query latency
+👉 File scanning
+👉 Memory overload
+---
+## 🔥 Recommended Architecture
+### Layer 1: Storage
+Do NOT serve directly from raw HF HTTP every request.
+Instead:
+Option A (simple):
+* Sync parquet files into container on startup
+Option B (proper scalable):
+* Store in object storage (S3-compatible)
+* Use partitioned parquet structure
+Example ideal structure:
+```
+/data
+   /symbol=BTCUSD
+       /timeframe=1m
+           /year=2024
+               part-001.parquet
+```
+Partitioning by:
+* symbol
+* timeframe
+* year
+This enables very fast filtering.
+---
+### Layer 2: Metadata Index (CRITICAL)
+Never scan files per request.
+Instead build metadata table:
+| symbol | timeframe | start_date | end_date | file_path |
+Store this in:
+* SQLite (simple)
+* PostgreSQL (better)
+* DuckDB (excellent for time-series)
+* Redis (for fast lookups)
+This allows instant symbol listing and range discovery.
+---
+### Layer 3: Query Engine
+When request comes:
+1. Validate symbol
+2. Validate timeframe
+3. Lookup metadata
+4. Determine which partitions to read
+5. Use PyArrow Dataset filtering
+6. Return JSON
+PyArrow filtering example:
+```python
+dataset = ds.dataset("data/", format="parquet")
+table = dataset.to_table(
+    filter=(
+        (ds.field("symbol") == "BTCUSD") &
+        (ds.field("timestamp") >= start) &
+        (ds.field("timestamp") <= end)
+    )
+)
+```
+This reads only relevant row groups.
+That’s how you keep it fast.
+---
+### Layer 4: Caching Layer
+Add Redis:
+* Cache symbol list
+* Cache latest candle
+* Cache hot queries
+* Cache range metadata
+Time-based invalidation.
+---
+# ⚡ 3️⃣ API Design Best Practices
+## REST Design (Recommended)
+```
+GET /symbols
+GET /symbols/search?q=btc
+GET /symbols/{symbol}/timeframes
+GET /symbols/{symbol}/{timeframe}/range
+GET /data?symbol=BTCUSD&tf=1m&start=2024-01-01&end=2024-02-01
+GET /data/latest?symbol=BTCUSD&tf=1m
+```
+Use:
+* Query parameters for filters
+* Pagination
+* Limit max rows
+---
+## Response Optimization
+* Gzip compression
+* Option for CSV or JSON
+* Option for binary Arrow format
+* Pagination for large ranges
+---
+## GraphQL?
+Only if you want flexible multi-field selection.
+For time-series APIs, REST is simpler and faster.
+---
+# ⚡ 4️⃣ Performance Strategy
+Here’s where people fail.
+### ❌ Bad Design
+* Load full parquet file
+* Filter in pandas
+* Return giant JSON
+### ✅ Good Design
+* Partitioned storage
+* Arrow dataset filtering
+* Limit rows
+* Cache hot symbols
+* Async FastAPI
+* Use uvicorn with workers
+---
+# ⚠ 5️⃣ Major Challenges
+### 1. Large Range Queries
+Solution:
+* Hard row limits
+* Pagination
+* Stream responses
+---
+### 2. Cold Start
+Solution:
+* Preload metadata
+* Warm-up query
+---
+### 3. Concurrency
+Solution:
+* Async endpoints
+* Gunicorn workers
+* Avoid global pandas DataFrames
+---
+### 4. Memory Pressure
+Solution:
+* Avoid pandas
+* Use PyArrow
+* Stream results
+---
+### 5. Scaling
+Solution:
+* Stateless API
+* External object storage
+* Horizontal scaling
+---
+# 🧠 Advanced Optimization
+If you want serious performance:
+* Use DuckDB as query engine
+* It reads parquet directly
+* Very fast filtering
+* SQL-based
+Example:
+```python
+import duckdb
+con = duckdb.connect()
+df = con.execute("""
+    SELECT * FROM 'data/*.parquet'
+    WHERE symbol='BTCUSD'
+    AND timestamp BETWEEN '2024-01-01' AND '2024-02-01'
+""").df()
+```
+DuckDB is insanely fast for this use case.
+Honestly? For your architecture — DuckDB is a better backend engine than pandas.
+---
+# 🏁 Final Architecture Recommendation
+If I were building your system:
+Storage → Partitioned Parquet
+Metadata → SQLite / Postgres
+Query Engine → DuckDB
+API → FastAPI (async)
+Cache → Redis
+Deployment → Docker
+Hosting → HF Space (demo) or Cloud VM (production)
+---

hf_dataset_structure_report.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

scripts/__init__.py ADDED Viewed

File without changes

scripts/sync_data.py ADDED Viewed

	@@ -0,0 +1,38 @@

+#!/usr/bin/env python3
+"""Standalone script to list remote parquet files and verify connectivity.
+Usage:
+    python scripts/sync_data.py
+"""
+import sys
+from pathlib import Path
+# Allow running as a script from project root
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+from app.config import settings
+from app.core.logging_config import setup_logging
+from app.core.database import db_manager
+def main() -> None:
+    setup_logging()
+    print(f"Connecting to DuckDB (remote mode) …")
+    db_manager.connect()
+    print(f"Listing files from HuggingFace repo: {settings.HF_REPO_ID}")
+    db_manager.build_metadata_index()
+    print(f"\n=== Remote Metadata ===")
+    print(f"  Total files : {len(db_manager.metadata)}")
+    print(f"  Symbols     : {len(db_manager.symbols)}")
+    if db_manager.symbols:
+        print(f"  First 10    : {db_manager.symbols[:10]}")
+    db_manager.close()
+if __name__ == "__main__":
+    main()

testdata.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import pandas as pd
+import pyarrow.parquet as pq
+import fsspec
+from huggingface_hub import list_repo_files
+from urllib.parse import quote
+# ==================================
+REPO_ID = "OMCHOKSI108/my-cloud-data-lake"
+OUTPUT_FILE = "hf_dataset_structure_report.csv"
+summary = []
+print(f"\nConnecting to HuggingFace dataset: {REPO_ID}\n")
+files = list_repo_files(repo_id=REPO_ID, repo_type="dataset")
+parquet_files = [f for f in files if f.endswith(".parquet")]
+print(f"Total parquet files found: {len(parquet_files)}\n")
+for file_path in parquet_files:
+    print(f"Inspecting: {file_path}")
+    try:
+        # 🔥 ENCODE SPECIAL CHARACTERS
+        encoded_path = quote(file_path)
+        hf_url = f"https://huggingface.co/datasets/{REPO_ID}/resolve/main/{encoded_path}"
+        with fsspec.open(hf_url, "rb") as f:
+            parquet_file = pq.ParquetFile(f)
+            schema = parquet_file.schema
+            num_rows = parquet_file.metadata.num_rows
+            summary.append({
+                "file_path": file_path,
+                "folder": file_path.split("/")[0],
+                "file_name": file_path.split("/")[-1],
+                "num_columns": len(schema.names),
+                "num_rows": num_rows,
+                "columns": schema.names
+            })
+    except Exception as e:
+        print("Error:", e)
+df = pd.DataFrame(summary)
+print("\n===== DATASET STRUCTURE =====\n")
+print(df[["file_name", "folder", "num_columns", "num_rows"]])
+df.to_csv(OUTPUT_FILE, index=False)
+print(f"\nReport saved as: {OUTPUT_FILE}")