Spaces:

ifieryarrows
/

copper-mind

Running

App Files Files Community

ifieryarrows commited on Jan 27

Commit

b3b36f7

verified ·

1 Parent(s): 0b7d144

Sync from GitHub (tests passed)

Browse files

Files changed (23) hide show

Dockerfile +22 -3
adapters/__init__.py +3 -0
adapters/db/__init__.py +3 -0
adapters/db/lock.py +197 -0
adapters/queue/__init__.py +7 -0
adapters/queue/jobs.py +86 -0
adapters/queue/redis.py +114 -0
app/main.py +166 -304
app/models.py +110 -1
app/schemas.py +42 -14
app/settings.py +12 -2
migrations/001_add_news_raw_processed.sql +121 -0
pipelines/__init__.py +8 -0
pipelines/cutoff.py +186 -0
pipelines/ingestion/__init__.py +1 -0
pipelines/ingestion/news.py +271 -0
pipelines/processing/__init__.py +1 -0
pipelines/processing/news.py +221 -0
requirements.txt +4 -0
supervisord.conf +38 -0
worker/__init__.py +4 -0
worker/runner.py +70 -0
worker/tasks.py +522 -0

Dockerfile CHANGED Viewed

@@ -2,19 +2,38 @@ FROM python:3.11-slim
 WORKDIR /code
-RUN apt-get update && apt-get install -y \
     gcc \
     libpq-dev \
     && rm -rf /var/lib/apt/lists/*
 COPY ./requirements.txt /code/requirements.txt
 RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
 COPY ./app /code/app
 COPY ./config /code/config
 # Copy pre-trained model files (from Kaggle)
 COPY ./data/models /data/models
-CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

 WORKDIR /code
+# OS deps: redis-server + supervisor + build tools
+RUN apt-get update && apt-get install -y --no-install-recommends \
     gcc \
     libpq-dev \
+    redis-server \
+    supervisor \
+    curl \
     && rm -rf /var/lib/apt/lists/*
+# Python deps
 COPY ./requirements.txt /code/requirements.txt
 RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+# App code
 COPY ./app /code/app
 COPY ./config /code/config
+COPY ./adapters /code/adapters
+COPY ./worker /code/worker
 # Copy pre-trained model files (from Kaggle)
 COPY ./data/models /data/models
+# Supervisor config
+COPY ./supervisord.conf /etc/supervisor/conf.d/supervisord.conf
+# HF Spaces default port: 7860
+EXPOSE 7860
+# Environment
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONPATH=/code \
+    REDIS_URL=redis://127.0.0.1:6379/0
+# Run supervisord (manages redis + api + worker)
+CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]

adapters/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+Adapters: External service integrations.
+"""

adapters/db/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+Database adapters.
+"""

adapters/db/lock.py ADDED Viewed

	@@ -0,0 +1,197 @@

+"""
+Distributed lock using PostgreSQL advisory locks.
+Advisory locks are:
+- Session-based: automatically released when connection closes
+- Non-blocking: can check without waiting
+- Reliable: no stale locks after crash
+This is the AUTHORITY for pipeline locking.
+`pipeline_locks` table is for VISIBILITY only (best-effort).
+"""
+import hashlib
+import logging
+from contextlib import contextmanager
+from datetime import datetime, timezone
+from typing import Optional
+from sqlalchemy import text
+from sqlalchemy.orm import Session
+logger = logging.getLogger(__name__)
+def _lock_key_to_id(lock_key: str) -> int:
+    """
+    Convert string lock key to bigint for pg_advisory_lock.
+    Uses first 15 hex chars of SHA-256 to fit in signed bigint.
+    """
+    hash_hex = hashlib.sha256(lock_key.encode()).hexdigest()[:15]
+    return int(hash_hex, 16)
+def try_acquire_lock(session: Session, lock_key: str) -> bool:
+    """
+    Try to acquire advisory lock (non-blocking).
+    Args:
+        session: SQLAlchemy session (lock is tied to this connection)
+        lock_key: String identifier for the lock (e.g., "pipeline:daily")
+    Returns:
+        True if lock acquired, False if already held by another session
+    IMPORTANT: Lock is held until session.close() or explicit release.
+    Keep the same session alive for the entire pipeline run.
+    """
+    lock_id = _lock_key_to_id(lock_key)
+    result = session.execute(
+        text("SELECT pg_try_advisory_lock(:lock_id)"),
+        {"lock_id": lock_id}
+    ).scalar()
+    if result:
+        logger.info(f"Advisory lock acquired: {lock_key} (id={lock_id})")
+    else:
+        logger.warning(f"Advisory lock NOT acquired (held by another): {lock_key}")
+    return bool(result)
+def release_lock(session: Session, lock_key: str) -> bool:
+    """
+    Release advisory lock explicitly.
+    Usually not needed - lock auto-releases on session close.
+    Use this for early release if pipeline completes before session ends.
+    """
+    lock_id = _lock_key_to_id(lock_key)
+    result = session.execute(
+        text("SELECT pg_advisory_unlock(:lock_id)"),
+        {"lock_id": lock_id}
+    ).scalar()
+    if result:
+        logger.info(f"Advisory lock released: {lock_key}")
+    else:
+        logger.warning(f"Advisory lock release failed (not held?): {lock_key}")
+    return bool(result)
+def is_lock_held(session: Session, lock_key: str) -> bool:
+    """
+    Check if lock is currently held by ANY session.
+    This is a weak check - another session could acquire between check and use.
+    Use try_acquire_lock for actual locking.
+    """
+    lock_id = _lock_key_to_id(lock_key)
+    # Try to acquire, then immediately release if successful
+    acquired = session.execute(
+        text("SELECT pg_try_advisory_lock(:lock_id)"),
+        {"lock_id": lock_id}
+    ).scalar()
+    if acquired:
+        session.execute(
+            text("SELECT pg_advisory_unlock(:lock_id)"),
+            {"lock_id": lock_id}
+        )
+        return False  # Was NOT held
+    else:
+        return True  # IS held by another
+@contextmanager
+def advisory_lock(session: Session, lock_key: str, raise_on_fail: bool = True):
+    """
+    Context manager for advisory lock.
+    Usage:
+        with advisory_lock(session, "pipeline:daily"):
+            # Do work - lock held
+            pass
+        # Lock released
+    Args:
+        session: SQLAlchemy session
+        lock_key: Lock identifier
+        raise_on_fail: If True, raise RuntimeError if lock not acquired
+    Raises:
+        RuntimeError: If lock not acquired and raise_on_fail=True
+    """
+    acquired = try_acquire_lock(session, lock_key)
+    if not acquired:
+        if raise_on_fail:
+            raise RuntimeError(f"Could not acquire lock: {lock_key}")
+        else:
+            yield False
+            return
+    try:
+        yield True
+    finally:
+        release_lock(session, lock_key)
+# Lock key constants
+PIPELINE_LOCK_KEY = "pipeline:daily"
+def write_lock_visibility(
+    session: Session,
+    lock_key: str,
+    run_id: str,
+    holder_id: Optional[str] = None
+) -> None:
+    """
+    Write lock info to pipeline_locks table for visibility.
+    This is BEST-EFFORT only - not the authority.
+    If this fails, pipeline continues.
+    """
+    try:
+        # Upsert lock info
+        session.execute(
+            text("""
+                INSERT INTO pipeline_locks (lock_key, holder_id, run_id, acquired_at)
+                VALUES (:lock_key, :holder_id, :run_id, :acquired_at)
+                ON CONFLICT (lock_key) DO UPDATE SET
+                    holder_id = EXCLUDED.holder_id,
+                    run_id = EXCLUDED.run_id,
+                    acquired_at = EXCLUDED.acquired_at
+            """),
+            {
+                "lock_key": lock_key,
+                "holder_id": holder_id,
+                "run_id": run_id,
+                "acquired_at": datetime.now(timezone.utc),
+            }
+        )
+        session.commit()
+    except Exception as e:
+        logger.debug(f"Failed to write lock visibility (best-effort): {e}")
+        session.rollback()
+def clear_lock_visibility(session: Session, lock_key: str) -> None:
+    """
+    Clear lock info from pipeline_locks table.
+    Best-effort only.
+    """
+    try:
+        session.execute(
+            text("DELETE FROM pipeline_locks WHERE lock_key = :lock_key"),
+            {"lock_key": lock_key}
+        )
+        session.commit()
+    except Exception as e:
+        logger.debug(f"Failed to clear lock visibility (best-effort): {e}")
+        session.rollback()

adapters/queue/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""
+Queue adapters for Redis/arq.
+"""
+from adapters.queue.redis import get_redis_pool, RedisSettings
+from adapters.queue.jobs import enqueue_pipeline_job
+__all__ = ["get_redis_pool", "RedisSettings", "enqueue_pipeline_job"]

adapters/queue/jobs.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""
+Job enqueue/dequeue functions for pipeline tasks.
+"""
+import logging
+from datetime import datetime, timezone
+from typing import Optional
+from uuid import uuid4
+from arq import create_pool
+from adapters.queue.redis import get_redis_settings
+logger = logging.getLogger(__name__)
+async def enqueue_pipeline_job(
+    train_model: bool = False,
+    trigger_source: str = "manual",
+    run_id: Optional[str] = None,
+) -> dict:
+    """
+    Enqueue a pipeline job to Redis.
+    Args:
+        train_model: Whether to train/retrain the XGBoost model
+        trigger_source: Source of trigger (manual, cron, api)
+        run_id: Optional run ID, generated if not provided
+    Returns:
+        dict with run_id and job_id
+    """
+    if run_id is None:
+        run_id = str(uuid4())
+    try:
+        redis = await create_pool(get_redis_settings())
+        job = await redis.enqueue_job(
+            "run_pipeline",
+            run_id=run_id,
+            train_model=train_model,
+            trigger_source=trigger_source,
+            enqueued_at=datetime.now(timezone.utc).isoformat(),
+        )
+        await redis.close()
+        logger.info(f"Pipeline job enqueued: run_id={run_id}, job_id={job.job_id}")
+        return {
+            "run_id": run_id,
+            "job_id": job.job_id,
+            "enqueued": True,
+            "trigger_source": trigger_source,
+            "train_model": train_model,
+        }
+    except Exception as e:
+        logger.error(f"Failed to enqueue pipeline job: {e}")
+        raise
+async def get_job_status(job_id: str) -> Optional[dict]:
+    """
+    Get status of a queued job.
+    Returns:
+        dict with job status or None if not found
+    """
+    try:
+        redis = await create_pool(get_redis_settings())
+        job = await redis.job(job_id)
+        await redis.close()
+        if job is None:
+            return None
+        return {
+            "job_id": job_id,
+            "status": job.status,
+        }
+    except Exception as e:
+        logger.error(f"Failed to get job status: {e}")
+        return None

adapters/queue/redis.py ADDED Viewed

	@@ -0,0 +1,114 @@

+"""
+Redis connection and settings for arq queue.
+"""
+import logging
+from typing import Optional
+from arq.connections import RedisSettings as ArqRedisSettings
+from redis.asyncio import Redis
+logger = logging.getLogger(__name__)
+# Module-level pool cache
+_redis_pool: Optional[Redis] = None
+def get_redis_settings() -> ArqRedisSettings:
+    """
+    Get Redis settings for arq worker.
+    Reads from environment:
+        REDIS_URL: Full Redis URL (redis://host:port/db)
+    Falls back to localhost:6379 for development.
+    """
+    import os
+    redis_url = os.getenv("REDIS_URL", "redis://localhost:6379/0")
+    # Parse URL for arq settings
+    # Format: redis://[user:password@]host:port/db
+    from urllib.parse import urlparse
+    parsed = urlparse(redis_url)
+    return ArqRedisSettings(
+        host=parsed.hostname or "localhost",
+        port=parsed.port or 6379,
+        database=int(parsed.path.lstrip("/") or 0),
+        password=parsed.password,
+    )
+async def get_redis_pool(max_retries: int = 5, retry_delay: float = 1.0) -> Redis:
+    """
+    Get async Redis connection pool.
+    Lazy initialization, cached at module level.
+    Includes retry logic for HF Spaces where Redis might start
+    slightly after API/Worker due to supervisord startup order.
+    """
+    global _redis_pool
+    if _redis_pool is None:
+        import os
+        import asyncio
+        redis_url = os.getenv("REDIS_URL", "redis://localhost:6379/0")
+        for attempt in range(max_retries):
+            try:
+                pool = Redis.from_url(
+                    redis_url,
+                    decode_responses=True,
+                    socket_connect_timeout=5.0,
+                    socket_timeout=5.0,
+                )
+                # Test connection
+                await pool.ping()
+                _redis_pool = pool
+                logger.info(f"Redis pool created: {redis_url.split('@')[-1]}")
+                break
+            except Exception as e:
+                if attempt < max_retries - 1:
+                    logger.warning(f"Redis connection attempt {attempt + 1} failed, retrying in {retry_delay}s: {e}")
+                    await asyncio.sleep(retry_delay)
+                else:
+                    logger.error(f"Redis connection failed after {max_retries} attempts: {e}")
+                    raise
+    return _redis_pool
+async def close_redis_pool():
+    """Close Redis pool on shutdown."""
+    global _redis_pool
+    if _redis_pool is not None:
+        await _redis_pool.close()
+        _redis_pool = None
+        logger.info("Redis pool closed")
+async def redis_healthcheck() -> dict:
+    """
+    Check Redis connectivity.
+    Returns:
+        dict with 'ok' bool and 'latency_ms' float
+    """
+    import time
+    try:
+        pool = await get_redis_pool()
+        start = time.monotonic()
+        await pool.ping()
+        latency = (time.monotonic() - start) * 1000
+        return {"ok": True, "latency_ms": round(latency, 2)}
+    except Exception as e:
+        logger.warning(f"Redis healthcheck failed: {e}")
+        return {"ok": False, "error": str(e)}
+# Re-export for convenience
+RedisSettings = ArqRedisSettings

app/main.py CHANGED Viewed

@@ -22,15 +22,11 @@ from fastapi.middleware.cors import CORSMiddleware
 from sqlalchemy import func
 from app.db import init_db, SessionLocal, get_db_type
-from app.models import NewsArticle, PriceBar, DailySentiment
 from app.settings import get_settings
 from app.lock import is_pipeline_locked
-from app.inference import (
-    generate_analysis_report,
-    save_analysis_snapshot,
-    get_latest_snapshot,
-    get_any_snapshot,
-)
 from app.schemas import (
     AnalysisReport,
     HistoryResponse,
@@ -59,21 +55,21 @@ async def lifespan(app: FastAPI):
     init_db()
     logger.info("Database initialized")
-    # Start scheduler if enabled
-    settings = get_settings()
-    if settings.scheduler_enabled:
-        from app.scheduler import start_scheduler
-        start_scheduler()
-        logger.info("Scheduler started")
     yield
     # Shutdown
     logger.info("Shutting down CopperMind API...")
-    if settings.scheduler_enabled:
-        from app.scheduler import stop_scheduler
-        stop_scheduler()
-        logger.info("Scheduler stopped")
 # =============================================================================
@@ -108,11 +104,11 @@ app.add_middleware(
     "/api/analysis",
     response_model=AnalysisReport,
     responses={
-        404: {"model": ErrorResponse, "description": "Model or data not found"},
-        503: {"model": ErrorResponse, "description": "Pipeline locked, snapshot unavailable"},
     },
-    summary="Get current analysis report",
-    description="Returns the latest analysis report with predictions, sentiment, and influencers."
 )
 async def get_analysis(
     symbol: str = Query(default="HG=F", description="Trading symbol")
@@ -120,165 +116,88 @@ async def get_analysis(
     """
     Get current analysis report.
-    Behavior:
-    - If fresh snapshot exists (within TTL), return it
-    - If pipeline is not locked, generate fresh report
-    - If pipeline is locked, return stale snapshot or 503
     """
-    settings = get_settings()
     with SessionLocal() as session:
-        # Check for fresh snapshot first
-        cached = get_latest_snapshot(
-            session,
-            symbol,
-            max_age_minutes=settings.analysis_ttl_minutes
-        )
-        if cached:
-            logger.debug(f"Cached snapshot exists, but running live prediction for accuracy")
-            import yfinance as yf
-            import xgboost as xgb
-            try:
-                # Get live price from yfinance
-                ticker = yf.Ticker(symbol)
-                info = ticker.info
-                live_price = info.get('regularMarketPrice') or info.get('currentPrice')
-                if live_price is not None:
-                    cached['current_price'] = round(float(live_price), 4)
-                    # Get latest DB close price for prediction base
-                    # Model predicts based on historical closes, not intraday prices
-                    latest_bar = session.query(PriceBar).filter(
-                        PriceBar.symbol == symbol
-                    ).order_by(PriceBar.date.desc()).first()
-                    if live_price is not None:
-                        # Prioritize live price for prediction base
-                        prediction_base = float(live_price)
-                    elif latest_bar:
-                        # Fallback to DB close
-                        prediction_base = latest_bar.close
-                    else:
-                        prediction_base = 0.0
-                    # Run LIVE model prediction
-                    from app.ai_engine import load_model, load_model_metadata
-                    from app.inference import build_features_for_prediction
-                    model = load_model(symbol)
-                    metadata = load_model_metadata(symbol)
-                    features = metadata.get("features", [])
-                    if model and features:
-                        # Build features and predict
-                        X = build_features_for_prediction(session, symbol, features)
-                        if X is not None and not X.empty:
-                            dmatrix = xgb.DMatrix(X, feature_names=features)
-                            predicted_return = float(model.predict(dmatrix)[0])
-                            # Update with live prediction
-                            # Apply futures-spot adjustment (HG=F is ~1.5% higher than XCU/USD)
-                            adjustment = settings.futures_spot_adjustment
-                            adjusted_base = float(prediction_base) * adjustment
-                            cached['predicted_return'] = round(predicted_return, 6)
-                            cached['predicted_price'] = round(
-                                adjusted_base * (1 + predicted_return),
-                                4
-                            )
-                            # Also adjust current_price for consistency
-                            cached['current_price'] = round(adjusted_base, 4)
-                            # Update confidence bounds (based on adjusted base)
-                            std_mult = 1.0  # 1 standard deviation
-                            cached['confidence_lower'] = round(adjusted_base * (1 - std_mult * abs(predicted_return)), 4)
-                            cached['confidence_upper'] = round(adjusted_base * (1 + std_mult * abs(predicted_return) * 2), 4)
-                            logger.info(f"LIVE prediction: HG=F=${prediction_base:.4f} -> XCU/USD≈${adjusted_base:.4f}, predicted=${cached['predicted_price']:.4f} ({predicted_return*100:.2f}%)")
-            except Exception as e:
-                logger.error(f"Live prediction failed, using cached: {e}")
-            # Update top_influencers from current model metadata
-            try:
-                from app.ai_engine import load_model_metadata
-                from app.features import get_feature_descriptions
-                metadata = load_model_metadata(symbol)
-                importance = metadata.get("importance", [])
-                if importance:
-                    descriptions = get_feature_descriptions()
-                    top_influencers = []
-                    for item in importance[:10]:
-                        feat = item["feature"]
-                        desc = None
-                        for key, value in descriptions.items():
-                            if key in feat:
-                                desc = value
-                                break
-                        if desc is None:
-                            desc = feat.replace("_", " ").replace("  ", " ").title()
-                        top_influencers.append({
-                            "feature": feat,
-                            "importance": item["importance"],
-                            "description": desc,
-                        })
-                    cached['top_influencers'] = top_influencers
-                    logger.info(f"Updated cached snapshot with fresh influencers from model")
-            except Exception as e:
-                logger.debug(f"Could not update influencers in cached snapshot: {e}")
-            return cached
-        # Check if pipeline is locked
-        if is_pipeline_locked():
-            # Try to return stale snapshot
-            stale = get_any_snapshot(session, symbol)
-            if stale:
-                logger.info(f"Pipeline locked, returning stale snapshot for {symbol}")
-                return stale
-            raise HTTPException(
-                status_code=503,
-                detail="Pipeline is currently running. No cached snapshot available. Please try again later."
-            )
-        # Generate fresh report
-        try:
-            report = generate_analysis_report(session, symbol)
-            if report is None:
-                raise HTTPException(
-                    status_code=404,
-                    detail=f"Could not generate analysis for {symbol}. "
-                           "Please ensure data has been fetched (make seed) and model trained (make train)."
-                )
-            # Save as snapshot
-            save_analysis_snapshot(session, report, symbol)
-            return report
-        except Exception as e:
-            logger.error(f"Error generating analysis: {e}")
-            # Try stale snapshot as fallback
-            stale = get_any_snapshot(session, symbol)
-            if stale:
-                logger.info(f"Error in fresh generation, returning stale snapshot")
-                return stale
-            raise HTTPException(
-                status_code=500,
-                detail=f"Error generating analysis: {str(e)}"
-            )
 @app.get(
@@ -366,13 +285,14 @@ async def get_history(
     "/api/health",
     response_model=HealthResponse,
     summary="System health check",
-    description="Returns system status including database, models, and pipeline lock state."
 )
 async def health_check():
     """
     Perform system health check.
     Returns status information useful for monitoring and debugging.
     """
     settings = get_settings()
     model_dir = Path(settings.model_dir)
@@ -382,17 +302,42 @@ async def health_check():
     if model_dir.exists():
         models_found = len(list(model_dir.glob("xgb_*_latest.json")))
-    # Get counts
     news_count = None
     price_count = None
     try:
         with SessionLocal() as session:
             news_count = session.query(func.count(NewsArticle.id)).scalar()
             price_count = session.query(func.count(PriceBar.id)).scalar()
     except Exception as e:
         logger.error(f"Error getting counts: {e}")
     # Determine status
     pipeline_locked = is_pipeline_locked()
@@ -400,6 +345,8 @@ async def health_check():
         status = "degraded"
     elif pipeline_locked:
         status = "degraded"
     else:
         status = "healthy"
@@ -410,7 +357,9 @@ async def health_check():
         pipeline_locked=pipeline_locked,
         timestamp=datetime.now(timezone.utc).isoformat(),
         news_count=news_count,
-        price_bars_count=price_count
     )
@@ -716,148 +665,61 @@ def verify_pipeline_secret(authorization: Optional[str] = Header(None)) -> None:
 @app.post(
     "/api/pipeline/trigger",
-    summary="Trigger data pipeline (requires authentication)",
-    description="Manually trigger data fetch and AI pipeline. Requires Authorization: Bearer <PIPELINE_TRIGGER_SECRET> header.",
     responses={
-        200: {"description": "Pipeline triggered successfully"},
         401: {"description": "Unauthorized - missing or invalid token"},
         409: {"description": "Pipeline already running"},
     },
 )
 async def trigger_pipeline(
-    fetch_data: bool = Query(default=True, description="Fetch new data from sources"),
-    train_model: bool = Query(default=True, description="Train/retrain XGBoost model"),
     _auth: None = Depends(verify_pipeline_secret),
 ):
     """
-    Manually trigger the pipeline.
-    This will:
-    1. Fetch new news and price data (if fetch_data=True)
-    2. Run sentiment scoring
-    3. Train XGBoost model (if train_model=True)
-    """
-    from threading import Thread
     if is_pipeline_locked():
         raise HTTPException(
             status_code=409,
             detail="Pipeline is already running. Please wait for it to complete."
         )
-    def run_pipeline():
-        try:
-            from app.lock import PipelineLock
-            from app.inference import generate_analysis_report, save_analysis_snapshot
-            from app.db import SessionLocal
-            lock = PipelineLock(timeout=0)
-            if not lock.acquire():
-                logger.error("Could not acquire pipeline lock")
-                return
-            try:
-                settings = get_settings()
-                if fetch_data:
-                    logger.info("Step 1: Fetching data...")
-                    from app.data_manager import fetch_all
-                    fetch_all(news=True, prices=True)
-                    logger.info("Data fetch complete")
-                logger.info(f"Step 2: Running AI pipeline (train_model={train_model})...")
-                from app.ai_engine import run_full_pipeline
-                ai_result = run_full_pipeline(
-                    target_symbol="HG=F",
-                    score_sentiment=True,
-                    aggregate_sentiment=True,
-                    train_model=train_model
-                )
-                logger.info(f"AI pipeline complete: scored={ai_result.get('scored_articles', 0)}, aggregated={ai_result.get('aggregated_days', 0)}")
-                # Log model training result specifically
-                if train_model:
-                    model_result = ai_result.get('model_result')
-                    if model_result:
-                        logger.info(f"Model training SUCCESS: {model_result.get('model_path')}")
-                        logger.info(f"Top influencers updated: {[i['feature'] for i in model_result.get('top_influencers', [])[:3]]}")
-                    else:
-                        logger.warning("Model training returned None - check for errors above")
-                # Step 3: Generate snapshot
-                logger.info("Step 3: Generating analysis snapshot...")
-                with SessionLocal() as session:
-                    # Clear old snapshots for this symbol to ensure fresh data
-                    from app.models import AnalysisSnapshot
-                    deleted = session.query(AnalysisSnapshot).filter(
-                        AnalysisSnapshot.symbol == settings.target_symbol
-                    ).delete()
-                    if deleted:
-                        session.commit()
-                        logger.info(f"Cleared {deleted} old snapshot(s) for {settings.target_symbol}")
-                    report = generate_analysis_report(session, settings.target_symbol)
-                    if report:
-                        save_analysis_snapshot(session, report, settings.target_symbol)
-                        logger.info(f"Snapshot generated")
-                        # Step 4: Generate AI Commentary
-                        logger.info("Step 4: Generating AI commentary...")
-                        try:
-                            import asyncio
-                            from app.commentary import generate_and_save_commentary
-                            from sqlalchemy import func
-                            from app.models import NewsArticle
-                            from datetime import timedelta
-                            # Get news count for last 7 days
-                            week_ago = datetime.now() - timedelta(days=7)
-                            news_count = session.query(func.count(NewsArticle.id)).filter(
-                                NewsArticle.published_at >= week_ago
-                            ).scalar() or 0
-                            # Run async function in sync context
-                            loop = asyncio.new_event_loop()
-                            asyncio.set_event_loop(loop)
-                            try:
-                                commentary = loop.run_until_complete(
-                                    generate_and_save_commentary(
-                                        session=session,
-                                        symbol=settings.target_symbol,
-                                        current_price=report.get('current_price', 0),
-                                        predicted_price=report.get('predicted_price', 0),
-                                        predicted_return=report.get('predicted_return', 0),
-                                        sentiment_index=report.get('sentiment_index', 0),
-                                        sentiment_label=report.get('sentiment_label', 'Neutral'),
-                                        top_influencers=report.get('top_influencers', []),
-                                        news_count=news_count,
-                                    )
-                                )
-                                if commentary:
-                                    logger.info("AI commentary generated and saved")
-                                else:
-                                    logger.warning("AI commentary skipped (API key not configured or failed)")
-                            finally:
-                                loop.close()
-                        except Exception as ce:
-                            logger.error(f"AI commentary generation failed: {ce}")
-                    else:
-                        logger.warning("Could not generate analysis snapshot")
-            finally:
-                lock.release()
-        except Exception as e:
-            logger.error(f"Pipeline error: {e}", exc_info=True)
-    # Run in background thread
-    thread = Thread(target=run_pipeline, daemon=True)
-    thread.start()
-    return {
-        "status": "triggered",
-        "message": "Pipeline started in background. Check /api/health for status.",
-        "fetch_data": fetch_data,
-        "train_model": train_model
-    }

 from sqlalchemy import func
 from app.db import init_db, SessionLocal, get_db_type
+from app.models import NewsArticle, PriceBar, DailySentiment, AnalysisSnapshot
 from app.settings import get_settings
 from app.lock import is_pipeline_locked
+# NOTE: Faz 1 - API is snapshot-only, no report generation
+# generate_analysis_report and save_analysis_snapshot are now worker-only
 from app.schemas import (
     AnalysisReport,
     HistoryResponse,
     init_db()
     logger.info("Database initialized")
+    # NOTE: Scheduler is NO LONGER started here.
+    # Pipeline scheduling is now external (GitHub Actions cron).
+    # This API only reads data and enqueues jobs.
     yield
     # Shutdown
     logger.info("Shutting down CopperMind API...")
+    # Close Redis pool if initialized
+    try:
+        from adapters.queue.redis import close_redis_pool
+        import asyncio
+        asyncio.create_task(close_redis_pool())
+    except ImportError:
+        pass
 # =============================================================================
     "/api/analysis",
     response_model=AnalysisReport,
     responses={
+        200: {"description": "Analysis report (may include quality_state for degraded modes)"},
+        404: {"model": ErrorResponse, "description": "No snapshot available"},
     },
+    summary="Get current analysis report (snapshot-only)",
+    description="Returns the latest cached analysis snapshot. No live computation - all heavy work is done by the worker."
 )
 async def get_analysis(
     symbol: str = Query(default="HG=F", description="Trading symbol")
     """
     Get current analysis report.
+    SNAPSHOT-ONLY MODE (Faz 1):
+    - Reads the latest snapshot from database
+    - NO yfinance calls
+    - NO model loading
+    - NO feature building
+    - All heavy computation is done by the worker pipeline
+    Response includes quality_state:
+    - "ok": Fresh snapshot available
+    - "stale": Snapshot older than 36 hours
+    - "missing": No snapshot found
     """
+    STALE_THRESHOLD_HOURS = 36
     with SessionLocal() as session:
+        # Get latest snapshot - any age
+        snapshot = session.query(AnalysisSnapshot).filter(
+            AnalysisSnapshot.symbol == symbol
+        ).order_by(AnalysisSnapshot.generated_at.desc()).first()
+        if snapshot is None:
+            # No snapshot at all - return minimal response for UI compatibility
+            logger.warning(f"No snapshot found for {symbol}")
+            return {
+                "symbol": symbol,
+                "quality_state": "missing",
+                "model_state": "offline",
+                "current_price": 0.0,
+                "predicted_return": 0.0,
+                "predicted_price": 0.0,
+                "confidence_lower": 0.0,
+                "confidence_upper": 0.0,
+                "sentiment_index": 0.0,
+                "sentiment_label": "Neutral",
+                "top_influencers": [],
+                "data_quality": {
+                    "news_count_7d": 0,
+                    "missing_days": 0,
+                    "coverage_pct": 0,
+                },
+                "generated_at": None,
+                "message": "No analysis available. Pipeline may not have run yet.",
+            }
+        # Calculate snapshot age
+        now = datetime.now(timezone.utc)
+        generated_at = snapshot.generated_at
+        if generated_at.tzinfo is None:
+            generated_at = generated_at.replace(tzinfo=timezone.utc)
+        age_hours = (now - generated_at).total_seconds() / 3600
+        # Determine quality state
+        if age_hours > STALE_THRESHOLD_HOURS:
+            quality_state = "stale"
+        else:
+            quality_state = "ok"
+        # Build response from snapshot
+        report = snapshot.report_json.copy() if snapshot.report_json else {}
+        # Add/override metadata
+        report["quality_state"] = quality_state
+        report["model_state"] = "ok" if quality_state == "ok" else "degraded"
+        report["snapshot_age_hours"] = round(age_hours, 1)
+        report["generated_at"] = generated_at.isoformat()
+        # Ensure required fields exist (backward compatibility)
+        if "symbol" not in report:
+            report["symbol"] = symbol
+        if "data_quality" not in report:
+            report["data_quality"] = {
+                "news_count_7d": 0,
+                "missing_days": 0,
+                "coverage_pct": 0,
+            }
+        if "top_influencers" not in report:
+            report["top_influencers"] = []
+        logger.info(f"Returning snapshot for {symbol}: age={age_hours:.1f}h, state={quality_state}")
+        return report
 @app.get(
     "/api/health",
     response_model=HealthResponse,
     summary="System health check",
+    description="Returns system status including database, Redis queue, models, and pipeline lock state."
 )
 async def health_check():
     """
     Perform system health check.
     Returns status information useful for monitoring and debugging.
+    Includes Redis queue status and snapshot age for Faz 1 observability.
     """
     settings = get_settings()
     model_dir = Path(settings.model_dir)
     if model_dir.exists():
         models_found = len(list(model_dir.glob("xgb_*_latest.json")))
+    # Get counts and snapshot age
     news_count = None
     price_count = None
+    last_snapshot_age = None
     try:
         with SessionLocal() as session:
             news_count = session.query(func.count(NewsArticle.id)).scalar()
             price_count = session.query(func.count(PriceBar.id)).scalar()
+            # Get latest snapshot age
+            from app.models import AnalysisSnapshot
+            latest_snapshot = session.query(AnalysisSnapshot).order_by(
+                AnalysisSnapshot.generated_at.desc()
+            ).first()
+            if latest_snapshot and latest_snapshot.generated_at:
+                age = datetime.now(timezone.utc) - latest_snapshot.generated_at.replace(tzinfo=timezone.utc)
+                last_snapshot_age = int(age.total_seconds())
     except Exception as e:
         logger.error(f"Error getting counts: {e}")
+    # Check Redis connectivity
+    redis_ok = None
+    try:
+        from adapters.queue.redis import redis_healthcheck
+        redis_result = await redis_healthcheck()
+        redis_ok = redis_result.get("ok", False)
+    except ImportError:
+        # Redis adapter not available yet
+        redis_ok = None
+    except Exception as e:
+        logger.warning(f"Redis healthcheck failed: {e}")
+        redis_ok = False
     # Determine status
     pipeline_locked = is_pipeline_locked()
         status = "degraded"
     elif pipeline_locked:
         status = "degraded"
+    elif redis_ok is False:
+        status = "degraded"
     else:
         status = "healthy"
         pipeline_locked=pipeline_locked,
         timestamp=datetime.now(timezone.utc).isoformat(),
         news_count=news_count,
+        price_bars_count=price_count,
+        redis_ok=redis_ok,
+        last_snapshot_age_seconds=last_snapshot_age,
     )
 @app.post(
     "/api/pipeline/trigger",
+    summary="Enqueue pipeline job (requires authentication)",
+    description="Enqueue a pipeline job to Redis queue. Worker executes the job. Requires Authorization: Bearer <PIPELINE_TRIGGER_SECRET> header.",
     responses={
+        200: {"description": "Pipeline job enqueued successfully"},
         401: {"description": "Unauthorized - missing or invalid token"},
         409: {"description": "Pipeline already running"},
+        503: {"description": "Redis queue unavailable"},
     },
 )
 async def trigger_pipeline(
+    train_model: bool = Query(default=False, description="Train/retrain XGBoost model"),
+    trigger_source: str = Query(default="api", description="Source of trigger (api, cron, manual)"),
     _auth: None = Depends(verify_pipeline_secret),
 ):
     """
+    Enqueue a pipeline job to Redis queue.
+    This endpoint does NOT run the pipeline - it only enqueues a job.
+    The worker service consumes and executes the job.
+    Returns:
+        run_id: UUID for tracking this pipeline run
+        enqueued: True if job was enqueued successfully
+    """
+    # Check if pipeline is already running (advisory lock check)
+    # Note: This is a weak check - the worker will do the authoritative lock check
     if is_pipeline_locked():
         raise HTTPException(
             status_code=409,
             detail="Pipeline is already running. Please wait for it to complete."
         )
+    try:
+        from adapters.queue.jobs import enqueue_pipeline_job
+        result = await enqueue_pipeline_job(
+            train_model=train_model,
+            trigger_source=trigger_source,
+        )
+        logger.info(f"Pipeline job enqueued: run_id={result['run_id']}, trigger={trigger_source}")
+        return {
+            "status": "enqueued",
+            "message": "Pipeline job enqueued. Worker will execute. Check /api/health for status.",
+            "run_id": result["run_id"],
+            "job_id": result["job_id"],
+            "train_model": train_model,
+            "trigger_source": trigger_source,
+        }
+    except Exception as e:
+        logger.error(f"Failed to enqueue pipeline job: {e}")
+        raise HTTPException(
+            status_code=503,
+            detail=f"Failed to enqueue job. Redis may be unavailable: {str(e)}"
+        )

app/models.py CHANGED Viewed

@@ -15,6 +15,7 @@ from typing import Optional
 from sqlalchemy import (
     Column,
     Integer,
     String,
     Float,
     DateTime,
@@ -24,7 +25,9 @@ from sqlalchemy import (
     Index,
     UniqueConstraint,
     JSON,
 )
 from sqlalchemy.orm import relationship
 from app.db import Base
@@ -302,19 +305,125 @@ class PipelineRunMetrics(Base):
     train_samples = Column(Integer, nullable=True)
     val_samples = Column(Integer, nullable=True)
-    # Data quality
     news_imported = Column(Integer, nullable=True)
     news_duplicates = Column(Integer, nullable=True)
     price_bars_updated = Column(Integer, nullable=True)
     missing_price_days = Column(Integer, nullable=True)
     # Snapshot info
     snapshot_generated = Column(Boolean, default=False)
     commentary_generated = Column(Boolean, default=False)
     # Status
     status = Column(String(20), nullable=False, default="running")  # running/success/failed
     error_message = Column(Text, nullable=True)
     def __repr__(self):
         return f"<PipelineRunMetrics(run_id={self.run_id}, status={self.status})>"

 from sqlalchemy import (
     Column,
     Integer,
+    BigInteger,
     String,
     Float,
     DateTime,
     Index,
     UniqueConstraint,
     JSON,
+    func,
 )
+from sqlalchemy.dialects.postgresql import UUID, JSONB
 from sqlalchemy.orm import relationship
 from app.db import Base
     train_samples = Column(Integer, nullable=True)
     val_samples = Column(Integer, nullable=True)
+    # Data quality (legacy - news_articles table)
     news_imported = Column(Integer, nullable=True)
     news_duplicates = Column(Integer, nullable=True)
     price_bars_updated = Column(Integer, nullable=True)
     missing_price_days = Column(Integer, nullable=True)
+    # Faz 2: Reproducible news pipeline stats
+    news_raw_inserted = Column(Integer, nullable=True)
+    news_raw_duplicates = Column(Integer, nullable=True)
+    news_processed_inserted = Column(Integer, nullable=True)
+    news_processed_duplicates = Column(Integer, nullable=True)
     # Snapshot info
     snapshot_generated = Column(Boolean, default=False)
     commentary_generated = Column(Boolean, default=False)
+    # Faz 2: News cut-off time
+    news_cutoff_time = Column(DateTime(timezone=True), nullable=True)
+    # Quality state for degraded runs
+    quality_state = Column(String(20), nullable=True, default="ok")  # ok/stale/degraded/failed
     # Status
     status = Column(String(20), nullable=False, default="running")  # running/success/failed
     error_message = Column(Text, nullable=True)
     def __repr__(self):
         return f"<PipelineRunMetrics(run_id={self.run_id}, status={self.status})>"
+# =============================================================================
+# Faz 2: Reproducible News Pipeline
+# =============================================================================
+class NewsRaw(Base):
+    """
+    Ham haber verisi - RSS/API'den geldiği gibi saklanır.
+    Faz 2: Reproducibility için "golden source".
+    Dedup stratejisi:
+    - url_hash: nullable + partial unique index (WHERE url_hash IS NOT NULL)
+    - URL eksikse title-based fallback processed seviyesinde yapılır
+    """
+    __tablename__ = "news_raw"
+    id = Column(BigInteger, primary_key=True, autoincrement=True)
+    # URL (nullable - RSS'te eksik olabilir)
+    url = Column(String(2000), nullable=True)
+    url_hash = Column(String(64), nullable=True, index=True)  # sha256, partial unique
+    # Content
+    title = Column(String(500), nullable=False)
+    description = Column(Text, nullable=True)
+    # Metadata
+    source = Column(String(200), nullable=True)  # "google_news", "newsapi"
+    source_feed = Column(String(500), nullable=True)  # Exact RSS URL or query
+    published_at = Column(DateTime(timezone=True), nullable=False, index=True)
+    fetched_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
+    # Pipeline run tracking (UUID)
+    run_id = Column(UUID(as_uuid=True), nullable=True, index=True)
+    # Raw payload (debug/audit)
+    raw_payload = Column(JSONB, nullable=True)
+    # Relationship
+    processed_items = relationship("NewsProcessed", back_populates="raw")
+    def __repr__(self):
+        return f"<NewsRaw(id={self.id}, title='{self.title[:30]}...')>"
+class NewsProcessed(Base):
+    """
+    İşlenmiş haber - dedup, cleaning, language filter sonrası.
+    Faz 2: Sentiment scoring için input.
+    Dedup stratejisi:
+    - dedup_key: NOT NULL + UNIQUE - asıl dedup otoritesi
+    - Öncelik: url_hash varsa kullan, yoksa sha256(source + canonical_title_hash)
+    """
+    __tablename__ = "news_processed"
+    id = Column(BigInteger, primary_key=True, autoincrement=True)
+    # FK to raw (RESTRICT - raw silinirse processed da silinmemeli)
+    raw_id = Column(
+        BigInteger,
+        ForeignKey("news_raw.id", ondelete="RESTRICT"),
+        nullable=False,
+        index=True
+    )
+    # Canonical content
+    canonical_title = Column(String(500), nullable=False)
+    canonical_title_hash = Column(String(64), nullable=False, index=True)  # sha256
+    cleaned_text = Column(Text, nullable=True)  # title + description, cleaned
+    # Dedup key - ASIL OTORİTE
+    dedup_key = Column(String(64), unique=True, nullable=False, index=True)  # sha256
+    # Language
+    language = Column(String(10), nullable=True, default="en")
+    language_confidence = Column(Float, nullable=True)
+    # Processing metadata
+    processed_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
+    run_id = Column(UUID(as_uuid=True), nullable=True, index=True)
+    # Future: Tone/Impact scores (Faz 3)
+    # tone_score = Column(Float, nullable=True)
+    # impact_direction = Column(String(20), nullable=True)  # bullish/bearish/neutral
+    # Relationship
+    raw = relationship("NewsRaw", back_populates="processed_items")
+    def __repr__(self):
+        return f"<NewsProcessed(id={self.id}, dedup_key='{self.dedup_key[:16]}...')>"

app/schemas.py CHANGED Viewed

@@ -24,18 +24,37 @@ class DataQuality(BaseModel):
 class AnalysisReport(BaseModel):
-    """Full analysis report returned by /api/analysis."""
     symbol: str = Field(..., description="Trading symbol (e.g., HG=F)")
-    current_price: float = Field(..., description="Most recent closing price")
-    predicted_return: float = Field(..., description="Predicted next-day return")
-    predicted_price: float = Field(..., description="Predicted next-day price")
-    confidence_lower: float = Field(..., description="Lower bound of confidence interval")
-    confidence_upper: float = Field(..., description="Upper bound of confidence interval")
-    sentiment_index: float = Field(..., description="Current sentiment index (-1 to 1)")
-    sentiment_label: str = Field(..., description="Sentiment label: Bullish, Bearish, or Neutral")
-    top_influencers: list[Influencer] = Field(..., description="Top feature influencers")
-    data_quality: DataQuality = Field(..., description="Data quality metrics")
-    generated_at: str = Field(..., description="ISO timestamp of report generation")
     class Config:
         json_schema_extra = {
@@ -57,7 +76,10 @@ class AnalysisReport(BaseModel):
                     "missing_days": 0,
                     "coverage_pct": 100
                 },
-                "generated_at": "2026-01-02T09:00:00Z"
             }
         }
@@ -96,17 +118,23 @@ class HealthResponse(BaseModel):
     timestamp: str = Field(..., description="Current server timestamp")
     news_count: Optional[int] = Field(None, description="Total news articles in database")
     price_bars_count: Optional[int] = Field(None, description="Total price bars in database")
     class Config:
         json_schema_extra = {
             "example": {
                 "status": "healthy",
-                "db_type": "sqlite",
                 "models_found": 1,
                 "pipeline_locked": False,
                 "timestamp": "2026-01-02T10:00:00Z",
                 "news_count": 1250,
-                "price_bars_count": 1460
             }
         }

 class AnalysisReport(BaseModel):
+    """
+    Full analysis report returned by /api/analysis.
+    Faz 1: Snapshot-only mode - fields may be null in degraded states.
+    Check quality_state to determine data freshness.
+    """
     symbol: str = Field(..., description="Trading symbol (e.g., HG=F)")
+    # Core prediction data (nullable for degraded modes)
+    current_price: Optional[float] = Field(0.0, description="Most recent closing price")
+    predicted_return: Optional[float] = Field(0.0, description="Predicted next-day return")
+    predicted_price: Optional[float] = Field(0.0, description="Predicted next-day price")
+    confidence_lower: Optional[float] = Field(0.0, description="Lower bound of confidence interval")
+    confidence_upper: Optional[float] = Field(0.0, description="Upper bound of confidence interval")
+    sentiment_index: Optional[float] = Field(0.0, description="Current sentiment index (-1 to 1)")
+    sentiment_label: Optional[str] = Field("Neutral", description="Sentiment label: Bullish, Bearish, or Neutral")
+    # Feature influencers (may be empty)
+    top_influencers: list[Influencer] = Field(default_factory=list, description="Top feature influencers")
+    # Data quality (always present)
+    data_quality: Optional[DataQuality] = Field(None, description="Data quality metrics")
+    # Timestamps
+    generated_at: Optional[str] = Field(None, description="ISO timestamp of report generation")
+    # Faz 1: Quality state fields
+    quality_state: Optional[str] = Field("ok", description="Snapshot quality: ok, stale, missing")
+    model_state: Optional[str] = Field("ok", description="Model status: ok, degraded, offline")
+    snapshot_age_hours: Optional[float] = Field(None, description="Hours since snapshot was generated")
+    message: Optional[str] = Field(None, description="Human-readable status message")
     class Config:
         json_schema_extra = {
                     "missing_days": 0,
                     "coverage_pct": 100
                 },
+                "generated_at": "2026-01-02T09:00:00Z",
+                "quality_state": "ok",
+                "model_state": "ok",
+                "snapshot_age_hours": 2.5
             }
         }
     timestamp: str = Field(..., description="Current server timestamp")
     news_count: Optional[int] = Field(None, description="Total news articles in database")
     price_bars_count: Optional[int] = Field(None, description="Total price bars in database")
+    # Faz 1: Queue and snapshot observability
+    redis_ok: Optional[bool] = Field(None, description="Redis queue connectivity")
+    last_snapshot_age_seconds: Optional[int] = Field(None, description="Age of last analysis snapshot in seconds")
     class Config:
         json_schema_extra = {
             "example": {
                 "status": "healthy",
+                "db_type": "postgresql",
                 "models_found": 1,
                 "pipeline_locked": False,
                 "timestamp": "2026-01-02T10:00:00Z",
                 "news_count": 1250,
+                "price_bars_count": 1460,
+                "redis_ok": True,
+                "last_snapshot_age_seconds": 3600
             }
         }

app/settings.py CHANGED Viewed

@@ -57,10 +57,14 @@ class Settings(BaseSettings):
     # Futures vs Spot adjustment factor
     futures_spot_adjustment: float = 0.985
-    # Scheduler
     schedule_time: str = "02:00"
     tz: str = "Europe/Istanbul"
-    scheduler_enabled: bool = True
     # OpenRouter AI Commentary
     openrouter_api_key: Optional[str] = None
@@ -75,6 +79,12 @@ class Settings(BaseSettings):
     # Pipeline trigger authentication
     pipeline_trigger_secret: Optional[str] = None
     def _load_symbol_set_file(self, set_name: str) -> Optional[dict]:
         """Load symbol set from JSON file. Returns None on error."""
         try:

     # Futures vs Spot adjustment factor
     futures_spot_adjustment: float = 0.985
+    # Scheduler (DEPRECATED in API - external scheduler only)
+    # These are kept for backward compatibility but scheduler no longer runs in API
     schedule_time: str = "02:00"
     tz: str = "Europe/Istanbul"
+    scheduler_enabled: bool = False  # Default to False - scheduler is external now
+    # Redis Queue (for worker)
+    redis_url: str = "redis://localhost:6379/0"
     # OpenRouter AI Commentary
     openrouter_api_key: Optional[str] = None
     # Pipeline trigger authentication
     pipeline_trigger_secret: Optional[str] = None
+    # Faz 2: Market cut-off for news aggregation
+    # Defines when "today's news" ends for sentiment calculation
+    market_timezone: str = "America/New_York"  # NYSE timezone
+    market_close_time: str = "16:00"  # 4 PM ET
+    cutoff_buffer_minutes: int = 30  # Allow 30 min after close for late news
     def _load_symbol_set_file(self, set_name: str) -> Optional[dict]:
         """Load symbol set from JSON file. Returns None on error."""
         try:

migrations/001_add_news_raw_processed.sql ADDED Viewed

	@@ -0,0 +1,121 @@

+-- Migration: 001_add_news_raw_processed.sql
+-- Faz 2: Ham/İşlenmiş haber tabloları + reproducible news pipeline
+--
+-- Run on: Supabase PostgreSQL
+-- Date: 2026-01-28
+--
+-- IMPORTANT: Run this migration BEFORE deploying Faz 2 pipeline code.
+-- =============================================================================
+-- 1. news_raw - Ham haber verisi (golden source)
+-- =============================================================================
+CREATE TABLE IF NOT EXISTS news_raw (
+    id BIGSERIAL PRIMARY KEY,
+    -- URL (nullable - RSS'te eksik olabilir)
+    url VARCHAR(2000),
+    url_hash VARCHAR(64),  -- sha256, nullable for partial unique
+    -- Content
+    title VARCHAR(500) NOT NULL,
+    description TEXT,
+    -- Metadata
+    source VARCHAR(200),  -- "google_news", "newsapi", etc.
+    source_feed VARCHAR(500),  -- Exact RSS URL or API query
+    published_at TIMESTAMPTZ NOT NULL,
+    fetched_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    -- Pipeline run tracking
+    run_id UUID,
+    -- Raw payload for debugging
+    raw_payload JSONB
+);
+-- Basic indexes
+CREATE INDEX IF NOT EXISTS ix_news_raw_published ON news_raw(published_at);
+CREATE INDEX IF NOT EXISTS ix_news_raw_run ON news_raw(run_id);
+CREATE INDEX IF NOT EXISTS ix_news_raw_url_hash ON news_raw(url_hash);
+-- PARTIAL UNIQUE INDEX: url_hash must be unique IF it exists
+-- This allows NULL url_hash (for articles without URL) while preventing duplicates
+CREATE UNIQUE INDEX IF NOT EXISTS ux_news_raw_url_hash
+ON news_raw(url_hash)
+WHERE url_hash IS NOT NULL;
+-- =============================================================================
+-- 2. news_processed - İşlenmiş haber (dedup authority)
+-- =============================================================================
+CREATE TABLE IF NOT EXISTS news_processed (
+    id BIGSERIAL PRIMARY KEY,
+    -- FK to raw (RESTRICT - don't allow deleting raw if processed exists)
+    raw_id BIGINT NOT NULL REFERENCES news_raw(id) ON DELETE RESTRICT,
+    -- Canonical content
+    canonical_title VARCHAR(500) NOT NULL,
+    canonical_title_hash VARCHAR(64) NOT NULL,  -- sha256
+    cleaned_text TEXT,  -- title + description, cleaned
+    -- Dedup key - THE AUTHORITY
+    -- Priority: url_hash if available, else sha256(source + canonical_title_hash)
+    dedup_key VARCHAR(64) NOT NULL UNIQUE,
+    -- Language
+    language VARCHAR(10) DEFAULT 'en',
+    language_confidence FLOAT,
+    -- Processing metadata
+    processed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    run_id UUID
+);
+-- Indexes
+CREATE INDEX IF NOT EXISTS ix_news_processed_raw_id ON news_processed(raw_id);
+CREATE INDEX IF NOT EXISTS ix_news_processed_run ON news_processed(run_id);
+CREATE INDEX IF NOT EXISTS ix_news_processed_title_hash ON news_processed(canonical_title_hash);
+-- =============================================================================
+-- 3. Add Faz 2 columns to pipeline_run_metrics
+-- =============================================================================
+-- Cut-off time
+ALTER TABLE pipeline_run_metrics
+ADD COLUMN IF NOT EXISTS news_cutoff_time TIMESTAMPTZ;
+-- Raw stats
+ALTER TABLE pipeline_run_metrics
+ADD COLUMN IF NOT EXISTS news_raw_inserted INTEGER;
+ALTER TABLE pipeline_run_metrics
+ADD COLUMN IF NOT EXISTS news_raw_duplicates INTEGER;
+-- Processed stats
+ALTER TABLE pipeline_run_metrics
+ADD COLUMN IF NOT EXISTS news_processed_inserted INTEGER;
+ALTER TABLE pipeline_run_metrics
+ADD COLUMN IF NOT EXISTS news_processed_duplicates INTEGER;
+-- Quality state for degraded runs
+ALTER TABLE pipeline_run_metrics
+ADD COLUMN IF NOT EXISTS quality_state VARCHAR(20) DEFAULT 'ok';
+-- =============================================================================
+-- Verification queries (run after migration to verify)
+-- =============================================================================
+-- Check tables exist:
+-- SELECT table_name FROM information_schema.tables WHERE table_name IN ('news_raw', 'news_processed');
+-- Check partial unique index:
+-- SELECT indexname FROM pg_indexes WHERE tablename = 'news_raw' AND indexname = 'ux_news_raw_url_hash';
+-- Check FK constraint:
+-- SELECT conname FROM pg_constraint WHERE conrelid = 'news_processed'::regclass AND confrelid = 'news_raw'::regclass;

pipelines/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""
+Pipelines package for Faz 2 reproducible data processing.
+Modules:
+- ingestion/news: RSS/API -> news_raw
+- processing/news: news_raw -> news_processed
+- cutoff: Market cut-off calculation
+"""

pipelines/cutoff.py ADDED Viewed

	@@ -0,0 +1,186 @@

+"""
+Market cut-off calculation for news aggregation.
+Faz 2: Defines which news articles belong to "today's" sentiment.
+Uses market close time with buffer to determine cut-off.
+"""
+import logging
+from datetime import datetime, time, timedelta, timezone
+from typing import Optional
+try:
+    from zoneinfo import ZoneInfo
+except ImportError:
+    from backports.zoneinfo import ZoneInfo  # Python < 3.9
+from app.settings import get_settings
+logger = logging.getLogger(__name__)
+def compute_news_cutoff(
+    run_datetime: Optional[datetime] = None,
+    market_tz: Optional[str] = None,
+    market_close: Optional[str] = None,
+    buffer_minutes: Optional[int] = None,
+) -> datetime:
+    """
+    Compute news cut-off datetime for a pipeline run.
+    Logic:
+        1. Convert run_datetime to market timezone
+        2. Calculate today's close + buffer
+        3. If run is before today's close+buffer, use yesterday's close+buffer
+        4. If run is on weekend, roll back to Friday
+    Args:
+        run_datetime: When pipeline started (UTC). Defaults to now.
+        market_tz: Market timezone (e.g., "America/New_York"). Defaults to settings.
+        market_close: Market close time "HH:MM". Defaults to settings.
+        buffer_minutes: Minutes after close to allow. Defaults to settings.
+    Returns:
+        Cut-off datetime in UTC
+    Example:
+        Pipeline runs at 2026-01-28 10:00 UTC (05:00 ET)
+        → Before 16:30 ET → use 2026-01-27 16:30 ET → 2026-01-27 21:30 UTC
+        Pipeline runs at 2026-01-28 22:00 UTC (17:00 ET)
+        → After 16:30 ET → use 2026-01-28 16:30 ET → 2026-01-28 21:30 UTC
+    """
+    settings = get_settings()
+    # Defaults from settings
+    if run_datetime is None:
+        run_datetime = datetime.now(timezone.utc)
+    if market_tz is None:
+        market_tz = settings.market_timezone
+    if market_close is None:
+        market_close = settings.market_close_time
+    if buffer_minutes is None:
+        buffer_minutes = settings.cutoff_buffer_minutes
+    # Parse market close time
+    close_hour, close_minute = map(int, market_close.split(":"))
+    buffer = timedelta(minutes=buffer_minutes)
+    # Get timezone
+    tz = ZoneInfo(market_tz)
+    # Convert run_datetime to market timezone
+    if run_datetime.tzinfo is None:
+        run_datetime = run_datetime.replace(tzinfo=timezone.utc)
+    run_local = run_datetime.astimezone(tz)
+    # Today's close + buffer in market timezone
+    today_close = run_local.replace(
+        hour=close_hour,
+        minute=close_minute,
+        second=0,
+        microsecond=0,
+    ) + buffer
+    # Determine which day's close to use
+    if run_local >= today_close:
+        # After today's close+buffer → use today's close
+        cutoff_local = today_close
+    else:
+        # Before today's close+buffer → use yesterday's close
+        yesterday_close = today_close - timedelta(days=1)
+        cutoff_local = yesterday_close
+    # Weekend guard: roll back to Friday if cutoff falls on weekend
+    cutoff_local = _adjust_for_weekend(cutoff_local)
+    # Convert back to UTC
+    cutoff_utc = cutoff_local.astimezone(timezone.utc)
+    logger.debug(
+        f"Cut-off computed: run={run_datetime.isoformat()}, "
+        f"cutoff={cutoff_utc.isoformat()} (local: {cutoff_local.isoformat()})"
+    )
+    return cutoff_utc
+def _adjust_for_weekend(dt: datetime) -> datetime:
+    """
+    Adjust datetime to Friday if it falls on weekend.
+    Args:
+        dt: Datetime to adjust
+    Returns:
+        Adjusted datetime (Friday if input was Sat/Sun)
+    """
+    weekday = dt.weekday()  # 0=Mon, 5=Sat, 6=Sun
+    if weekday == 5:  # Saturday
+        return dt - timedelta(days=1)  # Roll back to Friday
+    elif weekday == 6:  # Sunday
+        return dt - timedelta(days=2)  # Roll back to Friday
+    return dt
+def get_news_window(
+    cutoff_dt: datetime,
+    lookback_days: int = 7,
+) -> tuple[datetime, datetime]:
+    """
+    Get the time window for news aggregation.
+    Args:
+        cutoff_dt: Cut-off datetime (latest news to include)
+        lookback_days: How many days back to look
+    Returns:
+        Tuple of (start_dt, end_dt) for news query
+    """
+    end_dt = cutoff_dt
+    start_dt = cutoff_dt - timedelta(days=lookback_days)
+    return (start_dt, end_dt)
+def is_market_open(
+    dt: Optional[datetime] = None,
+    market_tz: Optional[str] = None,
+) -> bool:
+    """
+    Check if market is currently open (approximate).
+    Note: Does not account for holidays, just weekdays 9:30-16:00 ET.
+    Args:
+        dt: Datetime to check. Defaults to now.
+        market_tz: Market timezone. Defaults to settings.
+    Returns:
+        True if market is likely open
+    """
+    settings = get_settings()
+    if dt is None:
+        dt = datetime.now(timezone.utc)
+    if market_tz is None:
+        market_tz = settings.market_timezone
+    tz = ZoneInfo(market_tz)
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=timezone.utc)
+    local = dt.astimezone(tz)
+    # Weekend
+    if local.weekday() >= 5:
+        return False
+    # Market hours (approximate: 9:30 - 16:00)
+    market_open = time(9, 30)
+    market_close = time(16, 0)
+    current_time = local.time()
+    return market_open <= current_time <= market_close

pipelines/ingestion/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Ingestion subpackage - data source fetching."""

pipelines/ingestion/news.py ADDED Viewed

	@@ -0,0 +1,271 @@

+"""
+News ingestion to news_raw table.
+Faz 2: Reproducible news pipeline - first stage.
+Fetches from RSS/API and stores raw data for audit trail.
+"""
+import hashlib
+import logging
+import uuid
+from datetime import datetime, timezone
+from typing import Optional
+from sqlalchemy import text
+from sqlalchemy.dialects.postgresql import insert as pg_insert
+from sqlalchemy.orm import Session
+from app.models import NewsRaw
+from app.settings import get_settings
+from app.utils import normalize_url, clean_text
+from app.rss_ingest import fetch_google_news
+from app.db import get_db_type
+logger = logging.getLogger(__name__)
+def compute_url_hash(url: Optional[str]) -> Optional[str]:
+    """
+    Compute deterministic hash of normalized URL.
+    Args:
+        url: Raw URL string (may be None or empty)
+    Returns:
+        sha256 hex64 of normalized URL, or None if URL is empty/invalid
+    """
+    if not url or not url.strip():
+        return None
+    normalized = normalize_url(url)
+    if not normalized:
+        return None
+    return hashlib.sha256(normalized.encode()).hexdigest()
+def insert_raw_article(
+    session: Session,
+    url: Optional[str],
+    title: str,
+    description: Optional[str],
+    source: str,
+    source_feed: str,
+    published_at: datetime,
+    run_id: uuid.UUID,
+    raw_payload: Optional[dict] = None,
+) -> Optional[int]:
+    """
+    Insert single article to news_raw.
+    Uses ON CONFLICT DO NOTHING for url_hash to handle duplicates gracefully.
+    Args:
+        session: Database session
+        url: Article URL (can be None)
+        title: Article title
+        description: Article description
+        source: Source name (e.g., "google_news", "newsapi")
+        source_feed: Exact feed URL or query string
+        published_at: Publication timestamp (UTC)
+        run_id: Pipeline run UUID
+        raw_payload: Original response fragment for debugging
+    Returns:
+        raw_id if inserted, None if duplicate or error
+    """
+    if not title or not title.strip():
+        return None
+    title = clean_text(title)[:500]  # Truncate to column limit
+    url_hash = compute_url_hash(url)
+    try:
+        db_type = get_db_type()
+        if db_type == "postgresql":
+            # Use INSERT ... ON CONFLICT for PostgreSQL
+            stmt = pg_insert(NewsRaw).values(
+                url=url,
+                url_hash=url_hash,
+                title=title,
+                description=description[:2000] if description else None,
+                source=source,
+                source_feed=source_feed[:500] if source_feed else None,
+                published_at=published_at,
+                run_id=run_id,
+                raw_payload=raw_payload,
+            )
+            # Only conflict on url_hash if it's not None
+            if url_hash:
+                stmt = stmt.on_conflict_do_nothing(index_elements=["url_hash"])
+            result = session.execute(stmt)
+            if result.rowcount > 0:
+                # Get the inserted ID
+                # For PostgreSQL, we need to query it
+                row = session.execute(
+                    text("SELECT id FROM news_raw WHERE url_hash = :hash ORDER BY id DESC LIMIT 1"),
+                    {"hash": url_hash}
+                ).fetchone()
+                return row[0] if row else None
+            return None  # Duplicate
+        else:
+            # SQLite fallback - simple insert with error handling
+            article = NewsRaw(
+                url=url,
+                url_hash=url_hash,
+                title=title,
+                description=description[:2000] if description else None,
+                source=source,
+                source_feed=source_feed[:500] if source_feed else None,
+                published_at=published_at,
+                run_id=run_id,
+                raw_payload=raw_payload,
+            )
+            session.add(article)
+            session.flush()
+            return article.id
+    except Exception as e:
+        logger.debug(f"Insert raw article failed: {e}")
+        session.rollback()
+        return None
+def ingest_news_to_raw(
+    session: Session,
+    run_id: uuid.UUID,
+    sources: Optional[list[str]] = None,
+) -> dict:
+    """
+    Ingest news from all sources into news_raw.
+    Currently supports:
+    - google_news: RSS feed from Google News
+    - newsapi: NewsAPI.org (if API key configured)
+    Args:
+        session: Database session
+        run_id: Pipeline run UUID
+        sources: List of source types to fetch (default: all configured)
+    Returns:
+        dict with stats:
+            - fetched: Total items fetched from sources
+            - inserted: New items inserted to news_raw
+            - duplicates: Skipped due to url_hash conflict
+            - errors: Items that failed to insert
+    """
+    settings = get_settings()
+    sources = sources or ["google_news"]
+    # Add newsapi if key is configured
+    if settings.newsapi_key and "newsapi" not in sources:
+        sources.append("newsapi")
+    stats = {
+        "fetched": 0,
+        "inserted": 0,
+        "duplicates": 0,
+        "errors": 0,
+        "sources": sources,
+    }
+    # Strategic queries for copper market
+    QUERIES = [
+        "copper supply deficit",
+        "copper price forecast",
+        "copper mining production",
+        "copper demand China",
+        "copper EV battery",
+        "Freeport-McMoRan copper",
+        "BHP copper",
+        "Rio Tinto copper",
+    ]
+    logger.info(f"[run_id={run_id}] Ingesting news from {sources} with {len(QUERIES)} queries")
+    for source in sources:
+        if source == "google_news":
+            for query in QUERIES:
+                try:
+                    articles = fetch_google_news(
+                        query=query,
+                        language=settings.news_language,
+                    )
+                    stats["fetched"] += len(articles)
+                    for article in articles:
+                        raw_id = insert_raw_article(
+                            session=session,
+                            url=article.get("url"),
+                            title=article.get("title", ""),
+                            description=article.get("description"),
+                            source="google_news",
+                            source_feed=f"google_news:{query}",
+                            published_at=article.get("published_at", datetime.now(timezone.utc)),
+                            run_id=run_id,
+                            raw_payload={"query": query, "source": article.get("source")},
+                        )
+                        if raw_id:
+                            stats["inserted"] += 1
+                        else:
+                            stats["duplicates"] += 1
+                except Exception as e:
+                    logger.warning(f"Error fetching {source} for '{query}': {e}")
+                    stats["errors"] += 1
+        elif source == "newsapi" and settings.newsapi_key:
+            # NewsAPI implementation - reuse existing fetch
+            from app.data_manager import fetch_newsapi_articles
+            for query in QUERIES[:3]:  # Limit API calls
+                try:
+                    articles = fetch_newsapi_articles(
+                        api_key=settings.newsapi_key,
+                        query=query,
+                        language=settings.news_language,
+                        lookback_days=settings.lookback_days,
+                    )
+                    stats["fetched"] += len(articles)
+                    for article in articles:
+                        raw_id = insert_raw_article(
+                            session=session,
+                            url=article.get("url"),
+                            title=article.get("title", ""),
+                            description=article.get("description"),
+                            source="newsapi",
+                            source_feed=f"newsapi:{query}",
+                            published_at=article.get("published_at", datetime.now(timezone.utc)),
+                            run_id=run_id,
+                            raw_payload={"query": query, "author": article.get("author")},
+                        )
+                        if raw_id:
+                            stats["inserted"] += 1
+                        else:
+                            stats["duplicates"] += 1
+                except Exception as e:
+                    logger.warning(f"Error fetching newsapi for '{query}': {e}")
+                    stats["errors"] += 1
+    session.commit()
+    logger.info(
+        f"[run_id={run_id}] News ingestion complete: "
+        f"{stats['fetched']} fetched, {stats['inserted']} inserted, "
+        f"{stats['duplicates']} duplicates"
+    )
+    return stats

pipelines/processing/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Processing subpackage - data transformation and dedup."""

pipelines/processing/news.py ADDED Viewed

	@@ -0,0 +1,221 @@

+"""
+Process news_raw -> news_processed with deterministic dedup.
+Faz 2: Reproducible news pipeline - second stage.
+Applies canonicalization, language detection, and deterministic dedup.
+"""
+import hashlib
+import logging
+import uuid
+from datetime import datetime, timezone
+from typing import Optional
+from sqlalchemy import text
+from sqlalchemy.dialects.postgresql import insert as pg_insert
+from sqlalchemy.orm import Session
+from app.models import NewsRaw, NewsProcessed
+from app.utils import canonical_title, clean_text
+from app.db import get_db_type
+logger = logging.getLogger(__name__)
+def compute_canonical_title_hash(title: str) -> str:
+    """
+    Compute hash of canonical title.
+    Args:
+        title: Raw title string
+    Returns:
+        sha256 hex64 of canonical_title(title)
+    """
+    canon = canonical_title(title)
+    return hashlib.sha256(canon.encode()).hexdigest()
+def compute_dedup_key(
+    url_hash: Optional[str],
+    source: str,
+    canonical_title_hash: str,
+) -> str:
+    """
+    Compute deterministic dedup key.
+    Priority:
+        1. url_hash if not None (URL is the best identifier)
+        2. sha256(source + "|" + canonical_title_hash) as fallback
+    Args:
+        url_hash: Hash of normalized URL (may be None)
+        source: Source name (e.g., "google_news")
+        canonical_title_hash: Hash of canonical title
+    Returns:
+        sha256 hex64 dedup key
+    """
+    if url_hash:
+        return url_hash
+    # Fallback: combine source + title hash
+    fallback_input = f"{source}|{canonical_title_hash}"
+    return hashlib.sha256(fallback_input.encode()).hexdigest()
+def process_single_raw(
+    session: Session,
+    raw: NewsRaw,
+    run_id: uuid.UUID,
+) -> Optional[int]:
+    """
+    Process a single NewsRaw into NewsProcessed.
+    Args:
+        session: Database session
+        raw: NewsRaw object to process
+        run_id: Pipeline run UUID
+    Returns:
+        processed_id if inserted, None if duplicate
+    """
+    # Canonicalize
+    canon = canonical_title(raw.title)
+    canon_hash = compute_canonical_title_hash(raw.title)
+    # Clean text (title + description)
+    cleaned = clean_text(raw.title)
+    if raw.description:
+        cleaned += " " + clean_text(raw.description)
+    cleaned = cleaned[:5000]  # Reasonable limit
+    # Compute dedup key
+    dedup = compute_dedup_key(
+        url_hash=raw.url_hash,
+        source=raw.source or "unknown",
+        canonical_title_hash=canon_hash,
+    )
+    # Detect language (optional, use simple heuristic for now)
+    # Full langdetect is slow; Faz 3 can improve this
+    language = "en"  # Assume English for now
+    language_confidence = None
+    try:
+        db_type = get_db_type()
+        if db_type == "postgresql":
+            stmt = pg_insert(NewsProcessed).values(
+                raw_id=raw.id,
+                canonical_title=canon[:500],
+                canonical_title_hash=canon_hash,
+                cleaned_text=cleaned,
+                dedup_key=dedup,
+                language=language,
+                language_confidence=language_confidence,
+                run_id=run_id,
+            ).on_conflict_do_nothing(index_elements=["dedup_key"])
+            result = session.execute(stmt)
+            if result.rowcount > 0:
+                return raw.id  # Successfully inserted
+            return None  # Duplicate
+        else:
+            # SQLite fallback
+            processed = NewsProcessed(
+                raw_id=raw.id,
+                canonical_title=canon[:500],
+                canonical_title_hash=canon_hash,
+                cleaned_text=cleaned,
+                dedup_key=dedup,
+                language=language,
+                language_confidence=language_confidence,
+                run_id=run_id,
+            )
+            session.add(processed)
+            session.flush()
+            return processed.id
+    except Exception as e:
+        logger.debug(f"Process raw article failed: {e}")
+        session.rollback()
+        return None
+def process_raw_to_processed(
+    session: Session,
+    run_id: uuid.UUID,
+    batch_size: int = 100,
+) -> dict:
+    """
+    Process unprocessed raw articles.
+    Finds news_raw records that don't have corresponding news_processed,
+    canonicalizes them, and inserts to news_processed with dedup.
+    Args:
+        session: Database session
+        run_id: Pipeline run UUID
+        batch_size: Number of records to process per batch
+    Returns:
+        dict with stats:
+            - processed: Total items attempted
+            - inserted: New items in news_processed
+            - duplicates: Skipped due to dedup_key conflict
+    """
+    stats = {
+        "processed": 0,
+        "inserted": 0,
+        "duplicates": 0,
+    }
+    # Find unprocessed raw articles
+    # LEFT JOIN to find raw records without processed counterparts
+    unprocessed_query = (
+        session.query(NewsRaw)
+        .outerjoin(NewsProcessed, NewsRaw.id == NewsProcessed.raw_id)
+        .filter(NewsProcessed.id.is_(None))
+        .order_by(NewsRaw.id)
+    )
+    total = unprocessed_query.count()
+    logger.info(f"[run_id={run_id}] Found {total} unprocessed raw articles")
+    if total == 0:
+        return stats
+    # Process in batches
+    offset = 0
+    while True:
+        batch = unprocessed_query.limit(batch_size).offset(offset).all()
+        if not batch:
+            break
+        for raw in batch:
+            stats["processed"] += 1
+            result = process_single_raw(session, raw, run_id)
+            if result:
+                stats["inserted"] += 1
+            else:
+                stats["duplicates"] += 1
+        session.commit()
+        offset += batch_size
+        if offset >= total:
+            break
+    logger.info(
+        f"[run_id={run_id}] Processing complete: "
+        f"{stats['processed']} processed, {stats['inserted']} inserted, "
+        f"{stats['duplicates']} duplicates"
+    )
+    return stats

requirements.txt CHANGED Viewed

@@ -34,6 +34,10 @@ beautifulsoup4>=4.12.2
 # Scheduling
 apscheduler>=3.10.4
 # Utilities
 python-dateutil>=2.8.2
 filelock>=3.13.1

 # Scheduling
 apscheduler>=3.10.4
+# Queue (arq + Redis)
+arq>=0.25.0
+redis>=5.0.0
 # Utilities
 python-dateutil>=2.8.2
 filelock>=3.13.1

supervisord.conf ADDED Viewed

	@@ -0,0 +1,38 @@

+[supervisord]
+nodaemon=true
+logfile=/dev/null
+logfile_maxbytes=0
+[program:redis]
+command=redis-server --save "" --appendonly no --bind 127.0.0.1 --port 6379
+autorestart=true
+startsecs=1
+priority=1
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+[program:api]
+directory=/code
+command=python -m uvicorn app.main:app --host 0.0.0.0 --port 7860
+autorestart=true
+startsecs=3
+priority=10
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+environment=REDIS_URL="redis://127.0.0.1:6379/0",PYTHONPATH="/code"
+[program:worker]
+directory=/code
+command=python -m worker.runner
+autorestart=true
+startsecs=5
+priority=20
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+environment=REDIS_URL="redis://127.0.0.1:6379/0",PYTHONPATH="/code"

worker/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""
+Worker service for Terra Rara pipeline execution.
+Consumes jobs from Redis queue via arq.
+"""

worker/runner.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""
+arq Worker Runner.
+This is the entrypoint for the worker process.
+Run with: python -m worker.runner
+The worker:
+- Consumes jobs from Redis queue
+- Executes pipeline tasks
+- Has NO scheduler - scheduling is external (GitHub Actions, cron, etc.)
+"""
+import logging
+import os
+import sys
+# Add backend to path for imports
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from arq import run_worker
+from adapters.queue.redis import get_redis_settings
+from worker.tasks import run_pipeline, startup, shutdown
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(name)s - [worker] - %(message)s"
+)
+logger = logging.getLogger(__name__)
+class WorkerSettings:
+    """
+    arq worker settings.
+    This class is discovered by arq when running:
+        arq worker.runner.WorkerSettings
+    """
+    # Redis connection
+    redis_settings = get_redis_settings()
+    # Task functions
+    functions = [run_pipeline]
+    # Lifecycle hooks
+    on_startup = startup
+    on_shutdown = shutdown
+    # Job settings
+    max_jobs = 1  # Only one pipeline at a time per worker
+    job_timeout = 3600  # 1 hour max
+    max_tries = 1  # No automatic retries - cron will retry next cycle
+    # Health check
+    health_check_interval = 30
+def main():
+    """Run the worker."""
+    logger.info("Starting Terra Rara worker...")
+    logger.info(f"Redis: {WorkerSettings.redis_settings.host}:{WorkerSettings.redis_settings.port}")
+    # Run worker (blocking)
+    run_worker(WorkerSettings)
+if __name__ == "__main__":
+    main()

worker/tasks.py ADDED Viewed

	@@ -0,0 +1,522 @@

+"""
+Worker tasks for arq.
+This module defines the tasks that the worker executes.
+The main task is `run_pipeline` which orchestrates the entire pipeline.
+Faz 2: Integrated news_raw/news_processed pipeline with proper
+       commit boundaries, metrics tracking, and degraded mode handling.
+"""
+import logging
+import os
+import socket
+import uuid
+from datetime import datetime, timezone
+from typing import Any, Optional
+from sqlalchemy.orm import Session
+# These imports will be updated as we refactor
+import sys
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from app.db import SessionLocal, init_db, get_db_type
+from app.settings import get_settings
+from app.models import PipelineRunMetrics
+from adapters.db.lock import (
+    PIPELINE_LOCK_KEY,
+    try_acquire_lock,
+    release_lock,
+    write_lock_visibility,
+    clear_lock_visibility,
+)
+logger = logging.getLogger(__name__)
+# =============================================================================
+# Helper functions for metrics tracking
+# =============================================================================
+def create_run_metrics(
+    session: Session,
+    run_id: str,
+    started_at: datetime,
+) -> PipelineRunMetrics:
+    """Create initial pipeline_run_metrics record."""
+    metrics = PipelineRunMetrics(
+        run_id=run_id,
+        run_started_at=started_at,
+        status="running",
+    )
+    session.add(metrics)
+    session.flush()
+    return metrics
+def update_run_metrics(
+    session: Session,
+    run_id: str,
+    **kwargs,
+) -> None:
+    """Update pipeline_run_metrics with new values."""
+    metrics = session.query(PipelineRunMetrics).filter(
+        PipelineRunMetrics.run_id == run_id
+    ).first()
+    if metrics:
+        for key, value in kwargs.items():
+            if hasattr(metrics, key):
+                setattr(metrics, key, value)
+        session.flush()
+def finalize_run_metrics(
+    session: Session,
+    run_id: str,
+    status: str,
+    quality_state: str = "ok",
+    error_message: Optional[str] = None,
+) -> None:
+    """Finalize run metrics with completion status."""
+    completed_at = datetime.now(timezone.utc)
+    metrics = session.query(PipelineRunMetrics).filter(
+        PipelineRunMetrics.run_id == run_id
+    ).first()
+    if metrics:
+        metrics.run_completed_at = completed_at
+        metrics.status = status
+        metrics.quality_state = quality_state
+        if metrics.run_started_at:
+            metrics.duration_seconds = (completed_at - metrics.run_started_at).total_seconds()
+        if error_message:
+            metrics.error_message = error_message
+        session.flush()
+# =============================================================================
+# Main pipeline task
+# =============================================================================
+async def run_pipeline(
+    ctx: dict,
+    run_id: str,
+    train_model: bool = False,
+    trigger_source: str = "unknown",
+    enqueued_at: str = None,
+) -> dict:
+    """
+    Main pipeline task - executed by arq worker.
+    This is the ONLY entrypoint for pipeline execution.
+    Faz 2 Flow:
+        Stage 1a: News ingestion → news_raw
+        Stage 1b: Raw processing → news_processed
+        Stage 1c: Cut-off calculation
+        Stage 1d: Price ingestion
+        Stage 2: Sentiment scoring
+        Stage 3: Sentiment aggregation
+        Stage 4: Model training (optional)
+        Stage 5: Snapshot generation
+        Stage 6: Commentary generation
+    Args:
+        ctx: arq context (contains redis connection)
+        run_id: Unique identifier for this run
+        train_model: Whether to train the XGBoost model
+        trigger_source: Where the trigger came from (cron, manual, api)
+        enqueued_at: ISO timestamp when job was enqueued
+    Returns:
+        dict with run results
+    """
+    started_at = datetime.now(timezone.utc)
+    holder_id = f"{socket.gethostname()}:{os.getpid()}"
+    run_uuid = uuid.UUID(run_id) if isinstance(run_id, str) else run_id
+    logger.info(f"[run_id={run_id}] Pipeline starting: trigger={trigger_source}, train_model={train_model}")
+    # Initialize database
+    init_db()
+    # Get a dedicated session for this pipeline run
+    # IMPORTANT: This session holds the advisory lock
+    session: Session = get_session()
+    quality_state = "ok"
+    result = {}
+    try:
+        # 0. Create run metrics record
+        create_run_metrics(session, run_id, started_at)
+        session.commit()
+        # 1. Acquire distributed lock
+        if not try_acquire_lock(session, PIPELINE_LOCK_KEY):
+            logger.warning(f"[run_id={run_id}] Pipeline skipped: lock held by another process")
+            finalize_run_metrics(session, run_id, status="skipped_locked", quality_state="skipped")
+            session.commit()
+            return {
+                "run_id": run_id,
+                "status": "skipped_locked",
+                "message": "Another pipeline is running",
+            }
+        # Write lock visibility (best-effort)
+        write_lock_visibility(session, PIPELINE_LOCK_KEY, run_id, holder_id)
+        session.commit()
+        logger.info(f"[run_id={run_id}] Lock acquired, executing pipeline...")
+        # 2. Execute pipeline stages with proper commit boundaries
+        result = await _execute_pipeline_stages_v2(
+            session=session,
+            run_id=run_id,
+            run_uuid=run_uuid,
+            train_model=train_model,
+        )
+        # Determine quality state from result
+        # More nuanced logic to avoid false alarms
+        raw_inserted = result.get("news_raw_inserted", 0)
+        proc_inserted = result.get("news_processed_inserted", 0)
+        raw_error = result.get("news_raw_error")
+        proc_error = result.get("news_processed_error")
+        if raw_error or proc_error:
+            # Actual errors during ingestion/processing
+            quality_state = "degraded"
+            result["message"] = f"Pipeline errors: {raw_error or ''} {proc_error or ''}".strip()
+        elif raw_inserted == 0 and proc_inserted == 0:
+            # No new data at all - could be dedup working or sources haven't updated
+            quality_state = "stale"
+            result["message"] = "No new articles - sources may not have updated"
+        elif raw_inserted > 0 and proc_inserted == 0:
+            # Got raw but nothing processed - potential dedup anomaly
+            quality_state = "ok"  # This is actually fine - all duplicates
+            result["message"] = f"All {raw_inserted} articles were duplicates"
+        else:
+            quality_state = "ok"
+        # 3. Record success
+        finished_at = datetime.now(timezone.utc)
+        duration = (finished_at - started_at).total_seconds()
+        finalize_run_metrics(
+            session, run_id,
+            status="success",
+            quality_state=quality_state,
+        )
+        session.commit()
+        logger.info(f"[run_id={run_id}] Pipeline completed in {duration:.1f}s")
+        return {
+            "run_id": run_id,
+            "status": "success",
+            "quality_state": quality_state,
+            "started_at": started_at.isoformat(),
+            "finished_at": finished_at.isoformat(),
+            "duration_seconds": duration,
+            "train_model": train_model,
+            **result,
+        }
+    except Exception as e:
+        logger.error(f"[run_id={run_id}] Pipeline failed: {e}", exc_info=True)
+        try:
+            finalize_run_metrics(
+                session, run_id,
+                status="failed",
+                quality_state="failed",
+                error_message=str(e)[:1000],
+            )
+            session.commit()
+        except Exception:
+            session.rollback()
+        return {
+            "run_id": run_id,
+            "status": "failed",
+            "error": str(e),
+        }
+    finally:
+        # Always release lock and cleanup
+        try:
+            release_lock(session, PIPELINE_LOCK_KEY)
+            clear_lock_visibility(session, PIPELINE_LOCK_KEY)
+            session.commit()
+        except Exception:
+            session.rollback()
+        finally:
+            session.close()
+async def _execute_pipeline_stages_v2(
+    session: Session,
+    run_id: str,
+    run_uuid: uuid.UUID,
+    train_model: bool,
+) -> dict:
+    """
+    Execute pipeline stages with Faz 2 news pipeline integration.
+    Each stage has proper commit boundaries and metrics updates.
+    """
+    from app.settings import get_settings
+    settings = get_settings()
+    result = {}
+    # -------------------------------------------------------------------------
+    # Stage 1a: News ingestion → news_raw (FAZ 2)
+    # -------------------------------------------------------------------------
+    logger.info(f"[run_id={run_id}] Stage 1a: News ingestion → news_raw")
+    try:
+        from pipelines.ingestion.news import ingest_news_to_raw
+        raw_stats = ingest_news_to_raw(
+            session=session,
+            run_id=run_uuid,
+        )
+        session.commit()
+        result["news_raw_inserted"] = raw_stats.get("inserted", 0)
+        result["news_raw_duplicates"] = raw_stats.get("duplicates", 0)
+        update_run_metrics(
+            session, run_id,
+            news_raw_inserted=raw_stats.get("inserted", 0),
+            news_raw_duplicates=raw_stats.get("duplicates", 0),
+        )
+        session.commit()
+        logger.info(f"[run_id={run_id}] news_raw: {raw_stats.get('inserted', 0)} inserted")
+    except Exception as e:
+        logger.error(f"[run_id={run_id}] Stage 1a failed: {e}")
+        result["news_raw_error"] = str(e)
+        session.rollback()
+    # -------------------------------------------------------------------------
+    # Stage 1b: Raw → Processed (FAZ 2)
+    # -------------------------------------------------------------------------
+    logger.info(f"[run_id={run_id}] Stage 1b: news_raw → news_processed")
+    try:
+        from pipelines.processing.news import process_raw_to_processed
+        proc_stats = process_raw_to_processed(
+            session=session,
+            run_id=run_uuid,
+            batch_size=200,
+        )
+        session.commit()
+        result["news_processed_inserted"] = proc_stats.get("inserted", 0)
+        result["news_processed_duplicates"] = proc_stats.get("duplicates", 0)
+        update_run_metrics(
+            session, run_id,
+            news_processed_inserted=proc_stats.get("inserted", 0),
+            news_processed_duplicates=proc_stats.get("duplicates", 0),
+        )
+        session.commit()
+        logger.info(f"[run_id={run_id}] news_processed: {proc_stats.get('inserted', 0)} inserted")
+    except Exception as e:
+        logger.error(f"[run_id={run_id}] Stage 1b failed: {e}")
+        result["news_processed_error"] = str(e)
+        session.rollback()
+    # -------------------------------------------------------------------------
+    # Stage 1c: Cut-off calculation (FAZ 2)
+    # -------------------------------------------------------------------------
+    logger.info(f"[run_id={run_id}] Stage 1c: Computing news cut-off")
+    try:
+        from pipelines.cutoff import compute_news_cutoff
+        cutoff_dt = compute_news_cutoff(
+            run_datetime=datetime.now(timezone.utc),
+            market_tz=settings.market_timezone,
+            market_close=settings.market_close_time,
+            buffer_minutes=settings.cutoff_buffer_minutes,
+        )
+        result["news_cutoff_time"] = cutoff_dt.isoformat()
+        update_run_metrics(session, run_id, news_cutoff_time=cutoff_dt)
+        session.commit()
+        logger.info(f"[run_id={run_id}] Cut-off: {cutoff_dt.isoformat()}")
+    except Exception as e:
+        logger.error(f"[run_id={run_id}] Stage 1c failed: {e}")
+        result["cutoff_error"] = str(e)
+    # -------------------------------------------------------------------------
+    # Stage 1d: Price ingestion (existing)
+    # -------------------------------------------------------------------------
+    logger.info(f"[run_id={run_id}] Stage 1d: Price ingestion")
+    try:
+        from app.data_manager import ingest_prices
+        price_stats = ingest_prices(session)
+        session.commit()
+        result["symbols_fetched"] = len(price_stats)
+        result["price_bars_updated"] = sum(
+            s.get("imported", 0) for s in price_stats.values()
+        )
+        update_run_metrics(
+            session, run_id,
+            price_bars_updated=result["price_bars_updated"],
+        )
+        session.commit()
+    except Exception as e:
+        logger.error(f"[run_id={run_id}] Stage 1d failed: {e}")
+        result["price_error"] = str(e)
+        session.rollback()
+    # -------------------------------------------------------------------------
+    # Stage 2: Sentiment scoring (existing - uses news_articles for now)
+    # -------------------------------------------------------------------------
+    logger.info(f"[run_id={run_id}] Stage 2: Sentiment scoring")
+    try:
+        from app.ai_engine import score_unscored_articles
+        scored = score_unscored_articles(session)
+        session.commit()
+        result["articles_scored"] = scored
+    except Exception as e:
+        logger.error(f"[run_id={run_id}] Stage 2 failed: {e}")
+        result["scoring_error"] = str(e)
+        session.rollback()
+    # -------------------------------------------------------------------------
+    # Stage 3: Sentiment aggregation (existing)
+    # -------------------------------------------------------------------------
+    logger.info(f"[run_id={run_id}] Stage 3: Sentiment aggregation")
+    try:
+        from app.ai_engine import aggregate_daily_sentiment
+        days_aggregated = aggregate_daily_sentiment(session)
+        session.commit()
+        result["days_aggregated"] = days_aggregated
+    except Exception as e:
+        logger.error(f"[run_id={run_id}] Stage 3 failed: {e}")
+        result["aggregation_error"] = str(e)
+        session.rollback()
+    # -------------------------------------------------------------------------
+    # Stage 4: Model training (optional)
+    # -------------------------------------------------------------------------
+    if train_model:
+        logger.info(f"[run_id={run_id}] Stage 4: Model training")
+        try:
+            from app.ai_engine import train_xgboost_model, save_model_metadata_to_db
+            train_result = train_xgboost_model(session)
+            save_model_metadata_to_db(
+                session,
+                symbol="HG=F",
+                importance=train_result.get("importance", []),
+                features=train_result.get("features", []),
+                metrics=train_result.get("metrics", {}),
+            )
+            session.commit()
+            result["model_trained"] = True
+            result["model_metrics"] = train_result.get("metrics", {})
+            update_run_metrics(
+                session, run_id,
+                train_mae=train_result.get("metrics", {}).get("mae"),
+                val_mae=train_result.get("metrics", {}).get("val_mae"),
+            )
+            session.commit()
+        except Exception as e:
+            logger.error(f"[run_id={run_id}] Stage 4 failed: {e}")
+            result["training_error"] = str(e)
+            result["model_trained"] = False
+            session.rollback()
+    else:
+        result["model_trained"] = False
+    # -------------------------------------------------------------------------
+    # Stage 5: Generate snapshot
+    # -------------------------------------------------------------------------
+    logger.info(f"[run_id={run_id}] Stage 5: Generate snapshot")
+    try:
+        from app.inference import generate_analysis_report, save_analysis_snapshot
+        report = generate_analysis_report(session, "HG=F")
+        if report:
+            # Add Faz 2 metadata
+            report["quality_state"] = "ok"
+            if result.get("news_processed_inserted", 0) == 0:
+                report["quality_state"] = "degraded"
+                report["message"] = "No fresh news data"
+            save_analysis_snapshot(session, report, "HG=F")
+            session.commit()
+            result["snapshot_generated"] = True
+            update_run_metrics(session, run_id, snapshot_generated=True)
+            session.commit()
+        else:
+            result["snapshot_generated"] = False
+    except Exception as e:
+        logger.error(f"[run_id={run_id}] Stage 5 failed: {e}")
+        result["snapshot_error"] = str(e)
+        result["snapshot_generated"] = False
+        session.rollback()
+    # -------------------------------------------------------------------------
+    # Stage 6: Generate commentary
+    # -------------------------------------------------------------------------
+    logger.info(f"[run_id={run_id}] Stage 6: Generate commentary")
+    try:
+        from app.commentary import generate_and_save_commentary
+        generate_and_save_commentary(session, "HG=F")
+        session.commit()
+        result["commentary_generated"] = True
+        update_run_metrics(session, run_id, commentary_generated=True)
+        session.commit()
+    except Exception as e:
+        logger.warning(f"[run_id={run_id}] Stage 6 failed: {e}")
+        result["commentary_generated"] = False
+    return result
+# =============================================================================
+# arq worker lifecycle
+# =============================================================================
+async def startup(ctx: dict) -> None:
+    """Called when worker starts."""
+    logger.info("Worker starting up...")
+    init_db()
+async def shutdown(ctx: dict) -> None:
+    """Called when worker shuts down."""
+    logger.info("Worker shutting down...")