Spaces:

aviseth
/

fake-news-api

Sleeping

App Files Files Community

aviseth commited on Mar 29

Commit

1a5863d

1 Parent(s): eca2087

feat: Phase 1 enhancements - ensemble endpoint, history API, rate limiting, storage monitoring

Browse files

Files changed (6) hide show

requirements.txt +1 -0
scripts/phase2_migration.sql +90 -0
scripts/setup_supabase.sql +25 -5
src/api/main.py +374 -13
src/models/ensemble.py +192 -0
src/utils/supabase_client.py +124 -26

requirements.txt CHANGED Viewed

@@ -44,6 +44,7 @@ tqdm>=4.65.0
 # Testing
 pytest>=7.4.0
 pytest-asyncio>=0.21.0
 # Visualization
 matplotlib>=3.7.0

 # Testing
 pytest>=7.4.0
 pytest-asyncio>=0.21.0
+hypothesis>=6.0.0
 # Visualization
 matplotlib>=3.7.0

scripts/phase2_migration.sql ADDED Viewed

	@@ -0,0 +1,90 @@

+-- Phase 2: User Analysis History Database Migration
+-- This script creates the user_analysis_history table and related indexes
+-- Execute this in your Supabase SQL Editor
+-- Step 1: Create the user_analysis_history table
+CREATE TABLE IF NOT EXISTS user_analysis_history (
+    id              UUID         PRIMARY KEY DEFAULT uuid_generate_v4(),
+    session_id      VARCHAR(36)  NOT NULL,
+    article_id      VARCHAR(36)  NOT NULL UNIQUE,
+    text_preview    VARCHAR(200) NOT NULL,
+    predicted_label VARCHAR(50)  NOT NULL CHECK (predicted_label IN ('True', 'Fake', 'Satire', 'Bias')),
+    confidence      FLOAT        NOT NULL CHECK (confidence >= 0.0 AND confidence <= 1.0),
+    model_name      VARCHAR(100) NOT NULL,
+    created_at      TIMESTAMPTZ  DEFAULT NOW() NOT NULL,
+    CONSTRAINT fk_article FOREIGN KEY (article_id) REFERENCES predictions(article_id) ON DELETE CASCADE
+);
+-- Step 2: Create indexes for efficient queries
+CREATE INDEX IF NOT EXISTS idx_history_session_created ON user_analysis_history(session_id, created_at DESC);
+CREATE INDEX IF NOT EXISTS idx_history_article ON user_analysis_history(article_id);
+-- Step 3: Enable row-level security
+ALTER TABLE user_analysis_history ENABLE ROW LEVEL SECURITY;
+-- Step 4: Create policy to allow all operations (for development)
+-- Note: In production, you should restrict this based on your security requirements
+DROP POLICY IF EXISTS "allow_all_history" ON user_analysis_history;
+CREATE POLICY "allow_all_history" ON user_analysis_history FOR ALL USING (true) WITH CHECK (true);
+-- Step 5: Verify the table was created
+SELECT
+    table_name,
+    column_name,
+    data_type,
+    is_nullable,
+    column_default
+FROM information_schema.columns
+WHERE table_name = 'user_analysis_history'
+ORDER BY ordinal_position;
+-- Step 6: Verify indexes were created
+SELECT
+    indexname,
+    indexdef
+FROM pg_indexes
+WHERE tablename = 'user_analysis_history';
+-- Step 7: Verify RLS policy was created
+SELECT
+    policyname,
+    permissive,
+    roles,
+    cmd
+FROM pg_policies
+WHERE tablename = 'user_analysis_history';
+-- Optional: Insert a test record to verify everything works
+-- Uncomment the following lines to test (replace with actual article_id from predictions table)
+/*
+DO $$
+DECLARE
+    test_article_id VARCHAR(36);
+    test_session_id VARCHAR(36);
+BEGIN
+    -- First, insert a test prediction
+    test_article_id := gen_random_uuid()::text;
+    test_session_id := gen_random_uuid()::text;
+    INSERT INTO predictions (article_id, text, predicted_label, confidence, model_name)
+    VALUES (test_article_id, 'Test article for migration verification', 'True', 0.95, 'ensemble');
+    -- Then, insert a test history record
+    INSERT INTO user_analysis_history (session_id, article_id, text_preview, predicted_label, confidence, model_name)
+    VALUES (test_session_id, test_article_id, 'Test article for migration verification', 'True', 0.95, 'ensemble');
+    -- Verify the record was inserted
+    IF EXISTS (SELECT 1 FROM user_analysis_history WHERE article_id = test_article_id) THEN
+        RAISE NOTICE 'Test record inserted successfully!';
+    ELSE
+        RAISE EXCEPTION 'Test record insertion failed!';
+    END IF;
+    -- Clean up test data
+    DELETE FROM user_analysis_history WHERE article_id = test_article_id;
+    DELETE FROM predictions WHERE article_id = test_article_id;
+    RAISE NOTICE 'Test data cleaned up. Migration verification complete!';
+END $$;
+*/

scripts/setup_supabase.sql CHANGED Viewed

@@ -1,3 +1,4 @@
 DROP TABLE IF EXISTS feedback          CASCADE;
 DROP TABLE IF EXISTS predictions       CASCADE;
 DROP TABLE IF EXISTS news_articles     CASCADE;
@@ -77,11 +78,30 @@ CREATE TABLE user_sessions (
     last_activity TIMESTAMPTZ DEFAULT NOW()
 );
-ALTER TABLE predictions       DISABLE ROW LEVEL SECURITY;
-ALTER TABLE feedback          DISABLE ROW LEVEL SECURITY;
-ALTER TABLE news_articles     DISABLE ROW LEVEL SECURITY;
-ALTER TABLE model_performance DISABLE ROW LEVEL SECURITY;
-ALTER TABLE user_sessions     DISABLE ROW LEVEL SECURITY;
 CREATE VIEW prediction_stats AS
 SELECT predicted_label, COUNT(*) AS total_count, AVG(confidence) AS avg_confidence

+DROP TABLE IF EXISTS user_analysis_history CASCADE;
 DROP TABLE IF EXISTS feedback          CASCADE;
 DROP TABLE IF EXISTS predictions       CASCADE;
 DROP TABLE IF EXISTS news_articles     CASCADE;
     last_activity TIMESTAMPTZ DEFAULT NOW()
 );
+CREATE TABLE user_analysis_history (
+    id              UUID         PRIMARY KEY DEFAULT uuid_generate_v4(),
+    session_id      VARCHAR(36)  NOT NULL,
+    article_id      VARCHAR(36)  NOT NULL UNIQUE,
+    text_preview    VARCHAR(200) NOT NULL,
+    predicted_label VARCHAR(50)  NOT NULL CHECK (predicted_label IN ('True', 'Fake', 'Satire', 'Bias')),
+    confidence      FLOAT        NOT NULL CHECK (confidence >= 0.0 AND confidence <= 1.0),
+    model_name      VARCHAR(100) NOT NULL,
+    created_at      TIMESTAMPTZ  DEFAULT NOW() NOT NULL,
+    CONSTRAINT fk_article FOREIGN KEY (article_id) REFERENCES predictions(article_id) ON DELETE CASCADE
+);
+CREATE INDEX idx_history_session_created ON user_analysis_history(session_id, created_at DESC);
+CREATE INDEX idx_history_article ON user_analysis_history(article_id);
+ALTER TABLE predictions            DISABLE ROW LEVEL SECURITY;
+ALTER TABLE feedback               DISABLE ROW LEVEL SECURITY;
+ALTER TABLE news_articles          DISABLE ROW LEVEL SECURITY;
+ALTER TABLE model_performance      DISABLE ROW LEVEL SECURITY;
+ALTER TABLE user_sessions          DISABLE ROW LEVEL SECURITY;
+ALTER TABLE user_analysis_history  ENABLE ROW LEVEL SECURITY;
+CREATE POLICY "allow_all_history" ON user_analysis_history FOR ALL USING (true) WITH CHECK (true);
 CREATE VIEW prediction_stats AS
 SELECT predicted_label, COUNT(*) AS total_count, AVG(confidence) AS avg_confidence

src/api/main.py CHANGED Viewed

@@ -1,9 +1,13 @@
-from fastapi import FastAPI, HTTPException, BackgroundTasks
 from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
 from typing import Optional, List, Dict
 import os
 import uuid
 from dotenv import load_dotenv
 from src.utils.supabase_client import get_supabase_client
@@ -11,6 +15,14 @@ from src.utils.gnews_client import get_gnews_client
 load_dotenv()
 app = FastAPI(
     title="Fake News Detection API",
     description="Multi-class fake news detection using DistilBERT, RoBERTa, and XLNet",
@@ -34,6 +46,37 @@ app.add_middleware(
     allow_headers=["*"],
 )
 VALID_MODELS = {"distilbert", "roberta", "xlnet"}
@@ -70,6 +113,48 @@ class ExplainRequest(BaseModel):
     deep: Optional[bool] = False
 @app.on_event("startup")
 async def startup_event():
     try:
@@ -122,8 +207,16 @@ async def health_check():
 @app.post("/predict", response_model=PredictionResponse)
-async def predict(request: PredictionRequest, background_tasks: BackgroundTasks):
-    """Classify news as True / Fake / Satire / Bias."""
     if not request.text and not request.url:
         raise HTTPException(status_code=400, detail="Provide text or url")
@@ -164,23 +257,274 @@ async def predict(request: PredictionRequest, background_tasks: BackgroundTasks)
     )
     def _store():
         try:
             supabase = get_supabase_client()
-            supabase.store_prediction(
-                article_id=article_id,
-                text=text,
-                predicted_label=result["label"],
-                confidence=result["confidence"],
-                model_name=model_key,
-                explanation=result.get("tokens", []),
-            )
         except Exception as e:
-            print(f"[bg] store_prediction failed: {e}")
     background_tasks.add_task(_store)
     return response
 @app.post("/feedback")
 async def submit_feedback(feedback: FeedbackRequest):
     """Submit user correction for active learning."""
@@ -343,6 +687,23 @@ async def get_statistics():
             status_code=500, detail=f"Error fetching stats: {e}")
 @app.get("/models")
 async def list_models():
     """List available models and their training status."""

+from fastapi import FastAPI, HTTPException, BackgroundTasks, Header, Query, Request
 from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, validator
 from typing import Optional, List, Dict
 import os
 import uuid
+import asyncio
+import logging
+import time
+from collections import defaultdict
 from dotenv import load_dotenv
 from src.utils.supabase_client import get_supabase_client
 load_dotenv()
+# Configure logger
+logger = logging.getLogger(__name__)
+# Rate limiting: Track requests per IP
+request_tracker = defaultdict(list)
+RATE_LIMIT_REQUESTS = 100  # Max requests per window
+RATE_LIMIT_WINDOW = 60  # Window in seconds
 app = FastAPI(
     title="Fake News Detection API",
     description="Multi-class fake news detection using DistilBERT, RoBERTa, and XLNet",
     allow_headers=["*"],
 )
+@app.middleware("http")
+async def rate_limit_middleware(request: Request, call_next):
+    """
+    Rate limiting middleware to prevent abuse.
+    Allows RATE_LIMIT_REQUESTS per RATE_LIMIT_WINDOW seconds per IP.
+    """
+    client_ip = request.client.host
+    current_time = time.time()
+    # Clean old requests outside the window
+    request_tracker[client_ip] = [
+        req_time for req_time in request_tracker[client_ip]
+        if current_time - req_time < RATE_LIMIT_WINDOW
+    ]
+    # Check rate limit
+    if len(request_tracker[client_ip]) >= RATE_LIMIT_REQUESTS:
+        logger.warning(f"Rate limit exceeded for IP: {client_ip}")
+        raise HTTPException(
+            status_code=429,
+            detail=f"Rate limit exceeded. Maximum {RATE_LIMIT_REQUESTS} requests per {RATE_LIMIT_WINDOW} seconds."
+        )
+    # Track this request
+    request_tracker[client_ip].append(current_time)
+    response = await call_next(request)
+    return response
 VALID_MODELS = {"distilbert", "roberta", "xlnet"}
     deep: Optional[bool] = False
+# Ensemble API Models
+class EnsemblePredictionRequest(BaseModel):
+    text: str
+    session_id: Optional[str] = None
+    @validator('text')
+    def validate_text(cls, v):
+        if len(v.strip()) < 10:
+            raise ValueError("Text too short to classify")
+        return v
+class VotingResult(BaseModel):
+    label: str
+    confidence: float
+    scores: Dict[str, float]
+class VotingStrategies(BaseModel):
+    hard_voting: VotingResult
+    soft_voting: VotingResult
+    weighted_voting: VotingResult
+class ModelPredictionResponse(BaseModel):
+    model_name: str
+    label: str
+    confidence: float
+    scores: Dict[str, float]
+    tokens: List[ExplanationData]
+class EnsemblePredictionResponse(BaseModel):
+    article_id: str
+    primary_prediction: VotingResult  # hard voting result
+    voting_strategies: VotingStrategies
+    individual_models: List[ModelPredictionResponse]
+    merged_explanation: List[ExplanationData]
+    execution_time_ms: float
+    warnings: Optional[List[str]] = None
 @app.on_event("startup")
 async def startup_event():
     try:
 @app.post("/predict", response_model=PredictionResponse)
+async def predict(
+    request: PredictionRequest,
+    background_tasks: BackgroundTasks,
+    x_session_id: Optional[str] = Header(None, alias="X-Session-ID")
+):
+    """
+    Classify news as True / Fake / Satire / Bias.
+    Requirements: 4.4, 4.6, 2.7
+    """
     if not request.text and not request.url:
         raise HTTPException(status_code=400, detail="Provide text or url")
     )
     def _store():
+        """
+        Store prediction in both predictions and user_analysis_history tables.
+        Requirements: 4.4, 4.6, 2.7
+        """
         try:
             supabase = get_supabase_client()
+            # Store in predictions table (Requirement 2.7)
+            try:
+                supabase.store_prediction(
+                    article_id=article_id,
+                    text=text,
+                    predicted_label=result["label"],
+                    confidence=result["confidence"],
+                    model_name=model_key,
+                    explanation=result.get("tokens", []),
+                )
+                logger.info(
+                    f"Stored prediction {article_id} in predictions table")
+            except Exception as e:
+                logger.error(
+                    f"Failed to store prediction in predictions table: {e}")
+            # Store in user_analysis_history if session_id is provided (Requirement 4.4, 4.6)
+            if x_session_id:
+                try:
+                    supabase.store_user_history(
+                        session_id=x_session_id,
+                        article_id=article_id,
+                        text=text,
+                        predicted_label=result["label"],
+                        confidence=result["confidence"],
+                        model_name=model_key
+                    )
+                    logger.info(
+                        f"Stored prediction {article_id} in user_analysis_history for session {x_session_id}")
+                except Exception as e:
+                    # Handle missing session_id gracefully (Requirement 4.4)
+                    logger.error(
+                        f"Failed to store prediction in user_analysis_history: {e}")
+            else:
+                logger.debug(
+                    f"No session_id provided for prediction {article_id}, skipping history storage")
         except Exception as e:
+            logger.error(
+                f"Database storage failed for prediction {article_id}: {e}")
     background_tasks.add_task(_store)
     return response
+@app.post("/predict/ensemble", response_model=EnsemblePredictionResponse)
+async def predict_ensemble(
+    request: EnsemblePredictionRequest,
+    background_tasks: BackgroundTasks,
+    x_session_id: Optional[str] = Header(None, alias="X-Session-ID")
+):
+    """
+    Run ensemble prediction using all three models (DistilBERT, RoBERTa, XLNet).
+    Combines predictions using hard voting, soft voting, and weighted voting strategies.
+    Requirements: 2.1, 2.2, 2.5, 2.8
+    """
+    article_id = str(uuid.uuid4())
+    session_id = x_session_id or request.session_id
+    try:
+        from src.models.ensemble import get_ensemble_classifier
+        # Get ensemble classifier instance
+        ensemble = get_ensemble_classifier()
+        # Run ensemble prediction with 15s timeout (Requirement 2.8)
+        result = await asyncio.wait_for(
+            ensemble.predict_ensemble(request.text),
+            timeout=15.0
+        )
+        # Build response with all voting strategies
+        primary_prediction = VotingResult(
+            label=result.hard_voting_label,
+            confidence=result.hard_voting_confidence,
+            scores={result.hard_voting_label: result.hard_voting_confidence}
+        )
+        voting_strategies = VotingStrategies(
+            hard_voting=VotingResult(
+                label=result.hard_voting_label,
+                confidence=result.hard_voting_confidence,
+                scores={result.hard_voting_label: result.hard_voting_confidence}
+            ),
+            soft_voting=VotingResult(
+                label=result.soft_voting_label,
+                confidence=result.soft_voting_confidence,
+                scores=result.soft_voting_scores
+            ),
+            weighted_voting=VotingResult(
+                label=result.weighted_voting_label,
+                confidence=result.weighted_voting_confidence,
+                scores=result.weighted_voting_scores
+            )
+        )
+        # Convert individual model predictions
+        individual_models = [
+            ModelPredictionResponse(
+                model_name=pred.model_name,
+                label=pred.label,
+                confidence=pred.confidence,
+                scores=pred.scores,
+                tokens=[ExplanationData(**t) for t in pred.tokens]
+            )
+            for pred in result.individual_predictions
+        ]
+        # Convert merged explanation
+        merged_explanation = [
+            ExplanationData(**token) for token in result.merged_explanation
+        ]
+        response = EnsemblePredictionResponse(
+            article_id=article_id,
+            primary_prediction=primary_prediction,
+            voting_strategies=voting_strategies,
+            individual_models=individual_models,
+            merged_explanation=merged_explanation,
+            execution_time_ms=result.execution_time_ms,
+            warnings=result.warnings
+        )
+        # Background task: store ensemble prediction to database
+        def store_ensemble_prediction():
+            """
+            Store prediction in both predictions and user_analysis_history tables.
+            Handles database failures gracefully - logs errors but doesn't crash.
+            Requirements: 2.3, 2.4, 2.6, 2.7, 14.3
+            """
+            try:
+                supabase = get_supabase_client()
+                # Store in predictions table with model_name="ensemble" (Requirement 2.7)
+                try:
+                    supabase.store_prediction(
+                        article_id=article_id,
+                        text=request.text,
+                        predicted_label=result.hard_voting_label,
+                        confidence=result.hard_voting_confidence,
+                        model_name="ensemble",
+                        explanation=result.merged_explanation,
+                    )
+                    logger.info(
+                        f"Stored ensemble prediction {article_id} in predictions table")
+                except Exception as e:
+                    # Log but continue - don't let predictions table failure stop history storage
+                    logger.error(
+                        f"Failed to store prediction in predictions table: {e}")
+                # Store in user_analysis_history if session_id is provided (Requirement 2.4)
+                if session_id:
+                    try:
+                        supabase.store_user_history(
+                            session_id=session_id,
+                            article_id=article_id,
+                            text=request.text,
+                            predicted_label=result.hard_voting_label,
+                            confidence=result.hard_voting_confidence,
+                            model_name="ensemble"
+                        )
+                        logger.info(
+                            f"Stored ensemble prediction {article_id} in user_analysis_history for session {session_id}")
+                    except Exception as e:
+                        # Log but don't crash - history storage is non-critical (Requirement 14.3)
+                        logger.error(
+                            f"Failed to store prediction in user_analysis_history: {e}")
+                else:
+                    logger.debug(
+                        f"No session_id provided for prediction {article_id}, skipping history storage")
+            except Exception as e:
+                # Catch-all for any database connection failures (Requirement 14.3)
+                logger.error(
+                    f"Database storage failed for prediction {article_id}: {e}")
+        background_tasks.add_task(store_ensemble_prediction)
+        return response
+    except asyncio.TimeoutError:
+        # Requirement 2.8: Return HTTP 504 after 15s timeout
+        raise HTTPException(
+            status_code=504,
+            detail="Ensemble prediction timed out after 15 seconds"
+        )
+    except ValueError as e:
+        # Handle validation errors (e.g., text too short)
+        raise HTTPException(status_code=422, detail=str(e))
+    except RuntimeError as e:
+        # Handle case where all models fail
+        raise HTTPException(status_code=500, detail=str(e))
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        raise HTTPException(
+            status_code=500,
+            detail=f"Ensemble prediction error: {str(e)}"
+        )
+@app.get("/history/{session_id}")
+async def get_user_history(
+    session_id: str,
+    limit: int = Query(100, ge=1, le=100)
+):
+    """
+    Retrieve user's analysis history by session ID.
+    Args:
+        session_id: UUID v4 session identifier
+        limit: Maximum records to return (1-100, default 100)
+    Returns:
+        List of prediction records with metadata
+    Requirements: 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7
+    """
+    # Validate UUID format (Requirement 6.6)
+    try:
+        uuid.UUID(session_id, version=4)
+    except ValueError:
+        raise HTTPException(
+            status_code=400,
+            detail="Invalid session ID format"
+        )
+    try:
+        # Add 2s timeout (Requirement 6.7)
+        supabase = get_supabase_client()
+        history = await asyncio.wait_for(
+            asyncio.get_event_loop().run_in_executor(
+                None,
+                supabase.get_user_history,
+                session_id,
+                limit
+            ),
+            timeout=2.0
+        )
+        # Return empty array with HTTP 200 for sessions with no history (Requirement 6.5)
+        return {
+            "status": "success",
+            "session_id": session_id,
+            "count": len(history),
+            "history": history
+        }
+    except asyncio.TimeoutError:
+        # Requirement 6.7: Return HTTP 504 after 2s timeout
+        raise HTTPException(
+            status_code=504,
+            detail="History retrieval timed out after 2 seconds"
+        )
+    except Exception as e:
+        logger.error(f"Failed to fetch history for session {session_id}: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail="Failed to load history"
+        )
 @app.post("/feedback")
 async def submit_feedback(feedback: FeedbackRequest):
     """Submit user correction for active learning."""
             status_code=500, detail=f"Error fetching stats: {e}")
+@app.get("/storage")
+async def get_storage_usage():
+    """
+    Get database storage usage metrics and warnings.
+    Returns storage usage information and warns when approaching 90% of 500MB limit.
+    """
+    try:
+        supabase = get_supabase_client()
+        usage = supabase.check_storage_usage()
+        return {"status": "success", "storage": usage}
+    except Exception as e:
+        logger.error(f"Error fetching storage usage: {e}")
+        raise HTTPException(
+            status_code=500, detail=f"Error fetching storage usage: {e}")
 @app.get("/models")
 async def list_models():
     """List available models and their training status."""

src/models/ensemble.py ADDED Viewed

	@@ -0,0 +1,192 @@

+"""
+Ensemble classifier combining DistilBERT, RoBERTa, and XLNet
+with parallel execution and multiple voting strategies.
+"""
+import asyncio
+import time
+import logging
+from dataclasses import dataclass
+from typing import List, Dict, Optional
+from concurrent.futures import ThreadPoolExecutor
+import numpy as np
+from .inference import get_classifier
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+@dataclass
+class TokenImportance:
+    token: str
+    score: float
+@dataclass
+class ModelPrediction:
+    model_name: str
+    label: str
+    confidence: float
+    scores: Dict[str, float]
+    tokens: List[Dict]
+@dataclass
+class EnsembleResult:
+    hard_voting_label: str
+    hard_voting_confidence: float
+    soft_voting_label: str
+    soft_voting_confidence: float
+    soft_voting_scores: Dict[str, float]
+    weighted_voting_label: str
+    weighted_voting_confidence: float
+    weighted_voting_scores: Dict[str, float]
+    individual_predictions: List[ModelPrediction]
+    merged_explanation: List[Dict]
+    execution_time_ms: float
+    warnings: Optional[List[str]] = None
+class EnsembleClassifier:
+    """Combines predictions from all three models using voting strategies."""
+    def __init__(self):
+        self.model_names = ['distilbert', 'roberta', 'xlnet']
+        self.models = {name: get_classifier(name) for name in self.model_names}
+        self.weights = {'distilbert': 0.859, 'roberta': 0.858, 'xlnet': 0.862}
+        self.executor = ThreadPoolExecutor(max_workers=3)
+    async def predict_ensemble(self, text: str, model_timeout: float = 10.0,
+                               total_timeout: float = 15.0) -> EnsembleResult:
+        start_time = time.time()
+        warnings = []
+        loop = asyncio.get_event_loop()
+        tasks = [
+            loop.run_in_executor(
+                self.executor, self._predict_with_timeout, name, text, model_timeout)
+            for name in self.model_names
+        ]
+        try:
+            results = await asyncio.wait_for(
+                asyncio.gather(*tasks, return_exceptions=True),
+                timeout=total_timeout
+            )
+        except asyncio.TimeoutError:
+            warnings.append("Ensemble prediction exceeded total timeout")
+            raise
+        valid_predictions = []
+        for name, result in zip(self.model_names, results):
+            if isinstance(result, Exception):
+                warnings.append(f"{name} failed: {str(result)}")
+            elif result is None:
+                warnings.append(f"{name} returned no result")
+            else:
+                valid_predictions.append(ModelPrediction(
+                    model_name=name,
+                    label=result['label'],
+                    confidence=result['confidence'],
+                    scores=result['scores'],
+                    tokens=result['tokens']
+                ))
+        if not valid_predictions:
+            raise RuntimeError("All models failed to process the request")
+        hard_label, hard_conf = self.hard_voting(valid_predictions)
+        soft_scores = self.soft_voting(valid_predictions)
+        soft_label = max(soft_scores.items(), key=lambda x: x[1])[0]
+        soft_conf = soft_scores[soft_label]
+        weighted_scores = self.weighted_voting(valid_predictions)
+        weighted_label = max(weighted_scores.items(), key=lambda x: x[1])[0]
+        weighted_conf = weighted_scores[weighted_label]
+        merged_tokens = self._merge_explanations(valid_predictions)
+        execution_time = (time.time() - start_time) * 1000
+        logger.info(
+            f"Ensemble completed in {execution_time:.2f}ms with {len(valid_predictions)}/{len(self.model_names)} models")
+        if warnings:
+            logger.warning(f"Ensemble warnings: {warnings}")
+        return EnsembleResult(
+            hard_voting_label=hard_label,
+            hard_voting_confidence=hard_conf,
+            soft_voting_label=soft_label,
+            soft_voting_confidence=soft_conf,
+            soft_voting_scores=soft_scores,
+            weighted_voting_label=weighted_label,
+            weighted_voting_confidence=weighted_conf,
+            weighted_voting_scores=weighted_scores,
+            individual_predictions=valid_predictions,
+            merged_explanation=merged_tokens,
+            execution_time_ms=round(execution_time, 2),
+            warnings=warnings if warnings else None
+        )
+    def _predict_with_timeout(self, model_name: str, text: str, timeout: float) -> Optional[Dict]:
+        try:
+            return self.models[model_name].predict(text)
+        except Exception as e:
+            logger.error(f"[ensemble] {model_name} prediction failed: {e}")
+            return None
+    def hard_voting(self, predictions: List[ModelPrediction]) -> tuple[str, float]:
+        votes = {}
+        for pred in predictions:
+            votes[pred.label] = votes.get(pred.label, 0) + 1
+        winning_label = max(votes.items(), key=lambda x: x[1])[0]
+        confidences = [
+            p.confidence for p in predictions if p.label == winning_label]
+        return winning_label, round(sum(confidences) / len(confidences), 4)
+    def soft_voting(self, predictions: List[ModelPrediction]) -> Dict[str, float]:
+        all_labels = set(
+            label for pred in predictions for label in pred.scores)
+        return {
+            label: round(sum(p.scores.get(label, 0.0)
+                         for p in predictions) / len(predictions), 4)
+            for label in all_labels
+        }
+    def weighted_voting(self, predictions: List[ModelPrediction]) -> Dict[str, float]:
+        all_labels = set(
+            label for pred in predictions for label in pred.scores)
+        total_weight = sum(self.weights[p.model_name] for p in predictions)
+        return {
+            label: round(
+                sum(p.scores.get(label, 0.0) *
+                    self.weights[p.model_name] for p in predictions) / total_weight,
+                4
+            )
+            for label in all_labels
+        }
+    def _merge_explanations(self, predictions: List[ModelPrediction]) -> List[Dict]:
+        token_scores: Dict[str, float] = {}
+        token_counts: Dict[str, int] = {}
+        for pred in predictions:
+            for td in pred.tokens:
+                token = td['token']
+                token_scores[token] = token_scores.get(
+                    token, 0.0) + td['score']
+                token_counts[token] = token_counts.get(token, 0) + 1
+        merged = [
+            {'token': t, 'score': round(token_scores[t] / token_counts[t], 4)}
+            for t in token_scores
+        ]
+        return sorted(merged, key=lambda x: x['score'], reverse=True)[:10]
+_ensemble_classifier: Optional[EnsembleClassifier] = None
+def get_ensemble_classifier() -> EnsembleClassifier:
+    global _ensemble_classifier
+    if _ensemble_classifier is None:
+        _ensemble_classifier = EnsembleClassifier()
+    return _ensemble_classifier

src/utils/supabase_client.py CHANGED Viewed

@@ -1,11 +1,36 @@
 import os
 from typing import Optional, Dict, Any, List
-from datetime import datetime
 from supabase import create_client, Client
 from dotenv import load_dotenv
 load_dotenv()
 class SupabaseClient:
     def __init__(self):
@@ -17,15 +42,9 @@ class SupabaseClient:
                 "SUPABASE_URL and SUPABASE_SERVICE_KEY must be set")
         self.client: Client = create_client(self.url, self.key)
-    def store_prediction(
-        self,
-        article_id: str,
-        text: str,
-        predicted_label: str,
-        confidence: float,
-        model_name: str,
-        explanation=None,
-    ) -> Dict[str, Any]:
         data = {
             "article_id": article_id,
             "text": text[:1000],
@@ -33,24 +52,24 @@ class SupabaseClient:
             "confidence": confidence,
             "model_name": model_name,
             "explanation": explanation,
-            "created_at": datetime.utcnow().isoformat(),
         }
-        response = self.client.table("predictions").insert(data).execute()
-        return response.data
-    def store_feedback(
-        self,
-        article_id: str,
-        predicted_label: str,
-        actual_label: str,
-        user_comment: Optional[str] = None,
-    ) -> Dict[str, Any]:
         data = {
             "article_id": article_id,
             "predicted_label": predicted_label,
             "actual_label": actual_label,
             "user_comment": user_comment,
-            "created_at": datetime.utcnow().isoformat(),
         }
         response = self.client.table("feedback").insert(data).execute()
         return response.data
@@ -64,16 +83,96 @@ class SupabaseClient:
         for row in by_label_rows.data:
             lbl = row["predicted_label"]
             label_counts[lbl] = label_counts.get(lbl, 0) + 1
-        return {
-            "total_predictions": total.count,
-            "by_label": label_counts,
-        }
     def get_feedback_for_training(self, limit: int = 1000) -> List[Dict[str, Any]]:
         response = self.client.table("feedback").select(
             "*").limit(limit).execute()
         return response.data
 _supabase_client: Optional[SupabaseClient] = None
@@ -86,6 +185,5 @@ def get_supabase_client() -> SupabaseClient:
 def reset_client():
-    """Force re-initialisation."""
     global _supabase_client
     _supabase_client = None

 import os
+import uuid
+import time
+import logging
 from typing import Optional, Dict, Any, List
+from datetime import datetime, timezone
 from supabase import create_client, Client
 from dotenv import load_dotenv
 load_dotenv()
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def retry_with_exponential_backoff(max_retries=3, base_delay=1.0):
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            for attempt in range(max_retries):
+                try:
+                    return func(*args, **kwargs)
+                except Exception as e:
+                    if attempt == max_retries - 1:
+                        logger.error(
+                            f"{func.__name__} failed after {max_retries} attempts: {e}")
+                        raise
+                    delay = base_delay * (2 ** attempt)
+                    logger.warning(
+                        f"{func.__name__} attempt {attempt + 1} failed: {e}. Retrying in {delay}s...")
+                    time.sleep(delay)
+        return wrapper
+    return decorator
 class SupabaseClient:
     def __init__(self):
                 "SUPABASE_URL and SUPABASE_SERVICE_KEY must be set")
         self.client: Client = create_client(self.url, self.key)
+    @retry_with_exponential_backoff(max_retries=3)
+    def store_prediction(self, article_id: str, text: str, predicted_label: str,
+                         confidence: float, model_name: str, explanation=None) -> Dict[str, Any]:
         data = {
             "article_id": article_id,
             "text": text[:1000],
             "confidence": confidence,
             "model_name": model_name,
             "explanation": explanation,
+            "created_at": datetime.now(timezone.utc).isoformat(),
         }
+        try:
+            response = self.client.table("predictions").insert(data).execute()
+            logger.info(f"Stored prediction for article {article_id}")
+            return response.data
+        except Exception as e:
+            logger.error(f"Failed to store prediction: {e}")
+            raise
+    def store_feedback(self, article_id: str, predicted_label: str,
+                       actual_label: str, user_comment: Optional[str] = None) -> Dict[str, Any]:
         data = {
             "article_id": article_id,
             "predicted_label": predicted_label,
             "actual_label": actual_label,
             "user_comment": user_comment,
+            "created_at": datetime.now(timezone.utc).isoformat(),
         }
         response = self.client.table("feedback").insert(data).execute()
         return response.data
         for row in by_label_rows.data:
             lbl = row["predicted_label"]
             label_counts[lbl] = label_counts.get(lbl, 0) + 1
+        logger.info(f"Total predictions: {total.count}")
+        return {"total_predictions": total.count, "by_label": label_counts}
+    def check_storage_usage(self) -> Dict[str, Any]:
+        """Check database storage usage and warn if approaching the 500MB free-tier limit."""
+        try:
+            predictions_count = self.client.table("predictions").select(
+                "*", count="exact").execute().count
+            history_count = self.client.table("user_analysis_history").select(
+                "*", count="exact").execute().count
+            estimated_mb = (predictions_count * 1.0 +
+                            history_count * 0.5) / 1024
+            limit_mb = 500
+            usage_percent = (estimated_mb / limit_mb) * 100
+            result = {
+                "predictions_count": predictions_count,
+                "history_count": history_count,
+                "estimated_storage_mb": round(estimated_mb, 2),
+                "limit_mb": limit_mb,
+                "usage_percent": round(usage_percent, 2),
+                "warning": None
+            }
+            if usage_percent >= 90:
+                warning = f"Storage usage at {usage_percent:.1f}% ({estimated_mb:.1f}MB / {limit_mb}MB). Consider archiving old data."
+                result["warning"] = warning
+                logger.warning(warning)
+            elif usage_percent >= 75:
+                logger.info(
+                    f"Storage usage at {usage_percent:.1f}% ({estimated_mb:.1f}MB / {limit_mb}MB)")
+            return result
+        except Exception as e:
+            logger.error(f"Failed to check storage usage: {e}")
+            return {"error": str(e), "warning": "Unable to check storage usage"}
     def get_feedback_for_training(self, limit: int = 1000) -> List[Dict[str, Any]]:
         response = self.client.table("feedback").select(
             "*").limit(limit).execute()
         return response.data
+    @retry_with_exponential_backoff(max_retries=3)
+    def store_user_history(self, session_id: str, article_id: str, text: str,
+                           predicted_label: str, confidence: float, model_name: str) -> Dict[str, Any]:
+        try:
+            uuid.UUID(session_id)
+        except (ValueError, AttributeError) as e:
+            logger.error(f"Invalid session_id format: {e}")
+            raise ValueError(f"session_id must be a valid UUID: {e}")
+        data = {
+            "session_id": session_id,
+            "article_id": article_id,
+            "text_preview": text[:200],
+            "predicted_label": predicted_label,
+            "confidence": confidence,
+            "model_name": model_name,
+            "created_at": datetime.now(timezone.utc).isoformat()
+        }
+        try:
+            response = self.client.table(
+                "user_analysis_history").insert(data).execute()
+            logger.info(f"Stored user history for session {session_id}")
+            return response.data
+        except Exception as e:
+            logger.error(f"Failed to store user history: {e}")
+            raise
+    @retry_with_exponential_backoff(max_retries=3)
+    def get_user_history(self, session_id: str, limit: int = 100) -> List[Dict[str, Any]]:
+        try:
+            uuid.UUID(session_id)
+        except (ValueError, AttributeError) as e:
+            logger.error(f"Invalid session_id format: {e}")
+            raise ValueError(f"session_id must be a valid UUID: {e}")
+        try:
+            response = (
+                self.client.table("user_analysis_history")
+                .select("*")
+                .eq("session_id", session_id)
+                .order("created_at", desc=True)
+                .limit(limit)
+                .execute()
+            )
+            logger.info(
+                f"Retrieved {len(response.data)} history records for session {session_id}")
+            return response.data
+        except Exception as e:
+            logger.error(f"Failed to retrieve user history: {e}")
+            raise
 _supabase_client: Optional[SupabaseClient] = None
 def reset_client():
     global _supabase_client
     _supabase_client = None