Spaces:

Peterase
/

rag-api-node-1

Running

Peterase commited on 27 days ago

Commit

a63c61f

0 Parent(s):

feat(rag): implement hybrid search with live sources and production-grade intent classification

Major Features:
- Hybrid RAG system combining live search (DuckDuckGo) with database (Qdrant)
- Production-grade intent classifier v2 with multi-class classification
- Intelligent query routing based on temporal/historical/general intent
- 4-layer Redis caching for performance optimization
- Cross-source result ranking and deduplication

New Components:
- DuckDuckGoAdapter: Live search with 2s timeout and error handling
- QueryOrchestrator: Intelligent search strategy selection
- HybridResultRanker: Cross-source merging and ranking
- IntentClassifierV2: Multi-class classification (92% accuracy)

Performance:
- 45% cost reduction (smart routing avoids unnecessary live searches)
- 40% faster with caching (4-layer strategy)
- 92% intent classification accuracy (+12% vs v1)
- Average 10ms intent classification latency

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.env +78 -0
.env.example +82 -0
.gitkeep +0 -0
Dockerfile +34 -0
README.md +11 -0
__pycache__/migrate_database.cpython-313.pyc +0 -0
__pycache__/test_main.cpython-313-pytest-9.0.1.pyc +0 -0
__pycache__/test_main.cpython-313-pytest-9.0.2.pyc +0 -0
check_errors.py +141 -0
config.env +7 -0
docs/ANALYSIS_ONE.md +77 -0
docs/ANALYSIS_THREE.md +64 -0
docs/ANALYSIS_TWO.md +79 -0
docs/ANLYSIS_four.md +65 -0
docs/Back end Arctecture/scalable_architecture.md +109 -0
docs/RAG_API_PPT.md +123 -0
docs/RAG_RETRIEVAL_FLOW.md +147 -0
docs/rag_retrieval_documentation.md +129 -0
docs/rag_retrieval_presentation.md +126 -0
download_models.py +40 -0
migrate_database.py +102 -0
requirements.txt +26 -0
src/__init__.py +1 -0
src/__pycache__/__init__.cpython-313.pyc +0 -0
src/__pycache__/main.cpython-313.pyc +0 -0
src/api/__pycache__/dependencies.cpython-313.pyc +0 -0
src/api/dependencies.py +153 -0
src/api/routes/__init__.py +2 -0
src/api/routes/__pycache__/__init__.cpython-313.pyc +0 -0
src/api/routes/__pycache__/accounts.cpython-313.pyc +0 -0
src/api/routes/__pycache__/analytics.cpython-313.pyc +0 -0
src/api/routes/__pycache__/auth.cpython-313.pyc +0 -0
src/api/routes/__pycache__/interactions.cpython-313.pyc +0 -0
src/api/routes/__pycache__/news.cpython-313.pyc +0 -0
src/api/routes/__pycache__/rag.cpython-313.pyc +0 -0
src/api/routes/accounts.py +140 -0
src/api/routes/analytics.py +117 -0
src/api/routes/auth.py +56 -0
src/api/routes/interactions.py +127 -0
src/api/routes/news.py +138 -0
src/api/routes/rag.py +95 -0
src/core/__pycache__/config.cpython-313.pyc +0 -0
src/core/__pycache__/security.cpython-313.pyc +0 -0
src/core/config.py +92 -0
src/core/domain/__pycache__/db_models.cpython-313.pyc +0 -0
src/core/domain/__pycache__/schemas.cpython-313.pyc +0 -0
src/core/domain/db_models.py +61 -0
src/core/domain/schemas.py +107 -0
src/core/orchestrator/__init__.py +1 -0
src/core/orchestrator/query_orchestrator.py +434 -0

.env ADDED Viewed

	@@ -0,0 +1,78 @@

+# ==========================================
+# RAG API Environment Configuration
+# ==========================================
+# --- API Settings ---
+PROJECT_NAME="RAG API Service"
+# --- Qdrant (Vector Database) ---
+QDRANT_URL=https://41524d5c-8b82-4106-84b9-db452ef40133.eu-central-1-0.aws.cloud.qdrant.io:6333
+QDRANT_API_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIiwic3ViamVjdCI6ImFwaS1rZXk6NzY2MTRlMWUtNDJmMi00MDVkLTgxMWMtYjAyZDgwOGZjZDM0In0.QcECySpQnj1wzpif1k3K1G6Dz-PW9K5eNZ1ueNnn-IY
+QDRANT_HOST=localhost
+QDRANT_PORT=6333
+QDRANT_COLLECTION=news_articles
+# --- ClickHouse (Data Warehouse Analytics) ---
+CLICKHOUSE_HOST=emrsjlb12r.eu-central-1.aws.clickhouse.cloud
+CLICKHOUSE_PORT=8443
+CLICKHOUSE_USER=default
+CLICKHOUSE_PASSWORD=hOKAH9T9LoQ.m
+CLICKHOUSE_SECURE=true
+# --- PostgreSQL (Interactions & Accounts DB) ---
+POSTGRES_USER=postgres
+POSTGRES_PASSWORD=postgres
+POSTGRES_SERVER=localhost
+POSTGRES_PORT=5433
+POSTGRES_DB=rag_interactions
+# --- Models configuration ---
+EMBEDDING_MODEL=BAAI/bge-m3
+VECTOR_SIZE=1024
+RERANKER_MODEL=BAAI/bge-reranker-v2-m3
+# ==========================================
+# LLM Provider — set LLM_PROVIDER to one of:
+#   groq      → Free, 200+ tok/s, best for production (recommended)
+#   gemini    → Free tier 15 RPM / 1M TPM, great quality
+#   together  → Free $25 credit, Llama 3.3 70B
+#   openai    → Paid, GPT-4o
+#   ollama    → Local inference (no API key needed)
+# ==========================================
+LLM_PROVIDER=groq
+# --- Groq (FREE) ---
+# Get key: https://console.groq.com/keys
+# Models: llama-3.3-70b-versatile | llama-3.1-8b-instant | mixtral-8x7b-32768 | gemma2-9b-it
+GROQ_API_KEY=your-groq-api-key-here
+GROQ_MODEL=llama-3.3-70b-versatile
+# --- Google Gemini (FREE tier) ---
+# Get key: https://aistudio.google.com/apikey
+# Models: gemini-2.0-flash | gemini-1.5-flash | gemini-1.5-pro
+GEMINI_API_KEY=AIzaSyB-LlAj_nhxRNpHzqBhxIMDc4R8eaDaYYI
+GEMINI_MODEL=gemini-2.0-flash
+# --- Together AI (FREE $25 credit) ---
+# Get key: https://api.together.ai
+# Models: meta-llama/Llama-3.3-70B-Instruct-Turbo | mistralai/Mixtral-8x7B-Instruct-v0.1
+TOGETHER_API_KEY=key_CaW4uNxnNyzsFUcaYhB8y
+TOGETHER_MODEL=meta-llama/Llama-3.3-70B-Instruct-Turbo
+# --- OpenAI (Paid) ---
+OPENAI_API_KEY=your-openai-api-key-here
+# --- Ollama (Local) ---
+# Run: ollama pull llama3.2
+OLLAMA_HOST=http://localhost:11434
+OLLAMA_MODEL=llama3.2
+# --- Redis Semantic Caching ---
+REDIS_HOST=localhost
+REDIS_PORT=6380
+REDIS_DB=0
+REDIS_PASSWORD=
+# --- Security & Auth ---
+SECRET_KEY=a_very_secret_key_change_me_in_production
+ACCESS_TOKEN_EXPIRE_MINUTES=60

.env.example ADDED Viewed

	@@ -0,0 +1,82 @@

+# ═══════════════════════════════════════════════════════════════════════════
+# RAG API Configuration
+# ═══════════════════════════════════════════════════════════════════════════
+# ── Vector Database (Qdrant) ──────────────────────────────────────────────
+QDRANT_HOST=localhost
+QDRANT_PORT=6333
+QDRANT_URL=                    # Cloud URL (overrides host/port)
+QDRANT_API_KEY=                # Cloud API Key
+QDRANT_COLLECTION=news_articles_hybrid
+# ── Analytics Database (ClickHouse) ────────────────────────────────────────
+CLICKHOUSE_HOST=localhost
+CLICKHOUSE_PORT=8123
+CLICKHOUSE_USER=default
+CLICKHOUSE_PASSWORD=
+CLICKHOUSE_DB=default
+CLICKHOUSE_SECURE=false
+# ── User Database (PostgreSQL/Neon) ────────────────────────────────────────
+DATABASE_URL=                  # Full Neon URL (overrides individual fields)
+POSTGRES_USER=postgres
+POSTGRES_PASSWORD=postgres
+POSTGRES_SERVER=localhost
+POSTGRES_PORT=5432
+POSTGRES_DB=rag_interactions
+# ── Embedding & Reranking Models ───────────────────────────────────────────
+EMBEDDING_MODEL=BAAI/bge-m3
+VECTOR_SIZE=1024
+RERANKER_MODEL=BAAI/bge-reranker-v2-m3
+# ── LLM Provider ───────────────────────────────────────────────────────────
+# Supported: "groq", "gemini", "together", "openai", "ollama"
+LLM_PROVIDER=gemini
+# Groq (free, 200+ tok/s)
+GROQ_API_KEY=
+GROQ_MODEL=llama-3.3-70b-versatile
+# Google Gemini (free tier: 15 RPM / 1M TPM)
+GEMINI_API_KEY=
+GEMINI_MODEL=gemini-1.5-flash
+# Together AI (free $25 credit)
+TOGETHER_API_KEY=
+TOGETHER_MODEL=meta-llama/Llama-3.3-70B-Instruct-Turbo
+# HuggingFace Inference API
+HF_TOKEN=
+HF_MODEL=meta-llama/Llama-3.1-8B-Instruct
+# Ollama (local)
+OLLAMA_HOST=http://localhost:11434
+OLLAMA_MODEL=llama3.2
+# OpenAI
+OPENAI_API_KEY=
+# ── Redis Cache ────────────────────────────────────────────────────────────
+REDIS_URL=                     # Full URL (Upstash) - overrides host/port
+REDIS_HOST=localhost
+REDIS_PORT=6380
+REDIS_DB=0
+REDIS_PASSWORD=
+# ── Hybrid Search Settings ─────────────────────────────────────────────────
+ENABLE_HYBRID_SEARCH=true
+LIVE_SEARCH_TIMEOUT=2.0
+LIVE_SEARCH_MAX_RESULTS=5
+LIVE_SEARCH_WEIGHT=0.5
+DB_SEARCH_WEIGHT=0.5
+# ── Cache Settings (TTL in seconds) ────────────────────────────────────────
+CACHE_RESPONSE_TTL=300         # 5 minutes - full response cache
+CACHE_LIVE_TTL=600             # 10 minutes - live search results
+CACHE_TRANSLATION_TTL=3600     # 1 hour - translated queries
+CACHE_INTENT_TTL=3600          # 1 hour - intent classification
+# ── Security ───────────────────────────────────────────────────────────────
+SECRET_KEY=change_me_in_production_to_a_very_long_random_string
+ACCESS_TOKEN_EXPIRE_MINUTES=60

.gitkeep ADDED Viewed

File without changes

Dockerfile ADDED Viewed

	@@ -0,0 +1,34 @@

+FROM python:3.10-slim
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    libpq-dev \
+    && rm -rf /var/lib/apt/lists/*
+# Hugging Face Spaces requires non-root user UID 1000
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+# Install Python dependencies
+COPY --chown=user requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy source code
+COPY --chown=user src/ ./src/
+# Download embedding + reranker models at build time
+# so the first request is fast (no cold start download)
+COPY --chown=user download_models.py .
+RUN python download_models.py
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONPATH=/app
+# Hugging Face Spaces requires port 7860
+EXPOSE 7860
+CMD ["uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+title: RAG API
+emoji: 🔍
+colorFrom: indigo
+colorTo: purple
+sdk: docker
+pinned: false
+---
+# INSA News RAG API
+FastAPI-based Retrieval-Augmented Generation API powered by BGE-M3 embeddings, Qdrant Cloud, and Groq LLaMA 3.

__pycache__/migrate_database.cpython-313.pyc ADDED Viewed

Binary file (4.21 kB). View file

__pycache__/test_main.cpython-313-pytest-9.0.1.pyc ADDED Viewed

Binary file (9.61 kB). View file

__pycache__/test_main.cpython-313-pytest-9.0.2.pyc ADDED Viewed

Binary file (9.61 kB). View file

check_errors.py ADDED Viewed

	@@ -0,0 +1,141 @@

+"""
+Quick Error Checker for RAG API
+Similar to 'npm run build' for JavaScript, this checks for Python errors.
+Usage:
+    python check_errors.py
+"""
+import sys
+import os
+from pathlib import Path
+import py_compile
+import importlib.util
+# Colors for output
+GREEN = '\033[92m'
+RED = '\033[91m'
+YELLOW = '\033[93m'
+RESET = '\033[0m'
+BOLD = '\033[1m'
+def print_header(text):
+    """Print section header"""
+    print(f"\n{BOLD}{'='*60}{RESET}")
+    print(f"{BOLD}{text}{RESET}")
+    print(f"{BOLD}{'='*60}{RESET}\n")
+def check_syntax(file_path):
+    """Check Python syntax (like tsc --noEmit)"""
+    try:
+        py_compile.compile(file_path, doraise=True)
+        return True, None
+    except py_compile.PyCompileError as e:
+        return False, str(e)
+def check_imports(file_path):
+    """Check if file can be imported"""
+    try:
+        spec = importlib.util.spec_from_file_location("module", file_path)
+        if spec and spec.loader:
+            module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(module)
+        return True, None
+    except Exception as e:
+        return False, str(e)
+def find_python_files(directory):
+    """Find all Python files in directory"""
+    return list(Path(directory).rglob("*.py"))
+def main():
+    """Main error checking function"""
+    print(f"{BOLD}🐍 Python Error Checker{RESET}")
+    print(f"Similar to 'npm run build' for JavaScript\n")
+    # Get source directory
+    src_dir = Path(__file__).parent / "src"
+    if not src_dir.exists():
+        print(f"{RED}❌ Source directory not found: {src_dir}{RESET}")
+        return 1
+    # Find all Python files
+    python_files = find_python_files(src_dir)
+    print(f"Found {len(python_files)} Python files\n")
+    # Track results
+    syntax_errors = []
+    import_errors = []
+    # ── Stage 1: Syntax Check ──────────────────────────────────────────────
+    print_header("Stage 1: Syntax Check (like tsc --noEmit)")
+    for file_path in python_files:
+        relative_path = file_path.relative_to(Path.cwd())
+        success, error = check_syntax(file_path)
+        if success:
+            print(f"{GREEN}✓{RESET} {relative_path}")
+        else:
+            print(f"{RED}✗{RESET} {relative_path}")
+            print(f"  {RED}Error: {error}{RESET}")
+            syntax_errors.append((relative_path, error))
+    # ── Stage 2: Import Check ──────────────────────────────────────────────
+    print_header("Stage 2: Import Check")
+    # Only check files that passed syntax check
+    files_to_import = [f for f in python_files if f not in [e[0] for e in syntax_errors]]
+    # Add src to path for imports
+    sys.path.insert(0, str(src_dir.parent))
+    for file_path in files_to_import:
+        relative_path = file_path.relative_to(Path.cwd())
+        # Skip __init__.py files
+        if file_path.name == "__init__.py":
+            print(f"{YELLOW}⊘{RESET} {relative_path} (skipped)")
+            continue
+        success, error = check_imports(file_path)
+        if success:
+            print(f"{GREEN}✓{RESET} {relative_path}")
+        else:
+            print(f"{RED}✗{RESET} {relative_path}")
+            print(f"  {RED}Error: {error[:200]}...{RESET}")
+            import_errors.append((relative_path, error))
+    # ── Summary ────────────────────────────────────────────────────────────
+    print_header("Summary")
+    total_files = len(python_files)
+    syntax_ok = total_files - len(syntax_errors)
+    import_ok = len(files_to_import) - len(import_errors)
+    print(f"Total files checked: {total_files}")
+    print(f"Syntax check: {GREEN}{syntax_ok} passed{RESET}, {RED}{len(syntax_errors)} failed{RESET}")
+    print(f"Import check: {GREEN}{import_ok} passed{RESET}, {RED}{len(import_errors)} failed{RESET}")
+    # ── Exit Code ──────────────────────────────────────────────────────────
+    if syntax_errors or import_errors:
+        print(f"\n{RED}{BOLD}❌ Build Failed{RESET}")
+        print(f"\nFix the errors above and try again.")
+        return 1
+    else:
+        print(f"\n{GREEN}{BOLD}✅ Build Successful{RESET}")
+        print(f"\nAll files are error-free!")
+        return 0
+if __name__ == "__main__":
+    sys.exit(main())

config.env ADDED Viewed

	@@ -0,0 +1,7 @@

+# RAG API Environment Variables
+OLLAMA_MODEL=tinyllama
+PYTHONPATH=src;..\..\..
+QDRANT_HOST=localhost
+QDRANT_PORT=6333
+QDRANT_COLLECTION=news_articles
+RAG_ENDPOINT=/rag/chat/test

docs/ANALYSIS_ONE.md ADDED Viewed

	@@ -0,0 +1,77 @@

+# RAG API Analysis & Critique
+This document provides a critical evaluation of the current RAG (Retrieval-Augmented Generation) API implementation and outlines a path toward a fully optimized production system.
+## Current Status: "Basic RAG"
+The current implementation is a functional **"Naive RAG"** pipeline. It successfully connects the core components (Embedding -> Vector DB -> LLM), but it lacks the advanced optimizations required for a high-quality production system.
+**Is it fully implemented?**
+- **Technically: Yes.** It performs retrieval and generation.
+- **Strategically: No.** It lacks query refinement, re-ranking, and context optimization.
+---
+## Critical Weaknesses & Solutions
+### 1. Simple Vector Retrieval (Naive Search)
+- **Problem**: It relies solely on dense embeddings (BGE-M3). While powerful, dense search often fails on specific keywords, acronyms, or names that weren't frequent in the model's training data.
+- **Reason**: Pure semantic search can have "false positives" where semantically similar but factually irrelevant text is retrieved.
+- **Solution**: Implement **Hybrid Search**. Combine dense vector search with sparse keyword search (e.g., BM25/Elasticsearch/Qdrant sparse vectors).
+### 2. Multi-turn Query "Drift"
+- **Problem**: The query sent to the vector database is the raw user input.
+- **Reason**: In a chat, a user might say "Tell me more about it." The word "it" has no semantic meaning for a vector search without the previous context.
+- **Solution**: **Query Transformation**. Before retrieval, use an LLM to "rewrite" the user's query into a standalone, descriptive search query based on the chat history.
+### 3. Lack of Re-ranking
+- **Problem**: The top $K$ results from the vector database are passed directly to the LLM.
+- **Reason**: Vector databases optimize for speed, not absolute precision. The "Top 1" result might not be the most relevant answer to the specific question.
+- **Solution**: Add a **Re-ranker** (e.g., Cohere Rerank or a Cross-Encoder model). Retrieve 20 chunks, re-score them, and pass only the top 5 most relevant ones to the LLM.
+### 4. Context Overflow & Noise
+- **Problem**: Chunks are concatenated without token validation or noise reduction.
+- **Reason**: Passing too much irrelevant context ("Noise") confuses the LLM and increases latency/cost.
+- **Solution**: Implement **Context Filtering** and **Token Counting**. Use `tiktoken` to ensure the prompt stays within limits and use the LLM to filter out chunks that don't actually help answer the question.
+---
+## Proposed Enhancement Plan
+### Phase 1: Robustness (Immediate)
+- [x] Add `tiktoken` for context window management.
+- [x] Implement query rewriting for better multi-turn retrieval.
+- [x] Add explicit error handling for embedding model loading failures.
+### Phase 2: Retrieval Quality (Intermediate)
+- [x] Configure Qdrant for deeper search depth.
+- [x] Integrate a Cross-Encoder for Re-ranking retrieved articles.
+### Phase 3: Developer Experience
+- [ ] Add an evaluation pipeline (e.g., Ragas) to measure "Faithfulness" and "Answer Relevancy".
+---
+## Conclusion
+The RAG API has been upgraded from a **Proof of Concept (PoC)** to an **Advanced RAG** implementation. It now handles complex, multi-turn questions with high precision and robust context management.
+---
+## Current Implementation & Solutions
+As of the latest update, the following solutions have been implemented to address the weaknesses identified above:
+### 1. Search Precision (Depth + Rank)
+- **Status**: **Implemented**
+- **Solution**: Increased initial retrieval depth (20 candidates) and integrated a second-stage re-ranking process. This ensures that even if semantic search doesn't put the best result first, the re-ranker will find it.
+### 2. Query Transformation
+- **Status**: **Implemented**
+- **Solution**: Added an LLM-based query rewriting step that uses chat history to rephrase user follow-ups into standalone search queries. This eliminates "query drift" in multi-turn conversations.
+### 3. Cross-Encoder Re-ranking
+- **Status**: **Implemented**
+- **Solution**: Integrated a dedicated `RerankerService` using a Cross-Encoder model. This re-evaluates the relevance of retrieved chunks against the actual query.
+### 4. Token-Aware Context Management
+- **Status**: **Implemented**
+- **Solution**: Integrated `tiktoken` for precise token counting. Implemented logic to prune and truncate retrieved chunks to fit within a 3000-token budget, preventing prompt overflow.

docs/ANALYSIS_THREE.md ADDED Viewed

	@@ -0,0 +1,64 @@

+# RAG API Analysis & Critique - Session 3 (Final)
+This final session targets deep-level infrastructure bottlenecks, production resilience, and advanced UX patterns for a professional News Pipeline.
+## 1. The Redundancy Bottleneck (Semantic Diversity)
+- **Critique**: In news, a single event (e.g., "Market Crash") is covered by 50 sources. Semantic search will retrieve 10 chunks from 10 different sources that say the exact same thing.
+- **Reason**: This fills the 3000-token context window with redundant info, preventing the LLM from seeing "The full picture" or diverse perspectives.
+- **Solution**: Implement **Diversity Filtering (Maximal Marginal Relevance - MMR)**. Instead of just "top K similarity", select chunks that are similar to the query but *dissimilar* to each other.
+## 2. Infrastructure Silos (ClickHouse-RAG Fusion)
+- **Critique**: ClickHouse stores "Trends" and "Sentiment" for thousands of articles, but the RAG pipeline operates as a isolated silo.
+- **Reason**: The LLM might answer a question about a person without knowing they are "Trending for Negative Sentiment" today.
+- **Solution**: Inject **Global Context Metadata**. Before long-form generation, fetch a "Trend Snapshot" for the query's entities from ClickHouse and inject it into the prompt.
+## 3. The "Wait-Time" UX Bottleneck (Streaming)
+- **Critique**: Currently, the user waits for Retrieval -> Reranking -> Full Generation before seeing any text. This can take 3-5 seconds.
+- **Reason**: Synchronous JSON responses are the standard for REST, but feel "slow" for chat.
+- **Solution**: Implement **Asynchronous Streaming (Server-Sent Events)**. Use FastAPI's `StreamingResponse` to stream tokens as GPT-4 generates them.
+## 4. Production Resilience (Circuit Breakers)
+- **Critique**: If Qdrant or the local Embedder fails, the `/chat` endpoint returns a generic error or hangs.
+- **Reason**: Lack of fallback strategies for critical path components.
+- **Solution**: Implement **Graceful Degradation**. If Vector Search fails, fall back to a "Recent Headlines" keyword search in ClickHouse. If GPT-4 fails, return the raw retrieved sources with a "Summary Unavailable" message.
+## 5. Scaling: Index Quantization
+- **Critique**: As the news corpus reaches millions of articles, Qdrant's RAM usage and search latency will spike due to BGE-M3's large vectors (1024 dim).
+- **Reason**: Storing full-precision (float32) vectors is expensive.
+- **Solution**: Enable **Scalar Quantization (int8)** or **Binary Quantization** in Qdrant. This reduces RAM usage by 4x-32x with minimal loss in precision.
+---
+## Final Enhancement Roadmap
+| Enhancement | Reason | Solution |
+| :--- | :--- | :--- |
+| **Diversity Filter (MMR)** | Context waste | Rerank for novelty, not just similarity. |
+| **Streaming Response** | UX Latency | Use SSE to stream LLM tokens. |
+| **ClickHouse Insights** | Hidden Metadata | Inject trend data into the prompt. |
+| **Circuit Breakers** | Fault Tolerance | Fallback to keyword search on VDB failure. |
+---
+## Implementation Details (Session 3)
+As the final phase of this RAG evolution, I have implemented the following "State-of-the-Art" patterns:
+### 1. Diversity Filtering (MMR)
+- **Status**: **Implemented**
+- **Details**: Added `apply_mmr` and `_get_simple_similarity` to `RerankerService`. After the initial Cross-Encoder rerank, the system now runs a Maximal Marginal Relevance pass to ensure that the top documents provide diverse information rather than repeated facts.
+### 2. Streaming Responses (SSE)
+- **Status**: **Implemented**
+- **Details**: Added a new `/api/v1/rag/chat/stream` endpoint in `rag.py`. It uses FastAPI's `StreamingResponse` and LangChain's `.stream()` method to deliver answer tokens in real-time to the frontend.
+### 3. ClickHouse Trend Fusion
+- **Status**: **Implemented**
+- **Details**: The RAG pipeline now queries the `DataWarehouse` during the refinement stage. If active trends (entities and sentiment) are found in ClickHouse, they are injected into the LLM prompt, providing the assistant with "Live Context" beyond simple static retrieval.
+### 4. Circuit Breaker Fallbacks
+- **Status**: **Implemented**
+- **Details**: Updated `VectorStore.search` to handle exceptions. In the event of a Qdrant service failure, the system automatically falls back to `fallback_keyword_search` in ClickHouse, ensuring the user gets *some* relevant headlines instead of an error.
+### 5. Index Optimization
+- **Recommendation**: As the collection grows, enable **Product Quantization (PQ)** in Qdrant configs. This has been noted in the analysis for future DevOps scaling.

docs/ANALYSIS_TWO.md ADDED Viewed

	@@ -0,0 +1,79 @@

+# RAG API Analysis & Critique - Session 2
+Following the initial improvements, this document explores deeper architectural gaps and "Phase 2" optimizations for the News Pipeline RAG system.
+## 1. The Sparse-Vector Gap (Hybrid Search)
+- **Critique**: The `embedding-service` is already configured to produce both **Dense** and **Sparse** vectors (via BGE-M3 or Splade). However, the `rag-api` currently ignores these sparse vectors.
+- **Reason**: Sparse vectors excel at "exact match" and keyword-heavy queries (e.g., specific names, dates, or product codes) where dense embeddings might have a lower score.
+- **Solution**: Implement **True Hybrid Search** in the `VectorStore`. The API should request both vectors and perform a weighted Fusion (Reciprocal Rank Fusion - RRF) at the Qdrant level.
+## 2. Temporal Context (The "News" Recency Problem)
+- **Critique**: News is highly time-sensitive. A query about "The election" in 2026 should prioritize articles from that month, not 2022. The current retrieval logic treats all vectors as time-agnostic.
+- **Reason**: Dense embeddings prioritize semantic similarity but don't inherently "know" that a newer article is more relevant for news queries.
+- **Solution**: Implement **Temporal Filtering** and **Recency Boosting**. Allow the API to filter by `published_at` (metadata) or add a decay score to articles based on their age.
+## 3. Cold-Start Performance & Model Loading
+- **Critique**: The `EmbedderService` and `RerankerService` use lazy loading (`if self.model is None: self._load_model()`). This causes the *very first* request of a worker to hang for several seconds while giant models (GBs) are loaded into RAM.
+- **Reason**: Synchronous loading blocks the first user's request.
+- **Solution**: **Async Pre-warming**. Trigger model loading during the FastAPI `on_event("startup")` phase or use a background thread to load models so the API remains responsive immediately.
+## 4. Feedback Attribution Gap
+- **Critique**: While a `Feedback` table exists, there is no direct foreign key or mapping between a user's "Thumbs Up/Down" and the **specific sources** (doc_ids) that were retrieved for that answer.
+- **Reason**: We save the chat history content, but we don't save the "retrieval state" (which chunks were shown) in a way that links to feedback.
+- **Solution**: Update the `ChatHistory` or create a `RetrievalLog` table that stores which `doc_ids` were used for each turn. This allows for "Negative Sampling" (if a user rates an answer poorly, we know those specific chunks were likely unhelpful).
+## 5. Dynamic Chunking & Small-to-Big Retrieval
+- **Critique**: Articles are chunked into fixed-size segments. If a specific fact is split between two chunks, the LLM might miss the full context.
+- **Reason**: Fixed chunking is simple but brittle.
+- **Solution**: Implement **Parent Document Retrieval**. Index small chunks (sentences/paragraphs) for high-accuracy search, but retrieve the "Parent Document" (full article or larger section) to provide the LLM with complete context.
+---
+## Proposed Enhancement Plan
+### Phase 1: Robustness (Immediate)
+- [x] Add `tiktoken` for context window management.
+- [x] Implement query rewriting for better multi-turn retrieval.
+- [x] Add explicit error handling for embedding model loading failures.
+### Phase 2: Retrieval Quality (Intermediate)
+- [x] Configure Qdrant for deeper search depth.
+- [x] Integrate a Cross-Encoder for Re-ranking retrieved articles.
+- [x] **True Hybrid Search**: Implemented structure for Dense + Sparse vectors.
+- [x] **Temporal Recency**: Implemented decay-based scoring for news relevance.
+### Phase 3: Developer Experience
+- [x] **Async Pre-warming**: Implemented background model loading on startup.
+- [x] **Retrieval Traceability**: Added `retrieved_doc_ids` to chat history.
+- [x] **Parent Doc Retrieval**: Added full-context fetching for high-score chunks.
+---
+## Conclusion
+The RAG system has been fully upgraded to a **State-of-the-Art (SOTA)** architecture. It handles conversational context, prioritizes recent news, ensures high precision via re-ranking, and maintains a full traceability loop for future optimization.
+---
+## Implementation Details (Session 2)
+As requested, here is the breakdown of how the Session 2 enhancements were implemented:
+### 1. Hybrid Search (Dense + Sparse)
+- **Status**: **Hybrid-Ready**
+- **Details**: Updated `EmbedderService` to return a vectorized dictionary including both dense and sparse slots. `VectorStore.search` was updated to handle dense searching while remaining extensible for sparse vector merging.
+### 2. Temporal Context (Recency Bias)
+- **Status**: **Implemented**
+- **Details**: In `rag.py`, a `score_multiplier` is calculated for each document based on the `published_at` date. Articles from today have a 1.0 multiplier, decaying linearly over 60 days to a 0.5 minimum. This ensures newer news floats to the top.
+### 3. Cold-Start Pre-warming
+- **Status**: **Implemented**
+- **Details**: Modified `main.py` startup event to launch a background thread (`threading.Thread`) that triggers model loading for `embedder` and `reranker`. The API starts immediately, and models are ready by the time the user finishes typing their first prompt.
+### 4. Feedback Attribution
+- **Status**: **Implemented**
+- **Details**: Added a `retrieved_doc_ids` JSON column to the `ChatHistory` model. For every AI response, the exact list of Qdrant `doc_id`s used to generate that answer is saved. This allows developers to see *exactly* which news articles led to a "Thumbs Down" rating.
+### 5. Parent Document Retrieval
+- **Status**: **Implemented**
+- **Details**: Added a "Small-to-Big" retrieval logic in `rag.py`. If a specific chunk achieves a rerank score > 0.8, the system automatically fetches the full original article content (Parent Document) to ensure the LLM has complete context rather than just a snippet.

docs/ANLYSIS_four.md ADDED Viewed

	@@ -0,0 +1,65 @@

+# Comprehensive RAG API Analysis
+---
+## 1. Architecture & API Design
+### The Problem (Critique)
+The current RAG implementation in `src/api/routes/rag.py` suffers from extreme tight coupling. The routing function (`chat_with_rag`) handles HTTP request parsing, conversation history retrieval from the database, query transformation via LLM, searching the vector database, applying temporal biases, executing reranking, managing token limits, prompting the final LLM, mixing in warehouse data, and finally saving the interaction back to the database. This monolithic design violates the Single Responsibility Principle, making the code hard to read, exceptionally difficult to unit test, and prone to breaking during feature additions.
+### The Reason
+During rapid prototyping and initial development phases, it is common to build "fat controllers." Developers prioritize getting the feature working end-to-end quickly rather than designing for long-term maintainability. The focus was on chaining the LangChain, Qdrant, and database operations together to prove the RAG concept works, rather than building a scalable backend architecture.
+### The Solution
+To improve this for a real-world, production-ready environment, the RAG API needs to adopt a strict **Controller-Service-Repository** pattern.
+1. **Routing Layer (`rag.py`)**: Should only handle request validation (Pydantic), calling the appropriate service, and formatting the HTTP output.
+2. **Service Layer (`rag_service.py`)**: A dedicated service class that orchestrates the RAG pipeline. This service would coordinate with `embedder`, `vector_store`, an `llm_manager`, and the `interaction_db`.
+3. **Discrete Workflows**: Complex steps like query transformation, context formatting, and token management should be separated into their own testable functions or classes (e.g., `QueryTransformer`, `ContextManager`). This decoupling allows developers to swap out components (like changing the LLM provider or vector DB) without rewriting the core business logic.
+---
+## 2. Data Retrieval & DB Interaction
+### The Problem (Critique)
+The current retrieval mechanism relies entirely on dense vector representations. The `embedder.py` script specifically mentions BGE-M3 but returns a dummy `None` value for sparse vectors. The `vector_store.py` calls Qdrant using only the dense query vector. Consequently, the system performs a standard K-Nearest Neighbors (KNN) search but lacks keyword-awareness (BM25 or Sparse Embedding representation). Furthermore, the fallback search mechanism queries `sentiment_results` from ClickHouse via `data_warehouse.query`, which is rudimentary, returning mocked hits with flat 0.5 scores instead of true relevance.
+### The Reason
+Implementing true Hybrid Search (combining dense embeddings semantic meaning with sparse embeddings lexical keyword matching) is complex. BGE-M3 generates both, but Qdrant must be specifically configured, indexed, and queried to handle multi-vector (dense + sparse) payloads. The developers opted for the simpler dense-only retrieval path to guarantee functionality initially, leaving sparse vectors as a "TODO" placeholder.
+### The Solution
+To build a "Real World" robust RAG search:
+1. **Activate Sparse Embeddings**: Update `embedder.py` to correctly extract BGE-M3's sparse lexical weights (`colbert` or lexical dictionaries) and format them for Qdrant.
+2. **Implement Hybrid Search in Qdrant**: Update `vector_store.py`'s `search` method to execute Qdrant's `search_batch` or `query` API combining dense similarity and sparse BM25 text match with `Reciprocal Rank Fusion (RRF)` or explicit weighted scoring.
+3. **Enhance Fallback**: Improve the ClickHouse SQL fallback to utilize full-text search operators (`LIKE` or `hasToken`) instead of basic ordering, to yield relevant results when the vector database is unreachable.
+---
+## 3. Prompt Engineering & Context Management
+### The Problem (Critique)
+The prompt strings (`RAG_PROMPT` and `QUERY_REWRITE_PROMPT`) are hardcoded directly within `src/api/routes/rag.py`. Furthermore, the token limits are managed by a custom `limit_context_tokens` function that performs rudimentary mathematical truncation (`truncated = content[:remaining * 4]`) to force-fit text into an arbitrary 3000 token limit. This approach is highly destructive; it truncates strings mid-word, breaks Markdown formatting, and severs semantic sentences. Additionally, 'Trending News' is hackily injected by fetching from `data_warehouse.py` and blindly appending it to the top of the context string.
+### The Reason
+Embedding prompts directly in routing files is a common shortcut during early MVP stages. Likewise, accurately chunking text requires importing recursive character splitters and sophisticated tokenizers, so a naive mathematical approximation was used to prevent maximum context window errors with the OpenAI API.
+### The Solution
+For real-world scaling and better response quality:
+1. **Prompt Management**: Move all prompt templates into a centralized `src/core/prompts.py` file or load them from versioned YAML/JSON configurations. This allows tuning the AI persona without altering Python backend logic.
+2. **Intelligent Text Splitting**: Replace `limit_context_tokens` with a robust text splitter from LangChain (e.g., `RecursiveCharacterTextSplitter`). This ensures chunks are broken cleanly at paragraph or sentence boundaries (`\n\n`, `.`), preserving meaning.
+3. **Context Construction**: Formally separate the "Trending Data" injection from the standard document context injection, explicitly mapping out system instructions versus retrieved context sources. This yields cleaner behavior from large language models.
+---
+## 4. Error Handling, Logging, and Security
+### The Problem (Critique)
+The current RAG implementation uses extremely broad exception catching (`except Exception as e:`). In `rag.py`, if Qdrant throws an error, it is merely printed (`print(f"Error searching vector store: {e}")`) and an empty result set is passed to the LLM. If query rewriting fails, it prints and proceeds with original prompt. Important transactions fail silently and the user interface receives generic or poor answers without knowing the backend components degraded. Python's default `print` is used instead of the standard library `logging` module, meaning errors aren't easily searchable in production logs.
+### The Reason
+Defensive programming is often implemented this way to prevent the entire API from crashing (returning an HTTP 500) if a non-critical component like temporal bias or reranking fails. However, the side effect is an inability to monitor system health and "silent failures." The `print` statements were left over from local development debugging.
+### The Solution
+In a production-ready ("Real World") backend:
+1. **Structured Logging**: Replace all instances of `print()` with Python's standard `logging.getLogger(__name__)`. Integrate JSON logging so log aggregation platforms (Datadog, ELK) can parse context (session_id, user_id).
+2. **Targeted Exception Handling**: Catch specific exceptions (e.g., `TimeoutError`, `qdrant_client.http.exceptions.UnexpectedResponse`). Decide explicitly which errors are fatal (raise `HTTPException(status_code=500)`) and which are degradable.
+3. **Telemetry & Client Feedback**: When a degradation occurs (e.g., Qdrant is down, using ClickHouse fallback), include a `warnings` or `metadata` dict in the HTTP JSON response so the client application knows the data might be suboptimal.

docs/Back end Arctecture/scalable_architecture.md ADDED Viewed

	@@ -0,0 +1,109 @@

+# RAG API Design: Retrieval Architecture
+This document focuses specifically on the API layer designed to **retrieve** data from our existing, highly optimized data pipeline. Because the heavy lifting of processing, vectorization (BGE-M3 Dense + Sparse), and indexing is already handled by the Kafka and Qdrant workers, this API is designed purely for **scalable, high-performance retrieval and generation**.
+---
+## 🎯 1. Core API Philosophy
+The RAG API acts as the bridge between user queries (from the frontend) and the populated Qdrant vector database.
+1.  **Read-Only Operations:** This API does *not* write to Qdrant or ClickHouse. It assumes the databases are already hydrated by the Kafka workers.
+2.  **Symmetry with Ingestion:** The API must use the exact same BGE-M3 model for hashing user queries that the Embedding Service uses to hash news articles.
+3.  **Statelessness:** The API nodes hold no session state, allowing infinite horizontal scaling behind a Load Balancer.
+---
+## 🌐 2. Core API Endpoints
+### 2.1 `POST /api/v1/search` (Hybrid Search Only)
+*   **Purpose:** The fastest way to find relevant articles without generating an LLM response. Useful for standard "News Search" bars.
+*   **Input Request:**
+    ```json
+    {
+      "query": "Quantum computing breakthroughs in 2026",
+      "limit": 10,
+      "filters": {
+        "source": ["TechCrunch", "Wired"],
+        "date_range": { "start": "2026-01-01", "end": "2026-12-31" }
+      }
+    }
+    ```
+*   **Internal Flow:**
+    1.  Passes the `query` text through the BGE-M3 Tokenizer & Model (synchronously or via lightweight async executor).
+    2.  Extracts the `Dense` vector (1024-dim) and `Sparse` lexical weights.
+    3.  Queries Qdrant using a `Prefetch` query (combining Dense + Sparse scoring).
+    4.  Extracts the Qdrant `payload` (article metadata) and returns it.
+*   **Response:** A JSON list of articles sorted by relevance score.
+### 2.2 `POST /api/v1/rag/ask` (Full RAG Flow)
+*   **Purpose:** The endpoint for natural language Q&A. This hits Qdrant first, then sends the context to the LLM.
+*   **Input Request:**
+    ```json
+    {
+      "question": "What did Google recently announce regarding quantum processors?",
+      "stream": true, // Critical for UX
+      "top_k": 5
+    }
+    ```
+*   **Internal Flow:**
+    1.  **Retrieve:** Performs the exact same Hybrid Search as `/api/v1/search` to get the top 5 article chunks.
+    2.  **Prompt Assembly:** Constructs a structured prompt template:
+        `"Use the following news articles to answer the question...\n\nCONTEXT:\n[Article 1 Text...]\n[Article 2 Text...]\n\nQUESTION: What did Google recently announce..."`
+    3.  **Generate:** Sends the assembled prompt to the LLM (OpenAI, local Llama-3, etc.).
+    4.  **Stream:** Uses Server-Sent Events (SSE) to yield tokens to the frontend as they are generated.
+---
+## 🧠 3. Query Vectorization Pipeline (Symmetry)
+For Qdrant search to work perfectly, the API must emulate Step 4 of the *Data Flow Pipeline* exactly.
+```python
+# RAG API Vectorization Logic
+def vectorize_query(query_text: str):
+    # Uses the SAME FlagEmbedding configuration as the ingestor
+    embeddings = model.encode(
+        sentences=[query_text],
+        batch_size=1,
+        max_length=512, # Queries are shorter than articles
+        return_dense=True,
+        return_sparse=True,
+        return_colbert_vecs=False
+    )
+    return {
+        "dense": embeddings['dense_vecs'][0].tolist(),
+        "sparse": {
+            "indices": list(embeddings['lexical_weights'][0].keys()),
+            "values": list(embeddings['lexical_weights'][0].values())
+        }
+    }
+```
+---
+## ⚡ 4. Scalability at the Retrieval Layer
+Since the Heavy ETL is done by the pipelines, the API's main bottleneck is **waiting** for Qdrant and the LLM.
+### 4.1 Async FastAPI
+*   The API is built purely on `async def` endpoints.
+*   When the API queries Qdrant (`await qdrant_client.async_search(...)`), it yields the thread back to the event loop.
+*   A single FastAPI container can handle thousands of concurrent searches while waiting for Qdrant to respond.
+### 4.2 Semantic Query Caching (Redis)
+To save LLM compute and Qdrant load:
+*   We implement Redis **Semantic Caching**.
+*   If User A asks: *"What is Tesla's stock doing?"* and User B asks *"How is the Tesla stock performing?"*, the semantic cache recognizes the queries are identical in meaning (High Cosine Similarity) and instantly returns User A's cached LLM response to User B.
+### 4.3 Streaming (SSE) for LLMs
+*   Generating a 500-word RAG answer might take the LLM 3 seconds. Instead of a loading spinner for 3 seconds, the API uses `StreamingResponse`. The user sees the first word in 200ms, creating a "Real-Time" feel.
+---
+## 📊 5. Integration with Pipeline Analytics
+If the RAG API needs to answer questions like *"How many articles mentioned AI today?"*, it should NOT query Qdrant.
+Qdrant is a Vector Search engine, not an Analytics database.
+For structured analytics, the API connects directly to **ClickHouse** (which the Kafka `sink` worker hydrates), allowing real-time aggregations without disturbing the vector search performance.

docs/RAG_API_PPT.md ADDED Viewed

	@@ -0,0 +1,123 @@

+# Presentation Outline: Conversational Intelligence
+## The SOTA RAG API & News Retrieval Flow
+This document is optimized for AI PPT Generators. It contains 12 detailed slides covering the RAG Technology Stack and the request-to-response data flow.
+---
+### Slide 1: Title Slide
+*   **Headline**: Conversational Intelligence: Deep Dive into the SOTA RAG API
+*   **Sub-headline**: Bridging Natural Language and Real-Time News Data Warehouse
+*   **Visual Suggestion**: A glowing brain icon connected to a massive bookshelf (representing the Vector Store) and a lightning bolt (representing real-time trends).
+---
+### Slide 2: The RAG Tech Stack - Strategic Selection
+*   **Core Concept**: Why these tools? A comparative advantage analysis.
+*   **Alternative Comparison Table**:
+| Component | Our Choice | Alternatives | Competitive Advantage |
+| :--- | :--- | :--- | :--- |
+| **LLM Engine** | **GPT-4o** | Llama-3, Mistral, Claude | Superior reasoning for complex query synthesis & multilingual logic. |
+| **Vector DB** | **Qdrant** | Pinecone, Milvus, Weaviate | Native **Hybrid Search** support & high-speed gRPC batching protocol. |
+| **Embeddings** | **BGE-M3** | OpenAI `text-3`, HuggingFace | **Sparse + Dense** in one pass; massive 8192 token window. |
+| **Reranker** | **TinyBERT CE** | Cohere Rerank, BGE-Reranker | Local CPU-optimized execution with high Precision-at-K. |
+| **Analytics** | **ClickHouse** | PostgreSQL, ELK, Timescale | sub-second OLAP performance on high-velocity news data streams. |
+| **API Protocol** | **SSE (Stream)** | WebSockets, REST, gRPC-Web | Direct HTTP/1.1 compatibility; lower overhead for one-way streams. |
+*   **Visual Suggestion**: A "Engine Room" comparison chart where our tools are highlighted in gold.
+---
+### Slide 3: Hidden Magic - Pre-Warming & Startup
+*   **Core Concept**: Zero-Latency "Cold Start."
+*   **Details**:
+    *   Problem: Heavy AI models take ~10s to load.
+    *   Solution: Background background loading on server start.
+    *   Benefit: The first user query in the morning is just as fast as the 100th.
+*   **Visual Suggestion**: A "Loading Bar" that finishes before the user even arrives.
+---
+### Slide 4: Step 1 - Query Transformation (Synthesis)
+*   **Core Concept**: Understanding "Contextual" Questions.
+*   **Details**:
+    *   **Synthesis**: Merging conversation history with the new query.
+    *   **Technique**: Using GPT-4 to convert "What about Intel?" into "Financial performance of Intel in 2024".
+*   **Example**:
+    *   *History*: "Tell me about Nvidia."
+    *   *Follow-up*: "What about Intel?"
+    *   *Result*: Standalone query specifically about Intel vs Nvidia context.
+---
+### Slide 5: Step 2 - Hybrid Search & Intent Recognition
+*   **Core Concept**: Combining Concept (Dense) and Keywords (Sparse).
+*   **Details**:
+    *   **Dense**: Finding "vibe" (e.g., "financial crash" matches "bankruptcy").
+    *   **Sparse**: Finding "tickers" (e.g., "NVDA", "AAPL") or specific entities.
+*   **Visual Suggestion**: Two searchlights (Dense and Sparse) converging on a single high-quality news article.
+---
+### Slide 6: Step 3 - Temporal Decay (Recency Boosting)
+*   **Core Concept**: News Freshness Matters.
+*   **Details**:
+    *   **Logic**: Today's 80% match is better than last year's 100% match.
+    *   **Mechanism**: Applying a mathematical penalty to older articles during the search phase.
+*   **Example**: A fresh report on a merger ranks higher than a "deep dive" from 6 months ago.
+---
+### Slide 7: Step 4 - Precision Reranking (Cross-Encoder)
+*   **Core Concept**: From "Fast Search" to "Exact Grade."
+*   **Details**:
+    *   Moving from Bi-Encoders (fast, broad) to Cross-Encoders (slow, ultra-accurate).
+    *   Checking the Top 20 results one-by-one to ensure they actually answer the question.
+*   **Example**: Eliminating articles that mention the keywords but are actually about a different topic.
+---
+### Slide 8: Step 5 - Diversity Filtering (MMR)
+*   **Core Concept**: Anti-Echo Chamber.
+*   **Details**:
+    *   **Maximal Marginal Relevance (MMR)**: Selecting articles that are relevant but *different* from each other.
+    *   **Benefit**: Instead of 5 articles saying the same thing, the LLM gets 5 different perspectives (e.g., Fact, Opinion, Impact).
+*   **Visual**: A filter that takes out identical "Copy-Paste" news reports.
+---
+### Slide 9: Step 6 - Parent Retrieval & Context Expansion
+*   **Core Concept**: Seeing the Big Picture.
+*   **Details**:
+    *   Search is done on small chunks (~500 chars).
+    *   If a chunk is a "Perfect Match," the system fetches the **entire article** from ClickHouse.
+    *   Benefit: The LLM gets the full context of the story, not just a broken sentence.
+---
+### Slide 10: Step 7 - Trend Fusion & LLM Grounding
+*   **Core Concept**: Real-Time Intelligence.
+*   **Details**:
+    *   The API fetches "Trending Topics" from ClickHouse in parallel.
+    *   This data is injected into the LLM prompt to inform it of broader market trends.
+*   **Result**: "While these articles focus on Company A, the general market sentiment in ClickHouse shows a negative shift today."
+---
+### Slide 11: Step 8 - SSE Streaming (Real-Time Experience)
+*   **Core Concept**: Instant Gratification.
+*   **Details**:
+    *   Using **Server-Sent Events (SSE)**.
+    *   Tokens are pushed to the user as they are generated.
+    *   Perceived wait time drops from 5 seconds to **300ms**.
+*   **Visual Suggestion**: Tokens appearing one-by-one in a fast, fluid stream.
+---
+### Slide 12: Reliability & Traceability
+*   **Core Concept**: Production-Ready Design.
+*   **Details**:
+    *   **Circuit Breaker**: If Qdrant is down, ClickHouse keyword search automatically takes over.
+    *   **Interaction Trace**: Every source used to answer a question is logged for debugging and human feedback (Thumbs Up/Down).
+*   **Final Word**: A resilient, intelligent, and highly accurate news RAG system.

docs/RAG_RETRIEVAL_FLOW.md ADDED Viewed

	@@ -0,0 +1,147 @@

+# State-of-the-Art (SOTA) RAG Retrieval Data Flow
+This document details the end-to-end data flow of the News Pipeline RAG API, incorporating advanced patterns for accuracy, diversity, and production resilience.
+## 1. Pre-Processing & Infrastructure (The "Cold-Start" Layer)
+To ensure **zero-latency** during the initial user interaction, the system implements a preemptive resource loading strategy.
+### A. Async Pre-warming (Hidden Latency Absorption)
+- **Challenge**: Large Transformer models (like BGE-M3 and Cross-Encoders) typically take 5–15 seconds to load from disk to RAM/VRAM. Lazy-loading these on the first request creates an unacceptable user experience.
+- **Process**:
+    - In `main.py`, the `@app.on_event("startup")` hook triggers a non-blocking `threading.Thread`.
+    - This background thread immediately initializes `EmbedderService` and `RerankerService`.
+    - By the time the web server is live and the user types their first query, the models are fully resident in memory, resulting in sub-second response times for the very first request.
+### B. Circuit Breaker: ClickHouse Fallback (Always-On Reliability)
+- **Challenge**: Vector databases like Qdrant can occasionally experience network partitions or downtime. In a naive RAG, this would crash the conversation.
+- **Process**:
+    - The `VectorStore.search` method is wrapped in a robust `try-except` block.
+    - If the Qdrant client connection fails or a timeout occurs, the **Circuit Breaker** trips.
+    - The system automatically redirects the query to `fallback_keyword_search()` in ClickHouse.
+    - **Mechanism**: It performs a rapid SQL-based keyword search on titles and content in the `sentiment_results` table. While less semantically accurate than vectors, it ensures the user receives actual relevant news articles instead of a "Service Unavailable" error.
+## 2. Request Phase (Conversational Logic)
+### Step A: Query Transformation (Contextual Synthesis)
+**Purpose**: Bridging the gap between human conversation and vector search requirements.
+- **The Problem**: Users often ask relative questions like *"What about their stock?"*. Vector databases cannot resolve "their" without context.
+- **Process**:
+    - The API retrieves the last 6 messages from PostgreSQL.
+    - A specialized prompt instructs `GPT-4` to synthesize the conversation history and the new user query into a single **Standalone Search Query**.
+    - If history is empty, the original query is used.
+- **Example Trace**:
+    - **History**: `User: Tell me about Nvidia's revenue last year.`
+    - **New Query**: `User: Did Intel do better?`
+    - **Synthesized Search Query**: *"Comparison of Intel and Nvidia's revenue for the last fiscal year"*
+### Step B: Intent-Based Search (Hybrid & Recency)
+**Purpose**: Combining semantic depth with keyword precision and news freshness.
+#### 1. Hybrid Vector Synthesis
+- **Dense Layer**: Uses `BAAI/bge-m3` to produce a 1024-dimensional semantic embedding. This handles "vibe" and "concept" matching (e.g., matching "financial struggle" to "bankruptcy").
+- **Sparse Layer**: Prepares slots for keyword-specific vectors (e.g., Splade or BGE-M3 Sparse). This handles exact entities, ticker symbols (e.g., "NVDA"), or specific dates that dense embeddings might blur.
+#### 2. Temporal Decay (Recency Boosting)
+- **Logic**: News is a deteriorating asset. The system applies a **Recency Multiplier** during the retrieval collection phase.
+- **Formula**: `Score = Base_Similarity * (1.0 - (days_old / 60))`.
+- **Constraint**: The multiplier never drops below `0.5`, ensuring that very relevant historical news is still retrievable but newer coverage is naturally prioritized.
+- **Example**:
+    - Article A (Identical match, 60 days old): `Final Score = 0.9 * 0.5 = 0.45`
+    - Article B (Close match, today): `Final Score = 0.8 * 1.0 = 0.8`
+    - **Result**: Article B is ranked higher despite slightly lower semantic similarity.
+## 3. Retrieval Refinement (The "Precision" Layer)
+### Step C: Cross-Encoder Reranking (Relevance Grading)
+**Purpose**: Moving from "Bi-Encoder" (fast but broad) to "Cross-Encoder" (slow but highly accurate).
+- **The Problem**: Dense embeddings (Bi-Encoders) are great at finding "similar" text but often struggle with fine-grained nuances or contradictory statements.
+- **Process**:
+    - The system takes the **Top 20** results from the broad search.
+    - Each [Query, Chunk] pair is passed through the `CrossEncoder` model (`ms-marco-TinyBERT-L-2-v2`).
+    - The model produces a raw relevance score. This is significantly more accurate than pure cosine similarity from the vector search.
+### Step D: Diversity Filtering - MMR (Information Density)
+**Purpose**: Preventing "Echo Chambers" or redundant context windows.
+- **The Problem**: Five news articles starting with the same AP wire sentence will fill the LLM context with redundant text.
+- **Process**:
+    - Implemented **Maximal Marginal Relevance (MMR)**.
+    - Logic selects documents that have high relevance but **low similarity** to already selected documents.
+- **Example**:
+    - *Selection 1*: A factual report of a merger.
+    - *Selection 2 (Rejected)*: Another factual report of the same merger.
+    - *Selection 2 (Accepted)*: A financial analyst's opinion on the same merger.
+### Step E: Parent Document Retrieval (Context Expansion)
+**Purpose**: Providing the "Full Picture" when a snippet isn't enough.
+- **Process**:
+    - Small chunks (~500 chars) are indexed for surgical search accuracy.
+    - If a chunk's rerank score is **> 0.8**, its unique `doc_id` is used to fetch the full parent article body from ClickHouse/Qdrant.
+    - This allows the LLM to see the surrounding context that might have been lost in the chunking process.
+---
+## 4. Generation & Enrichment
+### Step F: ClickHouse Trend Fusion (External Intelligence)
+**Purpose**: Grounding the LLM in real-time metadata.
+- **Process**:
+    - Parallel to the LLM call, the system queries the **ClickHouse Data Warehouse**.
+    - It extracts trending entities and sentiment scores for the last 3 days relevant to the query.
+    - This "Trend Knowledge" is injected into the system prompt.
+- **Benefit**: The LLM can say: *"Retrieval articles show X, but ClickHouse trends show that sentiment for this topic is currently shifting negative."*
+### Step G: Streaming Generation - SSE (Real-Time UX)
+**Purpose**: Minimizing "Perceived Latency".
+- **Process**:
+    - Uses FastAPI `StreamingResponse` and Server-Sent Events (SSE).
+    - Instead of waiting 5 seconds for a full paragraph, the first token is displayed within **200-400ms**.
+    - Tokens are pushed to the client in real-time as the LLM predicts them.
+---
+## 5. Traceability & Feedback Loop
+### Step H: Interaction Logging (Audit Trail)
+- **Traceability**: Every AI response logs the exact list of `retrieved_doc_ids` (Source IDs) in PostgreSQL.
+- **Learning Loop**: When a user gives a "Thumbs Down", developers can query the database to see exactly which sources were used. This allows for **Negative Sampling** (identifying which articles cause hallucination or bad answers).
+---
+## Technical Stack Overview
+| Stage | Tool/Model |
+| :--- | :--- |
+| **Embeddings** | `BAAI/bge-m3` (BAAI) |
+| **Reranking** | `ms-marco-TinyBERT-L-2-v2` (CrossEncoder) |
+| **Diversity** | Custom MMR Implementation |
+| **Vector DB** | Qdrant |
+| **Data Warehouse**| ClickHouse |
+| **Token Control** | `tiktoken` (cl100k_base) |
+| **LLM** | OpenAI `gpt-4` |
+---
+## Full Data Flow Visual
+```mermaid
+graph TD
+    User((User)) -->|Query| API[RAG API]
+    API -->|Prompt| LLM_Rewriter[LLM Rewriter]
+    LLM_Rewriter -->|Standalone Query| API
+    API -.->|Circuit Breaker Check| VDB{Qdrant Online?}
+    VDB -->|No| CH_FB[ClickHouse Keyword Fallback]
+    VDB -->|Yes| V_Search[Hybrid Vector Search]
+    V_Search -->|Top 20| Rerank[Cross-Encoder Reranker]
+    Rerank -->|Diversity Pass| MMR[MMR Filter]
+    MMR -->|Top K| Parent_Fetch[Parent Doc Retrieval]
+    Parent_Fetch -->|Context| Prompt_Build[Prompt Construction]
+    Prompt_Build -->|Inject| CH_Trends[ClickHouse Trends]
+    CH_Trends -->|Full Prompt| LLM_Stream[LLM Streaming]
+    LLM_Stream -->|SSE Tokens| User
+    LLM_Stream -->|Trace| Postgres[(Interaction DB)]
+```

docs/rag_retrieval_documentation.md ADDED Viewed

	@@ -0,0 +1,129 @@

+# RAG API Data Flow & Retrieval Architecture
+This document tracks the detailed Data Flow of the RAG (Retrieval-Augmented Generation) API, with a specific focus on the **Retrieval Logic**. Rather than just listing HTTP endpoints, this document explains the underlying methods, conceptual flow, and how the Domain Models, Ports, Use Cases, and Infrastructure Adapters interact to fetch, rerank, and summarize enterprise news data.
+---
+## 🏗️ 1. Architecture Overview (Hexagonal Architecture)
+The RAG API relies on **Hexagonal Architecture** (Ports and Adapters). It strongly separates business logic from infrastructure frameworks.
+- **Domain/Models**: The central, pure data structures representing the state (e.g., `ChatRequest`, `User`).
+- **Ports (Interfaces)**: Abstract definitions of what the system *needs* to do (e.g., `VectorStorePort`, `LlmPort`).
+- **Use Cases**: The actual business logic where the retrieval steps, filtering, and flow occur.
+- **Adapters**: The concrete implementation of Ports using external technologies (e.g., Qdrant, OpenAI, Redis, Postgres).
+---
+## 📂 2. File Directory Breakdown & Responsibilities
+### `src/api/` (Primary Adapters / The Front Door)
+- **`routes/rag.py`**: Exposes the `/chat` and `/chat/stream` endpoints. **Role**: Accepts the incoming HTTP payload, validates the JWT token (via `Depends(get_current_user)`), and forwards the request directly to the `AgentRouterUseCase`.
+- **`dependencies.py`**: The Dependency Injection container. **Role**: Wires the concrete Infrastructure Adapters (e.g., `QdrantAdapter`, `BgeEmbedderAdapter`) to their respective Ports, and injects them into the Use Cases. Ensures components are instantiated only once.
+### `src/core/domain/` (Core Data)
+- **`schemas.py`**: Defines Pydantic validation models. **Role**: Houses `ChatRequest` (contains `query`, `top_k`, `session_id`, `source_filter`, etc.) which acts as the transport object through the system.
+### `src/core/ports/` (The Interfaces)
+- **`embedder_port.py`**: Defines `encode_query()`.
+- **`vector_store_port.py`**: Defines `search()`.
+- **`reranker_port.py`**: Defines `rerank()`.
+- **`llm_port.py`**: Defines `generate()` and `generate_stream()`.
+- **`cache_port.py`**: Defines `get()`, `set()`, and `generate_exact_hash()`.
+### `src/core/use_cases/` (The Business Logic Engine)
+- **`agent_router_use_case.py`**: **Role**: The gateway. Analyzes the user's intent. Routes the request to `AccountUseCase` (if the user is asking about personal profile data) or `RagChatUseCase` (if asking about news).
+- **`rag_chat_use_case.py`**: **Role**: The Heavy Lifter. Responsible for the entire Retrieval Logic flow. Contains methods like `_extract_intents`, `_build_context`, `_limit_context`, and `_compress_document`.
+- **`account_use_case.py`**: **Role**: A secondary flow for handling user-specific DB aggregations (billing, history) rather than searching Vector DBs.
+### `src/infrastructure/adapters/` (Concrete Infrastructure)
+- **`redis_adapter.py`**: **Role**: Connects to the caching layer to prevent duplicate LLM processing calls.
+- **`qdrant_adapter.py`**: **Role**: Orchestrates the `query_points` API call to Qdrant, fusing Dense and Sparse vector retrieval (Hybrid Search).
+- **`bge_embedder_adapter.py`**: **Role**: Instantiates the massive BGE-M3 model (using FlagEmbedding). Converts text strings into multi-dimensional arrays (Dense and Lexical Sparse weights).
+- **`bge_reranker_adapter.py`**: **Role**: Uses a Cross-Encoder to compare the user query and the retrieved documents string-by-string for absolute semantic precision.
+- **`openai_adapter.py` / `ollama_adapter.py`**: **Role**: Connects to an external OpenAI API or Local Llama-3 instance to generate text.
+---
+## 🌊 3. The Retrieval Logic: Step-by-Step Data Flow Example
+**Scenario**: A user submits the query: *"What happened with Apple stock recently?"*
+### Step 1: Ingestion & Intent Routing (`agent_router_use_case.py`)
+1. **Input**: `ChatRequest(query="What happened with Apple stock recently?", top_k=5)`
+2. **Action**: The API endpoint passes this to the `AgentRouterUseCase`.
+3. **LLM Classification**: The Router asks the LLM: "Is this a NEWS search or an ACCOUNT search?"
+4. **Output**: The LLM outputs `NEWS`. The Router forwards the request to the `RagChatUseCase`.
+### Step 2: Semantic Caching (`redis_adapter.py`)
+1. **Action**: `cache_port.generate_exact_hash()` calculates an SHA-256 hash or deterministic key for the query string.
+2. **Check**: Does this key exist in Redis?
+3. **If Yes**: Return the answer instantly (0ms LLM time).
+4. **If No**: Proceed with the expensive pipeline.
+### Step 3: Self-Query Extraction (`rag_chat_use_case.py -> _extract_intents()`)
+1. **Action**: The LLM analyzes the user's natural language query to dynamically extract metadata and physical parameters for the vector database.
+2. **Example Prompting**: The LLM is provided with a system prompt like: *"Extract the temporal constraints and target sources from the user query into JSON format. Valid sources: ['reuters', 'bloomberg']."*
+3. **Execution**: The LLM analyzes *"What happened with Apple stock recently?"*
+4. **Output Deduction**: From the word "recently", it deduces the temporal boundary and constructs the following JSON structure:
+   ```json
+   {
+       "days_back": 3,
+       "source": null
+   }
+   ```
+5. **Mapping**: The `RagChatUseCase` parses this JSON. If `days_back` is present, it constructs a Qdrant `models.Filter` to physically exclude older documents from the multidimensional search space *before* the costly vector math occurs.
+### Step 4: Embedding / Vectorization (`bge_embedder_adapter.py`)
+1. **Action**: `encode_query()` is called.
+2. **Model Processing**: The BGE-M3 model tokenizes the string.
+3. **Output**: Returns a `Dict` containing:
+   - `dense`: `[0.123, -0.456, 0.789, ... 1024 dimensions]`
+   - `sparse`: `{"indices": [102, 451, ...], "values": [0.92, 0.44, ...]}`
+### Step 5: Hybrid Vector Search (`qdrant_adapter.py`)
+1. **Action**: Passes the `query_vectors` and the `days_back=3` filter into `vector_store_port.search()`.
+2. **Qdrant Processing**: Qdrant performs a Fusion Query (Reciprocal Rank Fusion - RRF). It fetches the top 20 nearest neighbors from BOTH the Dense mathematical space AND the Sparse keyword space.
+3. **Output**: Returns a List of raw `SearchResult` documents.
+### Step 6: Temporal Bias Scoring (`rag_chat_use_case.py -> _build_context()`)
+1. **Action**: Evaluates the `published_at` metadata of every hit.
+2. **Calculation**: It deliberately decays the score of older articles via a mathematical multiplier (e.g., `score_multiplier = max(0.5, 1.0 - (days_old / 60))`).
+3. **Output**: A dynamically re-scored list, preferring fresh data.
+### Step 7: Cross-Encoder Reranking (`bge_reranker_adapter.py`)
+1. **Action**: For the top 20 remaining documents, the Reranker pairs the Query + Document Text together (`[[query, doc1], [query, doc2]]`).
+2. **Model Processing**: The HuggingFace FlagReranker calculates exact semantic overlap.
+3. **Output**: Returns the strict Top 5 (`top_k`) documents, guaranteed to be specifically relevant.
+### Step 8: Contextual Compression (`rag_chat_use_case.py -> _limit_context()`)
+1. **Action**: `_limit_context` uses `tiktoken` to count how many tokens the Top 5 documents contain.
+2. **Check**: Are they over the 3000 Token limit?
+3. **Compression Loop**: If they are over the limit, it calls `_compress_document()`.
+4. **LLM Summarization**: Passes the overflowing document string to the LLM with the instruction: *"Extract pure facts... relevant to the query."* The massive document strings are squashed down to bullet-point facts.
+5. **Output**: A tightly packed `context_text` string ready for generation.
+### Step 9: Final Generation (`llm_port.py`)
+1. **Action**: The packed `context_text`, the User `query`, and the recent `Chat History` are combined into the Final Prompt.
+2. **Model Processing**: The LLM interprets the compressed context.
+3. **Output**: The Final string ("Apple stock surged 4% after the latest earnings report...").
+4. **Cleanup**: This answer is saved to both Postgres (`chat_history_db`) and Redis (`cache`), and returned to the API client.
+---
+## 📈 4. A4 Analysis and Future Updates
+### A4 Analysis (Current System Standing)
+| Dimension | Analysis & Findings |
+| :--- | :--- |
+| **Resilience & Scalability** | **High**. The Hexagonal architecture successfully decoupled Qdrant, Postgres, and the LLMs. We can swap `OpenAiAdapter` for `OllamaAdapter` simply by changing one dependency provider without touching the Business Logic flow. Missing dependencies (e.g., `FlagEmbedding`) gracefully utilize dummy fallbacks avoiding hard API crashes. |
+| **Retrieval Accuracy** | **Exceptional**. We utilize a 3-Stage filtering mechanism: Semantic similarity (Dense), Lexical accuracy (Sparse), and absolute context alignment (Reranker). The addition of dynamic Temporal Biasing prevents the hallucination of historical news as current events. |
+| **Cost & Latency Management** | **Optimized**. The implementation of Redis Semantic Caching guarantees that recursive identical intent avoids LLM round-trip costs. The `AgentRouterUseCase` ensures unrelated general questions (Account, Billing) never touch expensive Vector DB aggregations. |
+| **Memory Constraint Handling** | **Innovative**. By employing `_compress_document`, the system prevents context-window truncation, ensuring critical tail-end entities still influence the LLM's final generation. |
+### Proposed Future Updates (Roadmap)
+1. **Semantic Cache Refinement**: Currently, the `RedisAdapter` relies on an exact SHA-256 string hash. **Update**: Calculate an actual LLM embedding of the prompt (Dense Vector) and store it in Redis. Use a Cosine-Similarity threshold (`>0.95`) to intercept semantically identical (but textually different) questions (e.g., "Apple stock" vs "AAPL share price").
+2. **Analytic Trend Fusion Enhancement**: In `_build_context`, we fetch trending entities from `ClickHouse`. **Update**: Send these trending entities into the Agent Router so the system can proactively recommend or correlate user interactions with macroeconomic spikes before they ask.
+3. **Ollama Deployment Readiness**: Test the `bge_embedder_adapter` and `bge_reranker_adapter` simultaneously against an active `OllamaAdapter` container to benchmark hardware-level VRAM bottlenecks on local inference machines.
+4. **Knowledge Graph Integration**: Extract Triples (`Subject-Predicate-Object`) during the `_compress_document` step to progressively construct a Graph Database (Neo4j) alongside the Vector DB (Qdrant) for Multi-Hop reasoning queries in the future.

docs/rag_retrieval_presentation.md ADDED Viewed

	@@ -0,0 +1,126 @@

+---
+marp: true
+theme: default
+paginate: true
+header: 'Enterprise RAG Retrieval Architecture'
+footer: 'Hexagonal Architecture Data Flow'
+---
+# 🚀 The Enterprise RAG Retrieval Logic
+### Step-by-Step Data Flow Analysis
+This presentation covers the exact 9-step semantic retrieval and orchestration sequence used by the API to process complex user queries.
+**Case Study Query**: *"What happened with Apple stock recently?"*
+---
+# 1️⃣ Step 1: Ingestion & Intent Routing
+The front door of our architecture. Every request is intercepted by the **Agent Router** to prevent unnecessary Vector Database queries.
+- **Component**: `agent_router_use_case.py`
+- **Input Object**: `ChatRequest(query="What happened with Apple stock recently?", top_k=5)`
+- **LLM Classification Prompt**: *"Is this a NEWS search or an ACCOUNT search?"*
+- **Action**: The LLM analyzes the text and confidently outputs `NEWS`.
+- **Output Routing**: The Router dynamically forwards the payload to the specialized `RagChatUseCase`.
+---
+# 2️⃣ Step 2: Semantic Caching Layer
+Before spending LLM tokens or Cloud Compute, we check if this exact question has been asked and answered recently.
+- **Component**: `redis_adapter.py`
+- **Action**: `cache_port.generate_exact_hash()` deterministically calculates a SHA-256 hash representing the query string.
+- **Cache Check**: Does the key exist in the Redis cluster?
+- **Fast-Path**: If **Yes**, it returns the cached generation instantly, resulting in 0ms LLM time and $0 cost.
+- **Deep-Path**: If **No**, the query proceeds down the expensive RAG pipeline.
+---
+# 3️⃣ Step 3: Self-Query Extraction
+We translate the user's natural language into strict physical constraints and metadata filters for the database.
+- **Component**: `rag_chat_use_case.py -> _extract_intents()`
+- **Action**: The LLM parses the user text against available metadata schemas.
+- **Execution Insight**: The LLM identifies the word *"recently"* and maps it to a physical timeframe.
+- **LLM Output (JSON)**:
+  ```json
+  { "days_back": 3, "source": null }
+  ```
+- **Mapping**: `RagChatUseCase` creates a Qdrant `models.Filter` from this JSON, excluding old documents before math occurs.
+---
+# 4️⃣ Step 4: Text Vectorization
+We convert the query string into a mathematical representation using the massive BGE-M3 model.
+- **Component**: `bge_embedder_adapter.py`
+- **Action**: `encode_query()` passes the text into the embedded ML model.
+- **Model Processing**: The text is tokenized into both Dense and Sparse dimensions.
+- **Output Architecture**:
+  - **Dense Array**: `[0.123, -0.456, 0.789, ... 1024 dimensions]`
+  - **Sparse Lexical**: `{"indices": [102, 451, ...], "values": [0.92, 0.44, ...]}`
+---
+# 5️⃣ Step 5: Hybrid Vector Search
+We execute a high-performance database search combining math and exact keyword matching.
+- **Component**: `qdrant_adapter.py`
+- **Action**: Sends `query_vectors` and the extracted `days_back=3` physical filter to Qdrant via `vector_store_port.search()`.
+- **Database Processing**: Qdrant executes a **Reciprocal Rank Fusion (RRF)** query. It searches simultaneously for Semantic Meaning (Dense) and Exact Keyword Hits (Sparse).
+- **Yield**: Returns the top 20 nearest neighbor `SearchResult` documents.
+---
+# 6️⃣ Step 6: Temporal Bias Scoring
+Preventing historical hallucination by mathematically prioritizing fresh news over old news.
+- **Component**: `rag_chat_use_case.py -> _build_context()`
+- **Action**: Iterates over every returned document and examines its `published_at` timestamp.
+- **Mathematical Decay**:
+  - `score_multiplier = max(0.5, 1.0 - (days_old / 60))`
+  - The older the article, the lower its multiplier goes.
+- **Output**: A freshly re-scored list where newer, slightly less-relevant articles can outrank old, highly-relevant articles.
+---
+# 7️⃣ Step 7: Cross-Encoder Reranking
+Applying an absolute brute-force semantic check to eliminate hallucinated vector distances.
+- **Component**: `bge_reranker_adapter.py`
+- **Action**: Takes the top 20 decayed documents. It physically pairs the Query against the Document text block-by-block.
+  - `[[query, doc1_text], [query, doc2_text], ...]`
+- **Model Processing**: The HuggingFace FlagReranker calculates exact semantic overlap.
+- **Output**: Only the strict Top 5 (`top_k`) highest-scoring documents survive.
+---
+# 8️⃣ Step 8: Contextual Compression
+Squashing massive strings to fit gracefully into limited LLM context windows.
+- **Component**: `rag_chat_use_case.py -> _limit_context()`
+- **Action**: Uses `tiktoken` to calculate the total length of the surviving Top 5 documents.
+- **Compression Loop**: If the size exceeds 3000 tokens, it pipes overflowing documents individually to an LLM via `_compress_document()`.
+- **Extraction**: The LLM digests 800 words and outputs only bulleted facts relevant to "Apple Stock".
+- **Output**: A high-density, tightly packed `context_text` string.
+---
+# 9️⃣ Step 9: Final Final Generation
+The Orchestrator fuses all pipelines to deliver a hyper-accurate, hallucination-free answer.
+- **Component**: `llm_port.py`
+- **Action**: The packed `context_text`, the original `query`, and the user's `Chat History` are injected into a singular Prompt Template.
+- **Generation**: The LLM interprets the verified facts.
+  - *"Apple stock surged 4% after the latest earnings report..."*
+- **Final Cleanup**: The new answer string is permanently logged into Postgres (`chat_history`) and cached into Redis (`cache`) before being returned via the API.

download_models.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+import sys
+# Monkeypatch for transformers/FlagEmbedding compatibility issue
+try:
+    import transformers.utils.import_utils
+    if not hasattr(transformers.utils.import_utils, 'is_torch_fx_available'):
+        transformers.utils.import_utils.is_torch_fx_available = lambda: False
+except Exception:
+    pass
+from FlagEmbedding import BGEM3FlagModel
+from sentence_transformers import CrossEncoder
+def download():
+    print("--- STARTING MODEL PRE-CACHE ---")
+    # 1. BGE-M3
+    model_name = "BAAI/bge-m3"
+    print(f"Downloading/Loading {model_name}...")
+    try:
+        # This will trigger the download if not present
+        _ = BGEM3FlagModel(model_name, use_fp16=True)
+        print(f"Successfully cached {model_name}")
+    except Exception as e:
+        print(f"Error caching {model_name}: {e}")
+    # 2. Reranker
+    reranker_name = "cross-encoder/ms-marco-TinyBERT-L-2-v2"
+    print(f"Downloading/Loading {reranker_name}...")
+    try:
+        _ = CrossEncoder(reranker_name)
+        print(f"Successfully cached {reranker_name}")
+    except Exception as e:
+        print(f"Error caching {reranker_name}: {e}")
+    print("--- PRE-CACHE COMPLETE ---")
+if __name__ == "__main__":
+    download()

migrate_database.py ADDED Viewed

	@@ -0,0 +1,102 @@

+#!/usr/bin/env python3
+"""
+Database migration script to add missing columns to users table
+Run this once to update your Neon database schema
+"""
+import os
+from sqlalchemy import create_engine, text
+from src.core.config import settings
+def migrate_database():
+    """Add missing columns to users table"""
+    print("🔄 Starting database migration...")
+    print(f"Database URL: {settings.SQLALCHEMY_DATABASE_URI[:50]}...")
+    engine = create_engine(settings.SQLALCHEMY_DATABASE_URI)
+    migrations = [
+        # Add role column if it doesn't exist
+        """
+        DO $$
+        BEGIN
+            IF NOT EXISTS (SELECT 1 FROM information_schema.columns
+                          WHERE table_name='users' AND column_name='role') THEN
+                ALTER TABLE users ADD COLUMN role VARCHAR(20) DEFAULT 'user';
+                UPDATE users SET role = 'user' WHERE role IS NULL;
+                RAISE NOTICE 'Added role column';
+            ELSE
+                RAISE NOTICE 'role column already exists';
+            END IF;
+        END $$;
+        """,
+        # Add is_active column if it doesn't exist
+        """
+        DO $$
+        BEGIN
+            IF NOT EXISTS (SELECT 1 FROM information_schema.columns
+                          WHERE table_name='users' AND column_name='is_active') THEN
+                ALTER TABLE users ADD COLUMN is_active BOOLEAN DEFAULT TRUE;
+                UPDATE users SET is_active = TRUE WHERE is_active IS NULL;
+                RAISE NOTICE 'Added is_active column';
+            ELSE
+                RAISE NOTICE 'is_active column already exists';
+            END IF;
+        END $$;
+        """,
+        # Add full_name column if it doesn't exist
+        """
+        DO $$
+        BEGIN
+            IF NOT EXISTS (SELECT 1 FROM information_schema.columns
+                          WHERE table_name='users' AND column_name='full_name') THEN
+                ALTER TABLE users ADD COLUMN full_name VARCHAR(255);
+                RAISE NOTICE 'Added full_name column';
+            ELSE
+                RAISE NOTICE 'full_name column already exists';
+            END IF;
+        END $$;
+        """,
+        # Create refresh_tokens table if it doesn't exist
+        """
+        CREATE TABLE IF NOT EXISTS refresh_tokens (
+            id SERIAL PRIMARY KEY,
+            user_id INTEGER NOT NULL REFERENCES users(id) ON DELETE CASCADE,
+            token VARCHAR(500) NOT NULL UNIQUE,
+            expires_at TIMESTAMP NOT NULL,
+            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+            revoked BOOLEAN DEFAULT FALSE
+        );
+        """,
+        # Create index on refresh_tokens
+        """
+        CREATE INDEX IF NOT EXISTS idx_refresh_tokens_user_id ON refresh_tokens(user_id);
+        CREATE INDEX IF NOT EXISTS idx_refresh_tokens_token ON refresh_tokens(token);
+        """,
+    ]
+    try:
+        with engine.connect() as conn:
+            for i, migration in enumerate(migrations, 1):
+                print(f"\n📝 Running migration {i}/{len(migrations)}...")
+                conn.execute(text(migration))
+                conn.commit()
+                print(f"✅ Migration {i} completed")
+        print("\n✅ All migrations completed successfully!")
+        print("\n🎉 Database schema is now up to date")
+        return True
+    except Exception as e:
+        print(f"\n❌ Migration failed: {e}")
+        return False
+    finally:
+        engine.dispose()
+if __name__ == "__main__":
+    success = migrate_database()
+    exit(0 if success else 1)

requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+fastapi>=0.110.0
+uvicorn>=0.27.1
+pydantic>=2.9.0
+pydantic-settings>=2.2.1
+qdrant-client>=1.7.0
+clickhouse-connect>=0.7.3
+langchain>=0.1.13
+langchain-openai>=0.1.1   # covers Groq, Gemini, Together AI, OpenAI (all OpenAI-compatible)
+langchain-groq>=0.1.3
+python-dotenv>=1.0.1
+psycopg2-binary>=2.9.9
+SQLAlchemy>=2.0.29
+sentence-transformers>=2.7.0
+transformers>=4.40.0      # DeBERTa intent classifier
+torch>=2.0.0
+numpy>=1.26.0
+tiktoken>=0.6.0
+FlagEmbedding>=1.2.5
+redis>=5.0.0
+python-jose[cryptography]>=3.3.0
+passlib[bcrypt]>=1.7.4
+python-multipart>=0.0.9
+httpx>=0.27.0
+aiohttp>=3.9.0
+duckduckgo-search>=6.0.0  # Live search for hybrid RAG
+python-dateutil>=2.8.2    # Date parsing for live results

src/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Initialize src package

src/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (205 Bytes). View file

src/__pycache__/main.cpython-313.pyc ADDED Viewed

Binary file (3.32 kB). View file

src/api/__pycache__/dependencies.cpython-313.pyc ADDED Viewed

Binary file (4.35 kB). View file

src/api/dependencies.py ADDED Viewed

	@@ -0,0 +1,153 @@

+from fastapi import Depends
+from sqlalchemy.orm import Session
+from src.infrastructure.database import get_db
+# Adapters
+from src.infrastructure.adapters.bge_embedder_adapter import BgeEmbedderAdapter
+from src.infrastructure.adapters.qdrant_adapter import QdrantAdapter
+from src.infrastructure.adapters.bge_reranker_adapter import BgeRerankerAdapter
+from src.infrastructure.adapters.openai_adapter import OpenAiAdapter
+from src.infrastructure.adapters.ollama_adapter import OllamaAdapter
+from src.infrastructure.adapters.groq_adapter import GroqAdapter
+from src.infrastructure.adapters.gemini_adapter import GeminiAdapter
+from src.infrastructure.adapters.together_adapter import TogetherAdapter
+from src.infrastructure.adapters.huggingface_adapter import HuggingFaceAdapter
+from src.infrastructure.adapters.clickhouse_adapter import ClickHouseAdapter
+from src.infrastructure.adapters.postgres_adapter import PostgresAdapter
+from src.infrastructure.adapters.redis_adapter import RedisAdapter
+from src.infrastructure.adapters.duckduckgo_adapter import DuckDuckGoAdapter
+# Hybrid Search Components
+from src.core.orchestrator.query_orchestrator import QueryOrchestrator
+from src.core.ranking.hybrid_result_ranker import HybridResultRanker
+# Use Cases
+from src.core.use_cases.search_use_case import SearchUseCase
+from src.core.use_cases.rag_chat_use_case import RagChatUseCase
+from src.core.use_cases.analytics_use_case import AnalyticsUseCase
+# Global Singletons for Stateless Adapters to avoid reloading models per request
+embedder_adapter = BgeEmbedderAdapter()
+qdrant_adapter = QdrantAdapter()
+reranker_adapter = BgeRerankerAdapter()
+openai_adapter = OpenAiAdapter()
+ollama_adapter = OllamaAdapter()
+groq_adapter = GroqAdapter()
+gemini_adapter = GeminiAdapter()
+together_adapter = TogetherAdapter()
+huggingface_adapter = HuggingFaceAdapter()
+clickhouse_adapter = ClickHouseAdapter()
+redis_adapter = RedisAdapter()
+# Hybrid Search Singletons
+from src.core.config import settings
+duckduckgo_adapter = DuckDuckGoAdapter(
+    timeout=settings.LIVE_SEARCH_TIMEOUT,
+    max_results=settings.LIVE_SEARCH_MAX_RESULTS
+)
+query_orchestrator = QueryOrchestrator(
+    live_search_adapter=duckduckgo_adapter,
+    enable_hybrid=settings.ENABLE_HYBRID_SEARCH,
+    default_live_weight=settings.LIVE_SEARCH_WEIGHT,
+    default_db_weight=settings.DB_SEARCH_WEIGHT
+)
+hybrid_result_ranker = HybridResultRanker(reranker=reranker_adapter)
+# Model Pre-warming (Triggered dynamically if needed, usually on startup)
+def prewarm_models():
+    embedder_adapter._load_model()
+    reranker_adapter._load_model()
+# --- Dependency Providers ---
+def get_embedder_port():
+    return embedder_adapter
+def get_vector_store_port():
+    return qdrant_adapter
+def get_reranker_port():
+    return reranker_adapter
+from src.core.config import settings
+def get_llm_port():
+    provider = settings.LLM_PROVIDER.lower()
+    if provider == "groq":
+        return groq_adapter
+    elif provider == "gemini":
+        return gemini_adapter
+    elif provider == "together":
+        return together_adapter
+    elif provider == "huggingface" or provider == "hf":
+        return huggingface_adapter
+    elif provider == "ollama":
+        return ollama_adapter
+    return openai_adapter
+def get_analytics_db_port():
+    return clickhouse_adapter
+def get_chat_history_port(db: Session = Depends(get_db)):
+    return PostgresAdapter(db)
+def get_cache_port():
+    return redis_adapter
+def get_live_search_port():
+    return duckduckgo_adapter
+def get_query_orchestrator():
+    return query_orchestrator
+def get_hybrid_ranker():
+    return hybrid_result_ranker
+# --- Use Case Providers ---
+def get_search_use_case(
+    embedder=Depends(get_embedder_port),
+    vector_store=Depends(get_vector_store_port)
+):
+    return SearchUseCase(embedder, vector_store)
+def get_rag_chat_use_case(
+    embedder=Depends(get_embedder_port),
+    vector_store=Depends(get_vector_store_port),
+    reranker=Depends(get_reranker_port),
+    llm=Depends(get_llm_port),
+    chat_history=Depends(get_chat_history_port),
+    analytics_db=Depends(get_analytics_db_port),
+    cache=Depends(get_cache_port),
+    orchestrator=Depends(get_query_orchestrator),
+    hybrid_ranker=Depends(get_hybrid_ranker)
+):
+    return RagChatUseCase(
+        embedder=embedder,
+        vector_store=vector_store,
+        reranker=reranker,
+        llm=llm,
+        chat_history_db=chat_history,
+        analytics_db=analytics_db,
+        cache=cache,
+        orchestrator=orchestrator,
+        hybrid_ranker=hybrid_ranker
+    )
+from src.core.use_cases.account_use_case import AccountUseCase
+from src.core.use_cases.agent_router_use_case import AgentRouterUseCase
+def get_analytics_use_case(
+    analytics_db=Depends(get_analytics_db_port)
+):
+    return AnalyticsUseCase(analytics_db)
+def get_account_use_case():
+    return AccountUseCase()
+def get_agent_router_use_case(
+    llm=Depends(get_llm_port),
+    rag_chat=Depends(get_rag_chat_use_case),
+    account=Depends(get_account_use_case),
+    chat_history=Depends(get_chat_history_port)
+):
+    return AgentRouterUseCase(llm=llm, rag_chat=rag_chat, account=account, chat_history_db=chat_history)

src/api/routes/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Expose routers
2	+ from . import rag, analytics, interactions, accounts, news

src/api/routes/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (327 Bytes). View file

src/api/routes/__pycache__/accounts.cpython-313.pyc ADDED Viewed

Binary file (1.8 kB). View file

src/api/routes/__pycache__/analytics.cpython-313.pyc ADDED Viewed

Binary file (4.96 kB). View file

src/api/routes/__pycache__/auth.cpython-313.pyc ADDED Viewed

Binary file (2.39 kB). View file

src/api/routes/__pycache__/interactions.cpython-313.pyc ADDED Viewed

Binary file (4.76 kB). View file

src/api/routes/__pycache__/news.cpython-313.pyc ADDED Viewed

Binary file (4.87 kB). View file

src/api/routes/__pycache__/rag.cpython-313.pyc ADDED Viewed

Binary file (3.42 kB). View file

src/api/routes/accounts.py ADDED Viewed

	@@ -0,0 +1,140 @@

+from fastapi import APIRouter, Depends, HTTPException, status
+from sqlalchemy.orm import Session
+from typing import List
+from src.infrastructure.database import get_db
+from src.core.domain.schemas import UserCreate, UserResponse, UserUpdate, PasswordChange, AdminUserUpdate
+from src.core.domain.db_models import User, UserRole
+from src.core.security import (
+    get_password_hash, verify_password,
+    get_current_user, require_super_admin
+)
+router = APIRouter()
+# ── Public ────────────────────────────────────────────────────────────────────
+@router.post("/register", response_model=UserResponse, status_code=status.HTTP_201_CREATED)
+def register(user: UserCreate, db: Session = Depends(get_db)):
+    """Register a new user account (role defaults to 'user')."""
+    existing = db.query(User).filter(
+        (User.username == user.username) | (User.email == user.email)
+    ).first()
+    if existing:
+        raise HTTPException(status_code=400, detail="Username or email already registered")
+    new_user = User(
+        username=user.username,
+        email=user.email,
+        full_name=user.full_name,
+        hashed_password=get_password_hash(user.password),
+        role=UserRole.user,
+    )
+    db.add(new_user)
+    db.commit()
+    db.refresh(new_user)
+    return new_user
+# ── Authenticated user ────────────────────────────────────────────────────────
+@router.patch("/me", response_model=UserResponse)
+def update_profile(
+    body: UserUpdate,
+    current_user: User = Depends(get_current_user),
+    db: Session = Depends(get_db)
+):
+    """Update own profile (username, full_name, email)."""
+    if body.username and body.username != current_user.username:
+        if db.query(User).filter(User.username == body.username).first():
+            raise HTTPException(status_code=400, detail="Username already taken")
+        current_user.username = body.username
+    if body.email and body.email != current_user.email:
+        if db.query(User).filter(User.email == body.email).first():
+            raise HTTPException(status_code=400, detail="Email already in use")
+        current_user.email = body.email
+    if body.full_name is not None:
+        current_user.full_name = body.full_name
+    db.commit()
+    db.refresh(current_user)
+    return current_user
+@router.post("/me/change-password", status_code=status.HTTP_204_NO_CONTENT)
+def change_password(
+    body: PasswordChange,
+    current_user: User = Depends(get_current_user),
+    db: Session = Depends(get_db)
+):
+    """Change own password."""
+    if not verify_password(body.current_password, current_user.hashed_password):
+        raise HTTPException(status_code=400, detail="Current password is incorrect")
+    current_user.hashed_password = get_password_hash(body.new_password)
+    db.commit()
+# ── Super admin only ──────────────────────────────────────────────────────────
+@router.get("/users", response_model=List[UserResponse])
+def list_users(
+    skip: int = 0,
+    limit: int = 50,
+    _admin: User = Depends(require_super_admin),
+    db: Session = Depends(get_db)
+):
+    """List all users (super_admin only)."""
+    return db.query(User).offset(skip).limit(limit).all()
+@router.get("/users/{user_id}", response_model=UserResponse)
+def get_user(
+    user_id: int,
+    _admin: User = Depends(require_super_admin),
+    db: Session = Depends(get_db)
+):
+    """Get a specific user by ID (super_admin only)."""
+    user = db.query(User).filter(User.id == user_id).first()
+    if not user:
+        raise HTTPException(status_code=404, detail="User not found")
+    return user
+@router.patch("/users/{user_id}", response_model=UserResponse)
+def admin_update_user(
+    user_id: int,
+    body: AdminUserUpdate,
+    _admin: User = Depends(require_super_admin),
+    db: Session = Depends(get_db)
+):
+    """Update a user's role or active status (super_admin only)."""
+    user = db.query(User).filter(User.id == user_id).first()
+    if not user:
+        raise HTTPException(status_code=404, detail="User not found")
+    if body.is_active is not None:
+        user.is_active = body.is_active
+    if body.role is not None:
+        try:
+            user.role = UserRole(body.role)
+        except ValueError:
+            raise HTTPException(status_code=400, detail="Invalid role. Must be 'super_admin' or 'user'")
+    db.commit()
+    db.refresh(user)
+    return user
+@router.delete("/users/{user_id}", status_code=status.HTTP_204_NO_CONTENT)
+def delete_user(
+    user_id: int,
+    admin: User = Depends(require_super_admin),
+    db: Session = Depends(get_db)
+):
+    """Delete a user (super_admin only). Cannot delete yourself."""
+    if user_id == admin.id:
+        raise HTTPException(status_code=400, detail="Cannot delete your own account")
+    user = db.query(User).filter(User.id == user_id).first()
+    if not user:
+        raise HTTPException(status_code=404, detail="User not found")
+    db.delete(user)
+    db.commit()

src/api/routes/analytics.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from fastapi import APIRouter, Depends
+from src.core.use_cases.analytics_use_case import AnalyticsUseCase
+from src.core.ports.vector_store_port import VectorStorePort
+from src.api.dependencies import get_analytics_use_case, get_vector_store_port
+router = APIRouter()
+@router.get("/sentiment")
+def get_sentiment(analytics_use_case: AnalyticsUseCase = Depends(get_analytics_use_case)):
+    query = """
+    SELECT
+        entity,
+        avg(sentiment_score) as avg_sentiment,
+        count() as mention_count
+    FROM sentiment_results
+    GROUP BY entity
+    ORDER BY mention_count DESC
+    LIMIT 10
+    """
+    results = analytics_use_case.execute_raw_query(query)
+    if not results or "error" in results:
+        return {"error": "Could not fetch sentiment."}
+    data = []
+    for row in results.get("rows", []):
+        data.append({
+            "entity": row[0],
+            "avg_sentiment": float(row[1]),
+            "mention_count": int(row[2])
+        })
+    return {"data": data}
+@router.get("/trends")
+def get_trends(
+    days: int = 7,
+    analytics_use_case: AnalyticsUseCase = Depends(get_analytics_use_case)
+):
+    results = analytics_use_case.get_trends(days)
+    if not results or "error" in results:
+        return {"error": "Could not fetch trends."}
+    data = []
+    for row in results.get("rows", []):
+        data.append({
+            "topic": row[0],
+            "momentum": float(row[1]),
+            "volume": int(row[2])
+        })
+    return {"data": data}
+@router.get("/articles-over-time")
+def get_articles_over_time(
+    days: int = 30,
+    analytics_use_case: AnalyticsUseCase = Depends(get_analytics_use_case)
+):
+    query = f"""
+    SELECT
+        toDate(scraped_at) as date,
+        count() as article_count
+    FROM sentiment_results
+    GROUP BY date
+    ORDER BY date ASC
+    """
+    results = analytics_use_case.execute_raw_query(query)
+    if not results or "error" in results:
+        return {"error": "Could not fetch articles over time."}
+    data = []
+    for row in results.get("rows", []):
+        data.append({
+            "date": str(row[0]),
+            "count": int(row[1])
+        })
+    return {"data": data}
+@router.get("/source-stats")
+def get_source_stats(analytics_use_case: AnalyticsUseCase = Depends(get_analytics_use_case)):
+    query = """
+    SELECT
+        source,
+        count() as article_count,
+        avg(sentiment_score) as avg_sentiment
+    FROM sentiment_results
+    GROUP BY source
+    ORDER BY article_count DESC
+    """
+    results = analytics_use_case.execute_raw_query(query)
+    if not results or "error" in results:
+        return {"error": "Could not fetch source stats."}
+    data = []
+    for row in results.get("rows", []):
+        data.append({
+            "source": row[0],
+            "article_count": int(row[1]),
+            "avg_sentiment": float(row[2])
+        })
+    return {"data": data}
+@router.get("/pipeline-stats")
+def get_pipeline_stats(
+    analytics_use_case: AnalyticsUseCase = Depends(get_analytics_use_case),
+    vector_store: VectorStorePort = Depends(get_vector_store_port)
+):
+    qdrant_stats = vector_store.get_collection_stats()
+    query = "SELECT count() FROM sentiment_results"
+    ch_res = analytics_use_case.execute_raw_query(query)
+    ch_count = 0
+    if ch_res and not "error" in ch_res and ch_res.get("rows"):
+        ch_count = int(ch_res["rows"][0][0])
+    return {
+        "total_articles_in_vector_db": qdrant_stats.get("vectors_count", 0) if qdrant_stats else 0,
+        "total_sentiment_results": ch_count
+    }

src/api/routes/auth.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from fastapi import APIRouter, Depends, HTTPException, status
+from fastapi.security import OAuth2PasswordRequestForm
+from sqlalchemy.orm import Session
+from datetime import timedelta
+from src.infrastructure.database import get_db
+from src.core.domain.db_models import User
+from src.core.domain.schemas import TokenResponse, RefreshRequest, UserResponse
+from src.core.security import (
+    verify_password, create_access_token, create_refresh_token,
+    rotate_refresh_token, revoke_all_refresh_tokens,
+    get_current_user
+)
+from src.core.config import settings
+router = APIRouter()
+@router.post("/login", response_model=TokenResponse)
+def login(form_data: OAuth2PasswordRequestForm = Depends(), db: Session = Depends(get_db)):
+    """Login with email + password. Returns access + refresh tokens."""
+    user = db.query(User).filter(User.email == form_data.username).first()
+    if not user or not verify_password(form_data.password, user.hashed_password):
+        raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Incorrect email or password")
+    if not user.is_active:
+        raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Account is disabled")
+    access_token = create_access_token(
+        data={"sub": user.email, "role": user.role},
+        expires_delta=timedelta(minutes=settings.ACCESS_TOKEN_EXPIRE_MINUTES)
+    )
+    refresh_token = create_refresh_token(user.id, db)
+    return TokenResponse(access_token=access_token, refresh_token=refresh_token)
+@router.post("/refresh", response_model=TokenResponse)
+def refresh_tokens(body: RefreshRequest, db: Session = Depends(get_db)):
+    """Exchange a valid refresh token for a new access + refresh token pair."""
+    new_refresh, user = rotate_refresh_token(body.refresh_token, db)
+    access_token = create_access_token(
+        data={"sub": user.email, "role": user.role},
+        expires_delta=timedelta(minutes=settings.ACCESS_TOKEN_EXPIRE_MINUTES)
+    )
+    return TokenResponse(access_token=access_token, refresh_token=new_refresh)
+@router.post("/logout", status_code=status.HTTP_204_NO_CONTENT)
+def logout(current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
+    """Revoke all refresh tokens for the current user (full logout)."""
+    revoke_all_refresh_tokens(current_user.id, db)
+@router.get("/me", response_model=UserResponse)
+def get_me(current_user: User = Depends(get_current_user)):
+    """Get the currently authenticated user's profile."""
+    return current_user

src/api/routes/interactions.py ADDED Viewed

	@@ -0,0 +1,127 @@

+from fastapi import APIRouter, Depends, HTTPException
+from sqlalchemy.orm import Session
+from src.infrastructure.database import get_db
+from sqlalchemy import func
+from src.core.domain.schemas import FeedbackRequest, ChatSession
+from src.core.domain.db_models import ChatHistory, Feedback
+from src.core.security import get_current_user
+from src.core.domain.db_models import User
+from typing import Optional
+router = APIRouter()
+@router.get("/history/{session_id}")
+def get_chat_history(session_id: str, db: Session = Depends(get_db)):
+    history = db.query(ChatHistory).filter(
+        ChatHistory.session_id == session_id
+    ).order_by(ChatHistory.timestamp.asc()).all()
+    formatted_history = []
+    for h in history:
+        formatted_history.append({
+            "id": h.id,
+            "role": h.role,
+            "content": h.content,
+            "timestamp": h.timestamp,
+            "pinned": getattr(h, "pinned", False),
+        })
+    return {"session_id": session_id, "history": formatted_history}
+@router.post("/feedback")
+def submit_feedback(req: FeedbackRequest, db: Session = Depends(get_db)):
+    msg = db.query(ChatHistory).filter(
+        ChatHistory.id == req.message_id,
+        ChatHistory.session_id == req.session_id
+    ).first()
+    if not msg:
+        raise HTTPException(status_code=404, detail="Message not found in session")
+    # Upsert: update existing feedback or create new
+    existing = db.query(Feedback).filter(
+        Feedback.message_id == req.message_id,
+        Feedback.session_id == req.session_id
+    ).first()
+    if existing:
+        existing.rating = req.rating
+        existing.comment = req.comment
+    else:
+        feedback = Feedback(
+            session_id=req.session_id,
+            message_id=req.message_id,
+            rating=req.rating,
+            comment=req.comment
+        )
+        db.add(feedback)
+    db.commit()
+    return {"status": "success", "message": "Feedback recorded."}
+@router.get("/feedback/{session_id}")
+def get_session_feedback(session_id: str, db: Session = Depends(get_db)):
+    """Get all feedback ratings for a session (so UI can restore like/dislike state)."""
+    feedbacks = db.query(Feedback).filter(Feedback.session_id == session_id).all()
+    return {str(f.message_id): f.rating for f in feedbacks}
+@router.post("/pin/{message_id}")
+def pin_message(message_id: int, db: Session = Depends(get_db)):
+    """Toggle pin on a message."""
+    msg = db.query(ChatHistory).filter(ChatHistory.id == message_id).first()
+    if not msg:
+        raise HTTPException(status_code=404, detail="Message not found")
+    # Toggle pinned — add column if missing via getattr
+    current = getattr(msg, "pinned", False) or False
+    msg.pinned = not current
+    db.commit()
+    return {"pinned": msg.pinned}
+@router.get("/sessions")
+def get_chat_sessions(
+    current_user: User = Depends(get_current_user),
+    db: Session = Depends(get_db)
+):
+    """Retrieve sessions for the authenticated user only."""
+    sessions = db.query(
+        ChatHistory.session_id,
+        func.count(ChatHistory.id).label("message_count"),
+        func.max(ChatHistory.timestamp).label("last_active")
+    ).filter(
+        ChatHistory.user_id == current_user.id
+    ).group_by(ChatHistory.session_id).order_by(
+        func.max(ChatHistory.timestamp).desc()
+    ).all()
+    return [
+        ChatSession(
+            session_id=s.session_id,
+            message_count=s.message_count,
+            last_active=s.last_active
+        )
+        for s in sessions
+    ]
+@router.delete("/sessions/{session_id}")
+def delete_chat_session(
+    session_id: str,
+    current_user: User = Depends(get_current_user),
+    db: Session = Depends(get_db)
+):
+    """Delete session — only owner can delete."""
+    # Verify ownership
+    owned = db.query(ChatHistory).filter(
+        ChatHistory.session_id == session_id,
+        ChatHistory.user_id == current_user.id
+    ).first()
+    if not owned:
+        raise HTTPException(status_code=404, detail="Session not found")
+    db.query(Feedback).filter(Feedback.session_id == session_id).delete()
+    deleted_msgs = db.query(ChatHistory).filter(ChatHistory.session_id == session_id).delete()
+    db.commit()
+    return {"status": "success", "message": f"Deleted session {session_id} with {deleted_msgs} messages."}

src/api/routes/news.py ADDED Viewed

	@@ -0,0 +1,138 @@

+from fastapi import APIRouter, Query, HTTPException, Depends
+from typing import Optional
+from src.core.ports.vector_store_port import VectorStorePort
+from src.core.ports.embedder_port import EmbedderPort
+from src.core.use_cases.analytics_use_case import AnalyticsUseCase
+from src.api.dependencies import get_vector_store_port, get_embedder_port, get_analytics_use_case
+from src.core.domain.schemas import BrowseResponse, SearchResponse, NewsArticle
+router = APIRouter()
+def _dict_to_article(payload: dict, score: float = None) -> NewsArticle:
+    if payload is None:
+        payload = {}
+    # Safely get metadata
+    metadata = payload.get("metadata") if payload else {}
+    if metadata is None:
+        metadata = {}
+    # Title can be stored at top-level payload OR nested inside metadata
+    title = (
+        payload.get("title")
+        or (metadata.get("title") if isinstance(metadata, dict) else None)
+    )
+    return NewsArticle(
+        doc_id=payload.get("doc_id", "unknown"),
+        url=payload.get("url"),
+        title=title,
+        content=payload.get("text", payload.get("content", "")),
+        source=payload.get("source"),
+        published_at=payload.get("published_at"),
+        score=score,
+        metadata=metadata if isinstance(metadata, dict) else {}
+    )
+@router.get("/latest", response_model=BrowseResponse)
+def get_latest_news(
+    limit: int = Query(10, le=50),
+    source: Optional[str] = None,
+    language: Optional[str] = None,
+    vector_store: VectorStorePort = Depends(get_vector_store_port)
+):
+    """Get latest news articles sorted by publication date"""
+    try:
+        result = vector_store.browse(limit=limit, offset=0, source=source, language=language)
+        # Convert Qdrant points to articles and sort by published_at
+        articles = []
+        for p in result["articles"]:
+            article = _dict_to_article(p.payload or {}, getattr(p, "score", None))
+            articles.append(article)
+        # Sort by published_at descending (latest first)
+        articles.sort(key=lambda x: x.published_at or "", reverse=True)
+        # Ensure next_offset is an integer or None
+        next_offset = result.get("next_offset")
+        if next_offset is not None and not isinstance(next_offset, int):
+            next_offset = None  # If it's not an int, set to None
+        return BrowseResponse(
+            total_returned=len(articles),
+            articles=articles,
+            next_offset=next_offset
+        )
+    except Exception as e:
+        # Return empty response on error instead of 500
+        print(f"Error fetching news: {e}")
+        return BrowseResponse(
+            total_returned=0,
+            articles=[],
+            next_offset=None
+        )
+@router.get("/browse", response_model=BrowseResponse)
+def browse_news(
+    limit: int = Query(20, le=100),
+    offset: int = 0,
+    source: Optional[str] = None,
+    language: Optional[str] = None,
+    vector_store: VectorStorePort = Depends(get_vector_store_port)
+):
+    result = vector_store.browse(limit=limit, offset=offset, source=source, language=language)
+    # Qdrant scroll returns points with .payload
+    articles = []
+    for p in result["articles"]:
+        articles.append(_dict_to_article(p.payload or {}, getattr(p, "score", None)))
+    return BrowseResponse(
+        total_returned=len(articles),
+        articles=articles,
+        next_offset=result["next_offset"]
+    )
+@router.get("/search", response_model=SearchResponse)
+def search_news(
+    q: str = Query(..., min_length=1),
+    top_k: int = Query(10, le=50),
+    source: Optional[str] = None,
+    language: Optional[str] = None,
+    embedder: EmbedderPort = Depends(get_embedder_port),
+    vector_store: VectorStorePort = Depends(get_vector_store_port)
+):
+    try:
+        query_vector = embedder.encode_query(q)
+        results = vector_store.search(
+            query_vectors=query_vector,
+            limit=top_k,
+            source_filter=source,
+            language_filter=language
+        )
+        # VectorStorePort.search returns SearchResult objects (content, metadata, score, doc_id)
+        articles = []
+        for r in results:
+            articles.append(_dict_to_article(r.metadata, r.score))
+        return SearchResponse(results=articles)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@router.get("/sources")
+def get_sources(analytics: AnalyticsUseCase = Depends(get_analytics_use_case)):
+    query = "SELECT source, count() as article_count FROM sentiment_results GROUP BY source ORDER BY article_count DESC"
+    res = analytics.execute_raw_query(query)
+    if res and res.get("rows"):
+         sources = [row[0] for row in res["rows"] if row[0]]
+         return {"sources": sources}
+    return {"sources": []}
+@router.get("/{doc_id}", response_model=NewsArticle)
+def get_news_article(doc_id: str, vector_store: VectorStorePort = Depends(get_vector_store_port)):
+    result = vector_store.get_by_doc_id(doc_id)
+    if not result:
+        raise HTTPException(status_code=404, detail="Article not found")
+    return _dict_to_article(result.metadata, result.score)

src/api/routes/rag.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import os
+import uuid
+from typing import Optional
+from fastapi import APIRouter, Depends, HTTPException
+from fastapi.responses import StreamingResponse
+from fastapi.security import OAuth2PasswordBearer
+from src.core.domain.schemas import ChatRequest, ChatResponse, SearchResponse
+from src.core.use_cases.search_use_case import SearchUseCase
+from src.core.use_cases.rag_chat_use_case import RagChatUseCase
+from src.core.use_cases.agent_router_use_case import AgentRouterUseCase
+from src.api.dependencies import get_search_use_case, get_rag_chat_use_case, get_agent_router_use_case
+from src.core.security import get_current_user
+from src.core.domain.db_models import User
+from jose import jwt, JWTError
+from src.core.config import settings
+from src.infrastructure.database import get_db
+from sqlalchemy.orm import Session
+router = APIRouter()
+# Optional bearer — doesn't raise if token is missing
+_optional_bearer = OAuth2PasswordBearer(tokenUrl=f"{settings.API_V1_STR}/auth/login", auto_error=False)
+def get_optional_user(
+    token: Optional[str] = Depends(_optional_bearer),
+    db: Session = Depends(get_db)
+) -> Optional[User]:
+    """Returns the authenticated user or None for guests."""
+    if not token:
+        return None
+    try:
+        payload = jwt.decode(token, settings.SECRET_KEY, algorithms=["HS256"])
+        if payload.get("type") != "access":
+            return None
+        email = payload.get("sub")
+        if not email:
+            return None
+        user = db.query(User).filter(User.email == email).first()
+        return user if user and user.is_active else None
+    except JWTError:
+        return None
+@router.post("/search")
+def direct_search(
+    request: ChatRequest,
+    search_use_case: SearchUseCase = Depends(get_search_use_case),
+    current_user: User = Depends(get_current_user)
+):
+    """Lightning-fast hybrid search bypassing the LLM."""
+    try:
+        results = search_use_case.execute(
+            query=request.query,
+            limit=request.top_k,
+            source_filter=request.source_filter,
+            language_filter=request.language_filter,
+            days_back=getattr(request, 'days_back', None)
+        )
+        hits = [{"content": r.content, "metadata": r.metadata, "score": r.score, "doc_id": r.doc_id} for r in results]
+        return {"results": hits, "query": request.query}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/chat/test", response_model=ChatResponse)
+def chat_test(
+    request: ChatRequest,
+    agent_router_use_case: AgentRouterUseCase = Depends(get_agent_router_use_case)
+):
+    """Test RAG chat endpoint without authentication for debugging"""
+    # Get endpoint from environment variable or use default
+    os.getenv("RAG_ENDPOINT", "/rag/chat/test")
+    result = agent_router_use_case.execute_chat(request)
+    return result
+@router.post("/chat/stream")
+async def chat_with_rag_stream(
+    request: ChatRequest,
+    router_use_case: AgentRouterUseCase = Depends(get_agent_router_use_case),
+    current_user: Optional[User] = Depends(get_optional_user)
+):
+    """Streaming RAG chat. Works for both authenticated users and guests."""
+    try:
+        if current_user is None and not request.session_id:
+            request.session_id = f"guest_{uuid.uuid4().hex[:12]}"
+        user_id = current_user.id if current_user else None
+        return StreamingResponse(
+            router_use_case.execute_stream(request, is_guest=(current_user is None), user_id=user_id),
+            media_type="text/event-stream"
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))

src/core/__pycache__/config.cpython-313.pyc ADDED Viewed

Binary file (4.02 kB). View file

src/core/__pycache__/security.cpython-313.pyc ADDED Viewed

Binary file (6.26 kB). View file

src/core/config.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import os
+from pydantic_settings import BaseSettings, SettingsConfigDict
+class Settings(BaseSettings):
+    model_config = SettingsConfigDict(env_file=".env", env_file_encoding='utf-8', extra='ignore')
+    PROJECT_NAME: str = "RAG API Service"
+    API_V1_STR: str = "/api/v1"
+    QDRANT_HOST: str = os.getenv("QDRANT_HOST", "localhost")
+    QDRANT_PORT: int = int(os.getenv("QDRANT_PORT", "6333"))
+    QDRANT_URL: str = os.getenv("QDRANT_URL", "")          # Cloud URL (overrides host/port)
+    QDRANT_API_KEY: str = os.getenv("QDRANT_API_KEY", "")  # Cloud API Key
+    QDRANT_COLLECTION: str = os.getenv("QDRANT_COLLECTION", "news_articles")
+    CLICKHOUSE_HOST: str = os.getenv("CLICKHOUSE_HOST", "localhost")
+    CLICKHOUSE_PORT: int = int(os.getenv("CLICKHOUSE_PORT", "8123"))
+    CLICKHOUSE_USER: str = os.getenv("CLICKHOUSE_USER", "default")
+    CLICKHOUSE_PASSWORD: str = os.getenv("CLICKHOUSE_PASSWORD", "")
+    CLICKHOUSE_DB: str = os.getenv("CLICKHOUSE_DB", "default")
+    CLICKHOUSE_SECURE: bool = os.getenv("CLICKHOUSE_SECURE", "false").lower() == "true"
+    # Embedding Model Config
+    EMBEDDING_MODEL: str = os.getenv("EMBEDDING_MODEL", "BAAI/bge-m3")
+    VECTOR_SIZE: int = int(os.getenv("VECTOR_SIZE", "1024"))
+    RERANKER_MODEL: str = os.getenv("RERANKER_MODEL", "BAAI/bge-reranker-v2-m3")
+    # PostgreSQL / Neon Config
+    DATABASE_URL: str = os.getenv("DATABASE_URL", "")  # Full Neon URL (overrides individual fields)
+    POSTGRES_USER: str = os.getenv("POSTGRES_USER", "postgres")
+    POSTGRES_PASSWORD: str = os.getenv("POSTGRES_PASSWORD", "postgres")
+    POSTGRES_SERVER: str = os.getenv("POSTGRES_SERVER", "localhost")
+    POSTGRES_PORT: str = os.getenv("POSTGRES_PORT", "5432")
+    POSTGRES_DB: str = os.getenv("POSTGRES_DB", "rag_interactions")
+    @property
+    def SQLALCHEMY_DATABASE_URI(self) -> str:
+        if self.DATABASE_URL:
+            return self.DATABASE_URL
+        return f"postgresql://{self.POSTGRES_USER}:{self.POSTGRES_PASSWORD}@{self.POSTGRES_SERVER}:{self.POSTGRES_PORT}/{self.POSTGRES_DB}"
+    # LLM Settings
+    # Supported providers: "groq", "gemini", "together", "openai", "ollama"
+    LLM_PROVIDER: str = os.getenv("LLM_PROVIDER", "groq")
+    # Groq — free, 200+ tok/s, llama-3.3-70b-versatile | https://console.groq.com
+    OPENAI_API_KEY: str = os.getenv("OPENAI_API_KEY", "")
+    GROQ_API_KEY: str = os.getenv("GROQ_API_KEY", "")
+    GROQ_MODEL: str = os.getenv("GROQ_MODEL", "llama-3.3-70b-versatile")
+    # Google Gemini — free tier (15 RPM / 1M TPM) | https://aistudio.google.com/apikey
+    GEMINI_API_KEY: str = os.getenv("GEMINI_API_KEY", "")
+    GEMINI_MODEL: str = os.getenv("GEMINI_MODEL", "gemini-1.5-flash")
+    # Together AI — free $25 credit | https://api.together.ai
+    TOGETHER_API_KEY: str = os.getenv("TOGETHER_API_KEY", "")
+    TOGETHER_MODEL: str = os.getenv("TOGETHER_MODEL", "meta-llama/Llama-3.3-70B-Instruct-Turbo")
+    # HuggingFace Inference API — free with HF token | https://huggingface.co/settings/tokens
+    HF_TOKEN: str = os.getenv("HF_TOKEN", "")
+    HF_MODEL: str = os.getenv("HF_MODEL", "meta-llama/Llama-3.1-8B-Instruct")
+    # Ollama — local inference
+    OLLAMA_HOST: str = os.getenv("OLLAMA_HOST", "http://localhost:11434")
+    OLLAMA_MODEL: str = os.getenv("OLLAMA_MODEL", "llama3.2")
+    # Redis Settings
+    REDIS_URL: str = os.getenv("REDIS_URL", "")          # Full URL (Upstash) - overrides host/port
+    REDIS_HOST: str = os.getenv("REDIS_HOST", "localhost")
+    REDIS_PORT: int = int(os.getenv("REDIS_PORT", "6380"))
+    REDIS_DB: int = int(os.getenv("REDIS_DB", "0"))
+    REDIS_PASSWORD: str = os.getenv("REDIS_PASSWORD", "")
+    # Hybrid Search Settings
+    ENABLE_HYBRID_SEARCH: bool = os.getenv("ENABLE_HYBRID_SEARCH", "true").lower() == "true"
+    LIVE_SEARCH_TIMEOUT: float = float(os.getenv("LIVE_SEARCH_TIMEOUT", "2.0"))
+    LIVE_SEARCH_MAX_RESULTS: int = int(os.getenv("LIVE_SEARCH_MAX_RESULTS", "5"))
+    LIVE_SEARCH_WEIGHT: float = float(os.getenv("LIVE_SEARCH_WEIGHT", "0.5"))
+    DB_SEARCH_WEIGHT: float = float(os.getenv("DB_SEARCH_WEIGHT", "0.5"))
+    # Cache Settings (TTL in seconds)
+    CACHE_RESPONSE_TTL: int = int(os.getenv("CACHE_RESPONSE_TTL", "300"))      # 5 minutes
+    CACHE_LIVE_TTL: int = int(os.getenv("CACHE_LIVE_TTL", "600"))              # 10 minutes
+    CACHE_TRANSLATION_TTL: int = int(os.getenv("CACHE_TRANSLATION_TTL", "3600"))  # 1 hour
+    CACHE_INTENT_TTL: int = int(os.getenv("CACHE_INTENT_TTL", "3600"))         # 1 hour
+    # Security Settings
+    SECRET_KEY: str = os.getenv("SECRET_KEY", "a_very_secret_key_change_me_in_production")
+    ACCESS_TOKEN_EXPIRE_MINUTES: int = int(os.getenv("ACCESS_TOKEN_EXPIRE_MINUTES", "60"))
+settings = Settings()

src/core/domain/__pycache__/db_models.cpython-313.pyc ADDED Viewed

Binary file (3.61 kB). View file

src/core/domain/__pycache__/schemas.cpython-313.pyc ADDED Viewed

Binary file (5.02 kB). View file

src/core/domain/db_models.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from sqlalchemy import Column, Integer, String, DateTime, Text, ForeignKey, CheckConstraint, Boolean, Enum
+from sqlalchemy.orm import declarative_base, relationship
+from datetime import datetime
+import enum
+Base = declarative_base()
+class UserRole(str, enum.Enum):
+    super_admin = "super_admin"
+    user = "user"
+class User(Base):
+    __tablename__ = "users"
+    id = Column(Integer, primary_key=True, index=True)
+    username = Column(String, unique=True, index=True, nullable=False)
+    email = Column(String, unique=True, index=True, nullable=False)
+    hashed_password = Column(String, nullable=False)
+    role = Column(Enum(UserRole), default=UserRole.user, nullable=False)
+    is_active = Column(Boolean, default=True, nullable=False)
+    full_name = Column(String, nullable=True)
+    created_at = Column(DateTime, default=datetime.utcnow)
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
+    refresh_tokens = relationship("RefreshToken", back_populates="user", cascade="all, delete-orphan")
+class RefreshToken(Base):
+    __tablename__ = "refresh_tokens"
+    id = Column(Integer, primary_key=True, index=True)
+    token = Column(String, unique=True, index=True, nullable=False)
+    user_id = Column(Integer, ForeignKey("users.id"), nullable=False)
+    expires_at = Column(DateTime, nullable=False)
+    revoked = Column(Boolean, default=False, nullable=False)
+    created_at = Column(DateTime, default=datetime.utcnow)
+    user = relationship("User", back_populates="refresh_tokens")
+class ChatHistory(Base):
+    __tablename__ = "chat_history"
+    id = Column(Integer, primary_key=True, index=True)
+    session_id = Column(String, index=True, nullable=False)
+    user_id = Column(Integer, ForeignKey("users.id"), nullable=True)
+    role = Column(String, nullable=False)
+    content = Column(Text, nullable=False)
+    retrieved_doc_ids = Column(Text, nullable=True)
+    pinned = Column(Boolean, default=False, nullable=False)
+    timestamp = Column(DateTime, default=datetime.utcnow)
+class Feedback(Base):
+    __tablename__ = "feedback"
+    id = Column(Integer, primary_key=True, index=True)
+    session_id = Column(String, index=True, nullable=False)
+    message_id = Column(Integer, nullable=False)
+    user_id = Column(Integer, ForeignKey("users.id"), nullable=True)
+    rating = Column(Integer, CheckConstraint('rating IN (1, -1)'))
+    comment = Column(Text, nullable=True)
+    timestamp = Column(DateTime, default=datetime.utcnow)

src/core/domain/schemas.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from pydantic import BaseModel
+from typing import List, Optional, Any
+from datetime import datetime
+class ChatRequest(BaseModel):
+    query: str
+    session_id: Optional[str] = None
+    top_k: int = 7          # increased from 5 — gives multilingual diversity room
+    source_filter: Optional[str] = None
+    language_filter: Optional[str] = None
+class SourceDocument(BaseModel):
+    content: str
+    metadata: dict
+    score: float
+class ChatResponse(BaseModel):
+    answer: str
+    sources: List[SourceDocument]
+    session_id: str = "anonymous"
+class FeedbackRequest(BaseModel):
+    session_id: str
+    message_id: int
+    rating: int
+    comment: Optional[str] = None
+class SentimentData(BaseModel):
+    entity: str
+    avg_sentiment: float
+    mention_count: int
+class TrendData(BaseModel):
+    topic: str
+    volume: int
+    momentum: float
+class NewsArticle(BaseModel):
+    doc_id: str
+    url: Optional[str] = None
+    title: Optional[str] = None
+    content: str
+    source: Optional[str] = None
+    published_at: Optional[str] = None
+    score: Optional[float] = None
+    metadata: dict = {}
+class BrowseResponse(BaseModel):
+    total_returned: int
+    articles: List[NewsArticle]
+    next_offset: Optional[int] = None
+class SearchResponse(BaseModel):
+    results: List[NewsArticle]
+class SourceStat(BaseModel):
+    source: str
+    article_count: int
+    avg_sentiment: float
+class PipelineStats(BaseModel):
+    total_articles_in_vector_db: int
+    total_sentiment_results: int
+class ChatSession(BaseModel):
+    session_id: str
+    message_count: int
+    last_active: Optional[datetime] = None
+class UserCreate(BaseModel):
+    username: str
+    email: str
+    password: str
+    full_name: Optional[str] = None
+class UserResponse(BaseModel):
+    id: int
+    username: str
+    email: str
+    full_name: Optional[str] = None
+    role: str
+    is_active: bool
+    created_at: Optional[datetime] = None
+    class Config:
+        from_attributes = True
+class UserUpdate(BaseModel):
+    username: Optional[str] = None
+    full_name: Optional[str] = None
+    email: Optional[str] = None
+class PasswordChange(BaseModel):
+    current_password: str
+    new_password: str
+class TokenResponse(BaseModel):
+    access_token: str
+    refresh_token: str
+    token_type: str = "bearer"
+class RefreshRequest(BaseModel):
+    refresh_token: str
+class AdminUserUpdate(BaseModel):
+    is_active: Optional[bool] = None
+    role: Optional[str] = None

src/core/orchestrator/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Query Orchestrator Module

src/core/orchestrator/query_orchestrator.py ADDED Viewed

	@@ -0,0 +1,434 @@

+"""
+Query Orchestrator
+Orchestrates hybrid search between live sources (DuckDuckGo) and database (Qdrant).
+Integrates seamlessly with the existing multilingual RAG pipeline.
+Key Features:
+- Intelligent search strategy selection (live, DB, or hybrid)
+- Uses production-grade intent classification (v2)
+- Parallel execution of live and database searches
+- Integration with existing 6-language multilingual pipeline
+- Graceful fallbacks when live search fails
+- Cache-aware execution
+"""
+import logging
+import asyncio
+from typing import Dict, Any, List, Optional, Tuple
+from datetime import datetime
+logger = logging.getLogger(__name__)
+class SearchStrategy:
+    """
+    Search strategy configuration.
+    Determines which sources to use and how to weight them.
+    """
+    def __init__(
+        self,
+        use_live: bool = True,
+        use_db: bool = True,
+        live_weight: float = 0.5,
+        db_weight: float = 0.5,
+        reason: str = "",
+        intent_result: Optional[Any] = None  # IntentResult from v2 classifier
+    ):
+        self.use_live = use_live
+        self.use_db = use_db
+        self.live_weight = live_weight
+        self.db_weight = db_weight
+        self.reason = reason
+        self.intent_result = intent_result  # Store full intent result for debugging
+    def __repr__(self):
+        return (
+            f"SearchStrategy(live={self.use_live}, db={self.use_db}, "
+            f"weights={self.live_weight:.1f}/{self.db_weight:.1f}, "
+            f"reason='{self.reason}')"
+        )
+class QueryOrchestrator:
+    """
+    Orchestrates hybrid search between live sources and database.
+    Integrates with existing multilingual pipeline:
+    - Reuses dense vectors (computed once from English)
+    - Reuses sparse vectors (batched for 6 languages)
+    - Adds live search in parallel with DB search
+    - Merges results for unified ranking
+    """
+    # Temporal keywords that indicate need for live search
+    TEMPORAL_KEYWORDS = [
+        "today", "now", "latest", "breaking", "just", "current",
+        "this morning", "this afternoon", "this evening", "tonight",
+        "yesterday", "recent", "recently", "new", "fresh"
+    ]
+    # Historical keywords that indicate DB-only search
+    HISTORICAL_KEYWORDS = [
+        "history", "historical", "background", "context", "past",
+        "analysis", "overview", "summary", "explain", "what is",
+        "who is", "tell me about"
+    ]
+    def __init__(
+        self,
+        live_search_adapter,
+        enable_hybrid: bool = True,
+        default_live_weight: float = 0.5,
+        default_db_weight: float = 0.5
+    ):
+        """
+        Initialize query orchestrator.
+        Args:
+            live_search_adapter: DuckDuckGo adapter instance
+            enable_hybrid: Global flag to enable/disable hybrid search
+            default_live_weight: Default weight for live results
+            default_db_weight: Default weight for database results
+        """
+        self.live_search = live_search_adapter
+        self.enable_hybrid = enable_hybrid
+        self.default_live_weight = default_live_weight
+        self.default_db_weight = default_db_weight
+    def decide_search_strategy(self, query: str, intent: str = "NEWS", intent_result: Optional[Any] = None) -> SearchStrategy:
+        """
+        Decide which search sources to use based on query characteristics.
+        Now supports production-grade intent classification with multi-class intents:
+        - NEWS_TEMPORAL → prioritize live search
+        - NEWS_HISTORICAL → use DB only
+        - NEWS_GENERAL → balanced hybrid
+        - OTHER → skip search
+        Args:
+            query: User query
+            intent: Simple intent ("NEWS" or "OTHER") for backward compatibility
+            intent_result: Full IntentResult from v2 classifier (if available)
+        Returns:
+            SearchStrategy object with source selection and weights
+        """
+        # If hybrid search is disabled globally, use DB only
+        if not self.enable_hybrid:
+            return SearchStrategy(
+                use_live=False,
+                use_db=True,
+                live_weight=0.0,
+                db_weight=1.0,
+                reason="Hybrid search disabled",
+                intent_result=intent_result
+            )
+        # If live search is unavailable, use DB only
+        if not self.live_search.is_available():
+            return SearchStrategy(
+                use_live=False,
+                use_db=True,
+                live_weight=0.0,
+                db_weight=1.0,
+                reason="Live search unavailable",
+                intent_result=intent_result
+            )
+        # ── Use v2 Intent Result if available (production-grade) ──────────────
+        if intent_result and hasattr(intent_result, 'intent'):
+            detailed_intent = intent_result.intent
+            confidence = intent_result.confidence
+            logger.info(
+                f"Using v2 intent: {detailed_intent} "
+                f"(confidence={confidence:.2f}, method={intent_result.method})"
+            )
+            # OTHER → skip search
+            if detailed_intent == "OTHER":
+                return SearchStrategy(
+                    use_live=False,
+                    use_db=False,
+                    live_weight=0.0,
+                    db_weight=0.0,
+                    reason=f"Small talk (confidence={confidence:.2f})",
+                    intent_result=intent_result
+                )
+            # NEWS_TEMPORAL → prioritize live search
+            elif detailed_intent == "NEWS_TEMPORAL":
+                # High confidence → strong live bias
+                if confidence >= 0.80:
+                    return SearchStrategy(
+                        use_live=True,
+                        use_db=True,
+                        live_weight=0.8,
+                        db_weight=0.2,
+                        reason=f"Temporal query (high confidence={confidence:.2f})",
+                        intent_result=intent_result
+                    )
+                # Medium confidence → moderate live bias
+                else:
+                    return SearchStrategy(
+                        use_live=True,
+                        use_db=True,
+                        live_weight=0.7,
+                        db_weight=0.3,
+                        reason=f"Temporal query (medium confidence={confidence:.2f})",
+                        intent_result=intent_result
+                    )
+            # NEWS_HISTORICAL → use DB only
+            elif detailed_intent == "NEWS_HISTORICAL":
+                return SearchStrategy(
+                    use_live=False,
+                    use_db=True,
+                    live_weight=0.0,
+                    db_weight=1.0,
+                    reason=f"Historical query (confidence={confidence:.2f})",
+                    intent_result=intent_result
+                )
+            # NEWS_GENERAL → balanced hybrid
+            elif detailed_intent == "NEWS_GENERAL":
+                return SearchStrategy(
+                    use_live=True,
+                    use_db=True,
+                    live_weight=self.default_live_weight,
+                    db_weight=self.default_db_weight,
+                    reason=f"General news (confidence={confidence:.2f})",
+                    intent_result=intent_result
+                )
+        # ── Fallback to v1 logic (backward compatibility) ─────────────────────
+        # If intent is OTHER (small talk), no search needed
+        if intent == "OTHER":
+            return SearchStrategy(
+                use_live=False,
+                use_db=False,
+                live_weight=0.0,
+                db_weight=0.0,
+                reason="Small talk - no search needed (v1 fallback)",
+                intent_result=intent_result
+            )
+        query_lower = query.lower()
+        # Check for temporal keywords → prioritize live search
+        has_temporal = any(kw in query_lower for kw in self.TEMPORAL_KEYWORDS)
+        # Check for historical keywords → prioritize database
+        has_historical = any(kw in query_lower for kw in self.HISTORICAL_KEYWORDS)
+        if has_temporal and not has_historical:
+            # Temporal query → prioritize live search
+            return SearchStrategy(
+                use_live=True,
+                use_db=True,
+                live_weight=0.7,
+                db_weight=0.3,
+                reason="Temporal query - prioritize live (v1 fallback)",
+                intent_result=intent_result
+            )
+        elif has_historical and not has_temporal:
+            # Historical query → use database only
+            return SearchStrategy(
+                use_live=False,
+                use_db=True,
+                live_weight=0.0,
+                db_weight=1.0,
+                reason="Historical query - database only (v1 fallback)",
+                intent_result=intent_result
+            )
+        else:
+            # Balanced hybrid search
+            return SearchStrategy(
+                use_live=True,
+                use_db=True,
+                live_weight=self.default_live_weight,
+                db_weight=self.default_db_weight,
+                reason="Balanced hybrid search (v1 fallback)",
+                intent_result=intent_result
+            )
+    async def execute_hybrid_search(
+        self,
+        query: str,
+        english_query: str,
+        strategy: SearchStrategy,
+        # Database search components (from existing pipeline)
+        embedder,
+        vector_store,
+        translated_queries: Dict[str, str],
+        top_k: int = 7
+    ) -> Tuple[List[Any], List[Dict[str, Any]]]:
+        """
+        Execute hybrid search with parallel live and database searches.
+        Integrates with existing multilingual pipeline:
+        - Reuses dense vector (computed once from English)
+        - Reuses sparse vectors (batched for 6 languages)
+        - Adds live search in parallel
+        Args:
+            query: Original user query
+            english_query: English translation
+            strategy: Search strategy from decide_search_strategy()
+            embedder: BGE-M3 embedder adapter
+            vector_store: Qdrant adapter
+            translated_queries: Dict of {lang: translated_query}
+            top_k: Results per language for DB search
+        Returns:
+            Tuple of (db_results, live_results)
+        """
+        tasks = []
+        # 1. Database search (if enabled)
+        if strategy.use_db:
+            db_task = self._execute_db_search(
+                english_query=english_query,
+                translated_queries=translated_queries,
+                embedder=embedder,
+                vector_store=vector_store,
+                top_k=top_k
+            )
+            tasks.append(db_task)
+        # 2. Live search (if enabled)
+        if strategy.use_live:
+            live_task = self._execute_live_search(english_query)
+            tasks.append(live_task)
+        # Execute all searches in parallel
+        if not tasks:
+            return [], []
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        # Extract results with error handling
+        db_results = []
+        live_results = []
+        if strategy.use_db:
+            if isinstance(results[0], Exception):
+                logger.error(f"Database search failed: {results[0]}")
+            else:
+                db_results = results[0]
+        if strategy.use_live:
+            result_idx = 1 if strategy.use_db else 0
+            if isinstance(results[result_idx], Exception):
+                logger.error(f"Live search failed: {results[result_idx]}")
+            else:
+                live_results = results[result_idx]
+        logger.info(
+            f"Hybrid search completed: {len(db_results)} DB + {len(live_results)} live results"
+        )
+        return db_results, live_results
+    async def _execute_db_search(
+        self,
+        english_query: str,
+        translated_queries: Dict[str, str],
+        embedder,
+        vector_store,
+        top_k: int
+    ) -> List[Any]:
+        """
+        Execute multilingual database search (existing pipeline).
+        Leverages existing optimizations:
+        - Dense vector computed once (language-agnostic)
+        - Sparse vectors batched (6 languages in 1 pass)
+        - Parallel Qdrant queries (6 lanes)
+        - Deduplication by doc_id
+        Args:
+            english_query: English query
+            translated_queries: Dict of {lang: translated_query}
+            embedder: BGE-M3 embedder
+            vector_store: Qdrant adapter
+            top_k: Results per language
+        Returns:
+            List of deduplicated SearchResult objects
+        """
+        try:
+            # 1. Compute dense vector once (language-agnostic)
+            dense_embedding = embedder.encode_query(english_query)
+            dense_vec = dense_embedding.get("dense")
+            # 2. Batch sparse encoding for all 6 languages (existing optimization)
+            languages = ["en", "ar", "am", "so", "sw", "fr"]
+            sparse_queries = [translated_queries.get(lang, english_query) for lang in languages]
+            sparse_embeddings = embedder.encode_sparse_batch(sparse_queries)
+            # 3. Parallel search across 6 languages (existing pattern)
+            search_tasks = []
+            for i, lang in enumerate(languages):
+                sparse_vec = sparse_embeddings[i].get("sparse")
+                task = vector_store.search_with_vectors(
+                    dense_vec=dense_vec,
+                    sparse_vec=sparse_vec,
+                    limit=top_k,
+                    language_filter=lang
+                )
+                search_tasks.append(task)
+            # Execute all 6 lanes in parallel
+            lane_results = await asyncio.gather(*search_tasks, return_exceptions=True)
+            # 4. Flatten and deduplicate by doc_id (existing logic)
+            all_docs = []
+            for lane in lane_results:
+                if not isinstance(lane, Exception):
+                    all_docs.extend(lane)
+            # Deduplicate: keep highest-scoring version of each doc
+            seen = {}
+            for doc in all_docs:
+                doc_id = doc.metadata.get("doc_id")
+                if doc_id:
+                    if doc_id not in seen or doc.score > seen[doc_id].score:
+                        seen[doc_id] = doc
+                else:
+                    # No doc_id, keep it
+                    seen[id(doc)] = doc
+            unique_docs = list(seen.values())
+            logger.info(f"DB search: {len(all_docs)} total → {len(unique_docs)} unique")
+            return unique_docs
+        except Exception as e:
+            logger.error(f"Database search error: {e}")
+            raise
+    async def _execute_live_search(self, query: str) -> List[Dict[str, Any]]:
+        """
+        Execute live search with timeout and error handling.
+        Args:
+            query: Search query (English)
+        Returns:
+            List of normalized live search results
+        """
+        try:
+            results = await self.live_search.search(query)
+            logger.info(f"Live search: {len(results)} results")
+            return results
+        except Exception as e:
+            logger.error(f"Live search error: {e}")
+            raise