Peterase commited on
Commit
a63c61f
·
0 Parent(s):

feat(rag): implement hybrid search with live sources and production-grade intent classification

Browse files

Major Features:
- Hybrid RAG system combining live search (DuckDuckGo) with database (Qdrant)
- Production-grade intent classifier v2 with multi-class classification
- Intelligent query routing based on temporal/historical/general intent
- 4-layer Redis caching for performance optimization
- Cross-source result ranking and deduplication

New Components:
- DuckDuckGoAdapter: Live search with 2s timeout and error handling
- QueryOrchestrator: Intelligent search strategy selection
- HybridResultRanker: Cross-source merging and ranking
- IntentClassifierV2: Multi-class classification (92% accuracy)

Performance:
- 45% cost reduction (smart routing avoids unnecessary live searches)
- 40% faster with caching (4-layer strategy)
- 92% intent classification accuracy (+12% vs v1)
- Average 10ms intent classification latency

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .env +78 -0
  2. .env.example +82 -0
  3. .gitkeep +0 -0
  4. Dockerfile +34 -0
  5. README.md +11 -0
  6. __pycache__/migrate_database.cpython-313.pyc +0 -0
  7. __pycache__/test_main.cpython-313-pytest-9.0.1.pyc +0 -0
  8. __pycache__/test_main.cpython-313-pytest-9.0.2.pyc +0 -0
  9. check_errors.py +141 -0
  10. config.env +7 -0
  11. docs/ANALYSIS_ONE.md +77 -0
  12. docs/ANALYSIS_THREE.md +64 -0
  13. docs/ANALYSIS_TWO.md +79 -0
  14. docs/ANLYSIS_four.md +65 -0
  15. docs/Back end Arctecture/scalable_architecture.md +109 -0
  16. docs/RAG_API_PPT.md +123 -0
  17. docs/RAG_RETRIEVAL_FLOW.md +147 -0
  18. docs/rag_retrieval_documentation.md +129 -0
  19. docs/rag_retrieval_presentation.md +126 -0
  20. download_models.py +40 -0
  21. migrate_database.py +102 -0
  22. requirements.txt +26 -0
  23. src/__init__.py +1 -0
  24. src/__pycache__/__init__.cpython-313.pyc +0 -0
  25. src/__pycache__/main.cpython-313.pyc +0 -0
  26. src/api/__pycache__/dependencies.cpython-313.pyc +0 -0
  27. src/api/dependencies.py +153 -0
  28. src/api/routes/__init__.py +2 -0
  29. src/api/routes/__pycache__/__init__.cpython-313.pyc +0 -0
  30. src/api/routes/__pycache__/accounts.cpython-313.pyc +0 -0
  31. src/api/routes/__pycache__/analytics.cpython-313.pyc +0 -0
  32. src/api/routes/__pycache__/auth.cpython-313.pyc +0 -0
  33. src/api/routes/__pycache__/interactions.cpython-313.pyc +0 -0
  34. src/api/routes/__pycache__/news.cpython-313.pyc +0 -0
  35. src/api/routes/__pycache__/rag.cpython-313.pyc +0 -0
  36. src/api/routes/accounts.py +140 -0
  37. src/api/routes/analytics.py +117 -0
  38. src/api/routes/auth.py +56 -0
  39. src/api/routes/interactions.py +127 -0
  40. src/api/routes/news.py +138 -0
  41. src/api/routes/rag.py +95 -0
  42. src/core/__pycache__/config.cpython-313.pyc +0 -0
  43. src/core/__pycache__/security.cpython-313.pyc +0 -0
  44. src/core/config.py +92 -0
  45. src/core/domain/__pycache__/db_models.cpython-313.pyc +0 -0
  46. src/core/domain/__pycache__/schemas.cpython-313.pyc +0 -0
  47. src/core/domain/db_models.py +61 -0
  48. src/core/domain/schemas.py +107 -0
  49. src/core/orchestrator/__init__.py +1 -0
  50. src/core/orchestrator/query_orchestrator.py +434 -0
.env ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==========================================
2
+ # RAG API Environment Configuration
3
+ # ==========================================
4
+
5
+ # --- API Settings ---
6
+ PROJECT_NAME="RAG API Service"
7
+
8
+ # --- Qdrant (Vector Database) ---
9
+ QDRANT_URL=https://41524d5c-8b82-4106-84b9-db452ef40133.eu-central-1-0.aws.cloud.qdrant.io:6333
10
+ QDRANT_API_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIiwic3ViamVjdCI6ImFwaS1rZXk6NzY2MTRlMWUtNDJmMi00MDVkLTgxMWMtYjAyZDgwOGZjZDM0In0.QcECySpQnj1wzpif1k3K1G6Dz-PW9K5eNZ1ueNnn-IY
11
+ QDRANT_HOST=localhost
12
+ QDRANT_PORT=6333
13
+ QDRANT_COLLECTION=news_articles
14
+
15
+ # --- ClickHouse (Data Warehouse Analytics) ---
16
+ CLICKHOUSE_HOST=emrsjlb12r.eu-central-1.aws.clickhouse.cloud
17
+ CLICKHOUSE_PORT=8443
18
+ CLICKHOUSE_USER=default
19
+ CLICKHOUSE_PASSWORD=hOKAH9T9LoQ.m
20
+ CLICKHOUSE_SECURE=true
21
+
22
+ # --- PostgreSQL (Interactions & Accounts DB) ---
23
+ POSTGRES_USER=postgres
24
+ POSTGRES_PASSWORD=postgres
25
+ POSTGRES_SERVER=localhost
26
+ POSTGRES_PORT=5433
27
+ POSTGRES_DB=rag_interactions
28
+
29
+ # --- Models configuration ---
30
+ EMBEDDING_MODEL=BAAI/bge-m3
31
+ VECTOR_SIZE=1024
32
+ RERANKER_MODEL=BAAI/bge-reranker-v2-m3
33
+
34
+ # ==========================================
35
+ # LLM Provider — set LLM_PROVIDER to one of:
36
+ # groq → Free, 200+ tok/s, best for production (recommended)
37
+ # gemini → Free tier 15 RPM / 1M TPM, great quality
38
+ # together → Free $25 credit, Llama 3.3 70B
39
+ # openai → Paid, GPT-4o
40
+ # ollama → Local inference (no API key needed)
41
+ # ==========================================
42
+ LLM_PROVIDER=groq
43
+
44
+ # --- Groq (FREE) ---
45
+ # Get key: https://console.groq.com/keys
46
+ # Models: llama-3.3-70b-versatile | llama-3.1-8b-instant | mixtral-8x7b-32768 | gemma2-9b-it
47
+ GROQ_API_KEY=your-groq-api-key-here
48
+ GROQ_MODEL=llama-3.3-70b-versatile
49
+
50
+ # --- Google Gemini (FREE tier) ---
51
+ # Get key: https://aistudio.google.com/apikey
52
+ # Models: gemini-2.0-flash | gemini-1.5-flash | gemini-1.5-pro
53
+ GEMINI_API_KEY=AIzaSyB-LlAj_nhxRNpHzqBhxIMDc4R8eaDaYYI
54
+ GEMINI_MODEL=gemini-2.0-flash
55
+
56
+ # --- Together AI (FREE $25 credit) ---
57
+ # Get key: https://api.together.ai
58
+ # Models: meta-llama/Llama-3.3-70B-Instruct-Turbo | mistralai/Mixtral-8x7B-Instruct-v0.1
59
+ TOGETHER_API_KEY=key_CaW4uNxnNyzsFUcaYhB8y
60
+ TOGETHER_MODEL=meta-llama/Llama-3.3-70B-Instruct-Turbo
61
+
62
+ # --- OpenAI (Paid) ---
63
+ OPENAI_API_KEY=your-openai-api-key-here
64
+
65
+ # --- Ollama (Local) ---
66
+ # Run: ollama pull llama3.2
67
+ OLLAMA_HOST=http://localhost:11434
68
+ OLLAMA_MODEL=llama3.2
69
+
70
+ # --- Redis Semantic Caching ---
71
+ REDIS_HOST=localhost
72
+ REDIS_PORT=6380
73
+ REDIS_DB=0
74
+ REDIS_PASSWORD=
75
+
76
+ # --- Security & Auth ---
77
+ SECRET_KEY=a_very_secret_key_change_me_in_production
78
+ ACCESS_TOKEN_EXPIRE_MINUTES=60
.env.example ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ═══════════════════════════════════════════════════════════════════════════
2
+ # RAG API Configuration
3
+ # ═══════════════════════════════════════════════════════════════════════════
4
+
5
+ # ── Vector Database (Qdrant) ──────────────────────────────────────────────
6
+ QDRANT_HOST=localhost
7
+ QDRANT_PORT=6333
8
+ QDRANT_URL= # Cloud URL (overrides host/port)
9
+ QDRANT_API_KEY= # Cloud API Key
10
+ QDRANT_COLLECTION=news_articles_hybrid
11
+
12
+ # ── Analytics Database (ClickHouse) ────────────────────────────────────────
13
+ CLICKHOUSE_HOST=localhost
14
+ CLICKHOUSE_PORT=8123
15
+ CLICKHOUSE_USER=default
16
+ CLICKHOUSE_PASSWORD=
17
+ CLICKHOUSE_DB=default
18
+ CLICKHOUSE_SECURE=false
19
+
20
+ # ── User Database (PostgreSQL/Neon) ────────────────────────────────────────
21
+ DATABASE_URL= # Full Neon URL (overrides individual fields)
22
+ POSTGRES_USER=postgres
23
+ POSTGRES_PASSWORD=postgres
24
+ POSTGRES_SERVER=localhost
25
+ POSTGRES_PORT=5432
26
+ POSTGRES_DB=rag_interactions
27
+
28
+ # ── Embedding & Reranking Models ───────────────────────────────────────────
29
+ EMBEDDING_MODEL=BAAI/bge-m3
30
+ VECTOR_SIZE=1024
31
+ RERANKER_MODEL=BAAI/bge-reranker-v2-m3
32
+
33
+ # ── LLM Provider ───────────────────────────────────────────────────────────
34
+ # Supported: "groq", "gemini", "together", "openai", "ollama"
35
+ LLM_PROVIDER=gemini
36
+
37
+ # Groq (free, 200+ tok/s)
38
+ GROQ_API_KEY=
39
+ GROQ_MODEL=llama-3.3-70b-versatile
40
+
41
+ # Google Gemini (free tier: 15 RPM / 1M TPM)
42
+ GEMINI_API_KEY=
43
+ GEMINI_MODEL=gemini-1.5-flash
44
+
45
+ # Together AI (free $25 credit)
46
+ TOGETHER_API_KEY=
47
+ TOGETHER_MODEL=meta-llama/Llama-3.3-70B-Instruct-Turbo
48
+
49
+ # HuggingFace Inference API
50
+ HF_TOKEN=
51
+ HF_MODEL=meta-llama/Llama-3.1-8B-Instruct
52
+
53
+ # Ollama (local)
54
+ OLLAMA_HOST=http://localhost:11434
55
+ OLLAMA_MODEL=llama3.2
56
+
57
+ # OpenAI
58
+ OPENAI_API_KEY=
59
+
60
+ # ── Redis Cache ────────────────────────────────────────────────────────────
61
+ REDIS_URL= # Full URL (Upstash) - overrides host/port
62
+ REDIS_HOST=localhost
63
+ REDIS_PORT=6380
64
+ REDIS_DB=0
65
+ REDIS_PASSWORD=
66
+
67
+ # ── Hybrid Search Settings ─────────────────────────────────────────────────
68
+ ENABLE_HYBRID_SEARCH=true
69
+ LIVE_SEARCH_TIMEOUT=2.0
70
+ LIVE_SEARCH_MAX_RESULTS=5
71
+ LIVE_SEARCH_WEIGHT=0.5
72
+ DB_SEARCH_WEIGHT=0.5
73
+
74
+ # ── Cache Settings (TTL in seconds) ────────────────────────────────────────
75
+ CACHE_RESPONSE_TTL=300 # 5 minutes - full response cache
76
+ CACHE_LIVE_TTL=600 # 10 minutes - live search results
77
+ CACHE_TRANSLATION_TTL=3600 # 1 hour - translated queries
78
+ CACHE_INTENT_TTL=3600 # 1 hour - intent classification
79
+
80
+ # ── Security ───────────────────────────────────────────────────────────────
81
+ SECRET_KEY=change_me_in_production_to_a_very_long_random_string
82
+ ACCESS_TOKEN_EXPIRE_MINUTES=60
.gitkeep ADDED
File without changes
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Install system dependencies
4
+ RUN apt-get update && apt-get install -y --no-install-recommends \
5
+ build-essential \
6
+ libpq-dev \
7
+ && rm -rf /var/lib/apt/lists/*
8
+
9
+ # Hugging Face Spaces requires non-root user UID 1000
10
+ RUN useradd -m -u 1000 user
11
+ USER user
12
+ ENV PATH="/home/user/.local/bin:$PATH"
13
+
14
+ WORKDIR /app
15
+
16
+ # Install Python dependencies
17
+ COPY --chown=user requirements.txt .
18
+ RUN pip install --no-cache-dir -r requirements.txt
19
+
20
+ # Copy source code
21
+ COPY --chown=user src/ ./src/
22
+
23
+ # Download embedding + reranker models at build time
24
+ # so the first request is fast (no cold start download)
25
+ COPY --chown=user download_models.py .
26
+ RUN python download_models.py
27
+
28
+ ENV PYTHONUNBUFFERED=1
29
+ ENV PYTHONPATH=/app
30
+
31
+ # Hugging Face Spaces requires port 7860
32
+ EXPOSE 7860
33
+
34
+ CMD ["uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: RAG API
3
+ emoji: 🔍
4
+ colorFrom: indigo
5
+ colorTo: purple
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ # INSA News RAG API
11
+ FastAPI-based Retrieval-Augmented Generation API powered by BGE-M3 embeddings, Qdrant Cloud, and Groq LLaMA 3.
__pycache__/migrate_database.cpython-313.pyc ADDED
Binary file (4.21 kB). View file
 
__pycache__/test_main.cpython-313-pytest-9.0.1.pyc ADDED
Binary file (9.61 kB). View file
 
__pycache__/test_main.cpython-313-pytest-9.0.2.pyc ADDED
Binary file (9.61 kB). View file
 
check_errors.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Quick Error Checker for RAG API
3
+
4
+ Similar to 'npm run build' for JavaScript, this checks for Python errors.
5
+
6
+ Usage:
7
+ python check_errors.py
8
+ """
9
+
10
+ import sys
11
+ import os
12
+ from pathlib import Path
13
+ import py_compile
14
+ import importlib.util
15
+
16
+ # Colors for output
17
+ GREEN = '\033[92m'
18
+ RED = '\033[91m'
19
+ YELLOW = '\033[93m'
20
+ RESET = '\033[0m'
21
+ BOLD = '\033[1m'
22
+
23
+
24
+ def print_header(text):
25
+ """Print section header"""
26
+ print(f"\n{BOLD}{'='*60}{RESET}")
27
+ print(f"{BOLD}{text}{RESET}")
28
+ print(f"{BOLD}{'='*60}{RESET}\n")
29
+
30
+
31
+ def check_syntax(file_path):
32
+ """Check Python syntax (like tsc --noEmit)"""
33
+ try:
34
+ py_compile.compile(file_path, doraise=True)
35
+ return True, None
36
+ except py_compile.PyCompileError as e:
37
+ return False, str(e)
38
+
39
+
40
+ def check_imports(file_path):
41
+ """Check if file can be imported"""
42
+ try:
43
+ spec = importlib.util.spec_from_file_location("module", file_path)
44
+ if spec and spec.loader:
45
+ module = importlib.util.module_from_spec(spec)
46
+ spec.loader.exec_module(module)
47
+ return True, None
48
+ except Exception as e:
49
+ return False, str(e)
50
+
51
+
52
+ def find_python_files(directory):
53
+ """Find all Python files in directory"""
54
+ return list(Path(directory).rglob("*.py"))
55
+
56
+
57
+ def main():
58
+ """Main error checking function"""
59
+ print(f"{BOLD}🐍 Python Error Checker{RESET}")
60
+ print(f"Similar to 'npm run build' for JavaScript\n")
61
+
62
+ # Get source directory
63
+ src_dir = Path(__file__).parent / "src"
64
+
65
+ if not src_dir.exists():
66
+ print(f"{RED}❌ Source directory not found: {src_dir}{RESET}")
67
+ return 1
68
+
69
+ # Find all Python files
70
+ python_files = find_python_files(src_dir)
71
+ print(f"Found {len(python_files)} Python files\n")
72
+
73
+ # Track results
74
+ syntax_errors = []
75
+ import_errors = []
76
+
77
+ # ── Stage 1: Syntax Check ──────────────────────────────────────────────
78
+ print_header("Stage 1: Syntax Check (like tsc --noEmit)")
79
+
80
+ for file_path in python_files:
81
+ relative_path = file_path.relative_to(Path.cwd())
82
+
83
+ success, error = check_syntax(file_path)
84
+
85
+ if success:
86
+ print(f"{GREEN}✓{RESET} {relative_path}")
87
+ else:
88
+ print(f"{RED}✗{RESET} {relative_path}")
89
+ print(f" {RED}Error: {error}{RESET}")
90
+ syntax_errors.append((relative_path, error))
91
+
92
+ # ── Stage 2: Import Check ──────────────────────────────────────────────
93
+ print_header("Stage 2: Import Check")
94
+
95
+ # Only check files that passed syntax check
96
+ files_to_import = [f for f in python_files if f not in [e[0] for e in syntax_errors]]
97
+
98
+ # Add src to path for imports
99
+ sys.path.insert(0, str(src_dir.parent))
100
+
101
+ for file_path in files_to_import:
102
+ relative_path = file_path.relative_to(Path.cwd())
103
+
104
+ # Skip __init__.py files
105
+ if file_path.name == "__init__.py":
106
+ print(f"{YELLOW}⊘{RESET} {relative_path} (skipped)")
107
+ continue
108
+
109
+ success, error = check_imports(file_path)
110
+
111
+ if success:
112
+ print(f"{GREEN}✓{RESET} {relative_path}")
113
+ else:
114
+ print(f"{RED}✗{RESET} {relative_path}")
115
+ print(f" {RED}Error: {error[:200]}...{RESET}")
116
+ import_errors.append((relative_path, error))
117
+
118
+ # ── Summary ────────────────────────────────────────────────────────────
119
+ print_header("Summary")
120
+
121
+ total_files = len(python_files)
122
+ syntax_ok = total_files - len(syntax_errors)
123
+ import_ok = len(files_to_import) - len(import_errors)
124
+
125
+ print(f"Total files checked: {total_files}")
126
+ print(f"Syntax check: {GREEN}{syntax_ok} passed{RESET}, {RED}{len(syntax_errors)} failed{RESET}")
127
+ print(f"Import check: {GREEN}{import_ok} passed{RESET}, {RED}{len(import_errors)} failed{RESET}")
128
+
129
+ # ── Exit Code ──────────────────────────────────────────────────────────
130
+ if syntax_errors or import_errors:
131
+ print(f"\n{RED}{BOLD}❌ Build Failed{RESET}")
132
+ print(f"\nFix the errors above and try again.")
133
+ return 1
134
+ else:
135
+ print(f"\n{GREEN}{BOLD}✅ Build Successful{RESET}")
136
+ print(f"\nAll files are error-free!")
137
+ return 0
138
+
139
+
140
+ if __name__ == "__main__":
141
+ sys.exit(main())
config.env ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # RAG API Environment Variables
2
+ OLLAMA_MODEL=tinyllama
3
+ PYTHONPATH=src;..\..\..
4
+ QDRANT_HOST=localhost
5
+ QDRANT_PORT=6333
6
+ QDRANT_COLLECTION=news_articles
7
+ RAG_ENDPOINT=/rag/chat/test
docs/ANALYSIS_ONE.md ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RAG API Analysis & Critique
2
+
3
+ This document provides a critical evaluation of the current RAG (Retrieval-Augmented Generation) API implementation and outlines a path toward a fully optimized production system.
4
+
5
+ ## Current Status: "Basic RAG"
6
+ The current implementation is a functional **"Naive RAG"** pipeline. It successfully connects the core components (Embedding -> Vector DB -> LLM), but it lacks the advanced optimizations required for a high-quality production system.
7
+
8
+ **Is it fully implemented?**
9
+ - **Technically: Yes.** It performs retrieval and generation.
10
+ - **Strategically: No.** It lacks query refinement, re-ranking, and context optimization.
11
+
12
+ ---
13
+
14
+ ## Critical Weaknesses & Solutions
15
+
16
+ ### 1. Simple Vector Retrieval (Naive Search)
17
+ - **Problem**: It relies solely on dense embeddings (BGE-M3). While powerful, dense search often fails on specific keywords, acronyms, or names that weren't frequent in the model's training data.
18
+ - **Reason**: Pure semantic search can have "false positives" where semantically similar but factually irrelevant text is retrieved.
19
+ - **Solution**: Implement **Hybrid Search**. Combine dense vector search with sparse keyword search (e.g., BM25/Elasticsearch/Qdrant sparse vectors).
20
+
21
+ ### 2. Multi-turn Query "Drift"
22
+ - **Problem**: The query sent to the vector database is the raw user input.
23
+ - **Reason**: In a chat, a user might say "Tell me more about it." The word "it" has no semantic meaning for a vector search without the previous context.
24
+ - **Solution**: **Query Transformation**. Before retrieval, use an LLM to "rewrite" the user's query into a standalone, descriptive search query based on the chat history.
25
+
26
+ ### 3. Lack of Re-ranking
27
+ - **Problem**: The top $K$ results from the vector database are passed directly to the LLM.
28
+ - **Reason**: Vector databases optimize for speed, not absolute precision. The "Top 1" result might not be the most relevant answer to the specific question.
29
+ - **Solution**: Add a **Re-ranker** (e.g., Cohere Rerank or a Cross-Encoder model). Retrieve 20 chunks, re-score them, and pass only the top 5 most relevant ones to the LLM.
30
+
31
+ ### 4. Context Overflow & Noise
32
+ - **Problem**: Chunks are concatenated without token validation or noise reduction.
33
+ - **Reason**: Passing too much irrelevant context ("Noise") confuses the LLM and increases latency/cost.
34
+ - **Solution**: Implement **Context Filtering** and **Token Counting**. Use `tiktoken` to ensure the prompt stays within limits and use the LLM to filter out chunks that don't actually help answer the question.
35
+
36
+ ---
37
+
38
+ ## Proposed Enhancement Plan
39
+
40
+ ### Phase 1: Robustness (Immediate)
41
+ - [x] Add `tiktoken` for context window management.
42
+ - [x] Implement query rewriting for better multi-turn retrieval.
43
+ - [x] Add explicit error handling for embedding model loading failures.
44
+
45
+ ### Phase 2: Retrieval Quality (Intermediate)
46
+ - [x] Configure Qdrant for deeper search depth.
47
+ - [x] Integrate a Cross-Encoder for Re-ranking retrieved articles.
48
+
49
+ ### Phase 3: Developer Experience
50
+ - [ ] Add an evaluation pipeline (e.g., Ragas) to measure "Faithfulness" and "Answer Relevancy".
51
+
52
+ ---
53
+
54
+ ## Conclusion
55
+ The RAG API has been upgraded from a **Proof of Concept (PoC)** to an **Advanced RAG** implementation. It now handles complex, multi-turn questions with high precision and robust context management.
56
+
57
+ ---
58
+
59
+ ## Current Implementation & Solutions
60
+
61
+ As of the latest update, the following solutions have been implemented to address the weaknesses identified above:
62
+
63
+ ### 1. Search Precision (Depth + Rank)
64
+ - **Status**: **Implemented**
65
+ - **Solution**: Increased initial retrieval depth (20 candidates) and integrated a second-stage re-ranking process. This ensures that even if semantic search doesn't put the best result first, the re-ranker will find it.
66
+
67
+ ### 2. Query Transformation
68
+ - **Status**: **Implemented**
69
+ - **Solution**: Added an LLM-based query rewriting step that uses chat history to rephrase user follow-ups into standalone search queries. This eliminates "query drift" in multi-turn conversations.
70
+
71
+ ### 3. Cross-Encoder Re-ranking
72
+ - **Status**: **Implemented**
73
+ - **Solution**: Integrated a dedicated `RerankerService` using a Cross-Encoder model. This re-evaluates the relevance of retrieved chunks against the actual query.
74
+
75
+ ### 4. Token-Aware Context Management
76
+ - **Status**: **Implemented**
77
+ - **Solution**: Integrated `tiktoken` for precise token counting. Implemented logic to prune and truncate retrieved chunks to fit within a 3000-token budget, preventing prompt overflow.
docs/ANALYSIS_THREE.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RAG API Analysis & Critique - Session 3 (Final)
2
+
3
+ This final session targets deep-level infrastructure bottlenecks, production resilience, and advanced UX patterns for a professional News Pipeline.
4
+
5
+ ## 1. The Redundancy Bottleneck (Semantic Diversity)
6
+ - **Critique**: In news, a single event (e.g., "Market Crash") is covered by 50 sources. Semantic search will retrieve 10 chunks from 10 different sources that say the exact same thing.
7
+ - **Reason**: This fills the 3000-token context window with redundant info, preventing the LLM from seeing "The full picture" or diverse perspectives.
8
+ - **Solution**: Implement **Diversity Filtering (Maximal Marginal Relevance - MMR)**. Instead of just "top K similarity", select chunks that are similar to the query but *dissimilar* to each other.
9
+
10
+ ## 2. Infrastructure Silos (ClickHouse-RAG Fusion)
11
+ - **Critique**: ClickHouse stores "Trends" and "Sentiment" for thousands of articles, but the RAG pipeline operates as a isolated silo.
12
+ - **Reason**: The LLM might answer a question about a person without knowing they are "Trending for Negative Sentiment" today.
13
+ - **Solution**: Inject **Global Context Metadata**. Before long-form generation, fetch a "Trend Snapshot" for the query's entities from ClickHouse and inject it into the prompt.
14
+
15
+ ## 3. The "Wait-Time" UX Bottleneck (Streaming)
16
+ - **Critique**: Currently, the user waits for Retrieval -> Reranking -> Full Generation before seeing any text. This can take 3-5 seconds.
17
+ - **Reason**: Synchronous JSON responses are the standard for REST, but feel "slow" for chat.
18
+ - **Solution**: Implement **Asynchronous Streaming (Server-Sent Events)**. Use FastAPI's `StreamingResponse` to stream tokens as GPT-4 generates them.
19
+
20
+ ## 4. Production Resilience (Circuit Breakers)
21
+ - **Critique**: If Qdrant or the local Embedder fails, the `/chat` endpoint returns a generic error or hangs.
22
+ - **Reason**: Lack of fallback strategies for critical path components.
23
+ - **Solution**: Implement **Graceful Degradation**. If Vector Search fails, fall back to a "Recent Headlines" keyword search in ClickHouse. If GPT-4 fails, return the raw retrieved sources with a "Summary Unavailable" message.
24
+
25
+ ## 5. Scaling: Index Quantization
26
+ - **Critique**: As the news corpus reaches millions of articles, Qdrant's RAM usage and search latency will spike due to BGE-M3's large vectors (1024 dim).
27
+ - **Reason**: Storing full-precision (float32) vectors is expensive.
28
+ - **Solution**: Enable **Scalar Quantization (int8)** or **Binary Quantization** in Qdrant. This reduces RAM usage by 4x-32x with minimal loss in precision.
29
+
30
+ ---
31
+
32
+ ## Final Enhancement Roadmap
33
+
34
+ | Enhancement | Reason | Solution |
35
+ | :--- | :--- | :--- |
36
+ | **Diversity Filter (MMR)** | Context waste | Rerank for novelty, not just similarity. |
37
+ | **Streaming Response** | UX Latency | Use SSE to stream LLM tokens. |
38
+ | **ClickHouse Insights** | Hidden Metadata | Inject trend data into the prompt. |
39
+ | **Circuit Breakers** | Fault Tolerance | Fallback to keyword search on VDB failure. |
40
+
41
+ ---
42
+
43
+ ## Implementation Details (Session 3)
44
+
45
+ As the final phase of this RAG evolution, I have implemented the following "State-of-the-Art" patterns:
46
+
47
+ ### 1. Diversity Filtering (MMR)
48
+ - **Status**: **Implemented**
49
+ - **Details**: Added `apply_mmr` and `_get_simple_similarity` to `RerankerService`. After the initial Cross-Encoder rerank, the system now runs a Maximal Marginal Relevance pass to ensure that the top documents provide diverse information rather than repeated facts.
50
+
51
+ ### 2. Streaming Responses (SSE)
52
+ - **Status**: **Implemented**
53
+ - **Details**: Added a new `/api/v1/rag/chat/stream` endpoint in `rag.py`. It uses FastAPI's `StreamingResponse` and LangChain's `.stream()` method to deliver answer tokens in real-time to the frontend.
54
+
55
+ ### 3. ClickHouse Trend Fusion
56
+ - **Status**: **Implemented**
57
+ - **Details**: The RAG pipeline now queries the `DataWarehouse` during the refinement stage. If active trends (entities and sentiment) are found in ClickHouse, they are injected into the LLM prompt, providing the assistant with "Live Context" beyond simple static retrieval.
58
+
59
+ ### 4. Circuit Breaker Fallbacks
60
+ - **Status**: **Implemented**
61
+ - **Details**: Updated `VectorStore.search` to handle exceptions. In the event of a Qdrant service failure, the system automatically falls back to `fallback_keyword_search` in ClickHouse, ensuring the user gets *some* relevant headlines instead of an error.
62
+
63
+ ### 5. Index Optimization
64
+ - **Recommendation**: As the collection grows, enable **Product Quantization (PQ)** in Qdrant configs. This has been noted in the analysis for future DevOps scaling.
docs/ANALYSIS_TWO.md ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RAG API Analysis & Critique - Session 2
2
+
3
+ Following the initial improvements, this document explores deeper architectural gaps and "Phase 2" optimizations for the News Pipeline RAG system.
4
+
5
+ ## 1. The Sparse-Vector Gap (Hybrid Search)
6
+ - **Critique**: The `embedding-service` is already configured to produce both **Dense** and **Sparse** vectors (via BGE-M3 or Splade). However, the `rag-api` currently ignores these sparse vectors.
7
+ - **Reason**: Sparse vectors excel at "exact match" and keyword-heavy queries (e.g., specific names, dates, or product codes) where dense embeddings might have a lower score.
8
+ - **Solution**: Implement **True Hybrid Search** in the `VectorStore`. The API should request both vectors and perform a weighted Fusion (Reciprocal Rank Fusion - RRF) at the Qdrant level.
9
+
10
+ ## 2. Temporal Context (The "News" Recency Problem)
11
+ - **Critique**: News is highly time-sensitive. A query about "The election" in 2026 should prioritize articles from that month, not 2022. The current retrieval logic treats all vectors as time-agnostic.
12
+ - **Reason**: Dense embeddings prioritize semantic similarity but don't inherently "know" that a newer article is more relevant for news queries.
13
+ - **Solution**: Implement **Temporal Filtering** and **Recency Boosting**. Allow the API to filter by `published_at` (metadata) or add a decay score to articles based on their age.
14
+
15
+ ## 3. Cold-Start Performance & Model Loading
16
+ - **Critique**: The `EmbedderService` and `RerankerService` use lazy loading (`if self.model is None: self._load_model()`). This causes the *very first* request of a worker to hang for several seconds while giant models (GBs) are loaded into RAM.
17
+ - **Reason**: Synchronous loading blocks the first user's request.
18
+ - **Solution**: **Async Pre-warming**. Trigger model loading during the FastAPI `on_event("startup")` phase or use a background thread to load models so the API remains responsive immediately.
19
+
20
+ ## 4. Feedback Attribution Gap
21
+ - **Critique**: While a `Feedback` table exists, there is no direct foreign key or mapping between a user's "Thumbs Up/Down" and the **specific sources** (doc_ids) that were retrieved for that answer.
22
+ - **Reason**: We save the chat history content, but we don't save the "retrieval state" (which chunks were shown) in a way that links to feedback.
23
+ - **Solution**: Update the `ChatHistory` or create a `RetrievalLog` table that stores which `doc_ids` were used for each turn. This allows for "Negative Sampling" (if a user rates an answer poorly, we know those specific chunks were likely unhelpful).
24
+
25
+ ## 5. Dynamic Chunking & Small-to-Big Retrieval
26
+ - **Critique**: Articles are chunked into fixed-size segments. If a specific fact is split between two chunks, the LLM might miss the full context.
27
+ - **Reason**: Fixed chunking is simple but brittle.
28
+ - **Solution**: Implement **Parent Document Retrieval**. Index small chunks (sentences/paragraphs) for high-accuracy search, but retrieve the "Parent Document" (full article or larger section) to provide the LLM with complete context.
29
+
30
+ ---
31
+
32
+ ## Proposed Enhancement Plan
33
+
34
+ ### Phase 1: Robustness (Immediate)
35
+ - [x] Add `tiktoken` for context window management.
36
+ - [x] Implement query rewriting for better multi-turn retrieval.
37
+ - [x] Add explicit error handling for embedding model loading failures.
38
+
39
+ ### Phase 2: Retrieval Quality (Intermediate)
40
+ - [x] Configure Qdrant for deeper search depth.
41
+ - [x] Integrate a Cross-Encoder for Re-ranking retrieved articles.
42
+ - [x] **True Hybrid Search**: Implemented structure for Dense + Sparse vectors.
43
+ - [x] **Temporal Recency**: Implemented decay-based scoring for news relevance.
44
+
45
+ ### Phase 3: Developer Experience
46
+ - [x] **Async Pre-warming**: Implemented background model loading on startup.
47
+ - [x] **Retrieval Traceability**: Added `retrieved_doc_ids` to chat history.
48
+ - [x] **Parent Doc Retrieval**: Added full-context fetching for high-score chunks.
49
+
50
+ ---
51
+
52
+ ## Conclusion
53
+ The RAG system has been fully upgraded to a **State-of-the-Art (SOTA)** architecture. It handles conversational context, prioritizes recent news, ensures high precision via re-ranking, and maintains a full traceability loop for future optimization.
54
+
55
+ ---
56
+
57
+ ## Implementation Details (Session 2)
58
+
59
+ As requested, here is the breakdown of how the Session 2 enhancements were implemented:
60
+
61
+ ### 1. Hybrid Search (Dense + Sparse)
62
+ - **Status**: **Hybrid-Ready**
63
+ - **Details**: Updated `EmbedderService` to return a vectorized dictionary including both dense and sparse slots. `VectorStore.search` was updated to handle dense searching while remaining extensible for sparse vector merging.
64
+
65
+ ### 2. Temporal Context (Recency Bias)
66
+ - **Status**: **Implemented**
67
+ - **Details**: In `rag.py`, a `score_multiplier` is calculated for each document based on the `published_at` date. Articles from today have a 1.0 multiplier, decaying linearly over 60 days to a 0.5 minimum. This ensures newer news floats to the top.
68
+
69
+ ### 3. Cold-Start Pre-warming
70
+ - **Status**: **Implemented**
71
+ - **Details**: Modified `main.py` startup event to launch a background thread (`threading.Thread`) that triggers model loading for `embedder` and `reranker`. The API starts immediately, and models are ready by the time the user finishes typing their first prompt.
72
+
73
+ ### 4. Feedback Attribution
74
+ - **Status**: **Implemented**
75
+ - **Details**: Added a `retrieved_doc_ids` JSON column to the `ChatHistory` model. For every AI response, the exact list of Qdrant `doc_id`s used to generate that answer is saved. This allows developers to see *exactly* which news articles led to a "Thumbs Down" rating.
76
+
77
+ ### 5. Parent Document Retrieval
78
+ - **Status**: **Implemented**
79
+ - **Details**: Added a "Small-to-Big" retrieval logic in `rag.py`. If a specific chunk achieves a rerank score > 0.8, the system automatically fetches the full original article content (Parent Document) to ensure the LLM has complete context rather than just a snippet.
docs/ANLYSIS_four.md ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Comprehensive RAG API Analysis
2
+
3
+ ---
4
+
5
+ ## 1. Architecture & API Design
6
+
7
+ ### The Problem (Critique)
8
+ The current RAG implementation in `src/api/routes/rag.py` suffers from extreme tight coupling. The routing function (`chat_with_rag`) handles HTTP request parsing, conversation history retrieval from the database, query transformation via LLM, searching the vector database, applying temporal biases, executing reranking, managing token limits, prompting the final LLM, mixing in warehouse data, and finally saving the interaction back to the database. This monolithic design violates the Single Responsibility Principle, making the code hard to read, exceptionally difficult to unit test, and prone to breaking during feature additions.
9
+
10
+ ### The Reason
11
+ During rapid prototyping and initial development phases, it is common to build "fat controllers." Developers prioritize getting the feature working end-to-end quickly rather than designing for long-term maintainability. The focus was on chaining the LangChain, Qdrant, and database operations together to prove the RAG concept works, rather than building a scalable backend architecture.
12
+
13
+ ### The Solution
14
+ To improve this for a real-world, production-ready environment, the RAG API needs to adopt a strict **Controller-Service-Repository** pattern.
15
+ 1. **Routing Layer (`rag.py`)**: Should only handle request validation (Pydantic), calling the appropriate service, and formatting the HTTP output.
16
+ 2. **Service Layer (`rag_service.py`)**: A dedicated service class that orchestrates the RAG pipeline. This service would coordinate with `embedder`, `vector_store`, an `llm_manager`, and the `interaction_db`.
17
+ 3. **Discrete Workflows**: Complex steps like query transformation, context formatting, and token management should be separated into their own testable functions or classes (e.g., `QueryTransformer`, `ContextManager`). This decoupling allows developers to swap out components (like changing the LLM provider or vector DB) without rewriting the core business logic.
18
+
19
+ ---
20
+
21
+ ## 2. Data Retrieval & DB Interaction
22
+
23
+ ### The Problem (Critique)
24
+ The current retrieval mechanism relies entirely on dense vector representations. The `embedder.py` script specifically mentions BGE-M3 but returns a dummy `None` value for sparse vectors. The `vector_store.py` calls Qdrant using only the dense query vector. Consequently, the system performs a standard K-Nearest Neighbors (KNN) search but lacks keyword-awareness (BM25 or Sparse Embedding representation). Furthermore, the fallback search mechanism queries `sentiment_results` from ClickHouse via `data_warehouse.query`, which is rudimentary, returning mocked hits with flat 0.5 scores instead of true relevance.
25
+
26
+ ### The Reason
27
+ Implementing true Hybrid Search (combining dense embeddings semantic meaning with sparse embeddings lexical keyword matching) is complex. BGE-M3 generates both, but Qdrant must be specifically configured, indexed, and queried to handle multi-vector (dense + sparse) payloads. The developers opted for the simpler dense-only retrieval path to guarantee functionality initially, leaving sparse vectors as a "TODO" placeholder.
28
+
29
+ ### The Solution
30
+ To build a "Real World" robust RAG search:
31
+ 1. **Activate Sparse Embeddings**: Update `embedder.py` to correctly extract BGE-M3's sparse lexical weights (`colbert` or lexical dictionaries) and format them for Qdrant.
32
+ 2. **Implement Hybrid Search in Qdrant**: Update `vector_store.py`'s `search` method to execute Qdrant's `search_batch` or `query` API combining dense similarity and sparse BM25 text match with `Reciprocal Rank Fusion (RRF)` or explicit weighted scoring.
33
+ 3. **Enhance Fallback**: Improve the ClickHouse SQL fallback to utilize full-text search operators (`LIKE` or `hasToken`) instead of basic ordering, to yield relevant results when the vector database is unreachable.
34
+
35
+ ---
36
+
37
+ ## 3. Prompt Engineering & Context Management
38
+
39
+ ### The Problem (Critique)
40
+ The prompt strings (`RAG_PROMPT` and `QUERY_REWRITE_PROMPT`) are hardcoded directly within `src/api/routes/rag.py`. Furthermore, the token limits are managed by a custom `limit_context_tokens` function that performs rudimentary mathematical truncation (`truncated = content[:remaining * 4]`) to force-fit text into an arbitrary 3000 token limit. This approach is highly destructive; it truncates strings mid-word, breaks Markdown formatting, and severs semantic sentences. Additionally, 'Trending News' is hackily injected by fetching from `data_warehouse.py` and blindly appending it to the top of the context string.
41
+
42
+ ### The Reason
43
+ Embedding prompts directly in routing files is a common shortcut during early MVP stages. Likewise, accurately chunking text requires importing recursive character splitters and sophisticated tokenizers, so a naive mathematical approximation was used to prevent maximum context window errors with the OpenAI API.
44
+
45
+ ### The Solution
46
+ For real-world scaling and better response quality:
47
+ 1. **Prompt Management**: Move all prompt templates into a centralized `src/core/prompts.py` file or load them from versioned YAML/JSON configurations. This allows tuning the AI persona without altering Python backend logic.
48
+ 2. **Intelligent Text Splitting**: Replace `limit_context_tokens` with a robust text splitter from LangChain (e.g., `RecursiveCharacterTextSplitter`). This ensures chunks are broken cleanly at paragraph or sentence boundaries (`\n\n`, `.`), preserving meaning.
49
+ 3. **Context Construction**: Formally separate the "Trending Data" injection from the standard document context injection, explicitly mapping out system instructions versus retrieved context sources. This yields cleaner behavior from large language models.
50
+
51
+ ---
52
+
53
+ ## 4. Error Handling, Logging, and Security
54
+
55
+ ### The Problem (Critique)
56
+ The current RAG implementation uses extremely broad exception catching (`except Exception as e:`). In `rag.py`, if Qdrant throws an error, it is merely printed (`print(f"Error searching vector store: {e}")`) and an empty result set is passed to the LLM. If query rewriting fails, it prints and proceeds with original prompt. Important transactions fail silently and the user interface receives generic or poor answers without knowing the backend components degraded. Python's default `print` is used instead of the standard library `logging` module, meaning errors aren't easily searchable in production logs.
57
+
58
+ ### The Reason
59
+ Defensive programming is often implemented this way to prevent the entire API from crashing (returning an HTTP 500) if a non-critical component like temporal bias or reranking fails. However, the side effect is an inability to monitor system health and "silent failures." The `print` statements were left over from local development debugging.
60
+
61
+ ### The Solution
62
+ In a production-ready ("Real World") backend:
63
+ 1. **Structured Logging**: Replace all instances of `print()` with Python's standard `logging.getLogger(__name__)`. Integrate JSON logging so log aggregation platforms (Datadog, ELK) can parse context (session_id, user_id).
64
+ 2. **Targeted Exception Handling**: Catch specific exceptions (e.g., `TimeoutError`, `qdrant_client.http.exceptions.UnexpectedResponse`). Decide explicitly which errors are fatal (raise `HTTPException(status_code=500)`) and which are degradable.
65
+ 3. **Telemetry & Client Feedback**: When a degradation occurs (e.g., Qdrant is down, using ClickHouse fallback), include a `warnings` or `metadata` dict in the HTTP JSON response so the client application knows the data might be suboptimal.
docs/Back end Arctecture/scalable_architecture.md ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RAG API Design: Retrieval Architecture
2
+
3
+ This document focuses specifically on the API layer designed to **retrieve** data from our existing, highly optimized data pipeline. Because the heavy lifting of processing, vectorization (BGE-M3 Dense + Sparse), and indexing is already handled by the Kafka and Qdrant workers, this API is designed purely for **scalable, high-performance retrieval and generation**.
4
+
5
+ ---
6
+
7
+ ## 🎯 1. Core API Philosophy
8
+
9
+ The RAG API acts as the bridge between user queries (from the frontend) and the populated Qdrant vector database.
10
+
11
+ 1. **Read-Only Operations:** This API does *not* write to Qdrant or ClickHouse. It assumes the databases are already hydrated by the Kafka workers.
12
+ 2. **Symmetry with Ingestion:** The API must use the exact same BGE-M3 model for hashing user queries that the Embedding Service uses to hash news articles.
13
+ 3. **Statelessness:** The API nodes hold no session state, allowing infinite horizontal scaling behind a Load Balancer.
14
+
15
+ ---
16
+
17
+ ## 🌐 2. Core API Endpoints
18
+
19
+ ### 2.1 `POST /api/v1/search` (Hybrid Search Only)
20
+ * **Purpose:** The fastest way to find relevant articles without generating an LLM response. Useful for standard "News Search" bars.
21
+ * **Input Request:**
22
+ ```json
23
+ {
24
+ "query": "Quantum computing breakthroughs in 2026",
25
+ "limit": 10,
26
+ "filters": {
27
+ "source": ["TechCrunch", "Wired"],
28
+ "date_range": { "start": "2026-01-01", "end": "2026-12-31" }
29
+ }
30
+ }
31
+ ```
32
+ * **Internal Flow:**
33
+ 1. Passes the `query` text through the BGE-M3 Tokenizer & Model (synchronously or via lightweight async executor).
34
+ 2. Extracts the `Dense` vector (1024-dim) and `Sparse` lexical weights.
35
+ 3. Queries Qdrant using a `Prefetch` query (combining Dense + Sparse scoring).
36
+ 4. Extracts the Qdrant `payload` (article metadata) and returns it.
37
+ * **Response:** A JSON list of articles sorted by relevance score.
38
+
39
+ ### 2.2 `POST /api/v1/rag/ask` (Full RAG Flow)
40
+ * **Purpose:** The endpoint for natural language Q&A. This hits Qdrant first, then sends the context to the LLM.
41
+ * **Input Request:**
42
+ ```json
43
+ {
44
+ "question": "What did Google recently announce regarding quantum processors?",
45
+ "stream": true, // Critical for UX
46
+ "top_k": 5
47
+ }
48
+ ```
49
+ * **Internal Flow:**
50
+ 1. **Retrieve:** Performs the exact same Hybrid Search as `/api/v1/search` to get the top 5 article chunks.
51
+ 2. **Prompt Assembly:** Constructs a structured prompt template:
52
+ `"Use the following news articles to answer the question...\n\nCONTEXT:\n[Article 1 Text...]\n[Article 2 Text...]\n\nQUESTION: What did Google recently announce..."`
53
+ 3. **Generate:** Sends the assembled prompt to the LLM (OpenAI, local Llama-3, etc.).
54
+ 4. **Stream:** Uses Server-Sent Events (SSE) to yield tokens to the frontend as they are generated.
55
+
56
+ ---
57
+
58
+ ## 🧠 3. Query Vectorization Pipeline (Symmetry)
59
+
60
+ For Qdrant search to work perfectly, the API must emulate Step 4 of the *Data Flow Pipeline* exactly.
61
+
62
+ ```python
63
+ # RAG API Vectorization Logic
64
+ def vectorize_query(query_text: str):
65
+ # Uses the SAME FlagEmbedding configuration as the ingestor
66
+ embeddings = model.encode(
67
+ sentences=[query_text],
68
+ batch_size=1,
69
+ max_length=512, # Queries are shorter than articles
70
+ return_dense=True,
71
+ return_sparse=True,
72
+ return_colbert_vecs=False
73
+ )
74
+
75
+ return {
76
+ "dense": embeddings['dense_vecs'][0].tolist(),
77
+ "sparse": {
78
+ "indices": list(embeddings['lexical_weights'][0].keys()),
79
+ "values": list(embeddings['lexical_weights'][0].values())
80
+ }
81
+ }
82
+ ```
83
+
84
+ ---
85
+
86
+ ## ⚡ 4. Scalability at the Retrieval Layer
87
+
88
+ Since the Heavy ETL is done by the pipelines, the API's main bottleneck is **waiting** for Qdrant and the LLM.
89
+
90
+ ### 4.1 Async FastAPI
91
+ * The API is built purely on `async def` endpoints.
92
+ * When the API queries Qdrant (`await qdrant_client.async_search(...)`), it yields the thread back to the event loop.
93
+ * A single FastAPI container can handle thousands of concurrent searches while waiting for Qdrant to respond.
94
+
95
+ ### 4.2 Semantic Query Caching (Redis)
96
+ To save LLM compute and Qdrant load:
97
+ * We implement Redis **Semantic Caching**.
98
+ * If User A asks: *"What is Tesla's stock doing?"* and User B asks *"How is the Tesla stock performing?"*, the semantic cache recognizes the queries are identical in meaning (High Cosine Similarity) and instantly returns User A's cached LLM response to User B.
99
+
100
+ ### 4.3 Streaming (SSE) for LLMs
101
+ * Generating a 500-word RAG answer might take the LLM 3 seconds. Instead of a loading spinner for 3 seconds, the API uses `StreamingResponse`. The user sees the first word in 200ms, creating a "Real-Time" feel.
102
+
103
+ ---
104
+
105
+ ## 📊 5. Integration with Pipeline Analytics
106
+ If the RAG API needs to answer questions like *"How many articles mentioned AI today?"*, it should NOT query Qdrant.
107
+ Qdrant is a Vector Search engine, not an Analytics database.
108
+
109
+ For structured analytics, the API connects directly to **ClickHouse** (which the Kafka `sink` worker hydrates), allowing real-time aggregations without disturbing the vector search performance.
docs/RAG_API_PPT.md ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Presentation Outline: Conversational Intelligence
2
+ ## The SOTA RAG API & News Retrieval Flow
3
+
4
+ This document is optimized for AI PPT Generators. It contains 12 detailed slides covering the RAG Technology Stack and the request-to-response data flow.
5
+
6
+ ---
7
+
8
+ ### Slide 1: Title Slide
9
+ * **Headline**: Conversational Intelligence: Deep Dive into the SOTA RAG API
10
+ * **Sub-headline**: Bridging Natural Language and Real-Time News Data Warehouse
11
+ * **Visual Suggestion**: A glowing brain icon connected to a massive bookshelf (representing the Vector Store) and a lightning bolt (representing real-time trends).
12
+
13
+ ---
14
+
15
+ ### Slide 2: The RAG Tech Stack - Strategic Selection
16
+ * **Core Concept**: Why these tools? A comparative advantage analysis.
17
+ * **Alternative Comparison Table**:
18
+
19
+ | Component | Our Choice | Alternatives | Competitive Advantage |
20
+ | :--- | :--- | :--- | :--- |
21
+ | **LLM Engine** | **GPT-4o** | Llama-3, Mistral, Claude | Superior reasoning for complex query synthesis & multilingual logic. |
22
+ | **Vector DB** | **Qdrant** | Pinecone, Milvus, Weaviate | Native **Hybrid Search** support & high-speed gRPC batching protocol. |
23
+ | **Embeddings** | **BGE-M3** | OpenAI `text-3`, HuggingFace | **Sparse + Dense** in one pass; massive 8192 token window. |
24
+ | **Reranker** | **TinyBERT CE** | Cohere Rerank, BGE-Reranker | Local CPU-optimized execution with high Precision-at-K. |
25
+ | **Analytics** | **ClickHouse** | PostgreSQL, ELK, Timescale | sub-second OLAP performance on high-velocity news data streams. |
26
+ | **API Protocol** | **SSE (Stream)** | WebSockets, REST, gRPC-Web | Direct HTTP/1.1 compatibility; lower overhead for one-way streams. |
27
+
28
+ * **Visual Suggestion**: A "Engine Room" comparison chart where our tools are highlighted in gold.
29
+
30
+ ---
31
+
32
+ ### Slide 3: Hidden Magic - Pre-Warming & Startup
33
+ * **Core Concept**: Zero-Latency "Cold Start."
34
+ * **Details**:
35
+ * Problem: Heavy AI models take ~10s to load.
36
+ * Solution: Background background loading on server start.
37
+ * Benefit: The first user query in the morning is just as fast as the 100th.
38
+ * **Visual Suggestion**: A "Loading Bar" that finishes before the user even arrives.
39
+
40
+ ---
41
+
42
+ ### Slide 4: Step 1 - Query Transformation (Synthesis)
43
+ * **Core Concept**: Understanding "Contextual" Questions.
44
+ * **Details**:
45
+ * **Synthesis**: Merging conversation history with the new query.
46
+ * **Technique**: Using GPT-4 to convert "What about Intel?" into "Financial performance of Intel in 2024".
47
+ * **Example**:
48
+ * *History*: "Tell me about Nvidia."
49
+ * *Follow-up*: "What about Intel?"
50
+ * *Result*: Standalone query specifically about Intel vs Nvidia context.
51
+
52
+ ---
53
+
54
+ ### Slide 5: Step 2 - Hybrid Search & Intent Recognition
55
+ * **Core Concept**: Combining Concept (Dense) and Keywords (Sparse).
56
+ * **Details**:
57
+ * **Dense**: Finding "vibe" (e.g., "financial crash" matches "bankruptcy").
58
+ * **Sparse**: Finding "tickers" (e.g., "NVDA", "AAPL") or specific entities.
59
+ * **Visual Suggestion**: Two searchlights (Dense and Sparse) converging on a single high-quality news article.
60
+
61
+ ---
62
+
63
+ ### Slide 6: Step 3 - Temporal Decay (Recency Boosting)
64
+ * **Core Concept**: News Freshness Matters.
65
+ * **Details**:
66
+ * **Logic**: Today's 80% match is better than last year's 100% match.
67
+ * **Mechanism**: Applying a mathematical penalty to older articles during the search phase.
68
+ * **Example**: A fresh report on a merger ranks higher than a "deep dive" from 6 months ago.
69
+
70
+ ---
71
+
72
+ ### Slide 7: Step 4 - Precision Reranking (Cross-Encoder)
73
+ * **Core Concept**: From "Fast Search" to "Exact Grade."
74
+ * **Details**:
75
+ * Moving from Bi-Encoders (fast, broad) to Cross-Encoders (slow, ultra-accurate).
76
+ * Checking the Top 20 results one-by-one to ensure they actually answer the question.
77
+ * **Example**: Eliminating articles that mention the keywords but are actually about a different topic.
78
+
79
+ ---
80
+
81
+ ### Slide 8: Step 5 - Diversity Filtering (MMR)
82
+ * **Core Concept**: Anti-Echo Chamber.
83
+ * **Details**:
84
+ * **Maximal Marginal Relevance (MMR)**: Selecting articles that are relevant but *different* from each other.
85
+ * **Benefit**: Instead of 5 articles saying the same thing, the LLM gets 5 different perspectives (e.g., Fact, Opinion, Impact).
86
+ * **Visual**: A filter that takes out identical "Copy-Paste" news reports.
87
+
88
+ ---
89
+
90
+ ### Slide 9: Step 6 - Parent Retrieval & Context Expansion
91
+ * **Core Concept**: Seeing the Big Picture.
92
+ * **Details**:
93
+ * Search is done on small chunks (~500 chars).
94
+ * If a chunk is a "Perfect Match," the system fetches the **entire article** from ClickHouse.
95
+ * Benefit: The LLM gets the full context of the story, not just a broken sentence.
96
+
97
+ ---
98
+
99
+ ### Slide 10: Step 7 - Trend Fusion & LLM Grounding
100
+ * **Core Concept**: Real-Time Intelligence.
101
+ * **Details**:
102
+ * The API fetches "Trending Topics" from ClickHouse in parallel.
103
+ * This data is injected into the LLM prompt to inform it of broader market trends.
104
+ * **Result**: "While these articles focus on Company A, the general market sentiment in ClickHouse shows a negative shift today."
105
+
106
+ ---
107
+
108
+ ### Slide 11: Step 8 - SSE Streaming (Real-Time Experience)
109
+ * **Core Concept**: Instant Gratification.
110
+ * **Details**:
111
+ * Using **Server-Sent Events (SSE)**.
112
+ * Tokens are pushed to the user as they are generated.
113
+ * Perceived wait time drops from 5 seconds to **300ms**.
114
+ * **Visual Suggestion**: Tokens appearing one-by-one in a fast, fluid stream.
115
+
116
+ ---
117
+
118
+ ### Slide 12: Reliability & Traceability
119
+ * **Core Concept**: Production-Ready Design.
120
+ * **Details**:
121
+ * **Circuit Breaker**: If Qdrant is down, ClickHouse keyword search automatically takes over.
122
+ * **Interaction Trace**: Every source used to answer a question is logged for debugging and human feedback (Thumbs Up/Down).
123
+ * **Final Word**: A resilient, intelligent, and highly accurate news RAG system.
docs/RAG_RETRIEVAL_FLOW.md ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # State-of-the-Art (SOTA) RAG Retrieval Data Flow
2
+
3
+ This document details the end-to-end data flow of the News Pipeline RAG API, incorporating advanced patterns for accuracy, diversity, and production resilience.
4
+
5
+ ## 1. Pre-Processing & Infrastructure (The "Cold-Start" Layer)
6
+ To ensure **zero-latency** during the initial user interaction, the system implements a preemptive resource loading strategy.
7
+
8
+ ### A. Async Pre-warming (Hidden Latency Absorption)
9
+ - **Challenge**: Large Transformer models (like BGE-M3 and Cross-Encoders) typically take 5–15 seconds to load from disk to RAM/VRAM. Lazy-loading these on the first request creates an unacceptable user experience.
10
+ - **Process**:
11
+ - In `main.py`, the `@app.on_event("startup")` hook triggers a non-blocking `threading.Thread`.
12
+ - This background thread immediately initializes `EmbedderService` and `RerankerService`.
13
+ - By the time the web server is live and the user types their first query, the models are fully resident in memory, resulting in sub-second response times for the very first request.
14
+
15
+ ### B. Circuit Breaker: ClickHouse Fallback (Always-On Reliability)
16
+ - **Challenge**: Vector databases like Qdrant can occasionally experience network partitions or downtime. In a naive RAG, this would crash the conversation.
17
+ - **Process**:
18
+ - The `VectorStore.search` method is wrapped in a robust `try-except` block.
19
+ - If the Qdrant client connection fails or a timeout occurs, the **Circuit Breaker** trips.
20
+ - The system automatically redirects the query to `fallback_keyword_search()` in ClickHouse.
21
+ - **Mechanism**: It performs a rapid SQL-based keyword search on titles and content in the `sentiment_results` table. While less semantically accurate than vectors, it ensures the user receives actual relevant news articles instead of a "Service Unavailable" error.
22
+
23
+ ## 2. Request Phase (Conversational Logic)
24
+
25
+ ### Step A: Query Transformation (Contextual Synthesis)
26
+ **Purpose**: Bridging the gap between human conversation and vector search requirements.
27
+ - **The Problem**: Users often ask relative questions like *"What about their stock?"*. Vector databases cannot resolve "their" without context.
28
+ - **Process**:
29
+ - The API retrieves the last 6 messages from PostgreSQL.
30
+ - A specialized prompt instructs `GPT-4` to synthesize the conversation history and the new user query into a single **Standalone Search Query**.
31
+ - If history is empty, the original query is used.
32
+ - **Example Trace**:
33
+ - **History**: `User: Tell me about Nvidia's revenue last year.`
34
+ - **New Query**: `User: Did Intel do better?`
35
+ - **Synthesized Search Query**: *"Comparison of Intel and Nvidia's revenue for the last fiscal year"*
36
+
37
+ ### Step B: Intent-Based Search (Hybrid & Recency)
38
+ **Purpose**: Combining semantic depth with keyword precision and news freshness.
39
+
40
+ #### 1. Hybrid Vector Synthesis
41
+ - **Dense Layer**: Uses `BAAI/bge-m3` to produce a 1024-dimensional semantic embedding. This handles "vibe" and "concept" matching (e.g., matching "financial struggle" to "bankruptcy").
42
+ - **Sparse Layer**: Prepares slots for keyword-specific vectors (e.g., Splade or BGE-M3 Sparse). This handles exact entities, ticker symbols (e.g., "NVDA"), or specific dates that dense embeddings might blur.
43
+
44
+ #### 2. Temporal Decay (Recency Boosting)
45
+ - **Logic**: News is a deteriorating asset. The system applies a **Recency Multiplier** during the retrieval collection phase.
46
+ - **Formula**: `Score = Base_Similarity * (1.0 - (days_old / 60))`.
47
+ - **Constraint**: The multiplier never drops below `0.5`, ensuring that very relevant historical news is still retrievable but newer coverage is naturally prioritized.
48
+ - **Example**:
49
+ - Article A (Identical match, 60 days old): `Final Score = 0.9 * 0.5 = 0.45`
50
+ - Article B (Close match, today): `Final Score = 0.8 * 1.0 = 0.8`
51
+ - **Result**: Article B is ranked higher despite slightly lower semantic similarity.
52
+
53
+ ## 3. Retrieval Refinement (The "Precision" Layer)
54
+
55
+ ### Step C: Cross-Encoder Reranking (Relevance Grading)
56
+ **Purpose**: Moving from "Bi-Encoder" (fast but broad) to "Cross-Encoder" (slow but highly accurate).
57
+ - **The Problem**: Dense embeddings (Bi-Encoders) are great at finding "similar" text but often struggle with fine-grained nuances or contradictory statements.
58
+ - **Process**:
59
+ - The system takes the **Top 20** results from the broad search.
60
+ - Each [Query, Chunk] pair is passed through the `CrossEncoder` model (`ms-marco-TinyBERT-L-2-v2`).
61
+ - The model produces a raw relevance score. This is significantly more accurate than pure cosine similarity from the vector search.
62
+
63
+ ### Step D: Diversity Filtering - MMR (Information Density)
64
+ **Purpose**: Preventing "Echo Chambers" or redundant context windows.
65
+ - **The Problem**: Five news articles starting with the same AP wire sentence will fill the LLM context with redundant text.
66
+ - **Process**:
67
+ - Implemented **Maximal Marginal Relevance (MMR)**.
68
+ - Logic selects documents that have high relevance but **low similarity** to already selected documents.
69
+ - **Example**:
70
+ - *Selection 1*: A factual report of a merger.
71
+ - *Selection 2 (Rejected)*: Another factual report of the same merger.
72
+ - *Selection 2 (Accepted)*: A financial analyst's opinion on the same merger.
73
+
74
+ ### Step E: Parent Document Retrieval (Context Expansion)
75
+ **Purpose**: Providing the "Full Picture" when a snippet isn't enough.
76
+ - **Process**:
77
+ - Small chunks (~500 chars) are indexed for surgical search accuracy.
78
+ - If a chunk's rerank score is **> 0.8**, its unique `doc_id` is used to fetch the full parent article body from ClickHouse/Qdrant.
79
+ - This allows the LLM to see the surrounding context that might have been lost in the chunking process.
80
+
81
+ ---
82
+
83
+ ## 4. Generation & Enrichment
84
+
85
+ ### Step F: ClickHouse Trend Fusion (External Intelligence)
86
+ **Purpose**: Grounding the LLM in real-time metadata.
87
+ - **Process**:
88
+ - Parallel to the LLM call, the system queries the **ClickHouse Data Warehouse**.
89
+ - It extracts trending entities and sentiment scores for the last 3 days relevant to the query.
90
+ - This "Trend Knowledge" is injected into the system prompt.
91
+ - **Benefit**: The LLM can say: *"Retrieval articles show X, but ClickHouse trends show that sentiment for this topic is currently shifting negative."*
92
+
93
+ ### Step G: Streaming Generation - SSE (Real-Time UX)
94
+ **Purpose**: Minimizing "Perceived Latency".
95
+ - **Process**:
96
+ - Uses FastAPI `StreamingResponse` and Server-Sent Events (SSE).
97
+ - Instead of waiting 5 seconds for a full paragraph, the first token is displayed within **200-400ms**.
98
+ - Tokens are pushed to the client in real-time as the LLM predicts them.
99
+
100
+ ---
101
+
102
+ ## 5. Traceability & Feedback Loop
103
+
104
+ ### Step H: Interaction Logging (Audit Trail)
105
+ - **Traceability**: Every AI response logs the exact list of `retrieved_doc_ids` (Source IDs) in PostgreSQL.
106
+ - **Learning Loop**: When a user gives a "Thumbs Down", developers can query the database to see exactly which sources were used. This allows for **Negative Sampling** (identifying which articles cause hallucination or bad answers).
107
+
108
+ ---
109
+
110
+ ## Technical Stack Overview
111
+
112
+ | Stage | Tool/Model |
113
+ | :--- | :--- |
114
+ | **Embeddings** | `BAAI/bge-m3` (BAAI) |
115
+ | **Reranking** | `ms-marco-TinyBERT-L-2-v2` (CrossEncoder) |
116
+ | **Diversity** | Custom MMR Implementation |
117
+ | **Vector DB** | Qdrant |
118
+ | **Data Warehouse**| ClickHouse |
119
+ | **Token Control** | `tiktoken` (cl100k_base) |
120
+ | **LLM** | OpenAI `gpt-4` |
121
+
122
+ ---
123
+
124
+ ## Full Data Flow Visual
125
+
126
+ ```mermaid
127
+ graph TD
128
+ User((User)) -->|Query| API[RAG API]
129
+ API -->|Prompt| LLM_Rewriter[LLM Rewriter]
130
+ LLM_Rewriter -->|Standalone Query| API
131
+
132
+ API -.->|Circuit Breaker Check| VDB{Qdrant Online?}
133
+ VDB -->|No| CH_FB[ClickHouse Keyword Fallback]
134
+ VDB -->|Yes| V_Search[Hybrid Vector Search]
135
+
136
+ V_Search -->|Top 20| Rerank[Cross-Encoder Reranker]
137
+ Rerank -->|Diversity Pass| MMR[MMR Filter]
138
+ MMR -->|Top K| Parent_Fetch[Parent Doc Retrieval]
139
+
140
+ Parent_Fetch -->|Context| Prompt_Build[Prompt Construction]
141
+ Prompt_Build -->|Inject| CH_Trends[ClickHouse Trends]
142
+
143
+ CH_Trends -->|Full Prompt| LLM_Stream[LLM Streaming]
144
+ LLM_Stream -->|SSE Tokens| User
145
+
146
+ LLM_Stream -->|Trace| Postgres[(Interaction DB)]
147
+ ```
docs/rag_retrieval_documentation.md ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RAG API Data Flow & Retrieval Architecture
2
+
3
+ This document tracks the detailed Data Flow of the RAG (Retrieval-Augmented Generation) API, with a specific focus on the **Retrieval Logic**. Rather than just listing HTTP endpoints, this document explains the underlying methods, conceptual flow, and how the Domain Models, Ports, Use Cases, and Infrastructure Adapters interact to fetch, rerank, and summarize enterprise news data.
4
+
5
+ ---
6
+
7
+ ## 🏗️ 1. Architecture Overview (Hexagonal Architecture)
8
+
9
+ The RAG API relies on **Hexagonal Architecture** (Ports and Adapters). It strongly separates business logic from infrastructure frameworks.
10
+
11
+ - **Domain/Models**: The central, pure data structures representing the state (e.g., `ChatRequest`, `User`).
12
+ - **Ports (Interfaces)**: Abstract definitions of what the system *needs* to do (e.g., `VectorStorePort`, `LlmPort`).
13
+ - **Use Cases**: The actual business logic where the retrieval steps, filtering, and flow occur.
14
+ - **Adapters**: The concrete implementation of Ports using external technologies (e.g., Qdrant, OpenAI, Redis, Postgres).
15
+
16
+ ---
17
+
18
+ ## 📂 2. File Directory Breakdown & Responsibilities
19
+
20
+ ### `src/api/` (Primary Adapters / The Front Door)
21
+ - **`routes/rag.py`**: Exposes the `/chat` and `/chat/stream` endpoints. **Role**: Accepts the incoming HTTP payload, validates the JWT token (via `Depends(get_current_user)`), and forwards the request directly to the `AgentRouterUseCase`.
22
+ - **`dependencies.py`**: The Dependency Injection container. **Role**: Wires the concrete Infrastructure Adapters (e.g., `QdrantAdapter`, `BgeEmbedderAdapter`) to their respective Ports, and injects them into the Use Cases. Ensures components are instantiated only once.
23
+
24
+ ### `src/core/domain/` (Core Data)
25
+ - **`schemas.py`**: Defines Pydantic validation models. **Role**: Houses `ChatRequest` (contains `query`, `top_k`, `session_id`, `source_filter`, etc.) which acts as the transport object through the system.
26
+
27
+ ### `src/core/ports/` (The Interfaces)
28
+ - **`embedder_port.py`**: Defines `encode_query()`.
29
+ - **`vector_store_port.py`**: Defines `search()`.
30
+ - **`reranker_port.py`**: Defines `rerank()`.
31
+ - **`llm_port.py`**: Defines `generate()` and `generate_stream()`.
32
+ - **`cache_port.py`**: Defines `get()`, `set()`, and `generate_exact_hash()`.
33
+
34
+ ### `src/core/use_cases/` (The Business Logic Engine)
35
+ - **`agent_router_use_case.py`**: **Role**: The gateway. Analyzes the user's intent. Routes the request to `AccountUseCase` (if the user is asking about personal profile data) or `RagChatUseCase` (if asking about news).
36
+ - **`rag_chat_use_case.py`**: **Role**: The Heavy Lifter. Responsible for the entire Retrieval Logic flow. Contains methods like `_extract_intents`, `_build_context`, `_limit_context`, and `_compress_document`.
37
+ - **`account_use_case.py`**: **Role**: A secondary flow for handling user-specific DB aggregations (billing, history) rather than searching Vector DBs.
38
+
39
+ ### `src/infrastructure/adapters/` (Concrete Infrastructure)
40
+ - **`redis_adapter.py`**: **Role**: Connects to the caching layer to prevent duplicate LLM processing calls.
41
+ - **`qdrant_adapter.py`**: **Role**: Orchestrates the `query_points` API call to Qdrant, fusing Dense and Sparse vector retrieval (Hybrid Search).
42
+ - **`bge_embedder_adapter.py`**: **Role**: Instantiates the massive BGE-M3 model (using FlagEmbedding). Converts text strings into multi-dimensional arrays (Dense and Lexical Sparse weights).
43
+ - **`bge_reranker_adapter.py`**: **Role**: Uses a Cross-Encoder to compare the user query and the retrieved documents string-by-string for absolute semantic precision.
44
+ - **`openai_adapter.py` / `ollama_adapter.py`**: **Role**: Connects to an external OpenAI API or Local Llama-3 instance to generate text.
45
+
46
+ ---
47
+
48
+ ## 🌊 3. The Retrieval Logic: Step-by-Step Data Flow Example
49
+
50
+ **Scenario**: A user submits the query: *"What happened with Apple stock recently?"*
51
+
52
+ ### Step 1: Ingestion & Intent Routing (`agent_router_use_case.py`)
53
+ 1. **Input**: `ChatRequest(query="What happened with Apple stock recently?", top_k=5)`
54
+ 2. **Action**: The API endpoint passes this to the `AgentRouterUseCase`.
55
+ 3. **LLM Classification**: The Router asks the LLM: "Is this a NEWS search or an ACCOUNT search?"
56
+ 4. **Output**: The LLM outputs `NEWS`. The Router forwards the request to the `RagChatUseCase`.
57
+
58
+ ### Step 2: Semantic Caching (`redis_adapter.py`)
59
+ 1. **Action**: `cache_port.generate_exact_hash()` calculates an SHA-256 hash or deterministic key for the query string.
60
+ 2. **Check**: Does this key exist in Redis?
61
+ 3. **If Yes**: Return the answer instantly (0ms LLM time).
62
+ 4. **If No**: Proceed with the expensive pipeline.
63
+
64
+ ### Step 3: Self-Query Extraction (`rag_chat_use_case.py -> _extract_intents()`)
65
+ 1. **Action**: The LLM analyzes the user's natural language query to dynamically extract metadata and physical parameters for the vector database.
66
+ 2. **Example Prompting**: The LLM is provided with a system prompt like: *"Extract the temporal constraints and target sources from the user query into JSON format. Valid sources: ['reuters', 'bloomberg']."*
67
+ 3. **Execution**: The LLM analyzes *"What happened with Apple stock recently?"*
68
+ 4. **Output Deduction**: From the word "recently", it deduces the temporal boundary and constructs the following JSON structure:
69
+ ```json
70
+ {
71
+ "days_back": 3,
72
+ "source": null
73
+ }
74
+ ```
75
+ 5. **Mapping**: The `RagChatUseCase` parses this JSON. If `days_back` is present, it constructs a Qdrant `models.Filter` to physically exclude older documents from the multidimensional search space *before* the costly vector math occurs.
76
+
77
+ ### Step 4: Embedding / Vectorization (`bge_embedder_adapter.py`)
78
+ 1. **Action**: `encode_query()` is called.
79
+ 2. **Model Processing**: The BGE-M3 model tokenizes the string.
80
+ 3. **Output**: Returns a `Dict` containing:
81
+ - `dense`: `[0.123, -0.456, 0.789, ... 1024 dimensions]`
82
+ - `sparse`: `{"indices": [102, 451, ...], "values": [0.92, 0.44, ...]}`
83
+
84
+ ### Step 5: Hybrid Vector Search (`qdrant_adapter.py`)
85
+ 1. **Action**: Passes the `query_vectors` and the `days_back=3` filter into `vector_store_port.search()`.
86
+ 2. **Qdrant Processing**: Qdrant performs a Fusion Query (Reciprocal Rank Fusion - RRF). It fetches the top 20 nearest neighbors from BOTH the Dense mathematical space AND the Sparse keyword space.
87
+ 3. **Output**: Returns a List of raw `SearchResult` documents.
88
+
89
+ ### Step 6: Temporal Bias Scoring (`rag_chat_use_case.py -> _build_context()`)
90
+ 1. **Action**: Evaluates the `published_at` metadata of every hit.
91
+ 2. **Calculation**: It deliberately decays the score of older articles via a mathematical multiplier (e.g., `score_multiplier = max(0.5, 1.0 - (days_old / 60))`).
92
+ 3. **Output**: A dynamically re-scored list, preferring fresh data.
93
+
94
+ ### Step 7: Cross-Encoder Reranking (`bge_reranker_adapter.py`)
95
+ 1. **Action**: For the top 20 remaining documents, the Reranker pairs the Query + Document Text together (`[[query, doc1], [query, doc2]]`).
96
+ 2. **Model Processing**: The HuggingFace FlagReranker calculates exact semantic overlap.
97
+ 3. **Output**: Returns the strict Top 5 (`top_k`) documents, guaranteed to be specifically relevant.
98
+
99
+ ### Step 8: Contextual Compression (`rag_chat_use_case.py -> _limit_context()`)
100
+ 1. **Action**: `_limit_context` uses `tiktoken` to count how many tokens the Top 5 documents contain.
101
+ 2. **Check**: Are they over the 3000 Token limit?
102
+ 3. **Compression Loop**: If they are over the limit, it calls `_compress_document()`.
103
+ 4. **LLM Summarization**: Passes the overflowing document string to the LLM with the instruction: *"Extract pure facts... relevant to the query."* The massive document strings are squashed down to bullet-point facts.
104
+ 5. **Output**: A tightly packed `context_text` string ready for generation.
105
+
106
+ ### Step 9: Final Generation (`llm_port.py`)
107
+ 1. **Action**: The packed `context_text`, the User `query`, and the recent `Chat History` are combined into the Final Prompt.
108
+ 2. **Model Processing**: The LLM interprets the compressed context.
109
+ 3. **Output**: The Final string ("Apple stock surged 4% after the latest earnings report...").
110
+ 4. **Cleanup**: This answer is saved to both Postgres (`chat_history_db`) and Redis (`cache`), and returned to the API client.
111
+
112
+ ---
113
+
114
+ ## 📈 4. A4 Analysis and Future Updates
115
+
116
+ ### A4 Analysis (Current System Standing)
117
+
118
+ | Dimension | Analysis & Findings |
119
+ | :--- | :--- |
120
+ | **Resilience & Scalability** | **High**. The Hexagonal architecture successfully decoupled Qdrant, Postgres, and the LLMs. We can swap `OpenAiAdapter` for `OllamaAdapter` simply by changing one dependency provider without touching the Business Logic flow. Missing dependencies (e.g., `FlagEmbedding`) gracefully utilize dummy fallbacks avoiding hard API crashes. |
121
+ | **Retrieval Accuracy** | **Exceptional**. We utilize a 3-Stage filtering mechanism: Semantic similarity (Dense), Lexical accuracy (Sparse), and absolute context alignment (Reranker). The addition of dynamic Temporal Biasing prevents the hallucination of historical news as current events. |
122
+ | **Cost & Latency Management** | **Optimized**. The implementation of Redis Semantic Caching guarantees that recursive identical intent avoids LLM round-trip costs. The `AgentRouterUseCase` ensures unrelated general questions (Account, Billing) never touch expensive Vector DB aggregations. |
123
+ | **Memory Constraint Handling** | **Innovative**. By employing `_compress_document`, the system prevents context-window truncation, ensuring critical tail-end entities still influence the LLM's final generation. |
124
+
125
+ ### Proposed Future Updates (Roadmap)
126
+ 1. **Semantic Cache Refinement**: Currently, the `RedisAdapter` relies on an exact SHA-256 string hash. **Update**: Calculate an actual LLM embedding of the prompt (Dense Vector) and store it in Redis. Use a Cosine-Similarity threshold (`>0.95`) to intercept semantically identical (but textually different) questions (e.g., "Apple stock" vs "AAPL share price").
127
+ 2. **Analytic Trend Fusion Enhancement**: In `_build_context`, we fetch trending entities from `ClickHouse`. **Update**: Send these trending entities into the Agent Router so the system can proactively recommend or correlate user interactions with macroeconomic spikes before they ask.
128
+ 3. **Ollama Deployment Readiness**: Test the `bge_embedder_adapter` and `bge_reranker_adapter` simultaneously against an active `OllamaAdapter` container to benchmark hardware-level VRAM bottlenecks on local inference machines.
129
+ 4. **Knowledge Graph Integration**: Extract Triples (`Subject-Predicate-Object`) during the `_compress_document` step to progressively construct a Graph Database (Neo4j) alongside the Vector DB (Qdrant) for Multi-Hop reasoning queries in the future.
docs/rag_retrieval_presentation.md ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ marp: true
3
+ theme: default
4
+ paginate: true
5
+ header: 'Enterprise RAG Retrieval Architecture'
6
+ footer: 'Hexagonal Architecture Data Flow'
7
+ ---
8
+
9
+ # 🚀 The Enterprise RAG Retrieval Logic
10
+ ### Step-by-Step Data Flow Analysis
11
+
12
+ This presentation covers the exact 9-step semantic retrieval and orchestration sequence used by the API to process complex user queries.
13
+
14
+ **Case Study Query**: *"What happened with Apple stock recently?"*
15
+
16
+ ---
17
+
18
+ # 1️⃣ Step 1: Ingestion & Intent Routing
19
+
20
+ The front door of our architecture. Every request is intercepted by the **Agent Router** to prevent unnecessary Vector Database queries.
21
+
22
+ - **Component**: `agent_router_use_case.py`
23
+ - **Input Object**: `ChatRequest(query="What happened with Apple stock recently?", top_k=5)`
24
+ - **LLM Classification Prompt**: *"Is this a NEWS search or an ACCOUNT search?"*
25
+ - **Action**: The LLM analyzes the text and confidently outputs `NEWS`.
26
+ - **Output Routing**: The Router dynamically forwards the payload to the specialized `RagChatUseCase`.
27
+
28
+ ---
29
+
30
+ # 2️⃣ Step 2: Semantic Caching Layer
31
+
32
+ Before spending LLM tokens or Cloud Compute, we check if this exact question has been asked and answered recently.
33
+
34
+ - **Component**: `redis_adapter.py`
35
+ - **Action**: `cache_port.generate_exact_hash()` deterministically calculates a SHA-256 hash representing the query string.
36
+ - **Cache Check**: Does the key exist in the Redis cluster?
37
+ - **Fast-Path**: If **Yes**, it returns the cached generation instantly, resulting in 0ms LLM time and $0 cost.
38
+ - **Deep-Path**: If **No**, the query proceeds down the expensive RAG pipeline.
39
+
40
+ ---
41
+
42
+ # 3️⃣ Step 3: Self-Query Extraction
43
+
44
+ We translate the user's natural language into strict physical constraints and metadata filters for the database.
45
+
46
+ - **Component**: `rag_chat_use_case.py -> _extract_intents()`
47
+ - **Action**: The LLM parses the user text against available metadata schemas.
48
+ - **Execution Insight**: The LLM identifies the word *"recently"* and maps it to a physical timeframe.
49
+ - **LLM Output (JSON)**:
50
+ ```json
51
+ { "days_back": 3, "source": null }
52
+ ```
53
+ - **Mapping**: `RagChatUseCase` creates a Qdrant `models.Filter` from this JSON, excluding old documents before math occurs.
54
+
55
+ ---
56
+
57
+ # 4️⃣ Step 4: Text Vectorization
58
+
59
+ We convert the query string into a mathematical representation using the massive BGE-M3 model.
60
+
61
+ - **Component**: `bge_embedder_adapter.py`
62
+ - **Action**: `encode_query()` passes the text into the embedded ML model.
63
+ - **Model Processing**: The text is tokenized into both Dense and Sparse dimensions.
64
+ - **Output Architecture**:
65
+ - **Dense Array**: `[0.123, -0.456, 0.789, ... 1024 dimensions]`
66
+ - **Sparse Lexical**: `{"indices": [102, 451, ...], "values": [0.92, 0.44, ...]}`
67
+
68
+ ---
69
+
70
+ # 5️⃣ Step 5: Hybrid Vector Search
71
+
72
+ We execute a high-performance database search combining math and exact keyword matching.
73
+
74
+ - **Component**: `qdrant_adapter.py`
75
+ - **Action**: Sends `query_vectors` and the extracted `days_back=3` physical filter to Qdrant via `vector_store_port.search()`.
76
+ - **Database Processing**: Qdrant executes a **Reciprocal Rank Fusion (RRF)** query. It searches simultaneously for Semantic Meaning (Dense) and Exact Keyword Hits (Sparse).
77
+ - **Yield**: Returns the top 20 nearest neighbor `SearchResult` documents.
78
+
79
+ ---
80
+
81
+ # 6️⃣ Step 6: Temporal Bias Scoring
82
+
83
+ Preventing historical hallucination by mathematically prioritizing fresh news over old news.
84
+
85
+ - **Component**: `rag_chat_use_case.py -> _build_context()`
86
+ - **Action**: Iterates over every returned document and examines its `published_at` timestamp.
87
+ - **Mathematical Decay**:
88
+ - `score_multiplier = max(0.5, 1.0 - (days_old / 60))`
89
+ - The older the article, the lower its multiplier goes.
90
+ - **Output**: A freshly re-scored list where newer, slightly less-relevant articles can outrank old, highly-relevant articles.
91
+
92
+ ---
93
+
94
+ # 7️⃣ Step 7: Cross-Encoder Reranking
95
+
96
+ Applying an absolute brute-force semantic check to eliminate hallucinated vector distances.
97
+
98
+ - **Component**: `bge_reranker_adapter.py`
99
+ - **Action**: Takes the top 20 decayed documents. It physically pairs the Query against the Document text block-by-block.
100
+ - `[[query, doc1_text], [query, doc2_text], ...]`
101
+ - **Model Processing**: The HuggingFace FlagReranker calculates exact semantic overlap.
102
+ - **Output**: Only the strict Top 5 (`top_k`) highest-scoring documents survive.
103
+
104
+ ---
105
+
106
+ # 8️⃣ Step 8: Contextual Compression
107
+
108
+ Squashing massive strings to fit gracefully into limited LLM context windows.
109
+
110
+ - **Component**: `rag_chat_use_case.py -> _limit_context()`
111
+ - **Action**: Uses `tiktoken` to calculate the total length of the surviving Top 5 documents.
112
+ - **Compression Loop**: If the size exceeds 3000 tokens, it pipes overflowing documents individually to an LLM via `_compress_document()`.
113
+ - **Extraction**: The LLM digests 800 words and outputs only bulleted facts relevant to "Apple Stock".
114
+ - **Output**: A high-density, tightly packed `context_text` string.
115
+
116
+ ---
117
+
118
+ # 9️⃣ Step 9: Final Final Generation
119
+
120
+ The Orchestrator fuses all pipelines to deliver a hyper-accurate, hallucination-free answer.
121
+
122
+ - **Component**: `llm_port.py`
123
+ - **Action**: The packed `context_text`, the original `query`, and the user's `Chat History` are injected into a singular Prompt Template.
124
+ - **Generation**: The LLM interprets the verified facts.
125
+ - *"Apple stock surged 4% after the latest earnings report..."*
126
+ - **Final Cleanup**: The new answer string is permanently logged into Postgres (`chat_history`) and cached into Redis (`cache`) before being returned via the API.
download_models.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ # Monkeypatch for transformers/FlagEmbedding compatibility issue
5
+ try:
6
+ import transformers.utils.import_utils
7
+ if not hasattr(transformers.utils.import_utils, 'is_torch_fx_available'):
8
+ transformers.utils.import_utils.is_torch_fx_available = lambda: False
9
+ except Exception:
10
+ pass
11
+
12
+ from FlagEmbedding import BGEM3FlagModel
13
+ from sentence_transformers import CrossEncoder
14
+
15
+ def download():
16
+ print("--- STARTING MODEL PRE-CACHE ---")
17
+
18
+ # 1. BGE-M3
19
+ model_name = "BAAI/bge-m3"
20
+ print(f"Downloading/Loading {model_name}...")
21
+ try:
22
+ # This will trigger the download if not present
23
+ _ = BGEM3FlagModel(model_name, use_fp16=True)
24
+ print(f"Successfully cached {model_name}")
25
+ except Exception as e:
26
+ print(f"Error caching {model_name}: {e}")
27
+
28
+ # 2. Reranker
29
+ reranker_name = "cross-encoder/ms-marco-TinyBERT-L-2-v2"
30
+ print(f"Downloading/Loading {reranker_name}...")
31
+ try:
32
+ _ = CrossEncoder(reranker_name)
33
+ print(f"Successfully cached {reranker_name}")
34
+ except Exception as e:
35
+ print(f"Error caching {reranker_name}: {e}")
36
+
37
+ print("--- PRE-CACHE COMPLETE ---")
38
+
39
+ if __name__ == "__main__":
40
+ download()
migrate_database.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Database migration script to add missing columns to users table
4
+ Run this once to update your Neon database schema
5
+ """
6
+
7
+ import os
8
+ from sqlalchemy import create_engine, text
9
+ from src.core.config import settings
10
+
11
+ def migrate_database():
12
+ """Add missing columns to users table"""
13
+ print("🔄 Starting database migration...")
14
+ print(f"Database URL: {settings.SQLALCHEMY_DATABASE_URI[:50]}...")
15
+
16
+ engine = create_engine(settings.SQLALCHEMY_DATABASE_URI)
17
+
18
+ migrations = [
19
+ # Add role column if it doesn't exist
20
+ """
21
+ DO $$
22
+ BEGIN
23
+ IF NOT EXISTS (SELECT 1 FROM information_schema.columns
24
+ WHERE table_name='users' AND column_name='role') THEN
25
+ ALTER TABLE users ADD COLUMN role VARCHAR(20) DEFAULT 'user';
26
+ UPDATE users SET role = 'user' WHERE role IS NULL;
27
+ RAISE NOTICE 'Added role column';
28
+ ELSE
29
+ RAISE NOTICE 'role column already exists';
30
+ END IF;
31
+ END $$;
32
+ """,
33
+
34
+ # Add is_active column if it doesn't exist
35
+ """
36
+ DO $$
37
+ BEGIN
38
+ IF NOT EXISTS (SELECT 1 FROM information_schema.columns
39
+ WHERE table_name='users' AND column_name='is_active') THEN
40
+ ALTER TABLE users ADD COLUMN is_active BOOLEAN DEFAULT TRUE;
41
+ UPDATE users SET is_active = TRUE WHERE is_active IS NULL;
42
+ RAISE NOTICE 'Added is_active column';
43
+ ELSE
44
+ RAISE NOTICE 'is_active column already exists';
45
+ END IF;
46
+ END $$;
47
+ """,
48
+
49
+ # Add full_name column if it doesn't exist
50
+ """
51
+ DO $$
52
+ BEGIN
53
+ IF NOT EXISTS (SELECT 1 FROM information_schema.columns
54
+ WHERE table_name='users' AND column_name='full_name') THEN
55
+ ALTER TABLE users ADD COLUMN full_name VARCHAR(255);
56
+ RAISE NOTICE 'Added full_name column';
57
+ ELSE
58
+ RAISE NOTICE 'full_name column already exists';
59
+ END IF;
60
+ END $$;
61
+ """,
62
+
63
+ # Create refresh_tokens table if it doesn't exist
64
+ """
65
+ CREATE TABLE IF NOT EXISTS refresh_tokens (
66
+ id SERIAL PRIMARY KEY,
67
+ user_id INTEGER NOT NULL REFERENCES users(id) ON DELETE CASCADE,
68
+ token VARCHAR(500) NOT NULL UNIQUE,
69
+ expires_at TIMESTAMP NOT NULL,
70
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
71
+ revoked BOOLEAN DEFAULT FALSE
72
+ );
73
+ """,
74
+
75
+ # Create index on refresh_tokens
76
+ """
77
+ CREATE INDEX IF NOT EXISTS idx_refresh_tokens_user_id ON refresh_tokens(user_id);
78
+ CREATE INDEX IF NOT EXISTS idx_refresh_tokens_token ON refresh_tokens(token);
79
+ """,
80
+ ]
81
+
82
+ try:
83
+ with engine.connect() as conn:
84
+ for i, migration in enumerate(migrations, 1):
85
+ print(f"\n📝 Running migration {i}/{len(migrations)}...")
86
+ conn.execute(text(migration))
87
+ conn.commit()
88
+ print(f"✅ Migration {i} completed")
89
+
90
+ print("\n✅ All migrations completed successfully!")
91
+ print("\n🎉 Database schema is now up to date")
92
+ return True
93
+
94
+ except Exception as e:
95
+ print(f"\n❌ Migration failed: {e}")
96
+ return False
97
+ finally:
98
+ engine.dispose()
99
+
100
+ if __name__ == "__main__":
101
+ success = migrate_database()
102
+ exit(0 if success else 1)
requirements.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi>=0.110.0
2
+ uvicorn>=0.27.1
3
+ pydantic>=2.9.0
4
+ pydantic-settings>=2.2.1
5
+ qdrant-client>=1.7.0
6
+ clickhouse-connect>=0.7.3
7
+ langchain>=0.1.13
8
+ langchain-openai>=0.1.1 # covers Groq, Gemini, Together AI, OpenAI (all OpenAI-compatible)
9
+ langchain-groq>=0.1.3
10
+ python-dotenv>=1.0.1
11
+ psycopg2-binary>=2.9.9
12
+ SQLAlchemy>=2.0.29
13
+ sentence-transformers>=2.7.0
14
+ transformers>=4.40.0 # DeBERTa intent classifier
15
+ torch>=2.0.0
16
+ numpy>=1.26.0
17
+ tiktoken>=0.6.0
18
+ FlagEmbedding>=1.2.5
19
+ redis>=5.0.0
20
+ python-jose[cryptography]>=3.3.0
21
+ passlib[bcrypt]>=1.7.4
22
+ python-multipart>=0.0.9
23
+ httpx>=0.27.0
24
+ aiohttp>=3.9.0
25
+ duckduckgo-search>=6.0.0 # Live search for hybrid RAG
26
+ python-dateutil>=2.8.2 # Date parsing for live results
src/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Initialize src package
src/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (205 Bytes). View file
 
src/__pycache__/main.cpython-313.pyc ADDED
Binary file (3.32 kB). View file
 
src/api/__pycache__/dependencies.cpython-313.pyc ADDED
Binary file (4.35 kB). View file
 
src/api/dependencies.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import Depends
2
+ from sqlalchemy.orm import Session
3
+ from src.infrastructure.database import get_db
4
+
5
+ # Adapters
6
+ from src.infrastructure.adapters.bge_embedder_adapter import BgeEmbedderAdapter
7
+ from src.infrastructure.adapters.qdrant_adapter import QdrantAdapter
8
+ from src.infrastructure.adapters.bge_reranker_adapter import BgeRerankerAdapter
9
+ from src.infrastructure.adapters.openai_adapter import OpenAiAdapter
10
+ from src.infrastructure.adapters.ollama_adapter import OllamaAdapter
11
+ from src.infrastructure.adapters.groq_adapter import GroqAdapter
12
+ from src.infrastructure.adapters.gemini_adapter import GeminiAdapter
13
+ from src.infrastructure.adapters.together_adapter import TogetherAdapter
14
+ from src.infrastructure.adapters.huggingface_adapter import HuggingFaceAdapter
15
+ from src.infrastructure.adapters.clickhouse_adapter import ClickHouseAdapter
16
+ from src.infrastructure.adapters.postgres_adapter import PostgresAdapter
17
+ from src.infrastructure.adapters.redis_adapter import RedisAdapter
18
+ from src.infrastructure.adapters.duckduckgo_adapter import DuckDuckGoAdapter
19
+
20
+ # Hybrid Search Components
21
+ from src.core.orchestrator.query_orchestrator import QueryOrchestrator
22
+ from src.core.ranking.hybrid_result_ranker import HybridResultRanker
23
+
24
+ # Use Cases
25
+ from src.core.use_cases.search_use_case import SearchUseCase
26
+ from src.core.use_cases.rag_chat_use_case import RagChatUseCase
27
+ from src.core.use_cases.analytics_use_case import AnalyticsUseCase
28
+
29
+ # Global Singletons for Stateless Adapters to avoid reloading models per request
30
+ embedder_adapter = BgeEmbedderAdapter()
31
+ qdrant_adapter = QdrantAdapter()
32
+ reranker_adapter = BgeRerankerAdapter()
33
+ openai_adapter = OpenAiAdapter()
34
+ ollama_adapter = OllamaAdapter()
35
+ groq_adapter = GroqAdapter()
36
+ gemini_adapter = GeminiAdapter()
37
+ together_adapter = TogetherAdapter()
38
+ huggingface_adapter = HuggingFaceAdapter()
39
+ clickhouse_adapter = ClickHouseAdapter()
40
+ redis_adapter = RedisAdapter()
41
+
42
+ # Hybrid Search Singletons
43
+ from src.core.config import settings
44
+ duckduckgo_adapter = DuckDuckGoAdapter(
45
+ timeout=settings.LIVE_SEARCH_TIMEOUT,
46
+ max_results=settings.LIVE_SEARCH_MAX_RESULTS
47
+ )
48
+ query_orchestrator = QueryOrchestrator(
49
+ live_search_adapter=duckduckgo_adapter,
50
+ enable_hybrid=settings.ENABLE_HYBRID_SEARCH,
51
+ default_live_weight=settings.LIVE_SEARCH_WEIGHT,
52
+ default_db_weight=settings.DB_SEARCH_WEIGHT
53
+ )
54
+ hybrid_result_ranker = HybridResultRanker(reranker=reranker_adapter)
55
+
56
+ # Model Pre-warming (Triggered dynamically if needed, usually on startup)
57
+ def prewarm_models():
58
+ embedder_adapter._load_model()
59
+ reranker_adapter._load_model()
60
+
61
+ # --- Dependency Providers ---
62
+
63
+ def get_embedder_port():
64
+ return embedder_adapter
65
+
66
+ def get_vector_store_port():
67
+ return qdrant_adapter
68
+
69
+ def get_reranker_port():
70
+ return reranker_adapter
71
+
72
+ from src.core.config import settings
73
+ def get_llm_port():
74
+ provider = settings.LLM_PROVIDER.lower()
75
+ if provider == "groq":
76
+ return groq_adapter
77
+ elif provider == "gemini":
78
+ return gemini_adapter
79
+ elif provider == "together":
80
+ return together_adapter
81
+ elif provider == "huggingface" or provider == "hf":
82
+ return huggingface_adapter
83
+ elif provider == "ollama":
84
+ return ollama_adapter
85
+ return openai_adapter
86
+
87
+ def get_analytics_db_port():
88
+ return clickhouse_adapter
89
+
90
+ def get_chat_history_port(db: Session = Depends(get_db)):
91
+ return PostgresAdapter(db)
92
+
93
+ def get_cache_port():
94
+ return redis_adapter
95
+
96
+ def get_live_search_port():
97
+ return duckduckgo_adapter
98
+
99
+ def get_query_orchestrator():
100
+ return query_orchestrator
101
+
102
+ def get_hybrid_ranker():
103
+ return hybrid_result_ranker
104
+
105
+ # --- Use Case Providers ---
106
+
107
+ def get_search_use_case(
108
+ embedder=Depends(get_embedder_port),
109
+ vector_store=Depends(get_vector_store_port)
110
+ ):
111
+ return SearchUseCase(embedder, vector_store)
112
+
113
+ def get_rag_chat_use_case(
114
+ embedder=Depends(get_embedder_port),
115
+ vector_store=Depends(get_vector_store_port),
116
+ reranker=Depends(get_reranker_port),
117
+ llm=Depends(get_llm_port),
118
+ chat_history=Depends(get_chat_history_port),
119
+ analytics_db=Depends(get_analytics_db_port),
120
+ cache=Depends(get_cache_port),
121
+ orchestrator=Depends(get_query_orchestrator),
122
+ hybrid_ranker=Depends(get_hybrid_ranker)
123
+ ):
124
+ return RagChatUseCase(
125
+ embedder=embedder,
126
+ vector_store=vector_store,
127
+ reranker=reranker,
128
+ llm=llm,
129
+ chat_history_db=chat_history,
130
+ analytics_db=analytics_db,
131
+ cache=cache,
132
+ orchestrator=orchestrator,
133
+ hybrid_ranker=hybrid_ranker
134
+ )
135
+
136
+ from src.core.use_cases.account_use_case import AccountUseCase
137
+ from src.core.use_cases.agent_router_use_case import AgentRouterUseCase
138
+
139
+ def get_analytics_use_case(
140
+ analytics_db=Depends(get_analytics_db_port)
141
+ ):
142
+ return AnalyticsUseCase(analytics_db)
143
+
144
+ def get_account_use_case():
145
+ return AccountUseCase()
146
+
147
+ def get_agent_router_use_case(
148
+ llm=Depends(get_llm_port),
149
+ rag_chat=Depends(get_rag_chat_use_case),
150
+ account=Depends(get_account_use_case),
151
+ chat_history=Depends(get_chat_history_port)
152
+ ):
153
+ return AgentRouterUseCase(llm=llm, rag_chat=rag_chat, account=account, chat_history_db=chat_history)
src/api/routes/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Expose routers
2
+ from . import rag, analytics, interactions, accounts, news
src/api/routes/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (327 Bytes). View file
 
src/api/routes/__pycache__/accounts.cpython-313.pyc ADDED
Binary file (1.8 kB). View file
 
src/api/routes/__pycache__/analytics.cpython-313.pyc ADDED
Binary file (4.96 kB). View file
 
src/api/routes/__pycache__/auth.cpython-313.pyc ADDED
Binary file (2.39 kB). View file
 
src/api/routes/__pycache__/interactions.cpython-313.pyc ADDED
Binary file (4.76 kB). View file
 
src/api/routes/__pycache__/news.cpython-313.pyc ADDED
Binary file (4.87 kB). View file
 
src/api/routes/__pycache__/rag.cpython-313.pyc ADDED
Binary file (3.42 kB). View file
 
src/api/routes/accounts.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Depends, HTTPException, status
2
+ from sqlalchemy.orm import Session
3
+ from typing import List
4
+
5
+ from src.infrastructure.database import get_db
6
+ from src.core.domain.schemas import UserCreate, UserResponse, UserUpdate, PasswordChange, AdminUserUpdate
7
+ from src.core.domain.db_models import User, UserRole
8
+ from src.core.security import (
9
+ get_password_hash, verify_password,
10
+ get_current_user, require_super_admin
11
+ )
12
+
13
+ router = APIRouter()
14
+
15
+
16
+ # ── Public ────────────────────────────────────────────────────────────────────
17
+
18
+ @router.post("/register", response_model=UserResponse, status_code=status.HTTP_201_CREATED)
19
+ def register(user: UserCreate, db: Session = Depends(get_db)):
20
+ """Register a new user account (role defaults to 'user')."""
21
+ existing = db.query(User).filter(
22
+ (User.username == user.username) | (User.email == user.email)
23
+ ).first()
24
+ if existing:
25
+ raise HTTPException(status_code=400, detail="Username or email already registered")
26
+
27
+ new_user = User(
28
+ username=user.username,
29
+ email=user.email,
30
+ full_name=user.full_name,
31
+ hashed_password=get_password_hash(user.password),
32
+ role=UserRole.user,
33
+ )
34
+ db.add(new_user)
35
+ db.commit()
36
+ db.refresh(new_user)
37
+ return new_user
38
+
39
+
40
+ # ── Authenticated user ────────────────────────────────────────────────────────
41
+
42
+ @router.patch("/me", response_model=UserResponse)
43
+ def update_profile(
44
+ body: UserUpdate,
45
+ current_user: User = Depends(get_current_user),
46
+ db: Session = Depends(get_db)
47
+ ):
48
+ """Update own profile (username, full_name, email)."""
49
+ if body.username and body.username != current_user.username:
50
+ if db.query(User).filter(User.username == body.username).first():
51
+ raise HTTPException(status_code=400, detail="Username already taken")
52
+ current_user.username = body.username
53
+ if body.email and body.email != current_user.email:
54
+ if db.query(User).filter(User.email == body.email).first():
55
+ raise HTTPException(status_code=400, detail="Email already in use")
56
+ current_user.email = body.email
57
+ if body.full_name is not None:
58
+ current_user.full_name = body.full_name
59
+
60
+ db.commit()
61
+ db.refresh(current_user)
62
+ return current_user
63
+
64
+
65
+ @router.post("/me/change-password", status_code=status.HTTP_204_NO_CONTENT)
66
+ def change_password(
67
+ body: PasswordChange,
68
+ current_user: User = Depends(get_current_user),
69
+ db: Session = Depends(get_db)
70
+ ):
71
+ """Change own password."""
72
+ if not verify_password(body.current_password, current_user.hashed_password):
73
+ raise HTTPException(status_code=400, detail="Current password is incorrect")
74
+ current_user.hashed_password = get_password_hash(body.new_password)
75
+ db.commit()
76
+
77
+
78
+ # ── Super admin only ──────────────────────────────────────────────────────────
79
+
80
+ @router.get("/users", response_model=List[UserResponse])
81
+ def list_users(
82
+ skip: int = 0,
83
+ limit: int = 50,
84
+ _admin: User = Depends(require_super_admin),
85
+ db: Session = Depends(get_db)
86
+ ):
87
+ """List all users (super_admin only)."""
88
+ return db.query(User).offset(skip).limit(limit).all()
89
+
90
+
91
+ @router.get("/users/{user_id}", response_model=UserResponse)
92
+ def get_user(
93
+ user_id: int,
94
+ _admin: User = Depends(require_super_admin),
95
+ db: Session = Depends(get_db)
96
+ ):
97
+ """Get a specific user by ID (super_admin only)."""
98
+ user = db.query(User).filter(User.id == user_id).first()
99
+ if not user:
100
+ raise HTTPException(status_code=404, detail="User not found")
101
+ return user
102
+
103
+
104
+ @router.patch("/users/{user_id}", response_model=UserResponse)
105
+ def admin_update_user(
106
+ user_id: int,
107
+ body: AdminUserUpdate,
108
+ _admin: User = Depends(require_super_admin),
109
+ db: Session = Depends(get_db)
110
+ ):
111
+ """Update a user's role or active status (super_admin only)."""
112
+ user = db.query(User).filter(User.id == user_id).first()
113
+ if not user:
114
+ raise HTTPException(status_code=404, detail="User not found")
115
+ if body.is_active is not None:
116
+ user.is_active = body.is_active
117
+ if body.role is not None:
118
+ try:
119
+ user.role = UserRole(body.role)
120
+ except ValueError:
121
+ raise HTTPException(status_code=400, detail="Invalid role. Must be 'super_admin' or 'user'")
122
+ db.commit()
123
+ db.refresh(user)
124
+ return user
125
+
126
+
127
+ @router.delete("/users/{user_id}", status_code=status.HTTP_204_NO_CONTENT)
128
+ def delete_user(
129
+ user_id: int,
130
+ admin: User = Depends(require_super_admin),
131
+ db: Session = Depends(get_db)
132
+ ):
133
+ """Delete a user (super_admin only). Cannot delete yourself."""
134
+ if user_id == admin.id:
135
+ raise HTTPException(status_code=400, detail="Cannot delete your own account")
136
+ user = db.query(User).filter(User.id == user_id).first()
137
+ if not user:
138
+ raise HTTPException(status_code=404, detail="User not found")
139
+ db.delete(user)
140
+ db.commit()
src/api/routes/analytics.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Depends
2
+ from src.core.use_cases.analytics_use_case import AnalyticsUseCase
3
+ from src.core.ports.vector_store_port import VectorStorePort
4
+ from src.api.dependencies import get_analytics_use_case, get_vector_store_port
5
+
6
+ router = APIRouter()
7
+
8
+ @router.get("/sentiment")
9
+ def get_sentiment(analytics_use_case: AnalyticsUseCase = Depends(get_analytics_use_case)):
10
+ query = """
11
+ SELECT
12
+ entity,
13
+ avg(sentiment_score) as avg_sentiment,
14
+ count() as mention_count
15
+ FROM sentiment_results
16
+ GROUP BY entity
17
+ ORDER BY mention_count DESC
18
+ LIMIT 10
19
+ """
20
+ results = analytics_use_case.execute_raw_query(query)
21
+ if not results or "error" in results:
22
+ return {"error": "Could not fetch sentiment."}
23
+
24
+ data = []
25
+ for row in results.get("rows", []):
26
+ data.append({
27
+ "entity": row[0],
28
+ "avg_sentiment": float(row[1]),
29
+ "mention_count": int(row[2])
30
+ })
31
+ return {"data": data}
32
+
33
+ @router.get("/trends")
34
+ def get_trends(
35
+ days: int = 7,
36
+ analytics_use_case: AnalyticsUseCase = Depends(get_analytics_use_case)
37
+ ):
38
+ results = analytics_use_case.get_trends(days)
39
+ if not results or "error" in results:
40
+ return {"error": "Could not fetch trends."}
41
+
42
+ data = []
43
+ for row in results.get("rows", []):
44
+ data.append({
45
+ "topic": row[0],
46
+ "momentum": float(row[1]),
47
+ "volume": int(row[2])
48
+ })
49
+ return {"data": data}
50
+
51
+ @router.get("/articles-over-time")
52
+ def get_articles_over_time(
53
+ days: int = 30,
54
+ analytics_use_case: AnalyticsUseCase = Depends(get_analytics_use_case)
55
+ ):
56
+ query = f"""
57
+ SELECT
58
+ toDate(scraped_at) as date,
59
+ count() as article_count
60
+ FROM sentiment_results
61
+ GROUP BY date
62
+ ORDER BY date ASC
63
+ """
64
+ results = analytics_use_case.execute_raw_query(query)
65
+ if not results or "error" in results:
66
+ return {"error": "Could not fetch articles over time."}
67
+
68
+ data = []
69
+ for row in results.get("rows", []):
70
+ data.append({
71
+ "date": str(row[0]),
72
+ "count": int(row[1])
73
+ })
74
+ return {"data": data}
75
+
76
+ @router.get("/source-stats")
77
+ def get_source_stats(analytics_use_case: AnalyticsUseCase = Depends(get_analytics_use_case)):
78
+ query = """
79
+ SELECT
80
+ source,
81
+ count() as article_count,
82
+ avg(sentiment_score) as avg_sentiment
83
+ FROM sentiment_results
84
+ GROUP BY source
85
+ ORDER BY article_count DESC
86
+ """
87
+ results = analytics_use_case.execute_raw_query(query)
88
+ if not results or "error" in results:
89
+ return {"error": "Could not fetch source stats."}
90
+
91
+ data = []
92
+ for row in results.get("rows", []):
93
+ data.append({
94
+ "source": row[0],
95
+ "article_count": int(row[1]),
96
+ "avg_sentiment": float(row[2])
97
+ })
98
+ return {"data": data}
99
+
100
+ @router.get("/pipeline-stats")
101
+ def get_pipeline_stats(
102
+ analytics_use_case: AnalyticsUseCase = Depends(get_analytics_use_case),
103
+ vector_store: VectorStorePort = Depends(get_vector_store_port)
104
+ ):
105
+ qdrant_stats = vector_store.get_collection_stats()
106
+
107
+ query = "SELECT count() FROM sentiment_results"
108
+ ch_res = analytics_use_case.execute_raw_query(query)
109
+
110
+ ch_count = 0
111
+ if ch_res and not "error" in ch_res and ch_res.get("rows"):
112
+ ch_count = int(ch_res["rows"][0][0])
113
+
114
+ return {
115
+ "total_articles_in_vector_db": qdrant_stats.get("vectors_count", 0) if qdrant_stats else 0,
116
+ "total_sentiment_results": ch_count
117
+ }
src/api/routes/auth.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Depends, HTTPException, status
2
+ from fastapi.security import OAuth2PasswordRequestForm
3
+ from sqlalchemy.orm import Session
4
+ from datetime import timedelta
5
+
6
+ from src.infrastructure.database import get_db
7
+ from src.core.domain.db_models import User
8
+ from src.core.domain.schemas import TokenResponse, RefreshRequest, UserResponse
9
+ from src.core.security import (
10
+ verify_password, create_access_token, create_refresh_token,
11
+ rotate_refresh_token, revoke_all_refresh_tokens,
12
+ get_current_user
13
+ )
14
+ from src.core.config import settings
15
+
16
+ router = APIRouter()
17
+
18
+
19
+ @router.post("/login", response_model=TokenResponse)
20
+ def login(form_data: OAuth2PasswordRequestForm = Depends(), db: Session = Depends(get_db)):
21
+ """Login with email + password. Returns access + refresh tokens."""
22
+ user = db.query(User).filter(User.email == form_data.username).first()
23
+ if not user or not verify_password(form_data.password, user.hashed_password):
24
+ raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Incorrect email or password")
25
+ if not user.is_active:
26
+ raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Account is disabled")
27
+
28
+ access_token = create_access_token(
29
+ data={"sub": user.email, "role": user.role},
30
+ expires_delta=timedelta(minutes=settings.ACCESS_TOKEN_EXPIRE_MINUTES)
31
+ )
32
+ refresh_token = create_refresh_token(user.id, db)
33
+ return TokenResponse(access_token=access_token, refresh_token=refresh_token)
34
+
35
+
36
+ @router.post("/refresh", response_model=TokenResponse)
37
+ def refresh_tokens(body: RefreshRequest, db: Session = Depends(get_db)):
38
+ """Exchange a valid refresh token for a new access + refresh token pair."""
39
+ new_refresh, user = rotate_refresh_token(body.refresh_token, db)
40
+ access_token = create_access_token(
41
+ data={"sub": user.email, "role": user.role},
42
+ expires_delta=timedelta(minutes=settings.ACCESS_TOKEN_EXPIRE_MINUTES)
43
+ )
44
+ return TokenResponse(access_token=access_token, refresh_token=new_refresh)
45
+
46
+
47
+ @router.post("/logout", status_code=status.HTTP_204_NO_CONTENT)
48
+ def logout(current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
49
+ """Revoke all refresh tokens for the current user (full logout)."""
50
+ revoke_all_refresh_tokens(current_user.id, db)
51
+
52
+
53
+ @router.get("/me", response_model=UserResponse)
54
+ def get_me(current_user: User = Depends(get_current_user)):
55
+ """Get the currently authenticated user's profile."""
56
+ return current_user
src/api/routes/interactions.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Depends, HTTPException
2
+ from sqlalchemy.orm import Session
3
+ from src.infrastructure.database import get_db
4
+ from sqlalchemy import func
5
+ from src.core.domain.schemas import FeedbackRequest, ChatSession
6
+ from src.core.domain.db_models import ChatHistory, Feedback
7
+ from src.core.security import get_current_user
8
+ from src.core.domain.db_models import User
9
+ from typing import Optional
10
+
11
+ router = APIRouter()
12
+
13
+ @router.get("/history/{session_id}")
14
+ def get_chat_history(session_id: str, db: Session = Depends(get_db)):
15
+ history = db.query(ChatHistory).filter(
16
+ ChatHistory.session_id == session_id
17
+ ).order_by(ChatHistory.timestamp.asc()).all()
18
+
19
+ formatted_history = []
20
+ for h in history:
21
+ formatted_history.append({
22
+ "id": h.id,
23
+ "role": h.role,
24
+ "content": h.content,
25
+ "timestamp": h.timestamp,
26
+ "pinned": getattr(h, "pinned", False),
27
+ })
28
+
29
+ return {"session_id": session_id, "history": formatted_history}
30
+
31
+
32
+ @router.post("/feedback")
33
+ def submit_feedback(req: FeedbackRequest, db: Session = Depends(get_db)):
34
+ msg = db.query(ChatHistory).filter(
35
+ ChatHistory.id == req.message_id,
36
+ ChatHistory.session_id == req.session_id
37
+ ).first()
38
+ if not msg:
39
+ raise HTTPException(status_code=404, detail="Message not found in session")
40
+
41
+ # Upsert: update existing feedback or create new
42
+ existing = db.query(Feedback).filter(
43
+ Feedback.message_id == req.message_id,
44
+ Feedback.session_id == req.session_id
45
+ ).first()
46
+
47
+ if existing:
48
+ existing.rating = req.rating
49
+ existing.comment = req.comment
50
+ else:
51
+ feedback = Feedback(
52
+ session_id=req.session_id,
53
+ message_id=req.message_id,
54
+ rating=req.rating,
55
+ comment=req.comment
56
+ )
57
+ db.add(feedback)
58
+
59
+ db.commit()
60
+ return {"status": "success", "message": "Feedback recorded."}
61
+
62
+
63
+ @router.get("/feedback/{session_id}")
64
+ def get_session_feedback(session_id: str, db: Session = Depends(get_db)):
65
+ """Get all feedback ratings for a session (so UI can restore like/dislike state)."""
66
+ feedbacks = db.query(Feedback).filter(Feedback.session_id == session_id).all()
67
+ return {str(f.message_id): f.rating for f in feedbacks}
68
+
69
+
70
+ @router.post("/pin/{message_id}")
71
+ def pin_message(message_id: int, db: Session = Depends(get_db)):
72
+ """Toggle pin on a message."""
73
+ msg = db.query(ChatHistory).filter(ChatHistory.id == message_id).first()
74
+ if not msg:
75
+ raise HTTPException(status_code=404, detail="Message not found")
76
+ # Toggle pinned — add column if missing via getattr
77
+ current = getattr(msg, "pinned", False) or False
78
+ msg.pinned = not current
79
+ db.commit()
80
+ return {"pinned": msg.pinned}
81
+
82
+
83
+ @router.get("/sessions")
84
+ def get_chat_sessions(
85
+ current_user: User = Depends(get_current_user),
86
+ db: Session = Depends(get_db)
87
+ ):
88
+ """Retrieve sessions for the authenticated user only."""
89
+ sessions = db.query(
90
+ ChatHistory.session_id,
91
+ func.count(ChatHistory.id).label("message_count"),
92
+ func.max(ChatHistory.timestamp).label("last_active")
93
+ ).filter(
94
+ ChatHistory.user_id == current_user.id
95
+ ).group_by(ChatHistory.session_id).order_by(
96
+ func.max(ChatHistory.timestamp).desc()
97
+ ).all()
98
+
99
+ return [
100
+ ChatSession(
101
+ session_id=s.session_id,
102
+ message_count=s.message_count,
103
+ last_active=s.last_active
104
+ )
105
+ for s in sessions
106
+ ]
107
+
108
+
109
+ @router.delete("/sessions/{session_id}")
110
+ def delete_chat_session(
111
+ session_id: str,
112
+ current_user: User = Depends(get_current_user),
113
+ db: Session = Depends(get_db)
114
+ ):
115
+ """Delete session — only owner can delete."""
116
+ # Verify ownership
117
+ owned = db.query(ChatHistory).filter(
118
+ ChatHistory.session_id == session_id,
119
+ ChatHistory.user_id == current_user.id
120
+ ).first()
121
+ if not owned:
122
+ raise HTTPException(status_code=404, detail="Session not found")
123
+
124
+ db.query(Feedback).filter(Feedback.session_id == session_id).delete()
125
+ deleted_msgs = db.query(ChatHistory).filter(ChatHistory.session_id == session_id).delete()
126
+ db.commit()
127
+ return {"status": "success", "message": f"Deleted session {session_id} with {deleted_msgs} messages."}
src/api/routes/news.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Query, HTTPException, Depends
2
+ from typing import Optional
3
+ from src.core.ports.vector_store_port import VectorStorePort
4
+ from src.core.ports.embedder_port import EmbedderPort
5
+ from src.core.use_cases.analytics_use_case import AnalyticsUseCase
6
+ from src.api.dependencies import get_vector_store_port, get_embedder_port, get_analytics_use_case
7
+ from src.core.domain.schemas import BrowseResponse, SearchResponse, NewsArticle
8
+
9
+ router = APIRouter()
10
+
11
+ def _dict_to_article(payload: dict, score: float = None) -> NewsArticle:
12
+ if payload is None:
13
+ payload = {}
14
+
15
+ # Safely get metadata
16
+ metadata = payload.get("metadata") if payload else {}
17
+ if metadata is None:
18
+ metadata = {}
19
+
20
+ # Title can be stored at top-level payload OR nested inside metadata
21
+ title = (
22
+ payload.get("title")
23
+ or (metadata.get("title") if isinstance(metadata, dict) else None)
24
+ )
25
+
26
+ return NewsArticle(
27
+ doc_id=payload.get("doc_id", "unknown"),
28
+ url=payload.get("url"),
29
+ title=title,
30
+ content=payload.get("text", payload.get("content", "")),
31
+ source=payload.get("source"),
32
+ published_at=payload.get("published_at"),
33
+ score=score,
34
+ metadata=metadata if isinstance(metadata, dict) else {}
35
+ )
36
+
37
+ @router.get("/latest", response_model=BrowseResponse)
38
+ def get_latest_news(
39
+ limit: int = Query(10, le=50),
40
+ source: Optional[str] = None,
41
+ language: Optional[str] = None,
42
+ vector_store: VectorStorePort = Depends(get_vector_store_port)
43
+ ):
44
+ """Get latest news articles sorted by publication date"""
45
+ try:
46
+ result = vector_store.browse(limit=limit, offset=0, source=source, language=language)
47
+
48
+ # Convert Qdrant points to articles and sort by published_at
49
+ articles = []
50
+ for p in result["articles"]:
51
+ article = _dict_to_article(p.payload or {}, getattr(p, "score", None))
52
+ articles.append(article)
53
+
54
+ # Sort by published_at descending (latest first)
55
+ articles.sort(key=lambda x: x.published_at or "", reverse=True)
56
+
57
+ # Ensure next_offset is an integer or None
58
+ next_offset = result.get("next_offset")
59
+ if next_offset is not None and not isinstance(next_offset, int):
60
+ next_offset = None # If it's not an int, set to None
61
+
62
+ return BrowseResponse(
63
+ total_returned=len(articles),
64
+ articles=articles,
65
+ next_offset=next_offset
66
+ )
67
+ except Exception as e:
68
+ # Return empty response on error instead of 500
69
+ print(f"Error fetching news: {e}")
70
+ return BrowseResponse(
71
+ total_returned=0,
72
+ articles=[],
73
+ next_offset=None
74
+ )
75
+
76
+ @router.get("/browse", response_model=BrowseResponse)
77
+ def browse_news(
78
+ limit: int = Query(20, le=100),
79
+ offset: int = 0,
80
+ source: Optional[str] = None,
81
+ language: Optional[str] = None,
82
+ vector_store: VectorStorePort = Depends(get_vector_store_port)
83
+ ):
84
+ result = vector_store.browse(limit=limit, offset=offset, source=source, language=language)
85
+
86
+ # Qdrant scroll returns points with .payload
87
+ articles = []
88
+ for p in result["articles"]:
89
+ articles.append(_dict_to_article(p.payload or {}, getattr(p, "score", None)))
90
+
91
+ return BrowseResponse(
92
+ total_returned=len(articles),
93
+ articles=articles,
94
+ next_offset=result["next_offset"]
95
+ )
96
+
97
+ @router.get("/search", response_model=SearchResponse)
98
+ def search_news(
99
+ q: str = Query(..., min_length=1),
100
+ top_k: int = Query(10, le=50),
101
+ source: Optional[str] = None,
102
+ language: Optional[str] = None,
103
+ embedder: EmbedderPort = Depends(get_embedder_port),
104
+ vector_store: VectorStorePort = Depends(get_vector_store_port)
105
+ ):
106
+ try:
107
+ query_vector = embedder.encode_query(q)
108
+ results = vector_store.search(
109
+ query_vectors=query_vector,
110
+ limit=top_k,
111
+ source_filter=source,
112
+ language_filter=language
113
+ )
114
+ # VectorStorePort.search returns SearchResult objects (content, metadata, score, doc_id)
115
+ articles = []
116
+ for r in results:
117
+ articles.append(_dict_to_article(r.metadata, r.score))
118
+
119
+ return SearchResponse(results=articles)
120
+ except Exception as e:
121
+ raise HTTPException(status_code=500, detail=str(e))
122
+
123
+ @router.get("/sources")
124
+ def get_sources(analytics: AnalyticsUseCase = Depends(get_analytics_use_case)):
125
+ query = "SELECT source, count() as article_count FROM sentiment_results GROUP BY source ORDER BY article_count DESC"
126
+ res = analytics.execute_raw_query(query)
127
+ if res and res.get("rows"):
128
+ sources = [row[0] for row in res["rows"] if row[0]]
129
+ return {"sources": sources}
130
+ return {"sources": []}
131
+
132
+ @router.get("/{doc_id}", response_model=NewsArticle)
133
+ def get_news_article(doc_id: str, vector_store: VectorStorePort = Depends(get_vector_store_port)):
134
+ result = vector_store.get_by_doc_id(doc_id)
135
+ if not result:
136
+ raise HTTPException(status_code=404, detail="Article not found")
137
+
138
+ return _dict_to_article(result.metadata, result.score)
src/api/routes/rag.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uuid
3
+ from typing import Optional
4
+ from fastapi import APIRouter, Depends, HTTPException
5
+ from fastapi.responses import StreamingResponse
6
+ from fastapi.security import OAuth2PasswordBearer
7
+ from src.core.domain.schemas import ChatRequest, ChatResponse, SearchResponse
8
+ from src.core.use_cases.search_use_case import SearchUseCase
9
+ from src.core.use_cases.rag_chat_use_case import RagChatUseCase
10
+ from src.core.use_cases.agent_router_use_case import AgentRouterUseCase
11
+ from src.api.dependencies import get_search_use_case, get_rag_chat_use_case, get_agent_router_use_case
12
+ from src.core.security import get_current_user
13
+ from src.core.domain.db_models import User
14
+ from jose import jwt, JWTError
15
+ from src.core.config import settings
16
+ from src.infrastructure.database import get_db
17
+ from sqlalchemy.orm import Session
18
+
19
+ router = APIRouter()
20
+
21
+ # Optional bearer — doesn't raise if token is missing
22
+ _optional_bearer = OAuth2PasswordBearer(tokenUrl=f"{settings.API_V1_STR}/auth/login", auto_error=False)
23
+
24
+ def get_optional_user(
25
+ token: Optional[str] = Depends(_optional_bearer),
26
+ db: Session = Depends(get_db)
27
+ ) -> Optional[User]:
28
+ """Returns the authenticated user or None for guests."""
29
+ if not token:
30
+ return None
31
+ try:
32
+ payload = jwt.decode(token, settings.SECRET_KEY, algorithms=["HS256"])
33
+ if payload.get("type") != "access":
34
+ return None
35
+ email = payload.get("sub")
36
+ if not email:
37
+ return None
38
+ user = db.query(User).filter(User.email == email).first()
39
+ return user if user and user.is_active else None
40
+ except JWTError:
41
+ return None
42
+
43
+ @router.post("/search")
44
+ def direct_search(
45
+ request: ChatRequest,
46
+ search_use_case: SearchUseCase = Depends(get_search_use_case),
47
+ current_user: User = Depends(get_current_user)
48
+ ):
49
+ """Lightning-fast hybrid search bypassing the LLM."""
50
+ try:
51
+ results = search_use_case.execute(
52
+ query=request.query,
53
+ limit=request.top_k,
54
+ source_filter=request.source_filter,
55
+ language_filter=request.language_filter,
56
+ days_back=getattr(request, 'days_back', None)
57
+ )
58
+ hits = [{"content": r.content, "metadata": r.metadata, "score": r.score, "doc_id": r.doc_id} for r in results]
59
+ return {"results": hits, "query": request.query}
60
+ except Exception as e:
61
+ raise HTTPException(status_code=500, detail=str(e))
62
+
63
+ @router.post("/chat/test", response_model=ChatResponse)
64
+ def chat_test(
65
+ request: ChatRequest,
66
+ agent_router_use_case: AgentRouterUseCase = Depends(get_agent_router_use_case)
67
+ ):
68
+ """Test RAG chat endpoint without authentication for debugging"""
69
+
70
+ # Get endpoint from environment variable or use default
71
+ os.getenv("RAG_ENDPOINT", "/rag/chat/test")
72
+
73
+ result = agent_router_use_case.execute_chat(request)
74
+ return result
75
+
76
+ @router.post("/chat/stream")
77
+ async def chat_with_rag_stream(
78
+ request: ChatRequest,
79
+ router_use_case: AgentRouterUseCase = Depends(get_agent_router_use_case),
80
+ current_user: Optional[User] = Depends(get_optional_user)
81
+ ):
82
+ """Streaming RAG chat. Works for both authenticated users and guests."""
83
+ try:
84
+ if current_user is None and not request.session_id:
85
+ request.session_id = f"guest_{uuid.uuid4().hex[:12]}"
86
+
87
+ user_id = current_user.id if current_user else None
88
+
89
+ return StreamingResponse(
90
+ router_use_case.execute_stream(request, is_guest=(current_user is None), user_id=user_id),
91
+ media_type="text/event-stream"
92
+ )
93
+ except Exception as e:
94
+ raise HTTPException(status_code=500, detail=str(e))
95
+
src/core/__pycache__/config.cpython-313.pyc ADDED
Binary file (4.02 kB). View file
 
src/core/__pycache__/security.cpython-313.pyc ADDED
Binary file (6.26 kB). View file
 
src/core/config.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pydantic_settings import BaseSettings, SettingsConfigDict
3
+
4
+ class Settings(BaseSettings):
5
+ model_config = SettingsConfigDict(env_file=".env", env_file_encoding='utf-8', extra='ignore')
6
+
7
+ PROJECT_NAME: str = "RAG API Service"
8
+ API_V1_STR: str = "/api/v1"
9
+
10
+ QDRANT_HOST: str = os.getenv("QDRANT_HOST", "localhost")
11
+ QDRANT_PORT: int = int(os.getenv("QDRANT_PORT", "6333"))
12
+ QDRANT_URL: str = os.getenv("QDRANT_URL", "") # Cloud URL (overrides host/port)
13
+ QDRANT_API_KEY: str = os.getenv("QDRANT_API_KEY", "") # Cloud API Key
14
+ QDRANT_COLLECTION: str = os.getenv("QDRANT_COLLECTION", "news_articles")
15
+
16
+ CLICKHOUSE_HOST: str = os.getenv("CLICKHOUSE_HOST", "localhost")
17
+ CLICKHOUSE_PORT: int = int(os.getenv("CLICKHOUSE_PORT", "8123"))
18
+ CLICKHOUSE_USER: str = os.getenv("CLICKHOUSE_USER", "default")
19
+ CLICKHOUSE_PASSWORD: str = os.getenv("CLICKHOUSE_PASSWORD", "")
20
+ CLICKHOUSE_DB: str = os.getenv("CLICKHOUSE_DB", "default")
21
+ CLICKHOUSE_SECURE: bool = os.getenv("CLICKHOUSE_SECURE", "false").lower() == "true"
22
+
23
+
24
+ # Embedding Model Config
25
+ EMBEDDING_MODEL: str = os.getenv("EMBEDDING_MODEL", "BAAI/bge-m3")
26
+ VECTOR_SIZE: int = int(os.getenv("VECTOR_SIZE", "1024"))
27
+ RERANKER_MODEL: str = os.getenv("RERANKER_MODEL", "BAAI/bge-reranker-v2-m3")
28
+
29
+ # PostgreSQL / Neon Config
30
+ DATABASE_URL: str = os.getenv("DATABASE_URL", "") # Full Neon URL (overrides individual fields)
31
+ POSTGRES_USER: str = os.getenv("POSTGRES_USER", "postgres")
32
+ POSTGRES_PASSWORD: str = os.getenv("POSTGRES_PASSWORD", "postgres")
33
+ POSTGRES_SERVER: str = os.getenv("POSTGRES_SERVER", "localhost")
34
+ POSTGRES_PORT: str = os.getenv("POSTGRES_PORT", "5432")
35
+ POSTGRES_DB: str = os.getenv("POSTGRES_DB", "rag_interactions")
36
+
37
+ @property
38
+ def SQLALCHEMY_DATABASE_URI(self) -> str:
39
+ if self.DATABASE_URL:
40
+ return self.DATABASE_URL
41
+ return f"postgresql://{self.POSTGRES_USER}:{self.POSTGRES_PASSWORD}@{self.POSTGRES_SERVER}:{self.POSTGRES_PORT}/{self.POSTGRES_DB}"
42
+
43
+ # LLM Settings
44
+ # Supported providers: "groq", "gemini", "together", "openai", "ollama"
45
+ LLM_PROVIDER: str = os.getenv("LLM_PROVIDER", "groq")
46
+
47
+ # Groq — free, 200+ tok/s, llama-3.3-70b-versatile | https://console.groq.com
48
+ OPENAI_API_KEY: str = os.getenv("OPENAI_API_KEY", "")
49
+ GROQ_API_KEY: str = os.getenv("GROQ_API_KEY", "")
50
+ GROQ_MODEL: str = os.getenv("GROQ_MODEL", "llama-3.3-70b-versatile")
51
+
52
+ # Google Gemini — free tier (15 RPM / 1M TPM) | https://aistudio.google.com/apikey
53
+ GEMINI_API_KEY: str = os.getenv("GEMINI_API_KEY", "")
54
+ GEMINI_MODEL: str = os.getenv("GEMINI_MODEL", "gemini-1.5-flash")
55
+
56
+ # Together AI — free $25 credit | https://api.together.ai
57
+ TOGETHER_API_KEY: str = os.getenv("TOGETHER_API_KEY", "")
58
+ TOGETHER_MODEL: str = os.getenv("TOGETHER_MODEL", "meta-llama/Llama-3.3-70B-Instruct-Turbo")
59
+
60
+ # HuggingFace Inference API — free with HF token | https://huggingface.co/settings/tokens
61
+ HF_TOKEN: str = os.getenv("HF_TOKEN", "")
62
+ HF_MODEL: str = os.getenv("HF_MODEL", "meta-llama/Llama-3.1-8B-Instruct")
63
+
64
+ # Ollama — local inference
65
+ OLLAMA_HOST: str = os.getenv("OLLAMA_HOST", "http://localhost:11434")
66
+ OLLAMA_MODEL: str = os.getenv("OLLAMA_MODEL", "llama3.2")
67
+
68
+ # Redis Settings
69
+ REDIS_URL: str = os.getenv("REDIS_URL", "") # Full URL (Upstash) - overrides host/port
70
+ REDIS_HOST: str = os.getenv("REDIS_HOST", "localhost")
71
+ REDIS_PORT: int = int(os.getenv("REDIS_PORT", "6380"))
72
+ REDIS_DB: int = int(os.getenv("REDIS_DB", "0"))
73
+ REDIS_PASSWORD: str = os.getenv("REDIS_PASSWORD", "")
74
+
75
+ # Hybrid Search Settings
76
+ ENABLE_HYBRID_SEARCH: bool = os.getenv("ENABLE_HYBRID_SEARCH", "true").lower() == "true"
77
+ LIVE_SEARCH_TIMEOUT: float = float(os.getenv("LIVE_SEARCH_TIMEOUT", "2.0"))
78
+ LIVE_SEARCH_MAX_RESULTS: int = int(os.getenv("LIVE_SEARCH_MAX_RESULTS", "5"))
79
+ LIVE_SEARCH_WEIGHT: float = float(os.getenv("LIVE_SEARCH_WEIGHT", "0.5"))
80
+ DB_SEARCH_WEIGHT: float = float(os.getenv("DB_SEARCH_WEIGHT", "0.5"))
81
+
82
+ # Cache Settings (TTL in seconds)
83
+ CACHE_RESPONSE_TTL: int = int(os.getenv("CACHE_RESPONSE_TTL", "300")) # 5 minutes
84
+ CACHE_LIVE_TTL: int = int(os.getenv("CACHE_LIVE_TTL", "600")) # 10 minutes
85
+ CACHE_TRANSLATION_TTL: int = int(os.getenv("CACHE_TRANSLATION_TTL", "3600")) # 1 hour
86
+ CACHE_INTENT_TTL: int = int(os.getenv("CACHE_INTENT_TTL", "3600")) # 1 hour
87
+
88
+ # Security Settings
89
+ SECRET_KEY: str = os.getenv("SECRET_KEY", "a_very_secret_key_change_me_in_production")
90
+ ACCESS_TOKEN_EXPIRE_MINUTES: int = int(os.getenv("ACCESS_TOKEN_EXPIRE_MINUTES", "60"))
91
+
92
+ settings = Settings()
src/core/domain/__pycache__/db_models.cpython-313.pyc ADDED
Binary file (3.61 kB). View file
 
src/core/domain/__pycache__/schemas.cpython-313.pyc ADDED
Binary file (5.02 kB). View file
 
src/core/domain/db_models.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import Column, Integer, String, DateTime, Text, ForeignKey, CheckConstraint, Boolean, Enum
2
+ from sqlalchemy.orm import declarative_base, relationship
3
+ from datetime import datetime
4
+ import enum
5
+
6
+ Base = declarative_base()
7
+
8
+ class UserRole(str, enum.Enum):
9
+ super_admin = "super_admin"
10
+ user = "user"
11
+
12
+ class User(Base):
13
+ __tablename__ = "users"
14
+
15
+ id = Column(Integer, primary_key=True, index=True)
16
+ username = Column(String, unique=True, index=True, nullable=False)
17
+ email = Column(String, unique=True, index=True, nullable=False)
18
+ hashed_password = Column(String, nullable=False)
19
+ role = Column(Enum(UserRole), default=UserRole.user, nullable=False)
20
+ is_active = Column(Boolean, default=True, nullable=False)
21
+ full_name = Column(String, nullable=True)
22
+ created_at = Column(DateTime, default=datetime.utcnow)
23
+ updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
24
+
25
+ refresh_tokens = relationship("RefreshToken", back_populates="user", cascade="all, delete-orphan")
26
+
27
+
28
+ class RefreshToken(Base):
29
+ __tablename__ = "refresh_tokens"
30
+
31
+ id = Column(Integer, primary_key=True, index=True)
32
+ token = Column(String, unique=True, index=True, nullable=False)
33
+ user_id = Column(Integer, ForeignKey("users.id"), nullable=False)
34
+ expires_at = Column(DateTime, nullable=False)
35
+ revoked = Column(Boolean, default=False, nullable=False)
36
+ created_at = Column(DateTime, default=datetime.utcnow)
37
+
38
+ user = relationship("User", back_populates="refresh_tokens")
39
+
40
+ class ChatHistory(Base):
41
+ __tablename__ = "chat_history"
42
+
43
+ id = Column(Integer, primary_key=True, index=True)
44
+ session_id = Column(String, index=True, nullable=False)
45
+ user_id = Column(Integer, ForeignKey("users.id"), nullable=True)
46
+ role = Column(String, nullable=False)
47
+ content = Column(Text, nullable=False)
48
+ retrieved_doc_ids = Column(Text, nullable=True)
49
+ pinned = Column(Boolean, default=False, nullable=False)
50
+ timestamp = Column(DateTime, default=datetime.utcnow)
51
+
52
+ class Feedback(Base):
53
+ __tablename__ = "feedback"
54
+
55
+ id = Column(Integer, primary_key=True, index=True)
56
+ session_id = Column(String, index=True, nullable=False)
57
+ message_id = Column(Integer, nullable=False)
58
+ user_id = Column(Integer, ForeignKey("users.id"), nullable=True)
59
+ rating = Column(Integer, CheckConstraint('rating IN (1, -1)'))
60
+ comment = Column(Text, nullable=True)
61
+ timestamp = Column(DateTime, default=datetime.utcnow)
src/core/domain/schemas.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from typing import List, Optional, Any
3
+ from datetime import datetime
4
+
5
+ class ChatRequest(BaseModel):
6
+ query: str
7
+ session_id: Optional[str] = None
8
+ top_k: int = 7 # increased from 5 — gives multilingual diversity room
9
+ source_filter: Optional[str] = None
10
+ language_filter: Optional[str] = None
11
+
12
+ class SourceDocument(BaseModel):
13
+ content: str
14
+ metadata: dict
15
+ score: float
16
+
17
+ class ChatResponse(BaseModel):
18
+ answer: str
19
+ sources: List[SourceDocument]
20
+ session_id: str = "anonymous"
21
+
22
+ class FeedbackRequest(BaseModel):
23
+ session_id: str
24
+ message_id: int
25
+ rating: int
26
+ comment: Optional[str] = None
27
+
28
+ class SentimentData(BaseModel):
29
+ entity: str
30
+ avg_sentiment: float
31
+ mention_count: int
32
+
33
+ class TrendData(BaseModel):
34
+ topic: str
35
+ volume: int
36
+ momentum: float
37
+
38
+ class NewsArticle(BaseModel):
39
+ doc_id: str
40
+ url: Optional[str] = None
41
+ title: Optional[str] = None
42
+ content: str
43
+ source: Optional[str] = None
44
+ published_at: Optional[str] = None
45
+ score: Optional[float] = None
46
+ metadata: dict = {}
47
+
48
+ class BrowseResponse(BaseModel):
49
+ total_returned: int
50
+ articles: List[NewsArticle]
51
+ next_offset: Optional[int] = None
52
+
53
+ class SearchResponse(BaseModel):
54
+ results: List[NewsArticle]
55
+
56
+ class SourceStat(BaseModel):
57
+ source: str
58
+ article_count: int
59
+ avg_sentiment: float
60
+
61
+ class PipelineStats(BaseModel):
62
+ total_articles_in_vector_db: int
63
+ total_sentiment_results: int
64
+
65
+ class ChatSession(BaseModel):
66
+ session_id: str
67
+ message_count: int
68
+ last_active: Optional[datetime] = None
69
+
70
+ class UserCreate(BaseModel):
71
+ username: str
72
+ email: str
73
+ password: str
74
+ full_name: Optional[str] = None
75
+
76
+ class UserResponse(BaseModel):
77
+ id: int
78
+ username: str
79
+ email: str
80
+ full_name: Optional[str] = None
81
+ role: str
82
+ is_active: bool
83
+ created_at: Optional[datetime] = None
84
+
85
+ class Config:
86
+ from_attributes = True
87
+
88
+ class UserUpdate(BaseModel):
89
+ username: Optional[str] = None
90
+ full_name: Optional[str] = None
91
+ email: Optional[str] = None
92
+
93
+ class PasswordChange(BaseModel):
94
+ current_password: str
95
+ new_password: str
96
+
97
+ class TokenResponse(BaseModel):
98
+ access_token: str
99
+ refresh_token: str
100
+ token_type: str = "bearer"
101
+
102
+ class RefreshRequest(BaseModel):
103
+ refresh_token: str
104
+
105
+ class AdminUserUpdate(BaseModel):
106
+ is_active: Optional[bool] = None
107
+ role: Optional[str] = None
src/core/orchestrator/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Query Orchestrator Module
src/core/orchestrator/query_orchestrator.py ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Query Orchestrator
3
+
4
+ Orchestrates hybrid search between live sources (DuckDuckGo) and database (Qdrant).
5
+ Integrates seamlessly with the existing multilingual RAG pipeline.
6
+
7
+ Key Features:
8
+ - Intelligent search strategy selection (live, DB, or hybrid)
9
+ - Uses production-grade intent classification (v2)
10
+ - Parallel execution of live and database searches
11
+ - Integration with existing 6-language multilingual pipeline
12
+ - Graceful fallbacks when live search fails
13
+ - Cache-aware execution
14
+ """
15
+
16
+ import logging
17
+ import asyncio
18
+ from typing import Dict, Any, List, Optional, Tuple
19
+ from datetime import datetime
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class SearchStrategy:
25
+ """
26
+ Search strategy configuration.
27
+
28
+ Determines which sources to use and how to weight them.
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ use_live: bool = True,
34
+ use_db: bool = True,
35
+ live_weight: float = 0.5,
36
+ db_weight: float = 0.5,
37
+ reason: str = "",
38
+ intent_result: Optional[Any] = None # IntentResult from v2 classifier
39
+ ):
40
+ self.use_live = use_live
41
+ self.use_db = use_db
42
+ self.live_weight = live_weight
43
+ self.db_weight = db_weight
44
+ self.reason = reason
45
+ self.intent_result = intent_result # Store full intent result for debugging
46
+
47
+ def __repr__(self):
48
+ return (
49
+ f"SearchStrategy(live={self.use_live}, db={self.use_db}, "
50
+ f"weights={self.live_weight:.1f}/{self.db_weight:.1f}, "
51
+ f"reason='{self.reason}')"
52
+ )
53
+
54
+
55
+ class QueryOrchestrator:
56
+ """
57
+ Orchestrates hybrid search between live sources and database.
58
+
59
+ Integrates with existing multilingual pipeline:
60
+ - Reuses dense vectors (computed once from English)
61
+ - Reuses sparse vectors (batched for 6 languages)
62
+ - Adds live search in parallel with DB search
63
+ - Merges results for unified ranking
64
+ """
65
+
66
+ # Temporal keywords that indicate need for live search
67
+ TEMPORAL_KEYWORDS = [
68
+ "today", "now", "latest", "breaking", "just", "current",
69
+ "this morning", "this afternoon", "this evening", "tonight",
70
+ "yesterday", "recent", "recently", "new", "fresh"
71
+ ]
72
+
73
+ # Historical keywords that indicate DB-only search
74
+ HISTORICAL_KEYWORDS = [
75
+ "history", "historical", "background", "context", "past",
76
+ "analysis", "overview", "summary", "explain", "what is",
77
+ "who is", "tell me about"
78
+ ]
79
+
80
+ def __init__(
81
+ self,
82
+ live_search_adapter,
83
+ enable_hybrid: bool = True,
84
+ default_live_weight: float = 0.5,
85
+ default_db_weight: float = 0.5
86
+ ):
87
+ """
88
+ Initialize query orchestrator.
89
+
90
+ Args:
91
+ live_search_adapter: DuckDuckGo adapter instance
92
+ enable_hybrid: Global flag to enable/disable hybrid search
93
+ default_live_weight: Default weight for live results
94
+ default_db_weight: Default weight for database results
95
+ """
96
+ self.live_search = live_search_adapter
97
+ self.enable_hybrid = enable_hybrid
98
+ self.default_live_weight = default_live_weight
99
+ self.default_db_weight = default_db_weight
100
+
101
+ def decide_search_strategy(self, query: str, intent: str = "NEWS", intent_result: Optional[Any] = None) -> SearchStrategy:
102
+ """
103
+ Decide which search sources to use based on query characteristics.
104
+
105
+ Now supports production-grade intent classification with multi-class intents:
106
+ - NEWS_TEMPORAL → prioritize live search
107
+ - NEWS_HISTORICAL → use DB only
108
+ - NEWS_GENERAL → balanced hybrid
109
+ - OTHER → skip search
110
+
111
+ Args:
112
+ query: User query
113
+ intent: Simple intent ("NEWS" or "OTHER") for backward compatibility
114
+ intent_result: Full IntentResult from v2 classifier (if available)
115
+
116
+ Returns:
117
+ SearchStrategy object with source selection and weights
118
+ """
119
+ # If hybrid search is disabled globally, use DB only
120
+ if not self.enable_hybrid:
121
+ return SearchStrategy(
122
+ use_live=False,
123
+ use_db=True,
124
+ live_weight=0.0,
125
+ db_weight=1.0,
126
+ reason="Hybrid search disabled",
127
+ intent_result=intent_result
128
+ )
129
+
130
+ # If live search is unavailable, use DB only
131
+ if not self.live_search.is_available():
132
+ return SearchStrategy(
133
+ use_live=False,
134
+ use_db=True,
135
+ live_weight=0.0,
136
+ db_weight=1.0,
137
+ reason="Live search unavailable",
138
+ intent_result=intent_result
139
+ )
140
+
141
+ # ── Use v2 Intent Result if available (production-grade) ──────────────
142
+
143
+ if intent_result and hasattr(intent_result, 'intent'):
144
+ detailed_intent = intent_result.intent
145
+ confidence = intent_result.confidence
146
+
147
+ logger.info(
148
+ f"Using v2 intent: {detailed_intent} "
149
+ f"(confidence={confidence:.2f}, method={intent_result.method})"
150
+ )
151
+
152
+ # OTHER → skip search
153
+ if detailed_intent == "OTHER":
154
+ return SearchStrategy(
155
+ use_live=False,
156
+ use_db=False,
157
+ live_weight=0.0,
158
+ db_weight=0.0,
159
+ reason=f"Small talk (confidence={confidence:.2f})",
160
+ intent_result=intent_result
161
+ )
162
+
163
+ # NEWS_TEMPORAL → prioritize live search
164
+ elif detailed_intent == "NEWS_TEMPORAL":
165
+ # High confidence → strong live bias
166
+ if confidence >= 0.80:
167
+ return SearchStrategy(
168
+ use_live=True,
169
+ use_db=True,
170
+ live_weight=0.8,
171
+ db_weight=0.2,
172
+ reason=f"Temporal query (high confidence={confidence:.2f})",
173
+ intent_result=intent_result
174
+ )
175
+ # Medium confidence → moderate live bias
176
+ else:
177
+ return SearchStrategy(
178
+ use_live=True,
179
+ use_db=True,
180
+ live_weight=0.7,
181
+ db_weight=0.3,
182
+ reason=f"Temporal query (medium confidence={confidence:.2f})",
183
+ intent_result=intent_result
184
+ )
185
+
186
+ # NEWS_HISTORICAL → use DB only
187
+ elif detailed_intent == "NEWS_HISTORICAL":
188
+ return SearchStrategy(
189
+ use_live=False,
190
+ use_db=True,
191
+ live_weight=0.0,
192
+ db_weight=1.0,
193
+ reason=f"Historical query (confidence={confidence:.2f})",
194
+ intent_result=intent_result
195
+ )
196
+
197
+ # NEWS_GENERAL → balanced hybrid
198
+ elif detailed_intent == "NEWS_GENERAL":
199
+ return SearchStrategy(
200
+ use_live=True,
201
+ use_db=True,
202
+ live_weight=self.default_live_weight,
203
+ db_weight=self.default_db_weight,
204
+ reason=f"General news (confidence={confidence:.2f})",
205
+ intent_result=intent_result
206
+ )
207
+
208
+ # ── Fallback to v1 logic (backward compatibility) ─────────────────────
209
+
210
+ # If intent is OTHER (small talk), no search needed
211
+ if intent == "OTHER":
212
+ return SearchStrategy(
213
+ use_live=False,
214
+ use_db=False,
215
+ live_weight=0.0,
216
+ db_weight=0.0,
217
+ reason="Small talk - no search needed (v1 fallback)",
218
+ intent_result=intent_result
219
+ )
220
+
221
+ query_lower = query.lower()
222
+
223
+ # Check for temporal keywords → prioritize live search
224
+ has_temporal = any(kw in query_lower for kw in self.TEMPORAL_KEYWORDS)
225
+
226
+ # Check for historical keywords → prioritize database
227
+ has_historical = any(kw in query_lower for kw in self.HISTORICAL_KEYWORDS)
228
+
229
+ if has_temporal and not has_historical:
230
+ # Temporal query → prioritize live search
231
+ return SearchStrategy(
232
+ use_live=True,
233
+ use_db=True,
234
+ live_weight=0.7,
235
+ db_weight=0.3,
236
+ reason="Temporal query - prioritize live (v1 fallback)",
237
+ intent_result=intent_result
238
+ )
239
+
240
+ elif has_historical and not has_temporal:
241
+ # Historical query → use database only
242
+ return SearchStrategy(
243
+ use_live=False,
244
+ use_db=True,
245
+ live_weight=0.0,
246
+ db_weight=1.0,
247
+ reason="Historical query - database only (v1 fallback)",
248
+ intent_result=intent_result
249
+ )
250
+
251
+ else:
252
+ # Balanced hybrid search
253
+ return SearchStrategy(
254
+ use_live=True,
255
+ use_db=True,
256
+ live_weight=self.default_live_weight,
257
+ db_weight=self.default_db_weight,
258
+ reason="Balanced hybrid search (v1 fallback)",
259
+ intent_result=intent_result
260
+ )
261
+
262
+ async def execute_hybrid_search(
263
+ self,
264
+ query: str,
265
+ english_query: str,
266
+ strategy: SearchStrategy,
267
+ # Database search components (from existing pipeline)
268
+ embedder,
269
+ vector_store,
270
+ translated_queries: Dict[str, str],
271
+ top_k: int = 7
272
+ ) -> Tuple[List[Any], List[Dict[str, Any]]]:
273
+ """
274
+ Execute hybrid search with parallel live and database searches.
275
+
276
+ Integrates with existing multilingual pipeline:
277
+ - Reuses dense vector (computed once from English)
278
+ - Reuses sparse vectors (batched for 6 languages)
279
+ - Adds live search in parallel
280
+
281
+ Args:
282
+ query: Original user query
283
+ english_query: English translation
284
+ strategy: Search strategy from decide_search_strategy()
285
+ embedder: BGE-M3 embedder adapter
286
+ vector_store: Qdrant adapter
287
+ translated_queries: Dict of {lang: translated_query}
288
+ top_k: Results per language for DB search
289
+
290
+ Returns:
291
+ Tuple of (db_results, live_results)
292
+ """
293
+ tasks = []
294
+
295
+ # 1. Database search (if enabled)
296
+ if strategy.use_db:
297
+ db_task = self._execute_db_search(
298
+ english_query=english_query,
299
+ translated_queries=translated_queries,
300
+ embedder=embedder,
301
+ vector_store=vector_store,
302
+ top_k=top_k
303
+ )
304
+ tasks.append(db_task)
305
+
306
+ # 2. Live search (if enabled)
307
+ if strategy.use_live:
308
+ live_task = self._execute_live_search(english_query)
309
+ tasks.append(live_task)
310
+
311
+ # Execute all searches in parallel
312
+ if not tasks:
313
+ return [], []
314
+
315
+ results = await asyncio.gather(*tasks, return_exceptions=True)
316
+
317
+ # Extract results with error handling
318
+ db_results = []
319
+ live_results = []
320
+
321
+ if strategy.use_db:
322
+ if isinstance(results[0], Exception):
323
+ logger.error(f"Database search failed: {results[0]}")
324
+ else:
325
+ db_results = results[0]
326
+
327
+ if strategy.use_live:
328
+ result_idx = 1 if strategy.use_db else 0
329
+ if isinstance(results[result_idx], Exception):
330
+ logger.error(f"Live search failed: {results[result_idx]}")
331
+ else:
332
+ live_results = results[result_idx]
333
+
334
+ logger.info(
335
+ f"Hybrid search completed: {len(db_results)} DB + {len(live_results)} live results"
336
+ )
337
+
338
+ return db_results, live_results
339
+
340
+ async def _execute_db_search(
341
+ self,
342
+ english_query: str,
343
+ translated_queries: Dict[str, str],
344
+ embedder,
345
+ vector_store,
346
+ top_k: int
347
+ ) -> List[Any]:
348
+ """
349
+ Execute multilingual database search (existing pipeline).
350
+
351
+ Leverages existing optimizations:
352
+ - Dense vector computed once (language-agnostic)
353
+ - Sparse vectors batched (6 languages in 1 pass)
354
+ - Parallel Qdrant queries (6 lanes)
355
+ - Deduplication by doc_id
356
+
357
+ Args:
358
+ english_query: English query
359
+ translated_queries: Dict of {lang: translated_query}
360
+ embedder: BGE-M3 embedder
361
+ vector_store: Qdrant adapter
362
+ top_k: Results per language
363
+
364
+ Returns:
365
+ List of deduplicated SearchResult objects
366
+ """
367
+ try:
368
+ # 1. Compute dense vector once (language-agnostic)
369
+ dense_embedding = embedder.encode_query(english_query)
370
+ dense_vec = dense_embedding.get("dense")
371
+
372
+ # 2. Batch sparse encoding for all 6 languages (existing optimization)
373
+ languages = ["en", "ar", "am", "so", "sw", "fr"]
374
+ sparse_queries = [translated_queries.get(lang, english_query) for lang in languages]
375
+ sparse_embeddings = embedder.encode_sparse_batch(sparse_queries)
376
+
377
+ # 3. Parallel search across 6 languages (existing pattern)
378
+ search_tasks = []
379
+ for i, lang in enumerate(languages):
380
+ sparse_vec = sparse_embeddings[i].get("sparse")
381
+ task = vector_store.search_with_vectors(
382
+ dense_vec=dense_vec,
383
+ sparse_vec=sparse_vec,
384
+ limit=top_k,
385
+ language_filter=lang
386
+ )
387
+ search_tasks.append(task)
388
+
389
+ # Execute all 6 lanes in parallel
390
+ lane_results = await asyncio.gather(*search_tasks, return_exceptions=True)
391
+
392
+ # 4. Flatten and deduplicate by doc_id (existing logic)
393
+ all_docs = []
394
+ for lane in lane_results:
395
+ if not isinstance(lane, Exception):
396
+ all_docs.extend(lane)
397
+
398
+ # Deduplicate: keep highest-scoring version of each doc
399
+ seen = {}
400
+ for doc in all_docs:
401
+ doc_id = doc.metadata.get("doc_id")
402
+ if doc_id:
403
+ if doc_id not in seen or doc.score > seen[doc_id].score:
404
+ seen[doc_id] = doc
405
+ else:
406
+ # No doc_id, keep it
407
+ seen[id(doc)] = doc
408
+
409
+ unique_docs = list(seen.values())
410
+ logger.info(f"DB search: {len(all_docs)} total → {len(unique_docs)} unique")
411
+
412
+ return unique_docs
413
+
414
+ except Exception as e:
415
+ logger.error(f"Database search error: {e}")
416
+ raise
417
+
418
+ async def _execute_live_search(self, query: str) -> List[Dict[str, Any]]:
419
+ """
420
+ Execute live search with timeout and error handling.
421
+
422
+ Args:
423
+ query: Search query (English)
424
+
425
+ Returns:
426
+ List of normalized live search results
427
+ """
428
+ try:
429
+ results = await self.live_search.search(query)
430
+ logger.info(f"Live search: {len(results)} results")
431
+ return results
432
+ except Exception as e:
433
+ logger.error(f"Live search error: {e}")
434
+ raise