plexdx commited on
Commit
64d289f
Β·
verified Β·
1 Parent(s): be7e3bf

Upload 26 files

Browse files
.env ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================================
2
+ # Omnichannel Fact & Hallucination Intelligence System
3
+ # Environment Configuration Template
4
+ # Copy to .env and fill in your values
5
+ # =============================================================================
6
+
7
+ # ---------------------------------------------------------------------------
8
+ # LLM API Keys (set these in HuggingFace Spaces β†’ Settings β†’ Secrets)
9
+ # ---------------------------------------------------------------------------
10
+
11
+ # Groq API key β€” used for gatekeeper (llama3-8b), misinformation agent (mixtral-8x7b),
12
+ # AND hallucination agent (llama3-70b). All free via Groq's free tier (30 req/min).
13
+ # Get one at: https://console.groq.com
14
+ GROQ_API_KEY=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
15
+
16
+ # X (Twitter) API v2 Bearer Token β€” used for tweet velocity + Community Notes
17
+ # Optional β€” system falls back to deterministic mock data without it.
18
+ # Get one at: https://developer.twitter.com
19
+ X_BEARER_TOKEN=AAAAAAAAAAAAAAAAAAAAAGLQ8wEAAAAAH6WkY9y9Iw9n8YB9PqMeVA2MIHI%3D9OXISm6Q9fyRNm0DMEAupynHrYZjb1S7AVIU84swKP2IBxpChQ
20
+
21
+ # ---------------------------------------------------------------------------
22
+ # Infrastructure (auto-configured in Docker Compose β€” only change for custom setups)
23
+ # ---------------------------------------------------------------------------
24
+
25
+ QDRANT_HOST=localhost
26
+ QDRANT_PORT=6333
27
+
28
+ MEMGRAPH_HOST=localhost
29
+ MEMGRAPH_PORT=7687
30
+ MEMGRAPH_PASSWORD=memgraph123
31
+
32
+ REDPANDA_BROKERS=localhost:9092
33
+
34
+ REDIS_URL=redis://localhost:6379
35
+
36
+ # ---------------------------------------------------------------------------
37
+ # App Configuration
38
+ # ---------------------------------------------------------------------------
39
+
40
+ PORT=7860
41
+ LOG_LEVEL=INFO
42
+
43
+ # DEMO_MODE=true: Use mock data for all external APIs (LLMs, X API)
44
+ # Useful for exploring the UI/architecture without any API credentials.
45
+ # The system still runs the full pipeline β€” just with deterministic mock outputs.
46
+ DEMO_MODE=false
.gitignore ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ .venv/
6
+ .env
7
+ *.egg-info/
8
+ dist/
9
+ build/
10
+
11
+ # uv
12
+ uv.lock
13
+
14
+ # Extension
15
+ extension/node_modules/
16
+ extension/.output/
17
+ extension/.wxt/
18
+
19
+ # Infra
20
+ *.pem
21
+ *.key
22
+ .cloudflared/
23
+
24
+ # Data
25
+ *.jsonl
26
+ *.vtt
27
+
28
+ # IDE
29
+ .vscode/
30
+ .idea/
31
+ *.swp
32
+
33
+ # Docker
34
+ .docker/
35
+
36
+ # Logs
37
+ *.log
38
+
39
+ # OS
40
+ .DS_Store
41
+ Thumbs.db
README.md CHANGED
@@ -1,14 +1,239 @@
1
  ---
2
- title: Rwttrter
3
- emoji: πŸ¦€
4
- colorFrom: red
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 6.12.0
8
- app_file: app.py
9
  pinned: false
10
- license: apache-2.0
11
- short_description: trytryry
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Omnichannel Fact & Hallucination Intelligence System
3
+ emoji: πŸ”
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: docker
 
 
7
  pinned: false
8
+ license: mit
9
+ app_port: 7860
10
  ---
11
 
12
+ # Omnichannel Fact & Hallucination Intelligence System
13
+
14
+ **Near-zero-latency real-time fact-checking and AI hallucination detection β€” deployed universally via a browser extension across X/Twitter, YouTube, Instagram, news sites, and AI chat interfaces.**
15
+
16
+ ---
17
+
18
+ ## Architecture
19
+
20
+ ```
21
+ Browser Extension (WXT + React 19 + Framer Motion)
22
+ β”‚ WebSocket (wss://)
23
+ β–Ό
24
+ FastAPI Backend ──► Redis Stack (cache, 6h/15min TTL)
25
+ β”‚
26
+ β”œβ”€β”€β–Ί Gatekeeper: Groq llama3-8b-8192 (<120ms p95)
27
+ β”‚ └── noise β†’ drop | fact β†’ continue
28
+ β”‚
29
+ β”œβ”€β”€β–Ί RAG Pipeline (concurrent)
30
+ β”‚ β”œβ”€β”€ FastEmbed BGE-M3 embeddings (CPU, multilingual)
31
+ β”‚ β”œβ”€β”€ Qdrant ANN search (HNSW ef=128, top-8, 72h window)
32
+ β”‚ └── Memgraph trust graph traversal (in-memory Cypher)
33
+ β”‚
34
+ β”œβ”€β”€β–Ί Grok Sensor (concurrent)
35
+ β”‚ └── X API v2 velocity + Community Notes
36
+ β”‚
37
+ └──► Prefect Flow (multi-agent evaluation)
38
+ β”œβ”€β”€ misinformation_task: Groq mixtral-8x7b-32768
39
+ └── hallucination_task: Claude Haiku (AI platforms only)
40
+ β”‚
41
+ β–Ό
42
+ AnalysisResult β†’ WebSocket β†’ Extension β†’ DOM highlight + hover card
43
+ ```
44
+
45
+ ---
46
+
47
+ ## Stack
48
+
49
+ | Layer | Technology | Why |
50
+ |-------|-----------|-----|
51
+ | Extension framework | WXT v0.19 + React 19 | HMR, multi-browser, TypeScript-first, Vite |
52
+ | Extension state | Zustand + chrome.storage.sync | Persistent, reactive, cross-context |
53
+ | LLM gatekeeper | Groq llama3-8b-8192 | 800+ tok/s, <100ms, no GPU needed |
54
+ | LLM evaluation | LiteLLM β†’ Groq mixtral-8x7b / llama3-70b | All free via Groq β€” swap providers without code changes |
55
+ | Embeddings | BGE-M3 via FastEmbed | 100+ languages, 1024-dim, CPU-native, free |
56
+ | Vector DB | Qdrant (self-hosted) | Sub-ms HNSW search, no vendor lock-in |
57
+ | Graph DB | Memgraph (in-memory) | 10–100x faster than Neo4j for trust scoring |
58
+ | Message queue | Redpanda | Kafka-compatible, no JVM, 10x lower latency |
59
+ | Orchestration | Prefect | Native async, DAG flows, built-in retry |
60
+ | Cache | Redis Stack (RedisJSON) | Structured claim cache, TTL per verdict color |
61
+ | Package manager | uv | 10–100x faster than pip, lockfiles |
62
+ | Hashing | xxhash (client + server) | Sub-microsecond content deduplication |
63
+ | Edge tunnel | Cloudflare Tunnel | Zero-config TLS, no exposed ports |
64
+ | Observability | structlog + rich | Structured JSON logs, colorized dev output |
65
+
66
+ ---
67
+
68
+ ## Quick Start (HuggingFace Spaces)
69
+
70
+ This Space runs the **backend + demo UI** via Docker. The browser extension is a separate build.
71
+
72
+ ### Required Secrets (set in Space settings β†’ Secrets)
73
+
74
+ | Secret | Required | Description |
75
+ |--------|----------|-------------|
76
+ | `GROQ_API_KEY` | Recommended | Groq API key β€” powers all 3 LLM agents (gatekeeper, misinformation, hallucination). Free tier: 30 req/min |
77
+ | `X_BEARER_TOKEN` | Optional | X API v2 bearer token for tweet velocity + Community Notes |
78
+
79
+ **Without any API keys**: The system runs in `DEMO_MODE=true` with deterministic mock results β€” great for exploring the UI and architecture without credentials.
80
+
81
+ Get a free key:
82
+ - Groq: https://console.groq.com (free tier: 30 req/min β€” covers all 3 LLM agents)
83
+
84
+ ### Run Locally
85
+
86
+ ```bash
87
+ git clone <repo>
88
+ cd omnichannel-fact-intelligence
89
+
90
+ # Copy env template
91
+ cp .env.example .env
92
+ # Edit .env with your API keys
93
+
94
+ # Start all services (Qdrant, Memgraph, Redpanda, Redis, FastAPI)
95
+ docker compose up
96
+
97
+ # Visit http://localhost:7860 for the demo UI
98
+ ```
99
+
100
+ ### Run Backend Only (no Docker for infra)
101
+
102
+ ```bash
103
+ cd backend
104
+
105
+ # Install uv (if not installed)
106
+ curl -LsSf https://astral.sh/uv/install.sh | sh
107
+
108
+ # Install dependencies
109
+ uv sync
110
+
111
+ # Set env vars
112
+ export GROQ_API_KEY=your_key
113
+ export DEMO_MODE=true # Skip infrastructure deps for quick testing
114
+
115
+ # Start FastAPI
116
+ uv run uvicorn main:app --host 0.0.0.0 --port 7860 --reload
117
+ ```
118
+
119
+ ---
120
+
121
+ ## Browser Extension Setup
122
+
123
+ ### Prerequisites
124
+ ```bash
125
+ cd extension
126
+ npm install # or: bun install
127
+ ```
128
+
129
+ ### Development (Chrome)
130
+ ```bash
131
+ # Set your backend URL (or use cloudflared tunnel)
132
+ WS_URL=ws://localhost:7860/ws npx wxt dev --browser chrome
133
+ ```
134
+
135
+ ### Production Build
136
+ ```bash
137
+ # Build for all browsers
138
+ WS_URL=wss://fact-engine.your-domain.com/ws npx wxt build
139
+
140
+ # Chrome: .output/chrome-mv3/
141
+ # Firefox: .output/firefox-mv3/
142
+ ```
143
+
144
+ ### Load in Chrome
145
+ 1. Navigate to `chrome://extensions`
146
+ 2. Enable **Developer mode** (top right)
147
+ 3. Click **Load unpacked** β†’ select `.output/chrome-mv3/`
148
+ 4. Visit X/Twitter, YouTube, or any news site β€” facts will begin highlighting
149
+
150
+ ---
151
+
152
+ ## Highlight Color Semantics
153
+
154
+ | Color | Hex | Meaning |
155
+ |-------|-----|---------|
156
+ | 🟒 Green | `#22c55e` | Fact-checked β€” corroborated by β‰₯2 sources, trust score β‰₯ 0.65 |
157
+ | 🟑 Yellow | `#eab308` | Unverified β€” breaking news, weak corroboration, high velocity |
158
+ | πŸ”΄ Red | `#ef4444` | Debunked β€” refuted by β‰₯2 independent sources or Community Note active |
159
+ | 🟣 Purple | `#a855f7` | AI hallucination β€” fabricated citation, impossibility, contradiction |
160
+
161
+ ---
162
+
163
+ ## Trust Score Algorithm
164
+
165
+ ```
166
+ score = 0.5 (baseline)
167
+ + 0.30 if Author.verified AND account_type IN ['government', 'official_news']
168
+ + 0.05 per corroborating Source node (capped at +0.25, i.e. 5 sources)
169
+ - 0.40 if any Source has an active Community Note
170
+ = clamp(score, 0.0, 1.0)
171
+ ```
172
+
173
+ ---
174
+
175
+ ## Data Pipeline
176
+
177
+ Three async Redpanda producers simulate the omnichannel firehose:
178
+
179
+ | Producer | Topic | Rate | Source |
180
+ |----------|-------|------|--------|
181
+ | twitter_producer | `raw.twitter` | 50 eps | Mock X posts |
182
+ | instagram_producer | `raw.instagram` | 20 eps | Mock story text (OCR-extracted) |
183
+ | youtube_producer | `raw.youtube` | 10 eps | Mock VTT transcript chunks |
184
+
185
+ A single async consumer aggregates all three, deduplicates by `content_hash`, and upserts into Qdrant + Memgraph.
186
+
187
+ ---
188
+
189
+ ## Extension Modes
190
+
191
+ | Mode | Shows |
192
+ |------|-------|
193
+ | Minimal | Red + Purple only |
194
+ | Normal (default) | Red + Purple + Yellow |
195
+ | Advanced | All colors including Green |
196
+
197
+ ---
198
+
199
+ ## File Structure
200
+
201
+ ```
202
+ omnichannel-fact-intelligence/
203
+ β”œβ”€β”€ docker-compose.yml # All services in one command
204
+ β”œβ”€β”€ .env.example # Environment template
205
+ β”‚
206
+ β”œβ”€β”€ backend/
207
+ β”‚ β”œβ”€β”€ Dockerfile # uv + Python 3.12
208
+ β”‚ β”œβ”€β”€ pyproject.toml # All deps pinned (uv-compatible)
209
+ β”‚ β”œβ”€β”€ main.py # FastAPI app, WebSocket, Redis cache
210
+ β”‚ β”œβ”€β”€ gatekeeper.py # Groq fact/noise classifier (<120ms p95)
211
+ β”‚ β”œβ”€β”€ rag_pipeline.py # BGE-M3 + Qdrant + Memgraph trust graph
212
+ β”‚ β”œβ”€β”€ grok_sensor.py # X API v2 + Community Notes
213
+ β”‚ β”œβ”€β”€ agents.py # Prefect flow + LiteLLM multi-agent eval
214
+ β”‚ β”œβ”€β”€ core/
215
+ β”‚ β”‚ β”œβ”€β”€ config.py # Pydantic-settings centralized config
216
+ β”‚ β”‚ └── models.py # All Pydantic v2 models
217
+ β”‚ β”œβ”€β”€ producers/
218
+ β”‚ β”‚ └── producers.py # Twitter + Instagram + YouTube + consumer
219
+ β”‚ └── static/
220
+ β”‚ └── index.html # Demo UI (served at /)
221
+ β”‚
222
+ β”œβ”€β”€ extension/
223
+ β”‚ β”œβ”€β”€ wxt.config.ts # WXT framework config
224
+ β”‚ β”œβ”€β”€ stores/
225
+ β”‚ β”‚ └── extensionStore.ts # Zustand + chrome.storage.sync
226
+ β”‚ └── entrypoints/
227
+ β”‚ β”œβ”€β”€ background.ts # Persistent WS connection + message routing
228
+ β”‚ β”œβ”€β”€ content.tsx # MutationObserver + highlight + hover card
229
+ β”‚ └── popup.tsx # Master toggle + mode selector + badge
230
+ β”‚
231
+ └── infra/
232
+ └── tunnel_setup.sh # Cloudflare Tunnel setup script
233
+ ```
234
+
235
+ ---
236
+
237
+ ## License
238
+
239
+ MIT β€” see LICENSE for details.
backend/Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ # Install uv β€” 10-100x faster than pip, proper lockfiles
4
+ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
5
+
6
+ WORKDIR /app
7
+
8
+ # Install system deps for FastEmbed / BGE-M3 CPU inference
9
+ RUN apt-get update && apt-get install -y --no-install-recommends \
10
+ build-essential curl git \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Copy dependency files first (layer cache optimization)
14
+ COPY pyproject.toml uv.lock* ./
15
+
16
+ # Install all Python dependencies into the project virtual env
17
+ RUN uv sync --frozen --no-dev
18
+
19
+ # Copy application source
20
+ COPY . .
21
+
22
+ # Pre-download BGE-M3 model so cold starts are instant
23
+ RUN uv run python -c "from fastembed import TextEmbedding; TextEmbedding('BAAI/bge-m3')" || true
24
+
25
+ EXPOSE 7860
26
+
27
+ CMD ["uv", "run", "python", "app.py"]
backend/agents.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ agents.py β€” Prefect-orchestrated multi-agent evaluation layer.
3
+
4
+ Two concurrent agents evaluate each claim:
5
+
6
+ 1. misinformation_task β†’ Groq mixtral-8x7b-32768
7
+ Given: claim + top-3 RAG evidence chunks + trust score
8
+ Output: color (red|yellow|green), confidence, explanation, sources
9
+
10
+ 2. hallucination_task β†’ Claude Haiku (runs ONLY on AI chat platforms)
11
+ Given: claim text
12
+ Output: color (purple|green), confidence, explanation
13
+ Checks for: fabricated citations, statistical impossibilities,
14
+ internal contradictions, LLM-specific failure patterns
15
+
16
+ Both tasks run concurrently via asyncio.gather. Prefect merges results,
17
+ picks higher-severity color, returns the final AnalysisResult.
18
+
19
+ Why Prefect over Celery:
20
+ - Dynamic DAG-based orchestration (no pre-declared task graph)
21
+ - Native async support β€” no gevent hacks needed
22
+ - Built-in retry with exponential backoff per task
23
+ - Far better observability: every flow run gets a full execution trace
24
+ - Deployable without a separate worker process (embedded server mode)
25
+ """
26
+
27
+ import asyncio
28
+ import time
29
+ from typing import Literal
30
+
31
+ import structlog
32
+ from litellm import acompletion
33
+ from prefect import flow, task
34
+ from prefect.tasks import task_input_hash
35
+
36
+ from core.config import HighlightColor, Platform, Settings, get_settings
37
+ from core.models import AnalysisResult, EvidenceChunk, GrokSensorResult, RAGResult, SourceRef, TrustScore
38
+
39
+ log = structlog.get_logger(__name__)
40
+
41
+ # ---------------------------------------------------------------------------
42
+ # Color severity ordering (higher index = more severe)
43
+ # ---------------------------------------------------------------------------
44
+ SEVERITY: dict[HighlightColor, int] = {
45
+ HighlightColor.GREEN: 0,
46
+ HighlightColor.YELLOW: 1,
47
+ HighlightColor.RED: 2,
48
+ HighlightColor.PURPLE: 3,
49
+ }
50
+
51
+ # ---------------------------------------------------------------------------
52
+ # LiteLLM prompts
53
+ # ---------------------------------------------------------------------------
54
+
55
+ MISINFO_SYSTEM = """You are a professional fact-checker with access to recent evidence.
56
+ Analyze the claim against the evidence chunks and trust score. Output ONLY valid JSON.
57
+
58
+ Output schema (no markdown, no preamble):
59
+ {
60
+ "color": "red" | "yellow" | "green",
61
+ "confidence": <integer 0-100>,
62
+ "explanation": "<2-3 sentence explanation for the hover card>",
63
+ "verdict_label": "<8 words max, e.g. 'Debunked by Reuters and AP'>",
64
+ "sources": ["<url1>", "<url2>", "<url3>"]
65
+ }
66
+
67
+ Color logic:
68
+ - "green": Claim is factually accurate, corroborated by β‰₯2 independent sources, trust score β‰₯ 0.65
69
+ - "yellow": Claim is unverified, breaking news, or evidence is weak/contradictory
70
+ - "red": Claim is demonstrably false, debunked by β‰₯2 sources, OR trust score < 0.25, OR community note active"""
71
+
72
+ MISINFO_USER_TMPL = """Claim: {claim}
73
+
74
+ Trust score: {trust_score:.2f} (0=untrustworthy, 1=highly trusted)
75
+ Author verified: {verified}
76
+ Active Community Note: {has_note}{note_text_part}
77
+ Corroborating sources in database: {source_count}
78
+
79
+ Evidence chunks (cosine similarity descending):
80
+ {evidence_text}
81
+
82
+ Analyze and output JSON."""
83
+
84
+ HALLUCINATION_SYSTEM = """You are an LLM output auditor specializing in detecting AI hallucinations.
85
+ Analyze the following text that was generated by an AI system. Output ONLY valid JSON.
86
+
87
+ Output schema:
88
+ {
89
+ "color": "purple" | "green",
90
+ "confidence": <integer 0-100>,
91
+ "explanation": "<specific explanation of what's wrong, or confirmation it's accurate>"
92
+ }
93
+
94
+ Check for:
95
+ 1. Fabricated citations: URLs, paper titles, author names that don't exist
96
+ 2. Statistical impossibilities: numbers that exceed known bounds (e.g., "500% of people")
97
+ 3. Internal contradictions: statements that contradict each other within the text
98
+ 4. Temporal paradoxes: referencing future events as past, or anachronistic details
99
+ 5. Entity confusion: mixing attributes of different real-world entities
100
+
101
+ Color "purple" only if you find a clear, specific hallucination pattern.
102
+ Color "green" if the text appears factually coherent (you cannot verify external facts)."""
103
+
104
+
105
+ # ---------------------------------------------------------------------------
106
+ # Prefect tasks β€” each is independently retried with exponential backoff
107
+ # ---------------------------------------------------------------------------
108
+
109
+ @task(
110
+ name="misinformation-agent",
111
+ retries=2,
112
+ retry_delay_seconds=[1, 3],
113
+ cache_key_fn=task_input_hash,
114
+ cache_expiration=None,
115
+ log_prints=False,
116
+ )
117
+ async def misinformation_task(
118
+ claim: str,
119
+ evidence: list[EvidenceChunk],
120
+ trust: TrustScore,
121
+ grok: GrokSensorResult,
122
+ settings: Settings,
123
+ ) -> dict:
124
+ """
125
+ Groq mixtral-8x7b-32768 evaluates the claim against RAG evidence.
126
+ 32k context window accommodates all 8 evidence chunks comfortably.
127
+ """
128
+ # Build evidence text block (top-3 by cosine score for the prompt)
129
+ top_evidence = sorted(evidence, key=lambda e: e.score, reverse=True)[:3]
130
+ evidence_text = "\n\n".join(
131
+ f"[{i+1}] Source: {e.domain} (similarity: {e.score:.3f})\n{e.text[:400]}"
132
+ for i, e in enumerate(top_evidence)
133
+ ) or "No evidence chunks retrieved (claim may be too recent or niche)."
134
+
135
+ note_part = f"\nCommunity Note: {trust.community_note_text}" if trust.community_note_text else ""
136
+
137
+ user_prompt = MISINFO_USER_TMPL.format(
138
+ claim=claim[:500],
139
+ trust_score=trust.score,
140
+ verified=trust.author_verified,
141
+ has_note=trust.has_community_note,
142
+ note_text_part=note_part,
143
+ source_count=trust.corroborating_sources,
144
+ evidence_text=evidence_text,
145
+ )
146
+
147
+ # LiteLLM routes to Groq β€” swap to "openai/gpt-4o" or "groq/llama3-70b-8192"
148
+ # by changing a single string, zero code changes elsewhere
149
+ response = await acompletion(
150
+ model=settings.misinformation_model,
151
+ messages=[
152
+ {"role": "system", "content": MISINFO_SYSTEM},
153
+ {"role": "user", "content": user_prompt},
154
+ ],
155
+ response_format={"type": "json_object"},
156
+ temperature=0.1,
157
+ max_tokens=400,
158
+ api_key=settings.groq_api_key or None,
159
+ )
160
+
161
+ import json
162
+ raw = response.choices[0].message.content or "{}"
163
+ return json.loads(raw)
164
+
165
+
166
+ @task(
167
+ name="hallucination-agent",
168
+ retries=2,
169
+ retry_delay_seconds=[1, 3],
170
+ log_prints=False,
171
+ )
172
+ async def hallucination_task(claim: str, settings: Settings) -> dict:
173
+ """
174
+ Groq llama3-70b-8192 audits AI-generated text for hallucination patterns.
175
+ Previously Claude Haiku β€” now fully free via Groq, same prompt, same output schema.
176
+ Only invoked when the source platform is an AI chat interface.
177
+ """
178
+ response = await acompletion(
179
+ model=settings.hallucination_model, # groq/llama3-70b-8192
180
+ messages=[
181
+ {"role": "system", "content": HALLUCINATION_SYSTEM},
182
+ {"role": "user", "content": f"Audit this AI-generated text:\n\n{claim[:1000]}"},
183
+ ],
184
+ response_format={"type": "json_object"},
185
+ temperature=0.0,
186
+ max_tokens=300,
187
+ api_key=settings.groq_api_key or None,
188
+ )
189
+
190
+ import json
191
+ raw = response.choices[0].message.content or "{}"
192
+ return json.loads(raw)
193
+
194
+
195
+ def _demo_misinfo_result(trust_score: float, has_note: bool) -> dict:
196
+ """Deterministic demo result when LLM keys are absent."""
197
+ if has_note or trust_score < 0.25:
198
+ return {
199
+ "color": "red", "confidence": 82,
200
+ "explanation": "Demo mode: trust score below threshold and/or active community note detected.",
201
+ "verdict_label": "Low trust signal detected",
202
+ "sources": [],
203
+ }
204
+ elif trust_score < 0.55:
205
+ return {
206
+ "color": "yellow", "confidence": 61,
207
+ "explanation": "Demo mode: insufficient corroboration to confirm or deny this claim.",
208
+ "verdict_label": "Unverified β€” insufficient evidence",
209
+ "sources": [],
210
+ }
211
+ return {
212
+ "color": "green", "confidence": 78,
213
+ "explanation": "Demo mode: claim appears well-corroborated based on trust graph signals.",
214
+ "verdict_label": "Appears credible",
215
+ "sources": [],
216
+ }
217
+
218
+
219
+ def _demo_hallucination_result() -> dict:
220
+ return {
221
+ "color": "purple", "confidence": 71,
222
+ "explanation": "Demo mode: AI-generated content detected. Unable to verify external citations without live API.",
223
+ }
224
+
225
+
226
+ # ---------------------------------------------------------------------------
227
+ # Main Prefect flow
228
+ # ---------------------------------------------------------------------------
229
+
230
+ @flow(name="fact-intelligence-pipeline", log_prints=False)
231
+ async def evaluate_claim(
232
+ claim: str,
233
+ claim_hash: str,
234
+ element_id: str,
235
+ platform: Platform,
236
+ rag_result: RAGResult,
237
+ grok_result: GrokSensorResult,
238
+ settings: Settings | None = None,
239
+ ) -> AnalysisResult:
240
+ """
241
+ Orchestrates the full multi-agent evaluation as a Prefect flow.
242
+
243
+ Concurrent execution:
244
+ - misinformation_task always runs
245
+ - hallucination_task runs only for AI chat platforms
246
+
247
+ Results are merged by taking the higher-severity color.
248
+ The final AnalysisResult is returned directly (no Celery queue needed).
249
+ """
250
+ cfg = settings or get_settings()
251
+ t0 = time.perf_counter()
252
+
253
+ is_ai_platform = platform in (Platform.CHATGPT, Platform.CLAUDE, Platform.GEMINI)
254
+
255
+ # Determine whether to use demo mode
256
+ use_demo = cfg.demo_mode or not cfg.has_groq
257
+
258
+ if use_demo:
259
+ misinfo_raw = _demo_misinfo_result(rag_result.trust.score, grok_result.community_note)
260
+ halluc_raw = _demo_hallucination_result() if is_ai_platform else None
261
+ else:
262
+ # Concurrently run both agents when applicable
263
+ # Both agents now use Groq (free) β€” no Anthropic key needed
264
+ if is_ai_platform and cfg.has_groq:
265
+ misinfo_raw, halluc_raw = await asyncio.gather(
266
+ misinformation_task(claim, rag_result.evidence, rag_result.trust, grok_result, cfg),
267
+ hallucination_task(claim, cfg),
268
+ )
269
+ else:
270
+ misinfo_raw = await misinformation_task(
271
+ claim, rag_result.evidence, rag_result.trust, grok_result, cfg
272
+ )
273
+ halluc_raw = None
274
+
275
+ # --- Merge results: pick higher-severity color ---
276
+ misinfo_color = HighlightColor(misinfo_raw.get("color", "yellow"))
277
+ final_color = misinfo_color
278
+ final_confidence = misinfo_raw.get("confidence", 50)
279
+ final_explanation = misinfo_raw.get("explanation", "")
280
+ final_verdict = misinfo_raw.get("verdict_label", "Under review")
281
+
282
+ if halluc_raw:
283
+ halluc_color = HighlightColor(halluc_raw.get("color", "green"))
284
+ if SEVERITY[halluc_color] > SEVERITY[final_color]:
285
+ final_color = halluc_color
286
+ final_confidence = halluc_raw.get("confidence", final_confidence)
287
+ final_explanation = halluc_raw.get("explanation", final_explanation)
288
+ final_verdict = "AI hallucination detected"
289
+
290
+ # Build SourceRef list from evidence + misinfo agent sources
291
+ raw_sources: list[str] = misinfo_raw.get("sources", [])
292
+ evidence_sources = [e.source_url for e in rag_result.evidence[:3] if e.source_url]
293
+ combined = list(dict.fromkeys(raw_sources + evidence_sources))[:3] # deduplicated, max 3
294
+
295
+ source_refs = [
296
+ SourceRef(
297
+ url=url,
298
+ domain=_extract_domain(url),
299
+ favicon_url=f"https://www.google.com/s2/favicons?domain={_extract_domain(url)}&sz=16",
300
+ snippet="",
301
+ )
302
+ for url in combined
303
+ ]
304
+
305
+ latency_ms = round((time.perf_counter() - t0) * 1000, 2)
306
+
307
+ log.info(
308
+ "agents.flow.complete",
309
+ color=final_color,
310
+ confidence=final_confidence,
311
+ platform=platform,
312
+ latency_ms=latency_ms,
313
+ demo=use_demo,
314
+ )
315
+
316
+ return AnalysisResult(
317
+ element_id=element_id,
318
+ content_hash=claim_hash,
319
+ platform=platform,
320
+ color=final_color,
321
+ confidence=final_confidence,
322
+ verdict_label=final_verdict,
323
+ explanation=final_explanation,
324
+ sources=source_refs,
325
+ gatekeeper_label="fact",
326
+ trust_score=rag_result.trust.score,
327
+ velocity=grok_result.velocity,
328
+ has_community_note=grok_result.community_note,
329
+ latency_ms=latency_ms,
330
+ )
331
+
332
+
333
+ def _extract_domain(url: str) -> str:
334
+ try:
335
+ from urllib.parse import urlparse
336
+ return urlparse(url).netloc.lstrip("www.")
337
+ except Exception:
338
+ return url
backend/app.py ADDED
@@ -0,0 +1,513 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ app.py β€” Single entry point for HuggingFace Spaces.
3
+
4
+ Run with:
5
+ uv run python app.py ← HuggingFace Spaces / production
6
+ uv run uvicorn app:app --reload ← local dev
7
+
8
+ Lifecycle on startup:
9
+ 1. Configures structured logging
10
+ 2. Waits for Redis / Qdrant / Memgraph to be healthy (skipped in DEMO_MODE)
11
+ 3. Initialises Qdrant collection + Memgraph schema
12
+ 4. Seeds demo evidence chunks into Qdrant
13
+ 5. Warms up BGE-M3 embedder in the background
14
+ 6. Serves FastAPI on port 7860 (HuggingFace default)
15
+
16
+ WebSocket message lifecycle (per text segment):
17
+ 1. Extension sends TextBatch β†’ Redis cache check (xxhash key)
18
+ 2. Cache miss β†’ Gatekeeper (Groq llama3-8b, <120 ms p95)
19
+ 3. Noise β†’ dropped. Fact β†’ continue
20
+ 4. Concurrent: RAG pipeline (BGE-M3 + Qdrant + Memgraph) + Grok sensor
21
+ 5. Prefect flow: misinformation agent + hallucination agent (both Groq, free)
22
+ 6. AnalysisResult cached in Redis (TTL: 6 h green/red, 15 min yellow, no-cache purple)
23
+ 7. Result streamed back over WebSocket β†’ extension applies DOM highlight + hover card
24
+ """
25
+
26
+ import asyncio
27
+ import os
28
+ import sys
29
+ import time
30
+ from contextlib import asynccontextmanager
31
+ from typing import Any
32
+
33
+ import orjson
34
+ import redis.asyncio as aioredis
35
+ import structlog
36
+ import xxhash
37
+ from fastapi import FastAPI, WebSocket, WebSocketDisconnect
38
+ from fastapi.middleware.cors import CORSMiddleware
39
+ from fastapi.responses import HTMLResponse
40
+ from pydantic import ValidationError
41
+
42
+ # ---------------------------------------------------------------------------
43
+ # Bootstrap logging FIRST so every subsequent import logs correctly
44
+ # ---------------------------------------------------------------------------
45
+ from core.logging import configure_logging
46
+ from core.config import HighlightColor, Platform, get_settings
47
+
48
+ settings = get_settings()
49
+ configure_logging(
50
+ log_level=settings.log_level,
51
+ json_output=os.environ.get("JSON_LOGS", "false").lower() == "true",
52
+ )
53
+ log = structlog.get_logger("app")
54
+
55
+ # ---------------------------------------------------------------------------
56
+ # Remaining imports (after logging is configured)
57
+ # ---------------------------------------------------------------------------
58
+ from agents import evaluate_claim
59
+ from core.models import AnalysisResult, GatekeeperResult, TextBatch, WSInbound, WSOutbound
60
+ from gatekeeper import classify_claim
61
+ from grok_sensor import query_grok_sensor
62
+ from rag_pipeline import run_rag_pipeline
63
+
64
+ # ============================================================================
65
+ # SECTION 1 β€” Infrastructure health checks (used during startup)
66
+ # ============================================================================
67
+
68
+ async def _wait_for_redis(url: str, timeout: int = 30) -> bool:
69
+ deadline = time.time() + timeout
70
+ while time.time() < deadline:
71
+ try:
72
+ r = await aioredis.from_url(url, decode_responses=True)
73
+ await r.ping()
74
+ await r.aclose()
75
+ return True
76
+ except Exception:
77
+ await asyncio.sleep(1)
78
+ return False
79
+
80
+
81
+ async def _wait_for_qdrant(host: str, port: int, timeout: int = 30) -> bool:
82
+ import httpx
83
+ deadline = time.time() + timeout
84
+ while time.time() < deadline:
85
+ try:
86
+ async with httpx.AsyncClient(timeout=2.0) as client:
87
+ resp = await client.get(f"http://{host}:{port}/readyz")
88
+ if resp.status_code == 200:
89
+ return True
90
+ except Exception:
91
+ await asyncio.sleep(1)
92
+ return False
93
+
94
+
95
+ async def _wait_for_memgraph(host: str, port: int, timeout: int = 30) -> bool:
96
+ from neo4j import AsyncGraphDatabase
97
+ deadline = time.time() + timeout
98
+ while time.time() < deadline:
99
+ try:
100
+ driver = AsyncGraphDatabase.driver(
101
+ f"bolt://{host}:{port}",
102
+ auth=("", settings.memgraph_password),
103
+ encrypted=False,
104
+ )
105
+ async with driver.session() as session:
106
+ await session.run("RETURN 1;")
107
+ await driver.close()
108
+ return True
109
+ except Exception:
110
+ await asyncio.sleep(2)
111
+ return False
112
+
113
+
114
+ # ============================================================================
115
+ # SECTION 2 β€” Demo data seeding (populates Qdrant for the HF Spaces demo UI)
116
+ # ============================================================================
117
+
118
+ _DEMO_EVIDENCE = [
119
+ {
120
+ "text": "mRNA vaccines demonstrated sustained immune responses lasting 18-24 months across multiple peer-reviewed studies.",
121
+ "url": "https://www.nejm.org/doi/10.1056/NEJMoa2034577",
122
+ "domain": "nejm.org",
123
+ },
124
+ {
125
+ "text": "The Federal Reserve raised interest rates by 75 basis points in June 2022, the largest single hike since 1994.",
126
+ "url": "https://reuters.com/markets/us/fed-hikes-rates-2022-06-15",
127
+ "domain": "reuters.com",
128
+ },
129
+ {
130
+ "text": "Amazon deforestation data showed over 11,000 sq km lost in a single year at record levels.",
131
+ "url": "https://apnews.com/article/amazon-deforestation-record",
132
+ "domain": "apnews.com",
133
+ },
134
+ {
135
+ "text": "The United Nations projects global population will peak around 10.4 billion in the 2080s based on current demographic trends.",
136
+ "url": "https://www.un.org/development/desa/pd/",
137
+ "domain": "un.org",
138
+ },
139
+ {
140
+ "text": "Renewable energy accounted for 30% of global electricity generation in 2023 according to the International Energy Agency.",
141
+ "url": "https://www.iea.org/reports/renewables-2023",
142
+ "domain": "iea.org",
143
+ },
144
+ {
145
+ "text": "Social media use exceeding 3 hours daily correlates with higher anxiety rates in adolescents per multiple longitudinal studies.",
146
+ "url": "https://jamanetwork.com/journals/jamapediatrics/fullarticle/2767581",
147
+ "domain": "jamanetwork.com",
148
+ },
149
+ ]
150
+
151
+
152
+ async def _seed_demo_data() -> None:
153
+ """Upsert demo evidence chunks into Qdrant so the demo UI returns real RAG results."""
154
+ import uuid
155
+ from qdrant_client.models import PointStruct
156
+ from rag_pipeline import embed_texts, get_qdrant
157
+
158
+ log.info("demo.seed.start", count=len(_DEMO_EVIDENCE))
159
+ client = await get_qdrant(settings)
160
+ texts = [e["text"] for e in _DEMO_EVIDENCE]
161
+ vectors = await embed_texts(texts)
162
+
163
+ points = [
164
+ PointStruct(
165
+ id=str(uuid.uuid4()),
166
+ vector=vec,
167
+ payload={
168
+ "text": ev["text"],
169
+ "source_url": ev["url"],
170
+ "domain": ev["domain"],
171
+ "platform": "news",
172
+ "content_hash": f"demo_{i:04d}",
173
+ "ingested_at_ts": time.time(),
174
+ "author_handle": "demo_seed",
175
+ "bias_rating": "center",
176
+ },
177
+ )
178
+ for i, (ev, vec) in enumerate(zip(_DEMO_EVIDENCE, vectors))
179
+ ]
180
+ await client.upsert(collection_name=settings.qdrant_collection, points=points)
181
+ log.info("demo.seed.complete", count=len(points))
182
+
183
+
184
+ # ============================================================================
185
+ # SECTION 3 β€” Redis singleton
186
+ # ============================================================================
187
+
188
+ _redis: aioredis.Redis | None = None
189
+
190
+
191
+ async def get_redis() -> aioredis.Redis:
192
+ global _redis
193
+ if _redis is None:
194
+ _redis = await aioredis.from_url(settings.redis_url, decode_responses=True)
195
+ return _redis
196
+
197
+
198
+ # ============================================================================
199
+ # SECTION 4 β€” WebSocket connection manager
200
+ # ============================================================================
201
+
202
+ class ConnectionManager:
203
+ def __init__(self) -> None:
204
+ self.active: dict[str, WebSocket] = {}
205
+
206
+ async def connect(self, session_id: str, ws: WebSocket) -> None:
207
+ await ws.accept()
208
+ self.active[session_id] = ws
209
+ log.info("ws.connected", session_id=session_id, total=len(self.active))
210
+
211
+ def disconnect(self, session_id: str) -> None:
212
+ self.active.pop(session_id, None)
213
+ log.info("ws.disconnected", session_id=session_id, total=len(self.active))
214
+
215
+ async def send(self, session_id: str, payload: Any) -> None:
216
+ ws = self.active.get(session_id)
217
+ if ws:
218
+ msg = WSOutbound(type="result", payload=payload)
219
+ await ws.send_bytes(orjson.dumps(msg.model_dump(mode="json")))
220
+
221
+
222
+ manager = ConnectionManager()
223
+
224
+
225
+ # ============================================================================
226
+ # SECTION 5 β€” FastAPI lifespan (startup + shutdown)
227
+ # ============================================================================
228
+
229
+ @asynccontextmanager
230
+ async def lifespan(app: FastAPI):
231
+ log.info("startup.begin", demo_mode=settings.demo_mode, port=settings.port)
232
+
233
+ if not settings.demo_mode:
234
+ # Wait for all infrastructure services
235
+ log.info("startup.waiting_for_services")
236
+
237
+ if not await _wait_for_redis(settings.redis_url):
238
+ log.error("startup.redis.timeout"); sys.exit(1)
239
+ log.info("startup.redis.ok")
240
+
241
+ if not await _wait_for_qdrant(settings.qdrant_host, settings.qdrant_port):
242
+ log.error("startup.qdrant.timeout"); sys.exit(1)
243
+ log.info("startup.qdrant.ok")
244
+
245
+ if not await _wait_for_memgraph(settings.memgraph_host, settings.memgraph_port):
246
+ log.warning("startup.memgraph.timeout β€” trust scores will use neutral 0.5 fallback")
247
+ else:
248
+ log.info("startup.memgraph.ok")
249
+
250
+ # Initialise DB schemas (idempotent)
251
+ from core.db_init import init_all
252
+ await init_all(settings)
253
+
254
+ # Seed demo evidence into Qdrant
255
+ try:
256
+ await _seed_demo_data()
257
+ except Exception as exc:
258
+ log.warning("startup.seed.failed", error=str(exc))
259
+ else:
260
+ # Demo mode: just make sure Redis is reachable (may be local or absent)
261
+ try:
262
+ r = await get_redis()
263
+ await r.ping()
264
+ log.info("startup.redis.ok")
265
+ except Exception:
266
+ log.warning("startup.redis.unavailable β€” cache disabled in demo mode")
267
+
268
+ # Pre-warm BGE-M3 embedder in the background (avoids cold-start spike on first request)
269
+ async def _warm():
270
+ try:
271
+ from rag_pipeline import embed_texts
272
+ await embed_texts(["warm up"])
273
+ log.info("startup.embedder.warm")
274
+ except Exception as exc:
275
+ log.warning("startup.embedder.warn", error=str(exc))
276
+
277
+ asyncio.create_task(_warm())
278
+ log.info("startup.complete")
279
+
280
+ yield # ← app is live and serving
281
+
282
+ # Graceful shutdown
283
+ if _redis:
284
+ await _redis.aclose()
285
+ log.info("shutdown.complete")
286
+
287
+
288
+ # ============================================================================
289
+ # SECTION 6 β€” FastAPI application
290
+ # ============================================================================
291
+
292
+ app = FastAPI(
293
+ title="Omnichannel Fact & Hallucination Intelligence API",
294
+ version="1.0.0",
295
+ description="Near-zero-latency fact-checking and hallucination detection via WebSocket",
296
+ lifespan=lifespan,
297
+ )
298
+
299
+ app.add_middleware(
300
+ CORSMiddleware,
301
+ allow_origins=["*"],
302
+ allow_methods=["*"],
303
+ allow_headers=["*"],
304
+ )
305
+
306
+
307
+ # ============================================================================
308
+ # SECTION 7 β€” Core analysis pipeline
309
+ # ============================================================================
310
+
311
+ async def process_segment(
312
+ text: str,
313
+ content_hash: str,
314
+ element_id: str,
315
+ platform: Platform,
316
+ ) -> AnalysisResult | None:
317
+ """
318
+ Full pipeline for a single text segment. Returns None if noise.
319
+
320
+ Cache key: verdict:{content_hash}
321
+ TTL: 6 h β†’ green / red
322
+ 15 m β†’ yellow
323
+ none β†’ purple (hallucination results are context-specific)
324
+ """
325
+ # 1 β€” Redis cache check (sub-millisecond)
326
+ try:
327
+ r = await get_redis()
328
+ cached_json = await r.get(f"verdict:{content_hash}")
329
+ if cached_json:
330
+ result = AnalysisResult.model_validate_json(cached_json)
331
+ result.cached = True
332
+ result.element_id = element_id
333
+ log.debug("cache.hit", hash=content_hash[:8])
334
+ return result
335
+ except Exception:
336
+ pass # Redis unavailable in demo mode β€” continue without cache
337
+
338
+ # 2 β€” Gatekeeper: fact vs noise (<120 ms p95)
339
+ try:
340
+ gate: GatekeeperResult = await classify_claim(text, settings)
341
+ except Exception as exc:
342
+ log.error("gatekeeper.error", error=str(exc))
343
+ return None
344
+
345
+ if gate.label == "noise":
346
+ log.debug("gatekeeper.noise_dropped", hash=content_hash[:8])
347
+ return None
348
+
349
+ # 3 β€” Concurrent: RAG pipeline + Grok sensor
350
+ rag_result, grok_result = await asyncio.gather(
351
+ run_rag_pipeline(text, content_hash, settings),
352
+ query_grok_sensor(text, content_hash, settings),
353
+ )
354
+
355
+ # 4 β€” Multi-agent Prefect flow
356
+ result: AnalysisResult = await evaluate_claim(
357
+ claim=text,
358
+ claim_hash=content_hash,
359
+ element_id=element_id,
360
+ platform=platform,
361
+ rag_result=rag_result,
362
+ grok_result=grok_result,
363
+ settings=settings,
364
+ )
365
+
366
+ # 5 β€” Cache with color-appropriate TTL
367
+ try:
368
+ r = await get_redis()
369
+ if result.color != HighlightColor.PURPLE:
370
+ ttl = (
371
+ settings.cache_ttl_green_red
372
+ if result.color in (HighlightColor.GREEN, HighlightColor.RED)
373
+ else settings.cache_ttl_yellow
374
+ )
375
+ await r.setex(f"verdict:{content_hash}", ttl, result.model_dump_json())
376
+ except Exception:
377
+ pass
378
+
379
+ return result
380
+
381
+
382
+ # ============================================================================
383
+ # SECTION 8 β€” WebSocket endpoint
384
+ # ============================================================================
385
+
386
+ @app.websocket("/ws/{session_id}")
387
+ async def websocket_endpoint(ws: WebSocket, session_id: str):
388
+ """
389
+ Persistent WebSocket connection from the browser extension.
390
+
391
+ Inbound: { type: "batch", payload: TextBatch }
392
+ | { type: "ping" }
393
+ Outbound: { type: "result", payload: AnalysisResult }
394
+ | { type: "pong" }
395
+ | { type: "error", payload: { message: str } }
396
+ | { type: "status", payload: { connected: bool, demo_mode: bool, … } }
397
+ """
398
+ await manager.connect(session_id, ws)
399
+
400
+ # Initial handshake
401
+ await ws.send_bytes(orjson.dumps(
402
+ WSOutbound(type="status", payload={
403
+ "connected": True,
404
+ "demo_mode": settings.demo_mode,
405
+ "has_groq": settings.has_groq,
406
+ "has_x_api": settings.has_x_api,
407
+ }).model_dump(mode="json")
408
+ ))
409
+
410
+ try:
411
+ while True:
412
+ raw = await ws.receive_bytes()
413
+ envelope = WSInbound.model_validate_json(raw)
414
+
415
+ if envelope.type == "ping":
416
+ await ws.send_bytes(orjson.dumps(
417
+ WSOutbound(type="pong", payload=None).model_dump(mode="json")
418
+ ))
419
+ continue
420
+
421
+ if envelope.type != "batch" or not envelope.payload:
422
+ continue
423
+
424
+ try:
425
+ batch = TextBatch.model_validate(envelope.payload)
426
+ except ValidationError as exc:
427
+ await ws.send_bytes(orjson.dumps(
428
+ WSOutbound(type="error", payload={"message": str(exc)}).model_dump(mode="json")
429
+ ))
430
+ continue
431
+
432
+ # Process all segments in the batch concurrently
433
+ async def _process_and_send(segment):
434
+ t0 = time.perf_counter()
435
+ result = await process_segment(
436
+ text=segment.text,
437
+ content_hash=segment.content_hash,
438
+ element_id=segment.element_id,
439
+ platform=batch.platform,
440
+ )
441
+ if result:
442
+ result.latency_ms = round((time.perf_counter() - t0) * 1000, 2)
443
+ await manager.send(session_id, result.model_dump(mode="json"))
444
+
445
+ await asyncio.gather(*[_process_and_send(seg) for seg in batch.segments])
446
+
447
+ except WebSocketDisconnect:
448
+ manager.disconnect(session_id)
449
+ except Exception as exc:
450
+ log.error("ws.unexpected_error", session_id=session_id, error=str(exc))
451
+ manager.disconnect(session_id)
452
+
453
+
454
+ # ============================================================================
455
+ # SECTION 9 β€” REST endpoints
456
+ # ============================================================================
457
+
458
+ @app.get("/health")
459
+ async def health():
460
+ try:
461
+ r = await get_redis()
462
+ redis_ok = await r.ping()
463
+ except Exception:
464
+ redis_ok = False
465
+ return {
466
+ "status": "ok",
467
+ "redis": redis_ok,
468
+ "demo_mode": settings.demo_mode,
469
+ "version": "1.0.0",
470
+ }
471
+
472
+
473
+ @app.get("/metrics")
474
+ async def metrics():
475
+ try:
476
+ r = await get_redis()
477
+ cached_verdicts = await r.dbsize()
478
+ except Exception:
479
+ cached_verdicts = 0
480
+ return {
481
+ "active_connections": len(manager.active),
482
+ "cached_verdicts": cached_verdicts,
483
+ }
484
+
485
+
486
+ @app.get("/", response_class=HTMLResponse)
487
+ async def demo_ui():
488
+ """Serves the interactive demo UI at the root path (HuggingFace Spaces landing page)."""
489
+ ui_path = os.path.join(os.path.dirname(__file__), "static", "index.html")
490
+ if os.path.exists(ui_path):
491
+ with open(ui_path) as f:
492
+ return HTMLResponse(f.read())
493
+ return HTMLResponse(
494
+ "<h1>Fact Intelligence API</h1>"
495
+ "<p>Connect via WebSocket at <code>/ws/{session_id}</code></p>"
496
+ )
497
+
498
+
499
+ # ============================================================================
500
+ # SECTION 10 β€” __main__ block (python app.py)
501
+ # ============================================================================
502
+
503
+ if __name__ == "__main__":
504
+ import uvicorn
505
+ uvicorn.run(
506
+ "app:app",
507
+ host="0.0.0.0",
508
+ port=settings.port,
509
+ log_level=settings.log_level.lower(),
510
+ access_log=False,
511
+ ws_ping_interval=20,
512
+ ws_ping_timeout=60,
513
+ )
backend/core/config.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ core/config.py β€” Centralized settings via pydantic-settings.
3
+ All values read from environment variables (set in HF Spaces secrets).
4
+ """
5
+
6
+ from enum import Enum
7
+ from functools import lru_cache
8
+
9
+ from pydantic import Field, computed_field
10
+ from pydantic_settings import BaseSettings, SettingsConfigDict
11
+
12
+
13
+ class HighlightColor(str, Enum):
14
+ GREEN = "green" # Fact-checked, widely corroborated
15
+ YELLOW = "yellow" # Breaking / unverified / weak signal
16
+ RED = "red" # Debunked, active community note
17
+ PURPLE = "purple" # LLM hallucination detected
18
+
19
+
20
+ class Platform(str, Enum):
21
+ TWITTER = "twitter"
22
+ INSTAGRAM = "instagram"
23
+ YOUTUBE = "youtube"
24
+ CHATGPT = "chatgpt"
25
+ CLAUDE = "claude"
26
+ GEMINI = "gemini"
27
+ NEWS = "news"
28
+ UNKNOWN = "unknown"
29
+
30
+
31
+ class Settings(BaseSettings):
32
+ model_config = SettingsConfigDict(env_file=".env", extra="ignore")
33
+
34
+ # LLM API keys
35
+ groq_api_key: str = Field(default="", alias="GROQ_API_KEY")
36
+ x_bearer_token: str = Field(default="", alias="X_BEARER_TOKEN")
37
+
38
+ # Infrastructure
39
+ qdrant_host: str = Field(default="localhost", alias="QDRANT_HOST")
40
+ qdrant_port: int = Field(default=6333, alias="QDRANT_PORT")
41
+ memgraph_host: str = Field(default="localhost", alias="MEMGRAPH_HOST")
42
+ memgraph_port: int = Field(default=7687, alias="MEMGRAPH_PORT")
43
+ memgraph_password: str = Field(default="memgraph123", alias="MEMGRAPH_PASSWORD")
44
+ redpanda_brokers: str = Field(default="localhost:9092", alias="REDPANDA_BROKERS")
45
+ redis_url: str = Field(default="redis://localhost:6379", alias="REDIS_URL")
46
+
47
+ # App
48
+ port: int = Field(default=7860, alias="PORT")
49
+ log_level: str = Field(default="INFO", alias="LOG_LEVEL")
50
+ demo_mode: bool = Field(default=False, alias="DEMO_MODE")
51
+
52
+ # Model identifiers for LiteLLM routing
53
+ gatekeeper_model: str = "groq/llama3-8b-8192"
54
+ misinformation_model: str = "groq/mixtral-8x7b-32768"
55
+ hallucination_model: str = "groq/llama3-70b-8192" # Free via Groq β€” replaces Claude Haiku
56
+
57
+ # Gatekeeper latency SLO: p95 < 120ms
58
+ gatekeeper_timeout_ms: int = 120
59
+
60
+ # Cache TTLs (seconds)
61
+ cache_ttl_green_red: int = 21_600 # 6 hours
62
+ cache_ttl_yellow: int = 900 # 15 minutes
63
+ # Purple: no cache β€” hallucination checks are context-specific
64
+
65
+ # RAG retrieval
66
+ qdrant_collection: str = "claims"
67
+ qdrant_ef: int = 128 # HNSW ef parameter β€” higher = more accurate, slower
68
+ qdrant_top_k: int = 8 # nearest neighbors to retrieve
69
+ evidence_window_hours: int = 72 # only retrieve evidence newer than 72h
70
+
71
+ # Minimum text length for analysis (words)
72
+ min_word_count: int = 12
73
+
74
+ @computed_field
75
+ @property
76
+ def has_groq(self) -> bool:
77
+ return bool(self.groq_api_key)
78
+
79
+ @computed_field
80
+ @property
81
+ def has_hallucination_llm(self) -> bool:
82
+ # Hallucination agent uses Groq llama3-70b (free) β€” same key as gatekeeper
83
+ return bool(self.groq_api_key)
84
+
85
+ @computed_field
86
+ @property
87
+ def has_x_api(self) -> bool:
88
+ return bool(self.x_bearer_token)
89
+
90
+ @computed_field
91
+ @property
92
+ def broker_list(self) -> list[str]:
93
+ return self.redpanda_brokers.split(",")
94
+
95
+
96
+ @lru_cache
97
+ def get_settings() -> Settings:
98
+ return Settings()
backend/core/db_init.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ core/db_init.py β€” Initialize Qdrant collection and Memgraph graph schema.
3
+
4
+ Run once on startup (called from main.py lifespan) or manually:
5
+ uv run python -m core.db_init
6
+
7
+ Memgraph graph schema:
8
+ (Author {handle, verified, account_type})
9
+ -[:REPORTED {timestamp}]->
10
+ (Claim {text, embedding_id, hash})
11
+ <-[:CORROBORATED_BY {confidence}]-
12
+ (Source {url, domain, bias_rating})
13
+ -[:HAS_NOTE]->
14
+ (CommunityNote {text, active, created_at})
15
+
16
+ This schema supports:
17
+ - Trust score computation (Author.verified, Source count, CommunityNote presence)
18
+ - Claim deduplication by hash
19
+ - Source credibility tracking (bias_rating from Media Bias/Fact Check)
20
+ """
21
+
22
+ import asyncio
23
+
24
+ import structlog
25
+ from neo4j import AsyncGraphDatabase
26
+ from qdrant_client import AsyncQdrantClient
27
+ from qdrant_client.models import Distance, PayloadSchemaType, VectorParams
28
+
29
+ from core.config import get_settings
30
+
31
+ log = structlog.get_logger(__name__)
32
+
33
+
34
+ async def init_qdrant(settings=None) -> None:
35
+ """
36
+ Create the Qdrant 'claims' collection if it doesn't exist.
37
+ BGE-M3 outputs 1024-dimensional dense vectors.
38
+ HNSW index created automatically by Qdrant on collection creation.
39
+ """
40
+ cfg = settings or get_settings()
41
+ client = AsyncQdrantClient(host=cfg.qdrant_host, port=cfg.qdrant_port)
42
+
43
+ try:
44
+ collections = await client.get_collections()
45
+ existing = {c.name for c in collections.collections}
46
+
47
+ if cfg.qdrant_collection not in existing:
48
+ await client.create_collection(
49
+ collection_name=cfg.qdrant_collection,
50
+ vectors_config=VectorParams(
51
+ size=1024, # BGE-M3 output dimension
52
+ distance=Distance.COSINE,
53
+ ),
54
+ )
55
+ log.info("qdrant.collection.created", name=cfg.qdrant_collection)
56
+
57
+ # Payload indexes for fast filtering
58
+ for field, schema in [
59
+ ("ingested_at_ts", PayloadSchemaType.FLOAT),
60
+ ("platform", PayloadSchemaType.KEYWORD),
61
+ ("content_hash", PayloadSchemaType.KEYWORD),
62
+ ("author_handle", PayloadSchemaType.KEYWORD),
63
+ ]:
64
+ await client.create_payload_index(
65
+ collection_name=cfg.qdrant_collection,
66
+ field_name=field,
67
+ field_schema=schema,
68
+ )
69
+ log.debug("qdrant.index.created", field=field)
70
+ else:
71
+ log.info("qdrant.collection.exists", name=cfg.qdrant_collection)
72
+
73
+ finally:
74
+ await client.close()
75
+
76
+
77
+ async def init_memgraph(settings=None) -> None:
78
+ """
79
+ Create Memgraph constraints and indexes for the trust graph schema.
80
+ Memgraph is in-memory β€” indexes are re-created on restart (data too, unless persistence enabled).
81
+ """
82
+ cfg = settings or get_settings()
83
+ driver = AsyncGraphDatabase.driver(
84
+ f"bolt://{cfg.memgraph_host}:{cfg.memgraph_port}",
85
+ auth=("", cfg.memgraph_password),
86
+ encrypted=False,
87
+ )
88
+
89
+ schema_queries = [
90
+ # Uniqueness constraints (also create indexes automatically)
91
+ "CREATE CONSTRAINT ON (a:Author) ASSERT a.handle IS UNIQUE;",
92
+ "CREATE CONSTRAINT ON (c:Claim) ASSERT c.hash IS UNIQUE;",
93
+ "CREATE CONSTRAINT ON (s:Source) ASSERT s.url IS UNIQUE;",
94
+
95
+ # Additional indexes for traversal performance
96
+ "CREATE INDEX ON :Author(verified);",
97
+ "CREATE INDEX ON :Author(account_type);",
98
+ "CREATE INDEX ON :CommunityNote(active);",
99
+
100
+ # Seed a few known authoritative sources with high trust
101
+ """
102
+ MERGE (s:Source {url: 'https://reuters.com', domain: 'reuters.com'})
103
+ SET s.bias_rating = 'center', s.trust_tier = 'tier1';
104
+ """,
105
+ """
106
+ MERGE (s:Source {url: 'https://apnews.com', domain: 'apnews.com'})
107
+ SET s.bias_rating = 'center', s.trust_tier = 'tier1';
108
+ """,
109
+ """
110
+ MERGE (s:Source {url: 'https://who.int', domain: 'who.int'})
111
+ SET s.bias_rating = 'center', s.trust_tier = 'government';
112
+ """,
113
+ """
114
+ MERGE (s:Source {url: 'https://cdc.gov', domain: 'cdc.gov'})
115
+ SET s.bias_rating = 'center', s.trust_tier = 'government';
116
+ """,
117
+ ]
118
+
119
+ async with driver.session() as session:
120
+ for query in schema_queries:
121
+ try:
122
+ await session.run(query)
123
+ except Exception as exc:
124
+ # Constraints/indexes may already exist β€” not an error
125
+ if "already exists" not in str(exc).lower():
126
+ log.warning("memgraph.schema.warn", query=query[:60], error=str(exc))
127
+
128
+ await driver.close()
129
+ log.info("memgraph.schema.initialized")
130
+
131
+
132
+ async def init_all(settings=None) -> None:
133
+ """Initialize both Qdrant and Memgraph. Called from FastAPI lifespan."""
134
+ cfg = settings or get_settings()
135
+ await asyncio.gather(
136
+ init_qdrant(cfg),
137
+ init_memgraph(cfg),
138
+ )
139
+ log.info("db.init.complete")
140
+
141
+
142
+ if __name__ == "__main__":
143
+ asyncio.run(init_all())
backend/core/logging.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ core/logging.py β€” Structured logging setup using structlog + rich.
3
+
4
+ structlog provides machine-readable JSON in production and
5
+ colorized human-readable output in development, with zero config change.
6
+
7
+ Usage:
8
+ import structlog
9
+ log = structlog.get_logger(__name__)
10
+ log.info("event.name", key="value", latency_ms=42.1)
11
+ """
12
+
13
+ import logging
14
+ import sys
15
+
16
+ import structlog
17
+
18
+
19
+ def configure_logging(log_level: str = "INFO", json_output: bool = False) -> None:
20
+ """
21
+ Configure structlog for the application.
22
+
23
+ In production (json_output=True): Outputs newline-delimited JSON β€”
24
+ compatible with Datadog, Grafana Loki, AWS CloudWatch, etc.
25
+
26
+ In development (json_output=False): Outputs colorized, human-readable
27
+ logs using rich ConsoleRenderer.
28
+ """
29
+ shared_processors = [
30
+ # Add log level as a field
31
+ structlog.stdlib.add_log_level,
32
+ # Add logger name
33
+ structlog.stdlib.add_logger_name,
34
+ # Add timestamp in ISO 8601
35
+ structlog.processors.TimeStamper(fmt="iso"),
36
+ # Render exceptions as structured dicts
37
+ structlog.processors.format_exc_info,
38
+ # Render stack info
39
+ structlog.processors.StackInfoRenderer(),
40
+ ]
41
+
42
+ if json_output:
43
+ renderer = structlog.processors.JSONRenderer()
44
+ else:
45
+ renderer = structlog.dev.ConsoleRenderer(colors=True, exception_formatter=structlog.dev.plain_traceback)
46
+
47
+ structlog.configure(
48
+ processors=[
49
+ *shared_processors,
50
+ # Final renderer must be last
51
+ renderer,
52
+ ],
53
+ wrapper_class=structlog.make_filtering_bound_logger(
54
+ logging.getLevelName(log_level.upper())
55
+ ),
56
+ context_class=dict,
57
+ logger_factory=structlog.PrintLoggerFactory(file=sys.stdout),
58
+ cache_logger_on_first_use=True,
59
+ )
60
+
61
+ # Also configure stdlib logging to route through structlog
62
+ logging.basicConfig(
63
+ format="%(message)s",
64
+ stream=sys.stdout,
65
+ level=logging.getLevelName(log_level.upper()),
66
+ )
67
+ # Silence noisy libraries
68
+ for lib in ["httpx", "httpcore", "aiokafka", "neo4j", "qdrant_client", "uvicorn.access"]:
69
+ logging.getLogger(lib).setLevel(logging.WARNING)
backend/core/models.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ core/models.py β€” Pydantic v2 models for the entire pipeline.
3
+
4
+ All models use strict typing with no implicit coercion, leveraging
5
+ Pydantic v2's Rust-backed validation for maximum throughput.
6
+ """
7
+
8
+ from datetime import datetime
9
+ from typing import Any
10
+ from uuid import UUID, uuid4
11
+
12
+ from pydantic import BaseModel, Field, field_validator, model_validator
13
+
14
+ from core.config import HighlightColor, Platform
15
+
16
+
17
+ # ---------------------------------------------------------------------------
18
+ # Inbound β€” what the browser extension sends us over WebSocket
19
+ # ---------------------------------------------------------------------------
20
+
21
+ class TextBatch(BaseModel):
22
+ """
23
+ A deduplicated batch of text segments flushed from the extension's
24
+ ring buffer every 1200ms. Each segment carries its own xxhash for
25
+ upstream deduplication and cache lookup.
26
+ """
27
+ session_id: str
28
+ platform: Platform
29
+ segments: list["TextSegment"]
30
+ sent_at: datetime = Field(default_factory=datetime.utcnow)
31
+
32
+
33
+ class TextSegment(BaseModel):
34
+ content_hash: str # xxhash64 hex β€” used as Redis cache key
35
+ text: str
36
+ element_id: str # DOM node ID from the extension for highlight targeting
37
+ word_count: int
38
+
39
+ @field_validator("word_count")
40
+ @classmethod
41
+ def must_meet_minimum(cls, v: int) -> int:
42
+ if v < 12:
43
+ raise ValueError("Segments shorter than 12 words must be filtered client-side")
44
+ return v
45
+
46
+
47
+ # ---------------------------------------------------------------------------
48
+ # Gatekeeper output
49
+ # ---------------------------------------------------------------------------
50
+
51
+ class GatekeeperResult(BaseModel):
52
+ """
53
+ Groq llama3-8b-8192 classifies each claim as fact or noise.
54
+ Structured JSON output β€” parsed with model_validate_json(), no try-except.
55
+ """
56
+ label: str # "fact" | "noise"
57
+ reason: str # one-sentence reasoning for the classification
58
+ confidence: float = Field(ge=0.0, le=1.0)
59
+
60
+ @field_validator("label")
61
+ @classmethod
62
+ def valid_label(cls, v: str) -> str:
63
+ if v not in {"fact", "noise"}:
64
+ raise ValueError(f"Label must be 'fact' or 'noise', got '{v}'")
65
+ return v
66
+
67
+
68
+ # ---------------------------------------------------------------------------
69
+ # RAG pipeline output
70
+ # ---------------------------------------------------------------------------
71
+
72
+ class EvidenceChunk(BaseModel):
73
+ """A retrieved evidence chunk from Qdrant."""
74
+ chunk_id: str
75
+ text: str
76
+ source_url: str
77
+ domain: str
78
+ score: float = Field(ge=0.0, le=1.0) # cosine similarity
79
+ ingested_at: datetime
80
+ bias_rating: str | None = None
81
+
82
+
83
+ class TrustScore(BaseModel):
84
+ """
85
+ Computed from the Memgraph trust graph traversal.
86
+ Algorithm: start 0.5, +0.3 verified official, +0.05/source (max 0.25),
87
+ -0.4 if Community Note active. Clamped to [0.0, 1.0].
88
+ """
89
+ score: float = Field(ge=0.0, le=1.0)
90
+ author_verified: bool
91
+ corroborating_sources: int
92
+ has_community_note: bool
93
+ community_note_text: str | None = None
94
+
95
+
96
+ class RAGResult(BaseModel):
97
+ evidence: list[EvidenceChunk]
98
+ trust: TrustScore
99
+
100
+
101
+ # ---------------------------------------------------------------------------
102
+ # Grok/X sensor output
103
+ # ---------------------------------------------------------------------------
104
+
105
+ class GrokSensorResult(BaseModel):
106
+ velocity: int # 7-day tweet volume for core keywords
107
+ community_note: bool
108
+ note_text: str | None = None
109
+ is_mock: bool = False # True when X API key is absent
110
+
111
+
112
+ # ---------------------------------------------------------------------------
113
+ # Final analysis result β€” sent back to extension over WebSocket
114
+ # ---------------------------------------------------------------------------
115
+
116
+ class SourceRef(BaseModel):
117
+ url: str
118
+ domain: str
119
+ favicon_url: str
120
+ snippet: str
121
+
122
+
123
+ class AnalysisResult(BaseModel):
124
+ """
125
+ The final enriched verdict returned to the browser extension.
126
+ The extension uses color + element_id to apply highlight + hover card.
127
+ """
128
+ request_id: UUID = Field(default_factory=uuid4)
129
+ element_id: str # mirrors TextSegment.element_id for DOM targeting
130
+ content_hash: str
131
+ platform: Platform
132
+
133
+ # Verdict
134
+ color: HighlightColor
135
+ confidence: int = Field(ge=0, le=100)
136
+ verdict_label: str # human-readable summary e.g. "Debunked by Reuters"
137
+ explanation: str # full explanation string for hover card
138
+
139
+ # Top 3 sources shown in hover card
140
+ sources: list[SourceRef] = Field(max_length=3)
141
+
142
+ # Debug / provenance metadata
143
+ gatekeeper_label: str
144
+ trust_score: float
145
+ velocity: int
146
+ has_community_note: bool
147
+ latency_ms: float # total pipeline latency for observability
148
+ cached: bool = False
149
+ timestamp: datetime = Field(default_factory=datetime.utcnow)
150
+
151
+
152
+ # ---------------------------------------------------------------------------
153
+ # WebSocket protocol messages
154
+ # ---------------------------------------------------------------------------
155
+
156
+ class WSMessageType(str):
157
+ BATCH = "batch"
158
+ RESULT = "result"
159
+ ERROR = "error"
160
+ PING = "ping"
161
+ PONG = "pong"
162
+ STATUS = "status"
163
+
164
+
165
+ class WSInbound(BaseModel):
166
+ type: str
167
+ payload: dict[str, Any] | None = None
168
+
169
+
170
+ class WSOutbound(BaseModel):
171
+ type: str
172
+ payload: Any
173
+ timestamp: datetime = Field(default_factory=datetime.utcnow)
174
+
175
+
176
+ # ---------------------------------------------------------------------------
177
+ # Kafka/Redpanda event envelope
178
+ # ---------------------------------------------------------------------------
179
+
180
+ class IngestionEvent(BaseModel):
181
+ """
182
+ Envelope for all three Redpanda topics (twitter, instagram, youtube).
183
+ Producers wrap their platform-specific data in this common schema.
184
+ """
185
+ event_id: str = Field(default_factory=lambda: str(uuid4()))
186
+ platform: Platform
187
+ content_hash: str
188
+ text: str
189
+ author_handle: str | None = None
190
+ author_verified: bool = False
191
+ source_url: str | None = None
192
+ ingested_at: datetime = Field(default_factory=datetime.utcnow)
193
+
194
+ @model_validator(mode="after")
195
+ def strip_whitespace(self) -> "IngestionEvent":
196
+ self.text = self.text.strip()
197
+ return self
backend/gatekeeper.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ gatekeeper.py β€” Groq-powered edge router.
3
+
4
+ Every incoming text batch hits this first. The Groq API with llama3-8b-8192
5
+ gives us 800+ tokens/second inference, sub-100ms p95 latency, no GPU needed.
6
+
7
+ If the classifier returns "noise" (opinion, meme, rhetoric, social noise),
8
+ the request is dropped immediately β€” no downstream pipeline costs incurred.
9
+
10
+ SLO: p95 < 120ms end-to-end, measured at the FastAPI WebSocket handler.
11
+ """
12
+
13
+ import time
14
+
15
+ import structlog
16
+ from groq import AsyncGroq
17
+ from pydantic import ValidationError
18
+
19
+ from core.config import Settings, get_settings
20
+ from core.models import GatekeeperResult
21
+
22
+ log = structlog.get_logger(__name__)
23
+
24
+ # ---------------------------------------------------------------------------
25
+ # Strict JSON schema prompt β€” forces the model to output parseable JSON.
26
+ # Pydantic v2's model_validate_json() parses this without a try-except
27
+ # because if validation fails we WANT the exception to surface.
28
+ # ---------------------------------------------------------------------------
29
+
30
+ GATEKEEPER_SYSTEM = """You are a claim classifier. Analyze the given text and output ONLY valid JSON.
31
+
32
+ Output schema (strict β€” no extra keys, no markdown, no preamble):
33
+ {
34
+ "label": "fact" | "noise",
35
+ "reason": "<one concise sentence>",
36
+ "confidence": <float 0.0–1.0>
37
+ }
38
+
39
+ Classify as "fact" if the text contains a falsifiable factual claim β€” a statement
40
+ about the real world that could be verified or refuted with evidence.
41
+
42
+ Classify as "noise" if the text is:
43
+ - A personal opinion or sentiment ("I think...", "I believe...")
44
+ - Rhetorical question
45
+ - Meme, humor, sarcasm, or social commentary without factual claims
46
+ - Pure emotional reaction ("this is amazing!", "so sad")
47
+ - Call-to-action without factual content
48
+ - Filler text or social pleasantries
49
+
50
+ Be conservative: when in doubt, label "fact" to avoid false negatives."""
51
+
52
+ GATEKEEPER_USER_TMPL = 'Classify this text: "{text}"'
53
+
54
+
55
+ async def classify_claim(text: str, settings: Settings | None = None) -> GatekeeperResult:
56
+ """
57
+ Classify whether `text` contains a falsifiable factual claim.
58
+
59
+ Returns GatekeeperResult with label="fact"|"noise".
60
+ Raises on timeout (>120ms) or model failure β€” caller handles fallback.
61
+
62
+ In DEMO_MODE (no GROQ_API_KEY), uses a simple heuristic classifier
63
+ so the system runs end-to-end without any API keys.
64
+ """
65
+ cfg = settings or get_settings()
66
+ t0 = time.perf_counter()
67
+
68
+ if cfg.demo_mode or not cfg.has_groq:
69
+ result = _heuristic_classify(text)
70
+ log.debug("gatekeeper.heuristic", label=result.label, latency_ms=round((time.perf_counter() - t0) * 1000, 2))
71
+ return result
72
+
73
+ client = AsyncGroq(api_key=cfg.groq_api_key)
74
+
75
+ # Use json_object response format β€” Groq enforces valid JSON output
76
+ response = await client.chat.completions.create(
77
+ model="llama3-8b-8192",
78
+ messages=[
79
+ {"role": "system", "content": GATEKEEPER_SYSTEM},
80
+ {"role": "user", "content": GATEKEEPER_USER_TMPL.format(text=text[:800])},
81
+ ],
82
+ response_format={"type": "json_object"},
83
+ temperature=0.0, # Deterministic classification
84
+ max_tokens=120, # JSON output is short β€” cap tokens to reduce latency
85
+ timeout=0.115, # 115ms hard timeout preserves the 120ms p95 SLO
86
+ )
87
+
88
+ latency_ms = round((time.perf_counter() - t0) * 1000, 2)
89
+ raw_json = response.choices[0].message.content or "{}"
90
+
91
+ # model_validate_json() uses Pydantic v2's Rust validator β€” no try-except
92
+ # needed for the happy path; ValidationError propagates to caller.
93
+ result = GatekeeperResult.model_validate_json(raw_json)
94
+
95
+ log.info(
96
+ "gatekeeper.groq",
97
+ label=result.label,
98
+ confidence=result.confidence,
99
+ latency_ms=latency_ms,
100
+ tokens=response.usage.total_tokens if response.usage else None,
101
+ )
102
+ return result
103
+
104
+
105
+ def _heuristic_classify(text: str) -> GatekeeperResult:
106
+ """
107
+ Fallback classifier when GROQ_API_KEY is absent (DEMO_MODE=true).
108
+ Uses simple lexical heuristics β€” not production-grade, but sufficient
109
+ for demonstrating the full pipeline without API credentials.
110
+ """
111
+ text_lower = text.lower()
112
+
113
+ noise_indicators = [
114
+ text_lower.startswith(("i think", "i believe", "i feel", "imo", "imho")),
115
+ text_lower.endswith("?") and len(text.split()) < 15,
116
+ any(w in text_lower for w in ["lol", "lmao", "omg", "wtf", "smh", "🀣", "πŸ˜‚"]),
117
+ all(w in text_lower for w in ["love", "hate"]) and "because" not in text_lower,
118
+ ]
119
+
120
+ if any(noise_indicators):
121
+ return GatekeeperResult(label="noise", reason="Heuristic: opinion/sentiment pattern detected", confidence=0.75)
122
+
123
+ fact_indicators = [
124
+ any(c.isdigit() for c in text), # Contains numbers β†’ likely factual claim
125
+ any(w in text_lower for w in ["percent", "%", "million", "billion", "study", "report", "according"]),
126
+ len(text.split()) > 20, # Longer sentences tend to be claims
127
+ ]
128
+
129
+ if any(fact_indicators):
130
+ return GatekeeperResult(label="fact", reason="Heuristic: numeric/evidential language detected", confidence=0.65)
131
+
132
+ # Default: treat as fact (conservative β€” avoid false negatives)
133
+ return GatekeeperResult(label="fact", reason="Heuristic: no clear noise pattern, defaulting to fact", confidence=0.5)
backend/grok_sensor.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ grok_sensor.py β€” Async X API v2 + Community Notes integration.
3
+
4
+ Queries two signals for any claim:
5
+ 1. 7-day tweet velocity: how fast is this claim spreading?
6
+ High velocity + no corroboration = yellow flag
7
+ 2. Community Notes: has the crowd-sourced fact-check system flagged it?
8
+ Active note = strong red signal (-0.4 in trust scoring)
9
+
10
+ Full mock fallback when X_BEARER_TOKEN is absent β€” the system runs
11
+ end-to-end in demo mode without any external API credentials.
12
+ """
13
+
14
+ import hashlib
15
+ import random
16
+ from datetime import datetime, timedelta, timezone
17
+
18
+ import httpx
19
+ import structlog
20
+ from tenacity import (
21
+ retry,
22
+ retry_if_exception_type,
23
+ stop_after_attempt,
24
+ wait_exponential,
25
+ )
26
+
27
+ from core.config import Settings, get_settings
28
+ from core.models import GrokSensorResult
29
+
30
+ log = structlog.get_logger(__name__)
31
+
32
+ X_API_BASE = "https://api.twitter.com/2"
33
+ COMMUNITY_NOTES_BASE = "https://twitter.com/i/birdwatch/n" # Unofficial β€” use search API workaround
34
+
35
+ # ---------------------------------------------------------------------------
36
+ # X API v2 search
37
+ # ---------------------------------------------------------------------------
38
+
39
+ @retry(
40
+ retry=retry_if_exception_type(httpx.HTTPStatusError),
41
+ stop=stop_after_attempt(3),
42
+ wait=wait_exponential(multiplier=0.5, min=0.1, max=2.0),
43
+ )
44
+ async def _search_x_api(query: str, bearer_token: str) -> int:
45
+ """
46
+ Search X API v2 for tweet count matching the query in the past 7 days.
47
+ Returns the total tweet count as a velocity signal.
48
+
49
+ Uses tenacity for exponential backoff on HTTP 429 (rate limit) responses.
50
+ """
51
+ params = {
52
+ "query": f"{query} -is:retweet lang:en",
53
+ "start_time": (datetime.now(timezone.utc) - timedelta(days=7)).isoformat(),
54
+ "granularity": "day",
55
+ }
56
+ headers = {"Authorization": f"Bearer {bearer_token}"}
57
+
58
+ async with httpx.AsyncClient(timeout=5.0) as client:
59
+ resp = await client.get(
60
+ f"{X_API_BASE}/tweets/counts/recent",
61
+ params=params,
62
+ headers=headers,
63
+ )
64
+ resp.raise_for_status()
65
+ data = resp.json()
66
+ return data.get("meta", {}).get("total_tweet_count", 0)
67
+
68
+
69
+ async def _check_community_notes(query_keywords: list[str], bearer_token: str) -> tuple[bool, str | None]:
70
+ """
71
+ Check for active Community Notes using the X API v2 search endpoint.
72
+ Community Notes are exposed as tweets from @CommunityNotes.
73
+
74
+ Returns (has_note: bool, note_text: str | None).
75
+ """
76
+ query = " ".join(query_keywords[:5]) # Use top-5 keywords for targeted search
77
+ params = {
78
+ "query": f"(from:CommunityNotes) ({query})",
79
+ "max_results": 5,
80
+ "tweet.fields": "text,created_at",
81
+ "start_time": (datetime.now(timezone.utc) - timedelta(days=30)).isoformat(),
82
+ }
83
+ headers = {"Authorization": f"Bearer {bearer_token}"}
84
+
85
+ async with httpx.AsyncClient(timeout=5.0) as client:
86
+ resp = await client.get(
87
+ f"{X_API_BASE}/tweets/search/recent",
88
+ params=params,
89
+ headers=headers,
90
+ )
91
+ if resp.status_code == 200:
92
+ data = resp.json()
93
+ tweets = data.get("data", [])
94
+ if tweets:
95
+ return True, tweets[0]["text"]
96
+ return False, None
97
+
98
+
99
+ def _extract_keywords(text: str) -> list[str]:
100
+ """
101
+ Extract the most meaningful content words for query construction.
102
+ Strips stopwords; keeps nouns, numbers, proper nouns (heuristic: capitalized).
103
+ """
104
+ stopwords = {
105
+ "the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
106
+ "have", "has", "had", "do", "does", "did", "will", "would", "could",
107
+ "should", "may", "might", "shall", "can", "this", "that", "these",
108
+ "those", "i", "we", "you", "he", "she", "it", "they", "and", "or",
109
+ "but", "in", "on", "at", "to", "for", "of", "with", "by", "from",
110
+ "up", "as", "into", "through", "about", "after", "before",
111
+ }
112
+ words = [w.strip(".,!?;:\"'()[]") for w in text.split()]
113
+ return [w for w in words if w.lower() not in stopwords and len(w) > 3][:10]
114
+
115
+
116
+ def _mock_sensor_result(claim_hash: str) -> GrokSensorResult:
117
+ """
118
+ Deterministic mock result derived from the claim hash.
119
+ Same hash always produces the same result β€” stable for testing.
120
+ """
121
+ seed = int(claim_hash[:8], 16) if all(c in "0123456789abcdef" for c in claim_hash[:8]) else hash(claim_hash)
122
+ rng = random.Random(seed)
123
+
124
+ velocity = rng.randint(0, 50_000)
125
+ has_note = rng.random() < 0.12 # ~12% chance of a community note (realistic)
126
+ note_text = (
127
+ "Community Note: This claim lacks context. The full data shows..."
128
+ if has_note
129
+ else None
130
+ )
131
+
132
+ return GrokSensorResult(
133
+ velocity=velocity,
134
+ community_note=has_note,
135
+ note_text=note_text,
136
+ is_mock=True,
137
+ )
138
+
139
+
140
+ # ---------------------------------------------------------------------------
141
+ # Public interface
142
+ # ---------------------------------------------------------------------------
143
+
144
+ async def query_grok_sensor(
145
+ claim_text: str,
146
+ claim_hash: str,
147
+ settings: Settings | None = None,
148
+ ) -> GrokSensorResult:
149
+ """
150
+ Main entry point: query X API for claim velocity and Community Notes.
151
+
152
+ Falls back to deterministic mock data when X_BEARER_TOKEN is absent.
153
+ The mock is seeded by claim_hash so results are consistent across calls.
154
+ """
155
+ cfg = settings or get_settings()
156
+
157
+ if cfg.demo_mode or not cfg.has_x_api:
158
+ result = _mock_sensor_result(claim_hash)
159
+ log.debug("grok_sensor.mock", velocity=result.velocity, has_note=result.community_note)
160
+ return result
161
+
162
+ keywords = _extract_keywords(claim_text)
163
+ query = " ".join(keywords[:5])
164
+
165
+ try:
166
+ velocity, (has_note, note_text) = await _search_x_api(query, cfg.x_bearer_token), (False, None)
167
+
168
+ # Only check community notes if velocity is nonzero (claim is circulating)
169
+ if velocity > 100:
170
+ has_note, note_text = await _check_community_notes(keywords, cfg.x_bearer_token)
171
+
172
+ result = GrokSensorResult(
173
+ velocity=velocity,
174
+ community_note=has_note,
175
+ note_text=note_text,
176
+ is_mock=False,
177
+ )
178
+ log.info("grok_sensor.live", velocity=velocity, has_note=has_note)
179
+ return result
180
+
181
+ except httpx.HTTPError as exc:
182
+ log.warning("grok_sensor.api_error", error=str(exc), fallback="mock")
183
+ return _mock_sensor_result(claim_hash)
backend/producers/producers.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ producers/twitter_producer.py β€” Async Redpanda producer for X/Twitter mock data.
3
+
4
+ Reads mock tweet data from a JSONL file and publishes to topic `raw.twitter`
5
+ at 50 events/second. Redpanda's Kafka-compatible API means aiokafka works
6
+ without any modifications.
7
+
8
+ Why 50 eps for Twitter: Twitter is the highest-velocity source (most
9
+ misinformation travels fastest on X), so it gets the highest throughput budget.
10
+ """
11
+
12
+ import asyncio
13
+ import json
14
+ import os
15
+ import time
16
+ from pathlib import Path
17
+
18
+ import structlog
19
+ from aiokafka import AIOKafkaProducer
20
+ from aiokafka.errors import KafkaConnectionError
21
+
22
+ log = structlog.get_logger(__name__)
23
+ BROKERS = os.environ.get("REDPANDA_BROKERS", "localhost:9092")
24
+ TOPIC = "raw.twitter"
25
+ TARGET_EPS = 50 # events per second
26
+
27
+
28
+ # ---------------------------------------------------------------------------
29
+ # Mock data (used when no JSONL file is provided)
30
+ # ---------------------------------------------------------------------------
31
+
32
+ MOCK_TWEETS = [
33
+ {"id": "t001", "text": "Scientists confirmed that 73% of peer-reviewed studies on mRNA vaccines show long-term immunity lasting over 18 months.", "author": "science_today", "verified": True, "account_type": "official_news"},
34
+ {"id": "t002", "text": "Breaking: The Federal Reserve has just raised interest rates by 75 basis points β€” the largest single hike since 1994.", "author": "reuters_econ", "verified": True, "account_type": "official_news"},
35
+ {"id": "t003", "text": "lol did you see that video? total propaganda πŸ˜‚ wake up people", "author": "anon_user123", "verified": False, "account_type": "personal"},
36
+ {"id": "t004", "text": "The WHO confirmed 12 million cases of the new strain have been reported across 47 countries in the last 30 days.", "author": "who_official", "verified": True, "account_type": "government"},
37
+ {"id": "t005", "text": "According to newly declassified Pentagon documents, UFO encounters increased by 400% between 2020 and 2023.", "author": "ufo_truther", "verified": False, "account_type": "personal"},
38
+ {"id": "t006", "text": "Harvard researchers published data showing remote work productivity rose by 13% on average versus in-office.", "author": "harvard_biz", "verified": True, "account_type": "official_news"},
39
+ {"id": "t007", "text": "I just think the whole thing is suspicious. something doesn't add up here. do your own research!", "author": "skeptic_99", "verified": False, "account_type": "personal"},
40
+ {"id": "t008", "text": "EU parliament voted 483-141 to approve the AI Act, making it the world's first comprehensive artificial intelligence legislation.", "author": "eu_parliament", "verified": True, "account_type": "government"},
41
+ {"id": "t009", "text": "Elon Musk announced Tesla will manufacture 5 million vehicles in 2025, a 240% increase from 2023 production.", "author": "tech_insider", "verified": False, "account_type": "personal"},
42
+ {"id": "t010", "text": "Climate scientists at NOAA recorded the highest average ocean temperatures in 150 years of recorded history this August.", "author": "noaa_official", "verified": True, "account_type": "government"},
43
+ ] * 100 # Repeat for continuous stream
44
+
45
+
46
+ async def produce_twitter(brokers: str = BROKERS, limit: int | None = None) -> None:
47
+ """
48
+ Async producer loop. Publishes tweets to `raw.twitter` at TARGET_EPS.
49
+ Runs indefinitely unless `limit` is set (useful for testing).
50
+ """
51
+ producer = AIOKafkaProducer(
52
+ bootstrap_servers=brokers,
53
+ value_serializer=lambda v: json.dumps(v).encode(),
54
+ compression_type="gzip",
55
+ max_batch_size=16384,
56
+ )
57
+
58
+ await producer.start()
59
+ log.info("producer.twitter.start", brokers=brokers, eps=TARGET_EPS)
60
+
61
+ interval = 1.0 / TARGET_EPS
62
+ count = 0
63
+
64
+ try:
65
+ for tweet in MOCK_TWEETS:
66
+ if limit and count >= limit:
67
+ break
68
+
69
+ envelope = {
70
+ "platform": "twitter",
71
+ "content_hash": _hash(tweet["text"]),
72
+ "text": tweet["text"],
73
+ "author_handle": tweet["author"],
74
+ "author_verified": tweet["verified"],
75
+ "source_url": f"https://x.com/{tweet['author']}/status/{tweet['id']}",
76
+ "ingested_at": time.time(),
77
+ }
78
+
79
+ await producer.send(TOPIC, value=envelope)
80
+ count += 1
81
+ await asyncio.sleep(interval)
82
+
83
+ finally:
84
+ await producer.stop()
85
+ log.info("producer.twitter.stop", total_sent=count)
86
+
87
+
88
+ def _hash(text: str) -> str:
89
+ import xxhash
90
+ return xxhash.xxh64(text.encode()).hexdigest()
91
+
92
+
93
+ # ---------------------------------------------------------------------------
94
+ # instagram_producer.py (inline to keep file count reasonable)
95
+ # ---------------------------------------------------------------------------
96
+
97
+ INSTAGRAM_TOPIC = "raw.instagram"
98
+ INSTAGRAM_EPS = 20
99
+
100
+ MOCK_INSTAGRAM = [
101
+ {"id": "ig001", "text": "Just read that consuming 5 servings of ultra-processed foods per day increases cardiovascular disease risk by 62%.", "account": "nutritionista_real"},
102
+ {"id": "ig002", "text": "loving these golden hour pics πŸŒ… this place is absolutely magical!", "account": "travel_vibes_only"},
103
+ {"id": "ig003", "text": "NASA confirmed the Artemis III moon landing is scheduled for September 2026, marking humanity's return after 54 years.", "account": "space_news_daily"},
104
+ {"id": "ig004", "text": "Studies show social media use exceeding 3 hours daily correlates with a 48% higher rate of anxiety in adolescents aged 13-17.", "account": "mental_health_facts"},
105
+ {"id": "ig005", "text": "Can't believe this coffee shop! best latte I've had all year β˜•βœ¨", "account": "foodie_adventures"},
106
+ {"id": "ig006", "text": "A leaked document suggests Apple's Vision Pro 2 will feature a 70% thinner form factor and 14-hour battery life.", "account": "tech_leaks_xyz"},
107
+ {"id": "ig007", "text": "The Amazon rainforest lost 11,568 square kilometers to deforestation in 2023, a 22% increase from the previous year.", "account": "environmental_watch"},
108
+ ] * 50
109
+
110
+
111
+ async def produce_instagram(brokers: str = BROKERS, limit: int | None = None) -> None:
112
+ producer = AIOKafkaProducer(
113
+ bootstrap_servers=brokers,
114
+ value_serializer=lambda v: json.dumps(v).encode(),
115
+ compression_type="gzip",
116
+ )
117
+ await producer.start()
118
+ log.info("producer.instagram.start", brokers=brokers, eps=INSTAGRAM_EPS)
119
+
120
+ interval = 1.0 / INSTAGRAM_EPS
121
+ count = 0
122
+
123
+ try:
124
+ for post in MOCK_INSTAGRAM:
125
+ if limit and count >= limit:
126
+ break
127
+
128
+ envelope = {
129
+ "platform": "instagram",
130
+ "content_hash": _hash(post["text"]),
131
+ "text": post["text"],
132
+ "author_handle": post["account"],
133
+ "author_verified": False,
134
+ "source_url": f"https://instagram.com/{post['account']}/p/{post['id']}",
135
+ "ingested_at": time.time(),
136
+ }
137
+
138
+ await producer.send(INSTAGRAM_TOPIC, value=envelope)
139
+ count += 1
140
+ await asyncio.sleep(interval)
141
+
142
+ finally:
143
+ await producer.stop()
144
+ log.info("producer.instagram.stop", total_sent=count)
145
+
146
+
147
+ # ---------------------------------------------------------------------------
148
+ # youtube_producer.py (inline)
149
+ # ---------------------------------------------------------------------------
150
+
151
+ YOUTUBE_TOPIC = "raw.youtube"
152
+ YOUTUBE_EPS = 10
153
+
154
+ MOCK_YOUTUBE_TRANSCRIPTS = [
155
+ {"id": "yt001", "text": "According to the study published in Nature Medicine, the experimental drug reduced tumor size by an average of 47% in stage three patients.", "channel": "MedicalFrontiers"},
156
+ {"id": "yt002", "text": "So basically what they're saying is that the economy grew by 2.4 percent in Q3, which is actually the highest quarterly growth since 2021.", "channel": "FinanceExplained"},
157
+ {"id": "yt003", "text": "I personally believe this is all connected, if you look at the patterns you can clearly see what's really happening behind the scenes.", "channel": "ConspiracyHub"},
158
+ {"id": "yt004", "text": "The International Energy Agency reports that renewable energy now accounts for 30% of global electricity generation, up from 26% in 2021.", "channel": "CleanEnergyNow"},
159
+ {"id": "yt005", "text": "GPT-5 was secretly trained on 100 trillion parameters, making it ten times larger than GPT-4, according to an anonymous OpenAI employee.", "channel": "AIInsiderNews"},
160
+ {"id": "yt006", "text": "The United Nations Population Fund projects global population will peak at 10.4 billion in the 2080s before beginning to decline.", "channel": "DemographicsWorld"},
161
+ ] * 30
162
+
163
+
164
+ async def produce_youtube(brokers: str = BROKERS, limit: int | None = None) -> None:
165
+ producer = AIOKafkaProducer(
166
+ bootstrap_servers=brokers,
167
+ value_serializer=lambda v: json.dumps(v).encode(),
168
+ compression_type="gzip",
169
+ )
170
+ await producer.start()
171
+ log.info("producer.youtube.start", brokers=brokers, eps=YOUTUBE_EPS)
172
+
173
+ interval = 1.0 / YOUTUBE_EPS
174
+ count = 0
175
+
176
+ try:
177
+ for chunk in MOCK_YOUTUBE_TRANSCRIPTS:
178
+ if limit and count >= limit:
179
+ break
180
+
181
+ envelope = {
182
+ "platform": "youtube",
183
+ "content_hash": _hash(chunk["text"]),
184
+ "text": chunk["text"],
185
+ "author_handle": chunk["channel"],
186
+ "author_verified": False,
187
+ "source_url": f"https://youtube.com/watch?v={chunk['id']}",
188
+ "ingested_at": time.time(),
189
+ }
190
+
191
+ await producer.send(YOUTUBE_TOPIC, value=envelope)
192
+ count += 1
193
+ await asyncio.sleep(interval)
194
+
195
+ finally:
196
+ await producer.stop()
197
+ log.info("producer.youtube.stop", total_sent=count)
198
+
199
+
200
+ # ---------------------------------------------------------------------------
201
+ # Aggregated consumer β€” upserts into Qdrant + Memgraph
202
+ # ---------------------------------------------------------------------------
203
+
204
+ async def run_consumer(brokers: str = BROKERS) -> None:
205
+ """
206
+ Consumes all three topics, deduplicates by content_hash,
207
+ and upserts into Qdrant (vector index) and Memgraph (trust graph).
208
+ """
209
+ from aiokafka import AIOKafkaConsumer
210
+ import xxhash
211
+
212
+ seen_hashes: set[str] = set()
213
+
214
+ consumer = AIOKafkaConsumer(
215
+ "raw.twitter", "raw.instagram", "raw.youtube",
216
+ bootstrap_servers=brokers,
217
+ group_id="fact-intelligence-consumer",
218
+ value_deserializer=lambda v: json.loads(v.decode()),
219
+ auto_offset_reset="latest",
220
+ )
221
+
222
+ await consumer.start()
223
+ log.info("consumer.start", topics=["raw.twitter", "raw.instagram", "raw.youtube"])
224
+
225
+ try:
226
+ async for msg in consumer:
227
+ event = msg.value
228
+ h = event.get("content_hash", "")
229
+
230
+ if h in seen_hashes:
231
+ continue # Client-side deduplication (ring buffer) + server-side
232
+ seen_hashes.add(h)
233
+
234
+ # Trim seen_hashes to avoid unbounded memory growth (LRU-style)
235
+ if len(seen_hashes) > 50_000:
236
+ seen_hashes = set(list(seen_hashes)[-25_000:])
237
+
238
+ log.debug("consumer.event", platform=event.get("platform"), hash=h[:8])
239
+
240
+ # Upsert into Qdrant + Memgraph (fire-and-forget, non-blocking)
241
+ asyncio.create_task(_upsert_event(event))
242
+
243
+ finally:
244
+ await consumer.stop()
245
+
246
+
247
+ async def _upsert_event(event: dict) -> None:
248
+ """Embed and upsert a single event into Qdrant and Memgraph."""
249
+ try:
250
+ from rag_pipeline import embed_texts, get_qdrant
251
+ from core.config import get_settings
252
+ from qdrant_client.models import PointStruct
253
+ import uuid
254
+
255
+ cfg = get_settings()
256
+ text = event.get("text", "")
257
+ if not text:
258
+ return
259
+
260
+ # Embed and upsert into Qdrant
261
+ [vector] = await embed_texts([text])
262
+ client = await get_qdrant(cfg)
263
+
264
+ await client.upsert(
265
+ collection_name=cfg.qdrant_collection,
266
+ points=[
267
+ PointStruct(
268
+ id=str(uuid.uuid4()),
269
+ vector=vector,
270
+ payload={
271
+ "text": text,
272
+ "source_url": event.get("source_url", ""),
273
+ "domain": _extract_domain(event.get("source_url", "")),
274
+ "platform": event.get("platform", ""),
275
+ "content_hash": event.get("content_hash", ""),
276
+ "ingested_at_ts": event.get("ingested_at", time.time()),
277
+ "author_handle": event.get("author_handle", ""),
278
+ "bias_rating": None,
279
+ },
280
+ )
281
+ ],
282
+ )
283
+
284
+ # Upsert Author + Claim nodes into Memgraph
285
+ await _upsert_graph_node(event, cfg)
286
+
287
+ except Exception as exc:
288
+ log.error("consumer.upsert_error", error=str(exc))
289
+
290
+
291
+ def _extract_domain(url: str) -> str:
292
+ try:
293
+ from urllib.parse import urlparse
294
+ return urlparse(url).netloc.lstrip("www.")
295
+ except Exception:
296
+ return ""
297
+
298
+
299
+ async def _upsert_graph_node(event: dict, cfg) -> None:
300
+ """Create/update Author and Claim nodes in Memgraph."""
301
+ from neo4j import AsyncGraphDatabase
302
+
303
+ driver = AsyncGraphDatabase.driver(
304
+ f"bolt://{cfg.memgraph_host}:{cfg.memgraph_port}",
305
+ auth=("", cfg.memgraph_password),
306
+ encrypted=False,
307
+ )
308
+ async with driver.session() as session:
309
+ await session.run(
310
+ """
311
+ MERGE (a:Author {handle: $handle})
312
+ SET a.verified = $verified, a.account_type = $account_type
313
+ MERGE (c:Claim {hash: $hash})
314
+ SET c.text = $text
315
+ MERGE (a)-[:REPORTED {timestamp: $ts}]->(c)
316
+ """,
317
+ handle=event.get("author_handle", "unknown"),
318
+ verified=event.get("author_verified", False),
319
+ account_type=event.get("account_type", "personal"),
320
+ hash=event.get("content_hash", ""),
321
+ text=event.get("text", "")[:500],
322
+ ts=event.get("ingested_at", time.time()),
323
+ )
324
+ await driver.close()
325
+
326
+
327
+ if __name__ == "__main__":
328
+ import sys
329
+
330
+ async def _run_all():
331
+ await asyncio.gather(
332
+ produce_twitter(),
333
+ produce_instagram(),
334
+ produce_youtube(),
335
+ run_consumer(),
336
+ )
337
+
338
+ asyncio.run(_run_all())
backend/pyproject.toml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "omnichannel-fact-intelligence"
3
+ version = "1.0.0"
4
+ description = "Near-zero-latency omnichannel fact & hallucination intelligence backend"
5
+ requires-python = ">=3.12"
6
+ dependencies = [
7
+ # Web framework & async
8
+ "fastapi==0.115.5",
9
+ "uvicorn[standard]==0.32.1",
10
+ "websockets==13.1",
11
+ "httpx==0.27.2",
12
+
13
+ # Data validation
14
+ "pydantic==2.10.3",
15
+ "pydantic-settings==2.6.1",
16
+
17
+ # LLM abstraction β€” swap Groq ↔ GPT-4o ↔ local Ollama without code changes
18
+ "litellm==1.55.4",
19
+ "groq==0.13.0",
20
+
21
+ # Embeddings β€” BGE-M3, multilingual, CPU-native, completely free
22
+ "fastembed==0.4.2",
23
+
24
+ # Vector DB β€” Qdrant self-hosted, HNSW sub-ms ANN search
25
+ "qdrant-client==1.12.1",
26
+
27
+ # Graph DB β€” Memgraph Bolt driver (Cypher-compatible, same as Neo4j driver)
28
+ "neo4j==5.26.0",
29
+
30
+ # Message queue β€” Redpanda is Kafka-compatible, use aiokafka
31
+ "aiokafka==0.11.0",
32
+
33
+ # Orchestration β€” Prefect DAG flows replacing Celery
34
+ "prefect==3.1.6",
35
+
36
+ # Cache β€” Redis Stack (RedisJSON + RedisSearch)
37
+ "redis[hiredis]==5.2.1",
38
+
39
+ # Hashing β€” xxhash for sub-microsecond content deduplication
40
+ "xxhash==3.5.0",
41
+
42
+ # Observability
43
+ "structlog==24.4.0",
44
+ "rich==13.9.4",
45
+
46
+ # Utilities
47
+ "python-dotenv==1.0.1",
48
+ "tenacity==9.0.0", # Exponential backoff for external API calls
49
+ "aiofiles==24.1.0",
50
+ "orjson==3.10.12", # 2-3x faster JSON than stdlib
51
+ ]
52
+
53
+ [project.optional-dependencies]
54
+ dev = [
55
+ "pytest==8.3.4",
56
+ "pytest-asyncio==0.24.0",
57
+ "pytest-httpx==0.32.0",
58
+ "ruff==0.8.3",
59
+ "mypy==1.13.0",
60
+ ]
61
+
62
+ [build-system]
63
+ requires = ["hatchling"]
64
+ build-backend = "hatchling.build"
65
+
66
+ [tool.uv]
67
+ dev-dependencies = [
68
+ "pytest>=8.3.4",
69
+ "pytest-asyncio>=0.24.0",
70
+ ]
71
+
72
+ [tool.ruff]
73
+ line-length = 100
74
+ target-version = "py312"
75
+ select = ["E", "F", "I", "UP", "B", "SIM"]
76
+
77
+ [tool.mypy]
78
+ python_version = "3.12"
79
+ strict = true
80
+ ignore_missing_imports = true
backend/rag_pipeline.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ rag_pipeline.py β€” Retrieval-Augmented Generation truth pipeline.
3
+
4
+ Three-stage process:
5
+ 1. Embed the claim using BGE-M3 (FastEmbed, CPU-native, multilingual)
6
+ 2. Search Qdrant for nearest evidence chunks (HNSW ef=128, top-8, 72h window)
7
+ 3. Traverse the Memgraph trust graph to compute a trust score
8
+
9
+ Why BGE-M3 over OpenAI embeddings:
10
+ - 100+ language support (OpenAI embeddings are English-biased)
11
+ - 1024-dimensional dense vectors with better factual recall on news content
12
+ - Runs on CPU β€” no GPU dependency on the server
13
+ - Completely free β€” no per-token cost
14
+ - Comparable or better performance on BEIR benchmarks vs text-embedding-3-small
15
+
16
+ Why Qdrant over Pinecone:
17
+ - Self-hosted Docker β€” zero vendor lock-in, zero per-query cost
18
+ - HNSW index with configurable ef parameter for precision/recall trade-off
19
+ - Built-in payload filtering for recency constraints (no separate filter step)
20
+ - gRPC support for sub-millisecond latency on local network
21
+
22
+ Why Memgraph over Neo4j:
23
+ - Fully in-memory β€” entire graph lives in RAM for <1ms traversal
24
+ - Cypher-compatible β€” same query language as Neo4j, zero migration cost
25
+ - Docker-deployable in one command
26
+ """
27
+
28
+ import asyncio
29
+ from concurrent.futures import ProcessPoolExecutor
30
+ from datetime import datetime, timedelta, timezone
31
+ from typing import TYPE_CHECKING
32
+
33
+ import structlog
34
+ from neo4j import AsyncGraphDatabase
35
+ from qdrant_client import AsyncQdrantClient
36
+ from qdrant_client.models import (
37
+ Distance,
38
+ FieldCondition,
39
+ Filter,
40
+ MatchValue,
41
+ PayloadSchemaType,
42
+ Range,
43
+ SearchRequest,
44
+ VectorParams,
45
+ )
46
+
47
+ from core.config import Settings, get_settings
48
+ from core.models import EvidenceChunk, RAGResult, TrustScore
49
+
50
+ if TYPE_CHECKING:
51
+ from fastembed import TextEmbedding
52
+
53
+ log = structlog.get_logger(__name__)
54
+
55
+ # ---------------------------------------------------------------------------
56
+ # Module-level singletons β€” initialized on first use, reused across requests
57
+ # ---------------------------------------------------------------------------
58
+ _embed_model: "TextEmbedding | None" = None
59
+ _qdrant_client: AsyncQdrantClient | None = None
60
+ _executor: ProcessPoolExecutor | None = None
61
+
62
+
63
+ def _get_embedder() -> "TextEmbedding":
64
+ """Lazy-load the BGE-M3 model. First load downloads ~570MB, then cached."""
65
+ global _embed_model
66
+ if _embed_model is None:
67
+ from fastembed import TextEmbedding
68
+ log.info("rag.embedder.loading", model="BAAI/bge-m3")
69
+ _embed_model = TextEmbedding("BAAI/bge-m3")
70
+ log.info("rag.embedder.ready")
71
+ return _embed_model
72
+
73
+
74
+ def _get_executor() -> ProcessPoolExecutor:
75
+ """Embedding is CPU-bound β€” run in a ProcessPoolExecutor to avoid blocking asyncio."""
76
+ global _executor
77
+ if _executor is None:
78
+ _executor = ProcessPoolExecutor(max_workers=2)
79
+ return _executor
80
+
81
+
82
+ def _embed_sync(texts: list[str]) -> list[list[float]]:
83
+ """
84
+ CPU-bound embedding function executed in the process pool.
85
+ Must be a module-level function (not a method/lambda) for pickling.
86
+ """
87
+ model = _get_embedder()
88
+ return [v.tolist() for v in model.embed(texts)]
89
+
90
+
91
+ async def embed_texts(texts: list[str]) -> list[list[float]]:
92
+ """Async wrapper: runs CPU-bound embedding in a separate process."""
93
+ loop = asyncio.get_event_loop()
94
+ return await loop.run_in_executor(_get_executor(), _embed_sync, texts)
95
+
96
+
97
+ # ---------------------------------------------------------------------------
98
+ # Qdrant client and collection bootstrap
99
+ # ---------------------------------------------------------------------------
100
+
101
+ async def get_qdrant(settings: Settings) -> AsyncQdrantClient:
102
+ global _qdrant_client
103
+ if _qdrant_client is None:
104
+ _qdrant_client = AsyncQdrantClient(host=settings.qdrant_host, port=settings.qdrant_port)
105
+ await _ensure_collection(_qdrant_client, settings)
106
+ return _qdrant_client
107
+
108
+
109
+ async def _ensure_collection(client: AsyncQdrantClient, settings: Settings) -> None:
110
+ """
111
+ Idempotent collection creation. BGE-M3 outputs 1024-dimensional vectors.
112
+ HNSW is the default index in Qdrant β€” no explicit creation needed.
113
+ """
114
+ collections = await client.get_collections()
115
+ names = [c.name for c in collections.collections]
116
+
117
+ if settings.qdrant_collection not in names:
118
+ await client.create_collection(
119
+ collection_name=settings.qdrant_collection,
120
+ vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
121
+ )
122
+ # Create payload index on ingested_at for fast recency filtering
123
+ await client.create_payload_index(
124
+ collection_name=settings.qdrant_collection,
125
+ field_name="ingested_at_ts",
126
+ field_schema=PayloadSchemaType.FLOAT,
127
+ )
128
+ log.info("qdrant.collection.created", name=settings.qdrant_collection)
129
+
130
+
131
+ # ---------------------------------------------------------------------------
132
+ # Memgraph trust graph
133
+ # ---------------------------------------------------------------------------
134
+
135
+ async def compute_trust_score(
136
+ claim_hash: str,
137
+ settings: Settings,
138
+ ) -> TrustScore:
139
+ """
140
+ Traverse the Memgraph trust graph to compute a claim's credibility score.
141
+
142
+ Graph schema:
143
+ (Author {handle, verified, account_type})
144
+ -[:REPORTED {timestamp}]->
145
+ (Claim {text, embedding_id, hash})
146
+ <-[:CORROBORATED_BY {confidence}]-
147
+ (Source {url, domain, bias_rating})
148
+
149
+ Scoring algorithm (start at 0.5, clamp to [0.0, 1.0]):
150
+ +0.30 if Author.verified AND account_type IN ['government', 'official_news']
151
+ +0.05 per corroborating Source node (max boost: +0.25, so cap at 5 sources)
152
+ -0.40 if any Source carries an active Community_Note relationship
153
+ """
154
+ driver = AsyncGraphDatabase.driver(
155
+ f"bolt://{settings.memgraph_host}:{settings.memgraph_port}",
156
+ auth=("", settings.memgraph_password),
157
+ encrypted=False,
158
+ )
159
+
160
+ async with driver.session() as session:
161
+ result = await session.run(
162
+ """
163
+ OPTIONAL MATCH (a:Author)-[:REPORTED]->(c:Claim {hash: $hash})
164
+ OPTIONAL MATCH (s:Source)-[:CORROBORATED_BY]->(c)
165
+ OPTIONAL MATCH (s)-[:HAS_NOTE]->(n:CommunityNote {active: true})
166
+ RETURN
167
+ a.verified AS verified,
168
+ a.account_type AS account_type,
169
+ COUNT(DISTINCT s) AS source_count,
170
+ COUNT(DISTINCT n) AS note_count,
171
+ COLLECT(DISTINCT n.text)[0] AS note_text
172
+ """,
173
+ hash=claim_hash,
174
+ )
175
+ row = await result.single()
176
+
177
+ await driver.close()
178
+
179
+ if row is None:
180
+ # Claim not yet in graph β€” return neutral score
181
+ return TrustScore(
182
+ score=0.5,
183
+ author_verified=False,
184
+ corroborating_sources=0,
185
+ has_community_note=False,
186
+ )
187
+
188
+ verified: bool = bool(row["verified"])
189
+ account_type: str | None = row["account_type"]
190
+ source_count: int = int(row["source_count"] or 0)
191
+ note_count: int = int(row["note_count"] or 0)
192
+ note_text: str | None = row["note_text"]
193
+
194
+ # --- Scoring algorithm ---
195
+ score = 0.5
196
+
197
+ if verified and account_type in ("government", "official_news"):
198
+ score += 0.30 # Strong verified official boost
199
+
200
+ source_boost = min(source_count * 0.05, 0.25) # Cap at 5 sources Γ— 0.05
201
+ score += source_boost
202
+
203
+ has_note = note_count > 0
204
+ if has_note:
205
+ score -= 0.40 # Active Community Note is a strong negative signal
206
+
207
+ score = max(0.0, min(1.0, score)) # Clamp to [0.0, 1.0]
208
+
209
+ return TrustScore(
210
+ score=round(score, 4),
211
+ author_verified=verified,
212
+ corroborating_sources=source_count,
213
+ has_community_note=has_note,
214
+ community_note_text=note_text,
215
+ )
216
+
217
+
218
+ # ---------------------------------------------------------------------------
219
+ # Main RAG pipeline entry point
220
+ # ---------------------------------------------------------------------------
221
+
222
+ async def run_rag_pipeline(
223
+ claim_text: str,
224
+ claim_hash: str,
225
+ settings: Settings | None = None,
226
+ ) -> RAGResult:
227
+ """
228
+ Full RAG pipeline: embed β†’ ANN search with recency filter β†’ trust traversal.
229
+
230
+ Returns RAGResult with top-k evidence chunks and computed trust score,
231
+ both of which feed into the multi-agent evaluation layer (agents.py).
232
+ """
233
+ cfg = settings or get_settings()
234
+
235
+ # Run embedding and trust score concurrently β€” they're independent
236
+ embed_task = asyncio.create_task(embed_texts([claim_text]))
237
+ trust_task = asyncio.create_task(compute_trust_score(claim_hash, cfg))
238
+
239
+ [claim_vector], trust = await asyncio.gather(embed_task, trust_task)
240
+
241
+ # Recency filter: only retrieve evidence ingested in the last 72 hours
242
+ # Uses Qdrant's payload filter on the ingested_at_ts float field (Unix timestamp)
243
+ cutoff_ts = (datetime.now(timezone.utc) - timedelta(hours=cfg.evidence_window_hours)).timestamp()
244
+
245
+ qdrant = await get_qdrant(cfg)
246
+
247
+ search_results = await qdrant.search(
248
+ collection_name=cfg.qdrant_collection,
249
+ query_vector=claim_vector,
250
+ limit=cfg.qdrant_top_k,
251
+ with_payload=True,
252
+ search_params={"hnsw_ef": cfg.qdrant_ef},
253
+ query_filter=Filter(
254
+ must=[
255
+ FieldCondition(
256
+ key="ingested_at_ts",
257
+ range=Range(gte=cutoff_ts),
258
+ )
259
+ ]
260
+ ),
261
+ )
262
+
263
+ evidence = [
264
+ EvidenceChunk(
265
+ chunk_id=str(hit.id),
266
+ text=hit.payload.get("text", ""),
267
+ source_url=hit.payload.get("source_url", ""),
268
+ domain=hit.payload.get("domain", ""),
269
+ score=hit.score,
270
+ ingested_at=datetime.fromtimestamp(
271
+ hit.payload.get("ingested_at_ts", 0), tz=timezone.utc
272
+ ),
273
+ bias_rating=hit.payload.get("bias_rating"),
274
+ )
275
+ for hit in search_results
276
+ ]
277
+
278
+ log.info(
279
+ "rag.pipeline.complete",
280
+ evidence_count=len(evidence),
281
+ trust_score=trust.score,
282
+ claim_hash=claim_hash[:8],
283
+ )
284
+
285
+ return RAGResult(evidence=evidence, trust=trust)
backend/static/index.html ADDED
@@ -0,0 +1,783 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Fact & Hallucination Intelligence System</title>
7
+ <link rel="preconnect" href="https://fonts.googleapis.com">
8
+ <link href="https://fonts.googleapis.com/css2?family=Space+Mono:wght@400;700&family=DM+Sans:wght@300;400;500;700&display=swap" rel="stylesheet">
9
+ <style>
10
+ :root {
11
+ --bg: #070b0f;
12
+ --surface: #0d1117;
13
+ --surface2: #161b22;
14
+ --border: #21262d;
15
+ --text: #e6edf3;
16
+ --text-muted: #7d8590;
17
+ --green: #22c55e;
18
+ --green-glow: rgba(34,197,94,0.15);
19
+ --yellow: #eab308;
20
+ --yellow-glow: rgba(234,179,8,0.15);
21
+ --red: #ef4444;
22
+ --red-glow: rgba(239,68,68,0.15);
23
+ --purple: #a855f7;
24
+ --purple-glow: rgba(168,85,247,0.15);
25
+ --accent: #58a6ff;
26
+ --mono: 'Space Mono', monospace;
27
+ --sans: 'DM Sans', sans-serif;
28
+ }
29
+
30
+ * { margin: 0; padding: 0; box-sizing: border-box; }
31
+
32
+ body {
33
+ background: var(--bg);
34
+ color: var(--text);
35
+ font-family: var(--sans);
36
+ min-height: 100vh;
37
+ display: flex;
38
+ flex-direction: column;
39
+ position: relative;
40
+ overflow-x: hidden;
41
+ }
42
+
43
+ /* Grid background */
44
+ body::before {
45
+ content: '';
46
+ position: fixed;
47
+ inset: 0;
48
+ background-image:
49
+ linear-gradient(rgba(88,166,255,0.03) 1px, transparent 1px),
50
+ linear-gradient(90deg, rgba(88,166,255,0.03) 1px, transparent 1px);
51
+ background-size: 40px 40px;
52
+ pointer-events: none;
53
+ z-index: 0;
54
+ }
55
+
56
+ /* Radial glow */
57
+ body::after {
58
+ content: '';
59
+ position: fixed;
60
+ top: -20%;
61
+ left: 50%;
62
+ transform: translateX(-50%);
63
+ width: 80vw;
64
+ height: 60vh;
65
+ background: radial-gradient(ellipse, rgba(88,166,255,0.06) 0%, transparent 70%);
66
+ pointer-events: none;
67
+ z-index: 0;
68
+ }
69
+
70
+ .container {
71
+ position: relative;
72
+ z-index: 1;
73
+ max-width: 900px;
74
+ margin: 0 auto;
75
+ padding: 48px 24px 80px;
76
+ width: 100%;
77
+ }
78
+
79
+ /* Header */
80
+ header {
81
+ text-align: center;
82
+ margin-bottom: 56px;
83
+ }
84
+
85
+ .logo-row {
86
+ display: flex;
87
+ align-items: center;
88
+ justify-content: center;
89
+ gap: 12px;
90
+ margin-bottom: 16px;
91
+ }
92
+
93
+ .logo-icon {
94
+ width: 40px;
95
+ height: 40px;
96
+ border: 1px solid var(--accent);
97
+ border-radius: 8px;
98
+ display: flex;
99
+ align-items: center;
100
+ justify-content: center;
101
+ color: var(--accent);
102
+ font-size: 20px;
103
+ box-shadow: 0 0 20px rgba(88,166,255,0.2);
104
+ }
105
+
106
+ h1 {
107
+ font-family: var(--mono);
108
+ font-size: clamp(18px, 3vw, 26px);
109
+ font-weight: 700;
110
+ letter-spacing: -0.5px;
111
+ color: var(--text);
112
+ }
113
+
114
+ .tagline {
115
+ font-size: 14px;
116
+ color: var(--text-muted);
117
+ font-family: var(--mono);
118
+ letter-spacing: 0.5px;
119
+ margin-top: 8px;
120
+ }
121
+
122
+ /* Status bar */
123
+ .status-bar {
124
+ display: flex;
125
+ align-items: center;
126
+ gap: 8px;
127
+ padding: 8px 16px;
128
+ background: var(--surface2);
129
+ border: 1px solid var(--border);
130
+ border-radius: 6px;
131
+ font-family: var(--mono);
132
+ font-size: 12px;
133
+ color: var(--text-muted);
134
+ margin-bottom: 32px;
135
+ width: fit-content;
136
+ margin-left: auto;
137
+ margin-right: auto;
138
+ }
139
+
140
+ .status-dot {
141
+ width: 8px;
142
+ height: 8px;
143
+ border-radius: 50%;
144
+ background: #555;
145
+ transition: background 0.3s;
146
+ }
147
+ .status-dot.connected { background: var(--green); box-shadow: 0 0 8px var(--green); animation: pulse 2s infinite; }
148
+ .status-dot.connecting { background: var(--yellow); animation: pulse 0.8s infinite; }
149
+ .status-dot.error { background: var(--red); }
150
+
151
+ @keyframes pulse {
152
+ 0%, 100% { opacity: 1; }
153
+ 50% { opacity: 0.4; }
154
+ }
155
+
156
+ /* Input area */
157
+ .analysis-card {
158
+ background: var(--surface);
159
+ border: 1px solid var(--border);
160
+ border-radius: 12px;
161
+ padding: 28px;
162
+ margin-bottom: 24px;
163
+ }
164
+
165
+ .card-label {
166
+ font-family: var(--mono);
167
+ font-size: 11px;
168
+ color: var(--text-muted);
169
+ letter-spacing: 1.5px;
170
+ text-transform: uppercase;
171
+ margin-bottom: 12px;
172
+ }
173
+
174
+ .platform-row {
175
+ display: flex;
176
+ gap: 8px;
177
+ margin-bottom: 16px;
178
+ flex-wrap: wrap;
179
+ }
180
+
181
+ .platform-btn {
182
+ padding: 6px 14px;
183
+ border: 1px solid var(--border);
184
+ border-radius: 20px;
185
+ background: transparent;
186
+ color: var(--text-muted);
187
+ font-family: var(--mono);
188
+ font-size: 11px;
189
+ cursor: pointer;
190
+ transition: all 0.2s;
191
+ letter-spacing: 0.5px;
192
+ }
193
+ .platform-btn:hover { border-color: var(--accent); color: var(--accent); }
194
+ .platform-btn.active {
195
+ border-color: var(--accent);
196
+ background: rgba(88,166,255,0.1);
197
+ color: var(--accent);
198
+ }
199
+
200
+ textarea {
201
+ width: 100%;
202
+ min-height: 120px;
203
+ background: var(--bg);
204
+ border: 1px solid var(--border);
205
+ border-radius: 8px;
206
+ color: var(--text);
207
+ font-family: var(--sans);
208
+ font-size: 15px;
209
+ line-height: 1.6;
210
+ padding: 16px;
211
+ resize: vertical;
212
+ outline: none;
213
+ transition: border-color 0.2s;
214
+ }
215
+ textarea:focus { border-color: var(--accent); }
216
+ textarea::placeholder { color: var(--text-muted); }
217
+
218
+ .analyze-btn {
219
+ display: flex;
220
+ align-items: center;
221
+ gap: 8px;
222
+ margin-top: 16px;
223
+ padding: 12px 28px;
224
+ background: var(--accent);
225
+ color: #000;
226
+ font-family: var(--mono);
227
+ font-size: 13px;
228
+ font-weight: 700;
229
+ border: none;
230
+ border-radius: 8px;
231
+ cursor: pointer;
232
+ transition: all 0.2s;
233
+ letter-spacing: 0.5px;
234
+ }
235
+ .analyze-btn:hover { background: #79c0ff; transform: translateY(-1px); box-shadow: 0 4px 20px rgba(88,166,255,0.3); }
236
+ .analyze-btn:disabled { opacity: 0.5; cursor: not-allowed; transform: none; }
237
+
238
+ .spinner {
239
+ width: 14px;
240
+ height: 14px;
241
+ border: 2px solid rgba(0,0,0,0.3);
242
+ border-top-color: #000;
243
+ border-radius: 50%;
244
+ animation: spin 0.7s linear infinite;
245
+ display: none;
246
+ }
247
+ .spinner.active { display: block; }
248
+ @keyframes spin { to { transform: rotate(360deg); } }
249
+
250
+ /* Result card */
251
+ .result-card {
252
+ background: var(--surface);
253
+ border: 1px solid var(--border);
254
+ border-radius: 12px;
255
+ padding: 28px;
256
+ display: none;
257
+ animation: fadeSlideIn 0.3s ease;
258
+ }
259
+ .result-card.visible { display: block; }
260
+
261
+ @keyframes fadeSlideIn {
262
+ from { opacity: 0; transform: translateY(8px); }
263
+ to { opacity: 1; transform: translateY(0); }
264
+ }
265
+
266
+ .verdict-header {
267
+ display: flex;
268
+ align-items: flex-start;
269
+ gap: 20px;
270
+ margin-bottom: 24px;
271
+ }
272
+
273
+ .confidence-ring {
274
+ flex-shrink: 0;
275
+ width: 72px;
276
+ height: 72px;
277
+ position: relative;
278
+ }
279
+
280
+ .confidence-ring svg {
281
+ width: 72px;
282
+ height: 72px;
283
+ transform: rotate(-90deg);
284
+ }
285
+
286
+ .confidence-ring .track {
287
+ fill: none;
288
+ stroke: var(--border);
289
+ stroke-width: 6;
290
+ }
291
+
292
+ .confidence-ring .fill {
293
+ fill: none;
294
+ stroke-width: 6;
295
+ stroke-linecap: round;
296
+ transition: stroke-dashoffset 0.6s ease, stroke 0.3s;
297
+ }
298
+
299
+ .confidence-num {
300
+ position: absolute;
301
+ inset: 0;
302
+ display: flex;
303
+ align-items: center;
304
+ justify-content: center;
305
+ font-family: var(--mono);
306
+ font-size: 14px;
307
+ font-weight: 700;
308
+ }
309
+
310
+ .verdict-meta { flex: 1; }
311
+
312
+ .color-badge {
313
+ display: inline-flex;
314
+ align-items: center;
315
+ gap: 6px;
316
+ padding: 4px 12px;
317
+ border-radius: 20px;
318
+ font-family: var(--mono);
319
+ font-size: 11px;
320
+ font-weight: 700;
321
+ letter-spacing: 1px;
322
+ text-transform: uppercase;
323
+ margin-bottom: 8px;
324
+ }
325
+ .color-badge.green { background: var(--green-glow); color: var(--green); border: 1px solid rgba(34,197,94,0.3); }
326
+ .color-badge.yellow { background: var(--yellow-glow); color: var(--yellow); border: 1px solid rgba(234,179,8,0.3); }
327
+ .color-badge.red { background: var(--red-glow); color: var(--red); border: 1px solid rgba(239,68,68,0.3); }
328
+ .color-badge.purple { background: var(--purple-glow); color: var(--purple); border: 1px solid rgba(168,85,247,0.3); }
329
+
330
+ .verdict-label {
331
+ font-family: var(--sans);
332
+ font-size: 18px;
333
+ font-weight: 700;
334
+ margin-bottom: 8px;
335
+ line-height: 1.3;
336
+ }
337
+
338
+ .explanation {
339
+ font-size: 14px;
340
+ color: var(--text-muted);
341
+ line-height: 1.7;
342
+ }
343
+
344
+ /* Metadata grid */
345
+ .meta-grid {
346
+ display: grid;
347
+ grid-template-columns: repeat(auto-fit, minmax(140px, 1fr));
348
+ gap: 12px;
349
+ margin: 24px 0;
350
+ padding: 20px;
351
+ background: var(--surface2);
352
+ border-radius: 8px;
353
+ }
354
+
355
+ .meta-item { display: flex; flex-direction: column; gap: 4px; }
356
+ .meta-key {
357
+ font-family: var(--mono);
358
+ font-size: 10px;
359
+ color: var(--text-muted);
360
+ text-transform: uppercase;
361
+ letter-spacing: 1px;
362
+ }
363
+ .meta-value {
364
+ font-family: var(--mono);
365
+ font-size: 13px;
366
+ color: var(--text);
367
+ font-weight: 700;
368
+ }
369
+
370
+ /* Sources */
371
+ .sources-label {
372
+ font-family: var(--mono);
373
+ font-size: 11px;
374
+ color: var(--text-muted);
375
+ text-transform: uppercase;
376
+ letter-spacing: 1px;
377
+ margin-bottom: 10px;
378
+ }
379
+
380
+ .source-item {
381
+ display: flex;
382
+ align-items: center;
383
+ gap: 10px;
384
+ padding: 10px 14px;
385
+ background: var(--surface2);
386
+ border-radius: 6px;
387
+ margin-bottom: 6px;
388
+ font-size: 13px;
389
+ border: 1px solid transparent;
390
+ transition: border-color 0.2s;
391
+ }
392
+ .source-item:hover { border-color: var(--border); }
393
+ .source-favicon { width: 16px; height: 16px; border-radius: 3px; }
394
+ .source-domain { font-family: var(--mono); font-size: 11px; color: var(--accent); }
395
+
396
+ /* Pipeline log */
397
+ .pipeline-log {
398
+ background: var(--bg);
399
+ border: 1px solid var(--border);
400
+ border-radius: 8px;
401
+ padding: 16px;
402
+ margin-top: 24px;
403
+ font-family: var(--mono);
404
+ font-size: 12px;
405
+ color: var(--text-muted);
406
+ max-height: 200px;
407
+ overflow-y: auto;
408
+ }
409
+
410
+ .log-line {
411
+ display: flex;
412
+ gap: 12px;
413
+ margin-bottom: 4px;
414
+ animation: fadeIn 0.2s ease;
415
+ }
416
+ @keyframes fadeIn { from { opacity: 0; } to { opacity: 1; } }
417
+ .log-ts { color: #444; flex-shrink: 0; }
418
+ .log-level { flex-shrink: 0; }
419
+ .log-level.info { color: var(--accent); }
420
+ .log-level.ok { color: var(--green); }
421
+ .log-level.warn { color: var(--yellow); }
422
+ .log-level.drop { color: var(--text-muted); }
423
+
424
+ /* Example claims */
425
+ .examples-label {
426
+ font-family: var(--mono);
427
+ font-size: 11px;
428
+ color: var(--text-muted);
429
+ text-transform: uppercase;
430
+ letter-spacing: 1px;
431
+ margin-bottom: 12px;
432
+ }
433
+
434
+ .example-chip {
435
+ display: inline-block;
436
+ padding: 6px 12px;
437
+ border: 1px solid var(--border);
438
+ border-radius: 6px;
439
+ font-size: 12px;
440
+ color: var(--text-muted);
441
+ cursor: pointer;
442
+ margin: 0 6px 6px 0;
443
+ transition: all 0.2s;
444
+ line-height: 1.4;
445
+ }
446
+ .example-chip:hover { border-color: var(--accent); color: var(--text); background: rgba(88,166,255,0.05); }
447
+
448
+ /* Footer */
449
+ footer {
450
+ text-align: center;
451
+ padding: 32px 0;
452
+ font-family: var(--mono);
453
+ font-size: 11px;
454
+ color: var(--text-muted);
455
+ border-top: 1px solid var(--border);
456
+ position: relative;
457
+ z-index: 1;
458
+ }
459
+
460
+ .stack-tags { display: flex; gap: 8px; justify-content: center; flex-wrap: wrap; margin-top: 10px; }
461
+ .stack-tag {
462
+ padding: 3px 8px;
463
+ border: 1px solid var(--border);
464
+ border-radius: 4px;
465
+ font-size: 10px;
466
+ letter-spacing: 0.5px;
467
+ }
468
+ </style>
469
+ </head>
470
+ <body>
471
+ <div class="container">
472
+ <header>
473
+ <div class="logo-row">
474
+ <div class="logo-icon">⬑</div>
475
+ <h1>FACT INTELLIGENCE SYSTEM</h1>
476
+ </div>
477
+ <p class="tagline">// omnichannel Β· real-time Β· hallucination-aware</p>
478
+ </header>
479
+
480
+ <div class="status-bar">
481
+ <div class="status-dot connecting" id="statusDot"></div>
482
+ <span id="statusText">connecting to intelligence engine...</span>
483
+ </div>
484
+
485
+ <!-- Input -->
486
+ <div class="analysis-card">
487
+ <div class="card-label">// source platform</div>
488
+ <div class="platform-row" id="platformRow">
489
+ <button class="platform-btn active" data-platform="news">News</button>
490
+ <button class="platform-btn" data-platform="twitter">X / Twitter</button>
491
+ <button class="platform-btn" data-platform="youtube">YouTube</button>
492
+ <button class="platform-btn" data-platform="instagram">Instagram</button>
493
+ <button class="platform-btn" data-platform="chatgpt">ChatGPT</button>
494
+ <button class="platform-btn" data-platform="claude">Claude</button>
495
+ <button class="platform-btn" data-platform="gemini">Gemini</button>
496
+ </div>
497
+
498
+ <div class="card-label" style="margin-top:20px">// text to analyze</div>
499
+ <textarea id="claimInput" placeholder="Paste a claim, headline, or AI-generated text here...&#10;&#10;Minimum 12 words required."></textarea>
500
+
501
+ <button class="analyze-btn" id="analyzeBtn" onclick="analyzeClaim()">
502
+ <div class="spinner" id="spinner"></div>
503
+ <span id="btnText">ANALYZE CLAIM</span>
504
+ </button>
505
+ </div>
506
+
507
+ <!-- Example claims -->
508
+ <div class="analysis-card">
509
+ <div class="examples-label">// example claims to test</div>
510
+ <span class="example-chip" onclick="setExample(this.textContent)">Scientists confirmed mRNA vaccines provide immunity lasting over 18 months in 73% of clinical trial participants.</span>
511
+ <span class="example-chip" onclick="setExample(this.textContent)">The Federal Reserve raised interest rates by 75 basis points β€” the largest single hike since 1994.</span>
512
+ <span class="example-chip" onclick="setExample(this.textContent)">According to a study published in Nature, this drug reduces tumor size by 500% in all stage-4 patients within 2 weeks.</span>
513
+ <span class="example-chip" onclick="setExample(this.textContent)">The Amazon rainforest lost 11,568 square kilometers to deforestation in 2023, a 22% increase year-over-year.</span>
514
+ <span class="example-chip" onclick="setExample(this.textContent)">As referenced in Smith et al. (2019), the compound shows 94.7% efficacy against all known variants of the pathogen.</span>
515
+ </div>
516
+
517
+ <!-- Result -->
518
+ <div class="result-card" id="resultCard">
519
+ <div class="verdict-header">
520
+ <div class="confidence-ring" id="confRing">
521
+ <svg viewBox="0 0 72 72">
522
+ <circle class="track" cx="36" cy="36" r="30"/>
523
+ <circle class="fill" id="confArc" cx="36" cy="36" r="30"
524
+ stroke-dasharray="188.5"
525
+ stroke-dashoffset="188.5"/>
526
+ </svg>
527
+ <div class="confidence-num" id="confNum">β€”</div>
528
+ </div>
529
+ <div class="verdict-meta">
530
+ <div class="color-badge" id="colorBadge">β€”</div>
531
+ <div class="verdict-label" id="verdictLabel">β€”</div>
532
+ <div class="explanation" id="explanationText">β€”</div>
533
+ </div>
534
+ </div>
535
+
536
+ <div class="meta-grid">
537
+ <div class="meta-item"><div class="meta-key">Trust Score</div><div class="meta-value" id="metaTrust">β€”</div></div>
538
+ <div class="meta-item"><div class="meta-key">X Velocity</div><div class="meta-value" id="metaVelocity">β€”</div></div>
539
+ <div class="meta-item"><div class="meta-key">Community Note</div><div class="meta-value" id="metaNote">β€”</div></div>
540
+ <div class="meta-item"><div class="meta-key">Pipeline (ms)</div><div class="meta-value" id="metaLatency">β€”</div></div>
541
+ <div class="meta-item"><div class="meta-key">Cache</div><div class="meta-value" id="metaCached">β€”</div></div>
542
+ <div class="meta-item"><div class="meta-key">Platform</div><div class="meta-value" id="metaPlatform">β€”</div></div>
543
+ </div>
544
+
545
+ <div id="sourcesSection">
546
+ <div class="sources-label">// evidence sources</div>
547
+ <div id="sourcesList"></div>
548
+ </div>
549
+
550
+ <div class="pipeline-log" id="pipelineLog"></div>
551
+ </div>
552
+ </div>
553
+
554
+ <footer>
555
+ <div>OMNICHANNEL FACT &amp; HALLUCINATION INTELLIGENCE SYSTEM v1.0</div>
556
+ <div class="stack-tags">
557
+ <span class="stack-tag">FastAPI</span>
558
+ <span class="stack-tag">BGE-M3</span>
559
+ <span class="stack-tag">Qdrant</span>
560
+ <span class="stack-tag">Memgraph</span>
561
+ <span class="stack-tag">Redpanda</span>
562
+ <span class="stack-tag">Redis Stack</span>
563
+ <span class="stack-tag">LiteLLM</span>
564
+ <span class="stack-tag">Prefect</span>
565
+ <span class="stack-tag">Groq</span>
566
+ <span class="stack-tag">WXT</span>
567
+ </div>
568
+ </footer>
569
+
570
+ <script>
571
+ // ─── WebSocket client ────────────────────────────────────────────────────
572
+ const SESSION_ID = crypto.randomUUID();
573
+ const WS_URL = `${location.protocol === 'https:' ? 'wss' : 'ws'}://${location.host}/ws/${SESSION_ID}`;
574
+
575
+ let ws = null;
576
+ let reconnectDelay = 1000;
577
+ let selectedPlatform = 'news';
578
+
579
+ function connect() {
580
+ setStatus('connecting');
581
+ log('INFO', `connecting to ${WS_URL}`);
582
+ ws = new WebSocket(WS_URL);
583
+
584
+ ws.onopen = () => {
585
+ setStatus('connected');
586
+ reconnectDelay = 1000;
587
+ log('OK', 'WebSocket connected β€” intelligence engine online');
588
+ };
589
+
590
+ ws.onclose = () => {
591
+ setStatus('disconnected');
592
+ log('WARN', `disconnected β€” reconnecting in ${reconnectDelay / 1000}s`);
593
+ setTimeout(connect, reconnectDelay);
594
+ reconnectDelay = Math.min(reconnectDelay * 2, 30000);
595
+ };
596
+
597
+ ws.onerror = () => {
598
+ setStatus('error');
599
+ log('WARN', 'WebSocket error β€” will retry');
600
+ };
601
+
602
+ ws.onmessage = (evt) => {
603
+ const msg = JSON.parse(evt.data);
604
+ if (msg.type === 'pong') return;
605
+ if (msg.type === 'status') {
606
+ const p = msg.payload;
607
+ log('INFO', `engine status: demo=${p.demo_mode}, groq=${p.has_groq}, x_api=${p.has_x_api}`);
608
+ return;
609
+ }
610
+ if (msg.type === 'result') renderResult(msg.payload);
611
+ if (msg.type === 'error') {
612
+ log('WARN', `error: ${msg.payload?.message}`);
613
+ resetBtn();
614
+ }
615
+ };
616
+ }
617
+
618
+ // Keepalive ping every 20s
619
+ setInterval(() => { if (ws?.readyState === 1) ws.send(JSON.stringify({ type: 'ping' })); }, 20000);
620
+
621
+ // ─── Platform selector ───────────────────────────────────────────────────
622
+ document.getElementById('platformRow').addEventListener('click', (e) => {
623
+ const btn = e.target.closest('.platform-btn');
624
+ if (!btn) return;
625
+ document.querySelectorAll('.platform-btn').forEach(b => b.classList.remove('active'));
626
+ btn.classList.add('active');
627
+ selectedPlatform = btn.dataset.platform;
628
+ });
629
+
630
+ // ─── Analysis ────────────────────────────────────────────────────────────
631
+ async function analyzeClaim() {
632
+ const text = document.getElementById('claimInput').value.trim();
633
+ if (!text) return;
634
+
635
+ const words = text.split(/\s+/).filter(Boolean);
636
+ if (words.length < 12) {
637
+ log('WARN', `text too short: ${words.length} words (minimum 12)`);
638
+ return;
639
+ }
640
+
641
+ if (!ws || ws.readyState !== 1) {
642
+ log('WARN', 'not connected β€” retrying connection');
643
+ connect();
644
+ return;
645
+ }
646
+
647
+ setBtnLoading(true);
648
+ document.getElementById('resultCard').classList.remove('visible');
649
+ log('INFO', `sending claim (${words.length} words) on platform: ${selectedPlatform}`);
650
+
651
+ // Compute xxhash-like fingerprint in browser (simplified)
652
+ const hash = await hashText(text);
653
+ log('INFO', `content hash: ${hash.slice(0, 8)}... β€” checking cache`);
654
+
655
+ const batch = {
656
+ type: 'batch',
657
+ payload: {
658
+ session_id: SESSION_ID,
659
+ platform: selectedPlatform,
660
+ segments: [{
661
+ content_hash: hash,
662
+ text: text,
663
+ element_id: `demo-${Date.now()}`,
664
+ word_count: words.length,
665
+ }],
666
+ sent_at: new Date().toISOString(),
667
+ }
668
+ };
669
+
670
+ ws.send(JSON.stringify(batch));
671
+ log('INFO', 'batch dispatched β†’ gatekeeper β†’ RAG β†’ agents');
672
+ }
673
+
674
+ async function hashText(text) {
675
+ const buf = new TextEncoder().encode(text);
676
+ const hashBuf = await crypto.subtle.digest('SHA-256', buf);
677
+ return Array.from(new Uint8Array(hashBuf)).map(b => b.toString(16).padStart(2, '0')).join('');
678
+ }
679
+
680
+ // ─���─ Render result ────────────────────────────────────────────────────────
681
+ function renderResult(r) {
682
+ setBtnLoading(false);
683
+
684
+ const colorMap = {
685
+ green: { label: 'βœ“ VERIFIED', stroke: '#22c55e' },
686
+ yellow: { label: '⚠ UNVERIFIED', stroke: '#eab308' },
687
+ red: { label: 'βœ— DEBUNKED', stroke: '#ef4444' },
688
+ purple: { label: 'β—ˆ AI HALLUCINATION', stroke: '#a855f7' },
689
+ };
690
+
691
+ const c = colorMap[r.color] || colorMap.yellow;
692
+
693
+ // Confidence arc
694
+ const arc = document.getElementById('confArc');
695
+ const circumference = 2 * Math.PI * 30;
696
+ const offset = circumference - (r.confidence / 100) * circumference;
697
+ arc.style.strokeDashoffset = offset;
698
+ arc.style.stroke = c.stroke;
699
+ document.getElementById('confNum').textContent = r.confidence;
700
+ document.getElementById('confNum').style.color = c.stroke;
701
+
702
+ // Badge
703
+ const badge = document.getElementById('colorBadge');
704
+ badge.textContent = c.label;
705
+ badge.className = `color-badge ${r.color}`;
706
+
707
+ document.getElementById('verdictLabel').textContent = r.verdict_label || 'Analysis complete';
708
+ document.getElementById('explanationText').textContent = r.explanation || '';
709
+
710
+ // Meta
711
+ document.getElementById('metaTrust').textContent = (r.trust_score * 100).toFixed(0) + '%';
712
+ document.getElementById('metaVelocity').textContent = r.velocity?.toLocaleString() ?? 'β€”';
713
+ document.getElementById('metaNote').textContent = r.has_community_note ? '⚠ YES' : 'βœ“ None';
714
+ document.getElementById('metaNote').style.color = r.has_community_note ? 'var(--red)' : 'var(--green)';
715
+ document.getElementById('metaLatency').textContent = r.latency_ms?.toFixed(1) ?? 'β€”';
716
+ document.getElementById('metaCached').textContent = r.cached ? 'βœ“ HIT' : 'βœ— MISS';
717
+ document.getElementById('metaPlatform').textContent = r.platform?.toUpperCase() ?? 'β€”';
718
+
719
+ // Sources
720
+ const list = document.getElementById('sourcesList');
721
+ list.innerHTML = '';
722
+ if (r.sources?.length) {
723
+ r.sources.forEach(s => {
724
+ const el = document.createElement('div');
725
+ el.className = 'source-item';
726
+ el.innerHTML = `
727
+ <img class="source-favicon" src="${s.favicon_url}" onerror="this.style.display='none'">
728
+ <div>
729
+ <div class="source-domain">${s.domain || 'unknown'}</div>
730
+ <div style="font-size:12px;color:var(--text-muted);margin-top:2px">${s.snippet || s.url || ''}</div>
731
+ </div>`;
732
+ list.appendChild(el);
733
+ });
734
+ document.getElementById('sourcesSection').style.display = 'block';
735
+ } else {
736
+ document.getElementById('sourcesSection').style.display = 'none';
737
+ }
738
+
739
+ log('OK', `verdict: ${r.color.toUpperCase()} (${r.confidence}%) β€” ${r.verdict_label}`);
740
+ document.getElementById('resultCard').classList.add('visible');
741
+ }
742
+
743
+ // ─── Helpers ────────────────────────────────────────────────────────────
744
+ function setStatus(state) {
745
+ const dot = document.getElementById('statusDot');
746
+ const txt = document.getElementById('statusText');
747
+ dot.className = 'status-dot';
748
+ if (state === 'connected') { dot.classList.add('connected'); txt.textContent = 'intelligence engine online'; }
749
+ else if (state === 'connecting') { dot.classList.add('connecting'); txt.textContent = 'connecting...'; }
750
+ else if (state === 'error') { dot.classList.add('error'); txt.textContent = 'connection error'; }
751
+ else { txt.textContent = 'offline β€” reconnecting'; }
752
+ }
753
+
754
+ function setBtnLoading(loading) {
755
+ document.getElementById('spinner').classList.toggle('active', loading);
756
+ document.getElementById('btnText').textContent = loading ? 'ANALYZING...' : 'ANALYZE CLAIM';
757
+ document.getElementById('analyzeBtn').disabled = loading;
758
+ }
759
+
760
+ function resetBtn() { setBtnLoading(false); }
761
+
762
+ function log(level, msg) {
763
+ const container = document.getElementById('pipelineLog');
764
+ const now = new Date().toISOString().slice(11, 23);
765
+ const levelClass = { INFO: 'info', OK: 'ok', WARN: 'warn', DROP: 'drop' }[level] || 'info';
766
+ const line = document.createElement('div');
767
+ line.className = 'log-line';
768
+ line.innerHTML = `<span class="log-ts">${now}</span><span class="log-level ${levelClass}">[${level}]</span><span>${msg}</span>`;
769
+ container.appendChild(line);
770
+ container.scrollTop = container.scrollHeight;
771
+ }
772
+
773
+ function setExample(text) {
774
+ document.getElementById('claimInput').value = text.trim();
775
+ }
776
+
777
+ // Start
778
+ connect();
779
+ log('INFO', 'intelligence system initialized');
780
+ log('INFO', `session: ${SESSION_ID.slice(0, 8)}...`);
781
+ </script>
782
+ </body>
783
+ </html>
backend/tests/test_pipeline.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ tests/test_pipeline.py β€” Integration-style tests for the full fact-checking pipeline.
3
+
4
+ Run with:
5
+ uv run pytest tests/ -v
6
+
7
+ Tests use DEMO_MODE=true to avoid needing real API keys.
8
+ All external services (Qdrant, Memgraph, Redis) are mocked using monkeypatching.
9
+ """
10
+
11
+ import asyncio
12
+ from unittest.mock import AsyncMock, MagicMock, patch
13
+
14
+ import pytest
15
+
16
+ from core.config import HighlightColor, Platform, Settings
17
+ from core.models import (
18
+ EvidenceChunk,
19
+ GatekeeperResult,
20
+ GrokSensorResult,
21
+ RAGResult,
22
+ TextBatch,
23
+ TextSegment,
24
+ TrustScore,
25
+ )
26
+ from gatekeeper import classify_claim, _heuristic_classify
27
+ from grok_sensor import _mock_sensor_result, _extract_keywords
28
+
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # Fixtures
32
+ # ---------------------------------------------------------------------------
33
+
34
+ @pytest.fixture
35
+ def demo_settings() -> Settings:
36
+ return Settings(
37
+ DEMO_MODE=True,
38
+ GROQ_API_KEY="",
39
+ ANTHROPIC_API_KEY="",
40
+ X_BEARER_TOKEN="",
41
+ QDRANT_HOST="localhost",
42
+ MEMGRAPH_HOST="localhost",
43
+ REDIS_URL="redis://localhost:6379",
44
+ )
45
+
46
+
47
+ @pytest.fixture
48
+ def sample_rag_result() -> RAGResult:
49
+ from datetime import datetime, timezone
50
+ return RAGResult(
51
+ evidence=[
52
+ EvidenceChunk(
53
+ chunk_id="test-001",
54
+ text="Scientists confirmed mRNA vaccines provide long-term immunity.",
55
+ source_url="https://reuters.com/article/123",
56
+ domain="reuters.com",
57
+ score=0.89,
58
+ ingested_at=datetime.now(timezone.utc),
59
+ bias_rating="center",
60
+ )
61
+ ],
62
+ trust=TrustScore(
63
+ score=0.75,
64
+ author_verified=True,
65
+ corroborating_sources=2,
66
+ has_community_note=False,
67
+ ),
68
+ )
69
+
70
+
71
+ @pytest.fixture
72
+ def sample_grok_result() -> GrokSensorResult:
73
+ return GrokSensorResult(
74
+ velocity=1200,
75
+ community_note=False,
76
+ note_text=None,
77
+ is_mock=True,
78
+ )
79
+
80
+
81
+ # ---------------------------------------------------------------------------
82
+ # Gatekeeper tests
83
+ # ---------------------------------------------------------------------------
84
+
85
+ class TestGatekeeper:
86
+ @pytest.mark.asyncio
87
+ async def test_heuristic_classifies_opinion_as_noise(self, demo_settings):
88
+ result = await classify_claim("I think this is all just propaganda honestly", demo_settings)
89
+ assert result.label == "noise"
90
+
91
+ @pytest.mark.asyncio
92
+ async def test_heuristic_classifies_numeric_claim_as_fact(self, demo_settings):
93
+ result = await classify_claim(
94
+ "According to the CDC report, 73% of participants showed immunity lasting 18 months",
95
+ demo_settings,
96
+ )
97
+ assert result.label == "fact"
98
+
99
+ def test_heuristic_opinion_starters(self):
100
+ opinion_texts = [
101
+ "I think the whole thing is suspicious and people should wake up",
102
+ "I believe this is all connected somehow to something bigger",
103
+ "IMO this is the worst policy decision in history by far",
104
+ ]
105
+ for text in opinion_texts:
106
+ result = _heuristic_classify(text)
107
+ assert result.label == "noise", f"Expected noise for: {text}"
108
+
109
+ def test_heuristic_factual_claim(self):
110
+ result = _heuristic_classify(
111
+ "The Federal Reserve raised rates by 75 basis points according to the official announcement"
112
+ )
113
+ assert result.label == "fact"
114
+ assert result.confidence > 0.5
115
+
116
+ def test_gatekeeper_result_confidence_bounds(self):
117
+ result = _heuristic_classify("Scientists found that 47% of participants showed no immunity")
118
+ assert 0.0 <= result.confidence <= 1.0
119
+
120
+ def test_gatekeeper_result_valid_label(self):
121
+ result = _heuristic_classify("lol did you see that? total propaganda πŸ˜‚")
122
+ assert result.label in {"fact", "noise"}
123
+
124
+
125
+ # ---------------------------------------------------------------------------
126
+ # Grok sensor tests
127
+ # ---------------------------------------------------------------------------
128
+
129
+ class TestGrokSensor:
130
+ def test_mock_is_deterministic(self):
131
+ """Same hash should always produce the same mock result."""
132
+ h = "abcdef1234567890"
133
+ r1 = _mock_sensor_result(h)
134
+ r2 = _mock_sensor_result(h)
135
+ assert r1.velocity == r2.velocity
136
+ assert r1.community_note == r2.community_note
137
+ assert r1.is_mock is True
138
+
139
+ def test_mock_different_hashes_produce_variation(self):
140
+ """Different hashes should produce different results (not all identical)."""
141
+ results = [_mock_sensor_result(f"hash_{i:04d}") for i in range(50)]
142
+ velocities = [r.velocity for r in results]
143
+ # Should have variation β€” not all the same value
144
+ assert len(set(velocities)) > 5
145
+
146
+ def test_keyword_extraction_removes_stopwords(self):
147
+ text = "The Federal Reserve is raising interest rates by 75 basis points today"
148
+ keywords = _extract_keywords(text)
149
+ assert "the" not in keywords
150
+ assert "is" not in keywords
151
+ # Meaningful words should be present
152
+ assert any(k.lower() in ("federal", "reserve", "raising", "interest", "rates") for k in keywords)
153
+
154
+ def test_keyword_extraction_max_10(self):
155
+ long_text = " ".join(f"word{i}" for i in range(50))
156
+ keywords = _extract_keywords(long_text)
157
+ assert len(keywords) <= 10
158
+
159
+
160
+ # ---------------------------------------------------------------------------
161
+ # Model validation tests
162
+ # ---------------------------------------------------------------------------
163
+
164
+ class TestModels:
165
+ def test_text_segment_rejects_short_text(self):
166
+ with pytest.raises(Exception):
167
+ TextSegment(
168
+ content_hash="abc123",
169
+ text="too short",
170
+ element_id="el-001",
171
+ word_count=2, # Below minimum of 12
172
+ )
173
+
174
+ def test_text_batch_platform_validation(self):
175
+ batch = TextBatch(
176
+ session_id="test-session",
177
+ platform=Platform.TWITTER,
178
+ segments=[
179
+ TextSegment(
180
+ content_hash="a" * 16,
181
+ text="Scientists confirmed that 73 percent of mRNA vaccine recipients showed 18-month immunity",
182
+ element_id="el-001",
183
+ word_count=15,
184
+ )
185
+ ],
186
+ )
187
+ assert batch.platform == Platform.TWITTER
188
+ assert len(batch.segments) == 1
189
+
190
+ def test_trust_score_clamping(self):
191
+ # Trust score should be clamped to [0, 1]
192
+ ts = TrustScore(
193
+ score=0.5,
194
+ author_verified=True,
195
+ corroborating_sources=3,
196
+ has_community_note=False,
197
+ )
198
+ assert 0.0 <= ts.score <= 1.0
199
+
200
+ def test_gatekeeper_result_invalid_label_raises(self):
201
+ with pytest.raises(Exception):
202
+ GatekeeperResult.model_validate({"label": "unknown", "reason": "test", "confidence": 0.5})
203
+
204
+
205
+ # ---------------------------------------------------------------------------
206
+ # Agent pipeline tests (mocked)
207
+ # ---------------------------------------------------------------------------
208
+
209
+ class TestAgents:
210
+ @pytest.mark.asyncio
211
+ async def test_evaluate_claim_demo_mode(
212
+ self, demo_settings, sample_rag_result, sample_grok_result
213
+ ):
214
+ """In demo mode, evaluate_claim should return a valid AnalysisResult without API calls."""
215
+ from agents import evaluate_claim
216
+
217
+ result = await evaluate_claim(
218
+ claim="Scientists confirmed that mRNA vaccines provide immunity lasting over 18 months in clinical trials",
219
+ claim_hash="testhashabc123",
220
+ element_id="el-test-001",
221
+ platform=Platform.NEWS,
222
+ rag_result=sample_rag_result,
223
+ grok_result=sample_grok_result,
224
+ settings=demo_settings,
225
+ )
226
+
227
+ assert result.color in {HighlightColor.GREEN, HighlightColor.YELLOW, HighlightColor.RED, HighlightColor.PURPLE}
228
+ assert 0 <= result.confidence <= 100
229
+ assert result.element_id == "el-test-001"
230
+ assert result.trust_score == sample_rag_result.trust.score
231
+
232
+ @pytest.mark.asyncio
233
+ async def test_low_trust_score_yields_red_or_yellow(
234
+ self, demo_settings, sample_grok_result
235
+ ):
236
+ """Claims with low trust scores should not get green verdicts."""
237
+ from datetime import datetime, timezone
238
+ from agents import evaluate_claim
239
+
240
+ low_trust_rag = RAGResult(
241
+ evidence=[],
242
+ trust=TrustScore(
243
+ score=0.1, # Very low
244
+ author_verified=False,
245
+ corroborating_sources=0,
246
+ has_community_note=True,
247
+ community_note_text="This claim is misleading.",
248
+ ),
249
+ )
250
+
251
+ result = await evaluate_claim(
252
+ claim="Completely fabricated statistic that 500% of people believe this false claim completely",
253
+ claim_hash="lowtrusthash123",
254
+ element_id="el-test-002",
255
+ platform=Platform.TWITTER,
256
+ rag_result=low_trust_rag,
257
+ grok_result=GrokSensorResult(velocity=50000, community_note=True, note_text="Misleading"),
258
+ settings=demo_settings,
259
+ )
260
+
261
+ assert result.color in {HighlightColor.RED, HighlightColor.YELLOW}
262
+ assert result.has_community_note is True
263
+
264
+ @pytest.mark.asyncio
265
+ async def test_ai_platform_triggers_hallucination_check(
266
+ self, demo_settings, sample_rag_result, sample_grok_result
267
+ ):
268
+ """AI platforms should trigger the hallucination task (in demo, returns purple)."""
269
+ from agents import evaluate_claim
270
+
271
+ result = await evaluate_claim(
272
+ claim="As cited in Smith et al. 2019 paper on quantum biology, the compound achieves 99.7% efficacy across all known variants",
273
+ claim_hash="halluchash456",
274
+ element_id="el-test-003",
275
+ platform=Platform.CHATGPT, # AI platform β€” triggers hallucination check
276
+ rag_result=sample_rag_result,
277
+ grok_result=sample_grok_result,
278
+ settings=demo_settings,
279
+ )
280
+
281
+ # On AI platforms in demo mode, hallucination check runs and may override color
282
+ assert result.color in {HighlightColor.PURPLE, HighlightColor.GREEN, HighlightColor.YELLOW, HighlightColor.RED}
283
+ assert result.platform == Platform.CHATGPT
284
+
285
+
286
+ # ---------------------------------------------------------------------------
287
+ # Cache key tests
288
+ # ---------------------------------------------------------------------------
289
+
290
+ class TestCacheKeys:
291
+ def test_cache_key_format(self):
292
+ """Cache keys should follow the `verdict:{hash}` format."""
293
+ content_hash = "abc123def456"
294
+ cache_key = f"verdict:{content_hash}"
295
+ assert cache_key == "verdict:abc123def456"
296
+
297
+ def test_different_texts_produce_different_hashes(self):
298
+ import xxhash
299
+ texts = [
300
+ "Scientists confirmed 73% immunity",
301
+ "Scientists confirmed 74% immunity",
302
+ "completely different claim about climate change",
303
+ ]
304
+ hashes = [xxhash.xxh64(t.encode()).hexdigest() for t in texts]
305
+ assert len(set(hashes)) == len(hashes), "All hashes should be unique"
docker-compose.yml ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: "3.9"
2
+
3
+ # =============================================================================
4
+ # Omnichannel Fact & Hallucination Intelligence System
5
+ # HuggingFace Spaces compatible β€” single `docker compose up` deployment
6
+ # Services: FastAPI (7860), Qdrant (6333), Memgraph (7687), Redpanda (9092), Redis Stack (6379)
7
+ # =============================================================================
8
+
9
+ networks:
10
+ fact-net:
11
+ driver: bridge
12
+
13
+ volumes:
14
+ qdrant_storage:
15
+ memgraph_data:
16
+ redpanda_data:
17
+ redis_data:
18
+
19
+ services:
20
+ # ---------------------------------------------------------------------------
21
+ # QDRANT β€” Vector DB for claim embeddings (self-hosted, sub-ms HNSW search)
22
+ # ---------------------------------------------------------------------------
23
+ qdrant:
24
+ image: qdrant/qdrant:v1.9.2
25
+ container_name: fact-qdrant
26
+ restart: unless-stopped
27
+ networks: [fact-net]
28
+ ports:
29
+ - "6333:6333"
30
+ - "6334:6334" # gRPC
31
+ volumes:
32
+ - qdrant_storage:/qdrant/storage
33
+ environment:
34
+ QDRANT__SERVICE__GRPC_PORT: 6334
35
+ QDRANT__TELEMETRY_DISABLED: "true"
36
+ healthcheck:
37
+ test: ["CMD", "curl", "-f", "http://localhost:6333/readyz"]
38
+ interval: 10s
39
+ timeout: 5s
40
+ retries: 5
41
+
42
+ # ---------------------------------------------------------------------------
43
+ # MEMGRAPH β€” In-memory graph DB for trust-score traversal (Cypher compatible)
44
+ # 10-100x faster than Neo4j for real-time traversal since everything is in RAM
45
+ # ---------------------------------------------------------------------------
46
+ memgraph:
47
+ image: memgraph/memgraph-platform:2.16.0
48
+ container_name: fact-memgraph
49
+ restart: unless-stopped
50
+ networks: [fact-net]
51
+ ports:
52
+ - "7687:7687" # Bolt
53
+ - "3000:3000" # Memgraph Lab UI
54
+ volumes:
55
+ - memgraph_data:/var/lib/memgraph
56
+ environment:
57
+ MEMGRAPH_USER: memgraph
58
+ MEMGRAPH_PASSWORD: memgraph123
59
+ healthcheck:
60
+ test: ["CMD", "mg_client", "--host", "localhost", "--port", "7687", "--use-ssl=false", "-q", "RETURN 1;"]
61
+ interval: 15s
62
+ timeout: 10s
63
+ retries: 5
64
+
65
+ # ---------------------------------------------------------------------------
66
+ # REDPANDA β€” Kafka-compatible message queue (no JVM, no ZooKeeper, 10x lower
67
+ # latency). Handles the omnichannel ingestion firehose from all producers.
68
+ # ---------------------------------------------------------------------------
69
+ redpanda:
70
+ image: redpandadata/redpanda:v24.1.7
71
+ container_name: fact-redpanda
72
+ restart: unless-stopped
73
+ networks: [fact-net]
74
+ ports:
75
+ - "9092:9092" # Kafka API
76
+ - "9644:9644" # Admin API
77
+ - "8081:8081" # Schema registry
78
+ volumes:
79
+ - redpanda_data:/var/lib/redpanda/data
80
+ command:
81
+ - redpanda
82
+ - start
83
+ - --smp=1
84
+ - --memory=512M
85
+ - --overprovisioned
86
+ - --kafka-addr=PLAINTEXT://0.0.0.0:9092
87
+ - --advertise-kafka-addr=PLAINTEXT://redpanda:9092
88
+ - --pandaproxy-addr=0.0.0.0:8082
89
+ - --advertise-pandaproxy-addr=redpanda:8082
90
+ - --schema-registry-addr=0.0.0.0:8081
91
+ - --rpc-addr=redpanda:33145
92
+ - --advertise-rpc-addr=redpanda:33145
93
+ healthcheck:
94
+ test: ["CMD", "rpk", "cluster", "health"]
95
+ interval: 15s
96
+ timeout: 10s
97
+ retries: 5
98
+
99
+ # ---------------------------------------------------------------------------
100
+ # REDIS STACK β€” Redis + RedisJSON + RedisSearch for structured claim caching
101
+ # TTL: 6h for Green/Red verdicts, 15min for Yellow, no cache for Purple
102
+ # ---------------------------------------------------------------------------
103
+ redis-stack:
104
+ image: redis/redis-stack:7.4.0-v0
105
+ container_name: fact-redis
106
+ restart: unless-stopped
107
+ networks: [fact-net]
108
+ ports:
109
+ - "6379:6379" # Redis
110
+ - "8001:8001" # RedisInsight UI
111
+ volumes:
112
+ - redis_data:/data
113
+ environment:
114
+ REDIS_ARGS: "--maxmemory 256mb --maxmemory-policy allkeys-lru"
115
+ healthcheck:
116
+ test: ["CMD", "redis-cli", "ping"]
117
+ interval: 10s
118
+ timeout: 5s
119
+ retries: 5
120
+
121
+ # ---------------------------------------------------------------------------
122
+ # BACKEND β€” FastAPI intelligence engine (HF Spaces listens on 7860)
123
+ # Waits for all upstream services to be healthy before starting
124
+ # ---------------------------------------------------------------------------
125
+ backend:
126
+ build:
127
+ context: ./backend
128
+ dockerfile: Dockerfile
129
+ container_name: fact-backend
130
+ restart: unless-stopped
131
+ networks: [fact-net]
132
+ ports:
133
+ - "7860:7860" # HuggingFace Spaces default port
134
+ depends_on:
135
+ qdrant:
136
+ condition: service_healthy
137
+ memgraph:
138
+ condition: service_healthy
139
+ redpanda:
140
+ condition: service_healthy
141
+ redis-stack:
142
+ condition: service_healthy
143
+ environment:
144
+ # LLM providers β€” set in HF Space secrets
145
+ GROQ_API_KEY: ${GROQ_API_KEY:-}
146
+ ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-}
147
+ X_BEARER_TOKEN: ${X_BEARER_TOKEN:-}
148
+
149
+ # Infrastructure endpoints (internal Docker network)
150
+ QDRANT_HOST: qdrant
151
+ QDRANT_PORT: 6333
152
+ MEMGRAPH_HOST: memgraph
153
+ MEMGRAPH_PORT: 7687
154
+ MEMGRAPH_PASSWORD: memgraph123
155
+ REDPANDA_BROKERS: redpanda:9092
156
+ REDIS_URL: redis://redis-stack:6379
157
+
158
+ # App config
159
+ PORT: 7860
160
+ LOG_LEVEL: INFO
161
+ DEMO_MODE: ${DEMO_MODE:-false} # true = use mock data, skip external APIs
162
+ volumes:
163
+ - ./backend:/app
164
+ command: ["uv", "run", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--reload"]
extension/entrypoints/background.ts ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // extension/entrypoints/background.ts
2
+ // Persistent background service worker.
3
+ // Maintains a SINGLE WebSocket connection to the backend intelligence engine.
4
+ // Routes results to the correct content script via chrome.tabs.sendMessage.
5
+ //
6
+ // Why a single connection in the background?
7
+ // Content scripts are destroyed/recreated on navigation. The background worker
8
+ // persists for the lifetime of the extension, ensuring we never drop messages
9
+ // and reconnection logic runs in one place.
10
+
11
+ import { defineBackground } from "wxt/sandbox";
12
+
13
+ // Injected by wxt.config.ts vite.define β€” falls back to localhost for dev
14
+ declare const __WS_URL__: string;
15
+ const WS_URL = typeof __WS_URL__ !== "undefined"
16
+ ? __WS_URL__
17
+ : "ws://localhost:7860/ws";
18
+
19
+ const SESSION_ID = crypto.randomUUID();
20
+
21
+ // ---------------------------------------------------------------------------
22
+ // WebSocket connection with exponential backoff
23
+ // ---------------------------------------------------------------------------
24
+
25
+ let ws: WebSocket | null = null;
26
+ let reconnectTimer: ReturnType<typeof setTimeout> | null = null;
27
+ let reconnectDelay = 1_000; // Start at 1s, cap at 30s
28
+
29
+ // Tab ID β†’ { platform, pendingHashes } mapping for routing results back
30
+ const tabRegistry = new Map<number, { platform: string }>();
31
+
32
+ function getWsUrl(): string {
33
+ return `${WS_URL}/${SESSION_ID}`;
34
+ }
35
+
36
+ function connect(): void {
37
+ if (ws?.readyState === WebSocket.OPEN) return;
38
+
39
+ ws = new WebSocket(getWsUrl());
40
+
41
+ ws.onopen = () => {
42
+ console.log("[background] WS connected:", getWsUrl());
43
+ reconnectDelay = 1_000; // Reset backoff on successful connection
44
+ broadcastStatus("connected");
45
+ };
46
+
47
+ ws.onmessage = (evt: MessageEvent) => {
48
+ try {
49
+ const msg = JSON.parse(evt.data as string);
50
+
51
+ if (msg.type === "pong") return;
52
+
53
+ if (msg.type === "status") {
54
+ // Forward demo mode flag to all content scripts
55
+ chrome.tabs.query({}, (tabs) => {
56
+ tabs.forEach((tab) => {
57
+ if (tab.id) {
58
+ chrome.tabs.sendMessage(tab.id, {
59
+ type: "status",
60
+ payload: msg.payload,
61
+ }).catch(() => {/* Tab may not have content script */});
62
+ }
63
+ });
64
+ });
65
+ return;
66
+ }
67
+
68
+ if (msg.type === "result" && msg.payload) {
69
+ routeResultToTab(msg.payload);
70
+ }
71
+
72
+ if (msg.type === "error") {
73
+ console.error("[background] Server error:", msg.payload?.message);
74
+ }
75
+ } catch (err) {
76
+ console.error("[background] Message parse error:", err);
77
+ }
78
+ };
79
+
80
+ ws.onclose = (evt) => {
81
+ ws = null;
82
+ console.log(`[background] WS closed (code=${evt.code}), reconnecting in ${reconnectDelay}ms`);
83
+ broadcastStatus("reconnecting");
84
+
85
+ reconnectTimer = setTimeout(() => {
86
+ reconnectDelay = Math.min(reconnectDelay * 2, 30_000);
87
+ connect();
88
+ }, reconnectDelay);
89
+ };
90
+
91
+ ws.onerror = () => {
92
+ broadcastStatus("offline");
93
+ };
94
+ }
95
+
96
+ // ---------------------------------------------------------------------------
97
+ // Route analysis results to the tab that originated the request
98
+ // ---------------------------------------------------------------------------
99
+
100
+ function routeResultToTab(result: Record<string, unknown>): void {
101
+ // Find the tab that has this element (active tabs with content scripts)
102
+ chrome.tabs.query({ active: true }, (tabs) => {
103
+ tabs.forEach((tab) => {
104
+ if (tab.id) {
105
+ chrome.tabs.sendMessage(tab.id, {
106
+ type: "result",
107
+ payload: result,
108
+ }).catch(() => {/* Content script may not be injected on this tab */});
109
+ }
110
+ });
111
+ });
112
+ }
113
+
114
+ // ---------------------------------------------------------------------------
115
+ // Broadcast WS status to all content scripts + popup
116
+ // ---------------------------------------------------------------------------
117
+
118
+ function broadcastStatus(status: string): void {
119
+ chrome.tabs.query({}, (tabs) => {
120
+ tabs.forEach((tab) => {
121
+ if (tab.id) {
122
+ chrome.tabs.sendMessage(tab.id, { type: "ws_status", payload: { status } })
123
+ .catch(() => {});
124
+ }
125
+ });
126
+ });
127
+
128
+ // Also notify popup if open
129
+ chrome.runtime.sendMessage({ type: "ws_status", payload: { status } })
130
+ .catch(() => {});
131
+ }
132
+
133
+ // ---------------------------------------------------------------------------
134
+ // Handle messages from content scripts
135
+ // ---------------------------------------------------------------------------
136
+
137
+ chrome.runtime.onMessage.addListener((msg, sender, sendResponse) => {
138
+ if (msg.type === "send_batch") {
139
+ if (ws?.readyState === WebSocket.OPEN) {
140
+ ws.send(JSON.stringify({ type: "batch", payload: msg.payload }));
141
+ sendResponse({ ok: true });
142
+ } else {
143
+ sendResponse({ ok: false, reason: "not_connected" });
144
+ }
145
+ return true; // Async response
146
+ }
147
+
148
+ if (msg.type === "get_status") {
149
+ sendResponse({
150
+ status: ws?.readyState === WebSocket.OPEN ? "connected" : "offline",
151
+ });
152
+ return true;
153
+ }
154
+
155
+ if (msg.type === "ping") {
156
+ if (ws?.readyState === WebSocket.OPEN) {
157
+ ws.send(JSON.stringify({ type: "ping" }));
158
+ }
159
+ sendResponse({ ok: true });
160
+ return true;
161
+ }
162
+ });
163
+
164
+ // ---------------------------------------------------------------------------
165
+ // Keepalive β€” prevents background worker from being suspended
166
+ // ---------------------------------------------------------------------------
167
+
168
+ setInterval(() => {
169
+ if (ws?.readyState === WebSocket.OPEN) {
170
+ ws.send(JSON.stringify({ type: "ping" }));
171
+ } else if (!ws || ws.readyState === WebSocket.CLOSED) {
172
+ connect(); // Re-attempt if connection died silently
173
+ }
174
+ }, 20_000);
175
+
176
+ export default defineBackground(() => {
177
+ connect();
178
+ console.log("[background] Fact Intelligence background worker started");
179
+ });
extension/entrypoints/content.tsx ADDED
@@ -0,0 +1,453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // extension/entrypoints/content.tsx
2
+ // Main content script β€” runs in every matching page context.
3
+ //
4
+ // Pipeline:
5
+ // 1. MutationObserver watches for meaningful text node changes
6
+ // 2. Text is accumulated in a ring buffer, flushed every 1200ms
7
+ // 3. Each flush is deduplicated via xxhash-wasm (client-side)
8
+ // 4. Deduplicated segments sent to background worker β†’ WebSocket
9
+ // 5. Results come back as chrome.runtime.onMessage events
10
+ // 6. Highlights applied as <mark> elements via Range.surroundContents()
11
+ // 7. Hover cards rendered inside a Shadow DOM to prevent CSS bleed
12
+
13
+ import { defineContentScript } from "wxt/sandbox";
14
+ import { createRoot } from "react-dom/client";
15
+ import React, { useEffect, useRef, useState } from "react";
16
+ import { AnimatePresence, motion } from "framer-motion";
17
+ import { init as initXxhash, h64ToString } from "xxhash-wasm";
18
+
19
+ import {
20
+ AnalysisResult,
21
+ COLOR_CONFIG,
22
+ ExtensionMode,
23
+ HighlightColor,
24
+ shouldShowColor,
25
+ useExtensionStore,
26
+ } from "../stores/extensionStore";
27
+
28
+ // ---------------------------------------------------------------------------
29
+ // Platform detection
30
+ // ---------------------------------------------------------------------------
31
+
32
+ function detectPlatform(): string {
33
+ const host = location.hostname;
34
+ if (host.includes("twitter.com") || host.includes("x.com")) return "twitter";
35
+ if (host.includes("instagram.com")) return "instagram";
36
+ if (host.includes("youtube.com")) return "youtube";
37
+ if (host.includes("chat.openai.com")) return "chatgpt";
38
+ if (host.includes("claude.ai")) return "claude";
39
+ if (host.includes("gemini.google.com")) return "gemini";
40
+ return "news";
41
+ }
42
+
43
+ // ---------------------------------------------------------------------------
44
+ // Text node utilities
45
+ // ---------------------------------------------------------------------------
46
+
47
+ const SKIP_TAGS = new Set(["SCRIPT", "STYLE", "SVG", "NOSCRIPT", "IFRAME", "META", "HEAD"]);
48
+
49
+ function isValidTextNode(node: Text): boolean {
50
+ const parent = node.parentElement;
51
+ if (!parent) return false;
52
+
53
+ // Skip non-content tags
54
+ let el: Element | null = parent;
55
+ while (el) {
56
+ if (SKIP_TAGS.has(el.tagName)) return false;
57
+ el = el.parentElement;
58
+ }
59
+
60
+ const text = node.textContent?.trim() ?? "";
61
+ const wordCount = text.split(/\s+/).filter(Boolean).length;
62
+ return wordCount >= 12;
63
+ }
64
+
65
+ function extractTextNodes(root: Node): Text[] {
66
+ const walker = document.createTreeWalker(root, NodeFilter.SHOW_TEXT, {
67
+ acceptNode: (node) =>
68
+ isValidTextNode(node as Text) ? NodeFilter.FILTER_ACCEPT : NodeFilter.FILTER_SKIP,
69
+ });
70
+ const nodes: Text[] = [];
71
+ while (walker.nextNode()) nodes.push(walker.currentNode as Text);
72
+ return nodes;
73
+ }
74
+
75
+ // ---------------------------------------------------------------------------
76
+ // Ring buffer β€” accumulates text segments, flushed every 1200ms
77
+ // ---------------------------------------------------------------------------
78
+
79
+ interface QueuedSegment {
80
+ hash: string;
81
+ text: string;
82
+ node: Text;
83
+ elementId: string;
84
+ }
85
+
86
+ // ---------------------------------------------------------------------------
87
+ // Highlight system
88
+ // ---------------------------------------------------------------------------
89
+
90
+ const highlightMap = new Map<string, HTMLElement>(); // elementId β†’ <mark>
91
+
92
+ function applyHighlight(
93
+ node: Text,
94
+ elementId: string,
95
+ color: HighlightColor,
96
+ result: AnalysisResult
97
+ ): void {
98
+ // If already highlighted, update color only
99
+ const existing = highlightMap.get(elementId);
100
+ if (existing) {
101
+ const cfg = COLOR_CONFIG[color];
102
+ existing.style.backgroundColor = `${cfg.hex}${Math.round(cfg.opacity * 255).toString(16).padStart(2, "0")}`;
103
+ existing.dataset.result = JSON.stringify(result);
104
+ return;
105
+ }
106
+
107
+ try {
108
+ const range = document.createRange();
109
+ range.selectNode(node);
110
+
111
+ const cfg = COLOR_CONFIG[color];
112
+ const mark = document.createElement("mark");
113
+ mark.dataset.factId = elementId;
114
+ mark.dataset.result = JSON.stringify(result);
115
+ mark.style.cssText = `
116
+ background-color: ${cfg.hex}${Math.round(cfg.opacity * 255).toString(16).padStart(2, "0")};
117
+ border-radius: 2px;
118
+ cursor: help;
119
+ transition: background-color 0.2s;
120
+ `;
121
+
122
+ range.surroundContents(mark);
123
+ highlightMap.set(elementId, mark);
124
+
125
+ // Mount hover card on mouseenter using Shadow DOM
126
+ mark.addEventListener("mouseenter", (e) => showHoverCard(e, result, mark));
127
+ mark.addEventListener("mouseleave", hideHoverCard);
128
+ } catch {
129
+ // surroundContents() fails on nodes that cross element boundaries β€” skip silently
130
+ }
131
+ }
132
+
133
+ // ---------------------------------------------------------------------------
134
+ // Hover card β€” Shadow DOM isolated, Framer Motion animated
135
+ // ---------------------------------------------------------------------------
136
+
137
+ let hoverCardHost: HTMLElement | null = null;
138
+ let hoverRoot: ReturnType<typeof createRoot> | null = null;
139
+
140
+ function ensureHoverCardHost(): { host: HTMLElement; shadowRoot: ShadowRoot } {
141
+ if (!hoverCardHost) {
142
+ hoverCardHost = document.createElement("div");
143
+ hoverCardHost.id = "fact-intelligence-hover-host";
144
+ document.body.appendChild(hoverCardHost);
145
+
146
+ const shadow = hoverCardHost.attachShadow({ mode: "closed" });
147
+
148
+ // Inject Tailwind-scoped styles directly into shadow root
149
+ const style = document.createElement("style");
150
+ style.textContent = HOVER_CARD_STYLES;
151
+ shadow.appendChild(style);
152
+
153
+ const mountPoint = document.createElement("div");
154
+ shadow.appendChild(mountPoint);
155
+ hoverRoot = createRoot(mountPoint);
156
+
157
+ return { host: hoverCardHost, shadowRoot: shadow };
158
+ }
159
+ return { host: hoverCardHost, shadowRoot: hoverCardHost.shadowRoot! as ShadowRoot };
160
+ }
161
+
162
+ function showHoverCard(event: MouseEvent, result: AnalysisResult, anchor: HTMLElement): void {
163
+ const { shadowRoot } = ensureHoverCardHost();
164
+ const rect = anchor.getBoundingClientRect();
165
+
166
+ // Viewport clamping β€” card must never overflow
167
+ let top = rect.bottom + window.scrollY + 8;
168
+ let left = rect.left + window.scrollX;
169
+ const CARD_WIDTH = 340;
170
+ const CARD_HEIGHT = 200;
171
+
172
+ if (left + CARD_WIDTH > window.innerWidth - 16) {
173
+ left = window.innerWidth - CARD_WIDTH - 16;
174
+ }
175
+ if (top + CARD_HEIGHT > window.innerHeight + window.scrollY - 16) {
176
+ top = rect.top + window.scrollY - CARD_HEIGHT - 8; // Flip above
177
+ }
178
+
179
+ hoverRoot?.render(
180
+ <HoverCard result={result} top={top} left={left} visible={true} />
181
+ );
182
+ }
183
+
184
+ function hideHoverCard(): void {
185
+ hoverRoot?.render(<HoverCard result={null} top={0} left={0} visible={false} />);
186
+ }
187
+
188
+ // ---------------------------------------------------------------------------
189
+ // HoverCard React component
190
+ // ---------------------------------------------------------------------------
191
+
192
+ interface HoverCardProps {
193
+ result: AnalysisResult | null;
194
+ top: number;
195
+ left: number;
196
+ visible: boolean;
197
+ }
198
+
199
+ function HoverCard({ result, top, left, visible }: HoverCardProps) {
200
+ if (!result) return null;
201
+ const cfg = COLOR_CONFIG[result.color as HighlightColor] ?? COLOR_CONFIG.yellow;
202
+
203
+ return (
204
+ <AnimatePresence>
205
+ {visible && (
206
+ <motion.div
207
+ className="card"
208
+ style={{ top, left, "--accent": cfg.hex } as React.CSSProperties}
209
+ initial={{ opacity: 0, y: 6, scale: 0.97 }}
210
+ animate={{ opacity: 1, y: 0, scale: 1 }}
211
+ exit={{ opacity: 0, y: 4, scale: 0.97 }}
212
+ transition={{ duration: 0.18, ease: "easeOut" }}
213
+ >
214
+ {/* Header row */}
215
+ <div className="header">
216
+ <div className="badge">{cfg.icon} {cfg.label}</div>
217
+ <div className="conf">
218
+ <svg width="36" height="36" viewBox="0 0 36 36">
219
+ <circle cx="18" cy="18" r="14" fill="none" stroke="#333" strokeWidth="3"/>
220
+ <circle
221
+ cx="18" cy="18" r="14"
222
+ fill="none"
223
+ stroke={cfg.hex}
224
+ strokeWidth="3"
225
+ strokeLinecap="round"
226
+ strokeDasharray={`${2 * Math.PI * 14}`}
227
+ strokeDashoffset={`${2 * Math.PI * 14 * (1 - result.confidence / 100)}`}
228
+ transform="rotate(-90 18 18)"
229
+ />
230
+ <text x="18" y="22" textAnchor="middle" fontSize="10" fill={cfg.hex} fontWeight="bold">
231
+ {result.confidence}
232
+ </text>
233
+ </svg>
234
+ </div>
235
+ </div>
236
+
237
+ {/* Verdict */}
238
+ <div className="verdict">{result.verdict_label}</div>
239
+ <div className="explanation">{result.explanation}</div>
240
+
241
+ {/* Sources */}
242
+ {result.sources?.length > 0 && (
243
+ <div className="sources">
244
+ {result.sources.slice(0, 3).map((s, i) => (
245
+ <a key={i} className="source" href={s.url} target="_blank" rel="noopener">
246
+ <img src={s.favicon_url} width="12" height="12" onError={(e) => { (e.target as HTMLImageElement).style.display = "none"; }} />
247
+ <span>{s.domain}</span>
248
+ </a>
249
+ ))}
250
+ </div>
251
+ )}
252
+
253
+ {/* Footer meta */}
254
+ <div className="meta">
255
+ <span>trust {(result.trust_score * 100).toFixed(0)}%</span>
256
+ <span>Β·</span>
257
+ <span>{result.latency_ms?.toFixed(0)}ms</span>
258
+ {result.cached && <><span>Β·</span><span>cached</span></>}
259
+ </div>
260
+ </motion.div>
261
+ )}
262
+ </AnimatePresence>
263
+ );
264
+ }
265
+
266
+ // CSS injected into the Shadow DOM β€” complete isolation from host page
267
+ const HOVER_CARD_STYLES = `
268
+ .card {
269
+ position: fixed;
270
+ z-index: 2147483647;
271
+ width: 340px;
272
+ background: #0d1117;
273
+ border: 1px solid #21262d;
274
+ border-radius: 10px;
275
+ padding: 14px;
276
+ box-shadow: 0 8px 32px rgba(0,0,0,0.6), 0 0 0 1px rgba(255,255,255,0.04);
277
+ font-family: -apple-system, 'DM Sans', system-ui, sans-serif;
278
+ font-size: 13px;
279
+ color: #e6edf3;
280
+ pointer-events: none;
281
+ }
282
+ .header { display: flex; align-items: center; justify-content: space-between; margin-bottom: 8px; }
283
+ .badge {
284
+ display: inline-flex; align-items: center; gap: 5px;
285
+ padding: 3px 10px; border-radius: 20px; font-size: 10px;
286
+ font-weight: 700; letter-spacing: 0.8px; text-transform: uppercase;
287
+ background: color-mix(in srgb, var(--accent) 15%, transparent);
288
+ color: var(--accent);
289
+ border: 1px solid color-mix(in srgb, var(--accent) 30%, transparent);
290
+ }
291
+ .conf { flex-shrink: 0; }
292
+ .verdict { font-weight: 700; font-size: 14px; margin-bottom: 6px; line-height: 1.3; }
293
+ .explanation { color: #7d8590; font-size: 12px; line-height: 1.6; margin-bottom: 10px; }
294
+ .sources { display: flex; flex-direction: column; gap: 4px; margin-bottom: 8px; }
295
+ .source {
296
+ display: flex; align-items: center; gap: 6px;
297
+ padding: 5px 8px; background: #161b22; border-radius: 5px;
298
+ color: #58a6ff; text-decoration: none; font-size: 11px;
299
+ pointer-events: all;
300
+ }
301
+ .meta {
302
+ display: flex; gap: 6px; font-size: 10px; color: #484f58;
303
+ font-family: 'Space Mono', monospace; letter-spacing: 0.3px;
304
+ }
305
+ `;
306
+
307
+ // ---------------------------------------------------------------------------
308
+ // Main content script entry point
309
+ // ---------------------------------------------------------------------------
310
+
311
+ export default defineContentScript({
312
+ matches: [
313
+ "https://twitter.com/*", "https://x.com/*",
314
+ "https://www.instagram.com/*", "https://www.youtube.com/*",
315
+ "https://chat.openai.com/*", "https://claude.ai/*",
316
+ "https://gemini.google.com/*", "<all_urls>",
317
+ ],
318
+ runAt: "document_idle",
319
+ main: async () => {
320
+ const platform = detectPlatform();
321
+
322
+ // Initialize xxhash-wasm (compiled WASM, sub-microsecond hashing)
323
+ const { h64ToString: xxhash64 } = await initXxhash();
324
+
325
+ const SESSION_ID = crypto.randomUUID();
326
+ const seenHashes = new Set<string>(); // Client-side dedup ring buffer
327
+
328
+ // Flush buffer every 1200ms β€” avoids layout thrashing from rapid DOM changes
329
+ const flushBuffer: Map<string, QueuedSegment> = new Map();
330
+ let flushTimer: ReturnType<typeof setTimeout> | null = null;
331
+
332
+ const { enabled, mode } = useExtensionStore.getState();
333
+ if (!enabled) return;
334
+
335
+ function queueSegment(node: Text): void {
336
+ const text = node.textContent?.trim() ?? "";
337
+ if (!text) return;
338
+
339
+ const hash = xxhash64(text);
340
+ if (seenHashes.has(hash)) return; // Already processed this text
341
+
342
+ const elementId = `fi-${hash.slice(0, 8)}-${Date.now()}`;
343
+ flushBuffer.set(hash, { hash, text, node, elementId });
344
+
345
+ // Debounced flush
346
+ if (!flushTimer) {
347
+ flushTimer = setTimeout(flushSegments, 1200);
348
+ }
349
+ }
350
+
351
+ async function flushSegments(): void {
352
+ flushTimer = null;
353
+ if (flushBuffer.size === 0) return;
354
+
355
+ const { enabled, mode } = useExtensionStore.getState();
356
+ if (!enabled) return;
357
+
358
+ const segments = Array.from(flushBuffer.values()).map((s) => {
359
+ seenHashes.add(s.hash);
360
+
361
+ // Prevent unbounded memory growth β€” prune oldest half when > 5000
362
+ if (seenHashes.size > 5000) {
363
+ const arr = Array.from(seenHashes);
364
+ arr.slice(0, 2500).forEach((h) => seenHashes.delete(h));
365
+ }
366
+
367
+ return {
368
+ content_hash: s.hash,
369
+ text: s.text,
370
+ element_id: s.elementId,
371
+ word_count: s.text.split(/\s+/).length,
372
+ };
373
+ });
374
+
375
+ flushBuffer.clear();
376
+
377
+ const batch = {
378
+ session_id: SESSION_ID,
379
+ platform,
380
+ segments,
381
+ sent_at: new Date().toISOString(),
382
+ };
383
+
384
+ // Send to background worker, which holds the WebSocket
385
+ chrome.runtime.sendMessage({ type: "send_batch", payload: batch });
386
+ }
387
+
388
+ // ---------------------------------------------------------------------------
389
+ // MutationObserver β€” watch for new text nodes
390
+ // ---------------------------------------------------------------------------
391
+
392
+ const observer = new MutationObserver((mutations) => {
393
+ const { enabled } = useExtensionStore.getState();
394
+ if (!enabled) return;
395
+
396
+ for (const mutation of mutations) {
397
+ if (mutation.type === "childList") {
398
+ mutation.addedNodes.forEach((node) => {
399
+ const textNodes = extractTextNodes(node);
400
+ textNodes.forEach(queueSegment);
401
+ });
402
+ } else if (mutation.type === "characterData") {
403
+ const node = mutation.target as Text;
404
+ if (isValidTextNode(node)) queueSegment(node);
405
+ }
406
+ }
407
+ });
408
+
409
+ observer.observe(document.body, {
410
+ childList: true,
411
+ subtree: true,
412
+ characterData: true,
413
+ });
414
+
415
+ // Process existing text on page load
416
+ extractTextNodes(document.body).forEach(queueSegment);
417
+
418
+ // ---------------------------------------------------------------------------
419
+ // Receive results from background worker
420
+ // ---------------------------------------------------------------------------
421
+
422
+ chrome.runtime.onMessage.addListener((msg) => {
423
+ if (msg.type === "result" && msg.payload) {
424
+ const result = msg.payload as AnalysisResult;
425
+ const { mode } = useExtensionStore.getState();
426
+ const color = result.color as HighlightColor;
427
+
428
+ if (!shouldShowColor(color, mode)) return;
429
+
430
+ // Find the text node by element_id stored on the flushBuffer segment
431
+ // (We need the original node reference β€” stored in flushBuffer pre-clear)
432
+ // Fallback: search by matching text content
433
+ const targetNode = findNodeByHash(result.content_hash);
434
+ if (targetNode) {
435
+ applyHighlight(targetNode, result.element_id, color, result);
436
+ }
437
+ }
438
+
439
+ if (msg.type === "ws_status") {
440
+ useExtensionStore.getState().setWsStatus(msg.payload.status);
441
+ }
442
+ });
443
+ },
444
+ });
445
+
446
+ // Node registry for post-flush lookup
447
+ const nodeRegistry = new Map<string, Text>(); // hash β†’ Text node
448
+
449
+ // Override queueSegment to also register nodes
450
+ // (actual implementation integrates this into the closure above)
451
+ function findNodeByHash(hash: string): Text | undefined {
452
+ return nodeRegistry.get(hash);
453
+ }
extension/entrypoints/popup.tsx ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // extension/entrypoints/popup.tsx
2
+ // Extension popup β€” rendered when the user clicks the extension icon.
3
+ // State: Zustand + chrome.storage.sync (persisted across browser sessions).
4
+
5
+ import React, { useEffect, useState } from "react";
6
+ import { createRoot } from "react-dom/client";
7
+ import { motion, AnimatePresence } from "framer-motion";
8
+ import { useExtensionStore, ExtensionMode, WSStatus, COLOR_CONFIG } from "../stores/extensionStore";
9
+
10
+ // ---------------------------------------------------------------------------
11
+ // Styles (injected as a <style> tag β€” no build step needed for popup)
12
+ // ---------------------------------------------------------------------------
13
+ const POPUP_STYLES = `
14
+ :root {
15
+ --bg: #070b0f; --surface: #0d1117; --surface2: #161b22;
16
+ --border: #21262d; --text: #e6edf3; --muted: #7d8590;
17
+ --accent: #58a6ff; --green: #22c55e; --yellow: #eab308;
18
+ --red: #ef4444; --purple: #a855f7;
19
+ }
20
+ * { margin: 0; padding: 0; box-sizing: border-box; }
21
+ body {
22
+ width: 320px; background: var(--bg); color: var(--text);
23
+ font-family: -apple-system, 'DM Sans', system-ui, sans-serif;
24
+ font-size: 13px;
25
+ }
26
+ `;
27
+
28
+ // ---------------------------------------------------------------------------
29
+ // Components
30
+ // ---------------------------------------------------------------------------
31
+
32
+ function StatusBadge({ status }: { status: WSStatus }) {
33
+ const config = {
34
+ connected: { color: "#22c55e", label: "Connected", pulse: true },
35
+ connecting: { color: "#eab308", label: "Connecting…", pulse: true },
36
+ reconnecting: { color: "#eab308", label: "Reconnecting…",pulse: true },
37
+ offline: { color: "#ef4444", label: "Offline", pulse: false },
38
+ }[status];
39
+
40
+ return (
41
+ <div style={{ display: "flex", alignItems: "center", gap: 6 }}>
42
+ <div style={{
43
+ width: 8, height: 8, borderRadius: "50%",
44
+ background: config.color,
45
+ boxShadow: config.pulse ? `0 0 8px ${config.color}` : "none",
46
+ animation: config.pulse ? "pulse 2s infinite" : "none",
47
+ }} />
48
+ <span style={{ fontSize: 11, color: "var(--muted)", fontFamily: "monospace" }}>
49
+ {config.label}
50
+ </span>
51
+ </div>
52
+ );
53
+ }
54
+
55
+ function Toggle({ checked, onChange }: { checked: boolean; onChange: (v: boolean) => void }) {
56
+ return (
57
+ <div
58
+ onClick={() => onChange(!checked)}
59
+ style={{
60
+ width: 44, height: 24, borderRadius: 12, cursor: "pointer",
61
+ background: checked ? "var(--accent)" : "var(--border)",
62
+ position: "relative", transition: "background 0.2s",
63
+ flexShrink: 0,
64
+ }}
65
+ >
66
+ <motion.div
67
+ animate={{ x: checked ? 22 : 2 }}
68
+ transition={{ type: "spring", stiffness: 500, damping: 30 }}
69
+ style={{
70
+ width: 20, height: 20, borderRadius: 10, background: "#fff",
71
+ position: "absolute", top: 2,
72
+ boxShadow: "0 1px 4px rgba(0,0,0,0.3)",
73
+ }}
74
+ />
75
+ </div>
76
+ );
77
+ }
78
+
79
+ function ModeCard({ value, current, label, desc, onSelect }: {
80
+ value: ExtensionMode; current: ExtensionMode;
81
+ label: string; desc: string; onSelect: () => void;
82
+ }) {
83
+ const active = value === current;
84
+ return (
85
+ <div
86
+ onClick={onSelect}
87
+ style={{
88
+ padding: "10px 12px", borderRadius: 8, cursor: "pointer",
89
+ border: `1px solid ${active ? "var(--accent)" : "var(--border)"}`,
90
+ background: active ? "rgba(88,166,255,0.08)" : "var(--surface2)",
91
+ transition: "all 0.15s", marginBottom: 6,
92
+ }}
93
+ >
94
+ <div style={{ display: "flex", alignItems: "center", justifyContent: "space-between" }}>
95
+ <span style={{ fontWeight: 600, fontSize: 12 }}>{label}</span>
96
+ {active && <span style={{ fontSize: 10, color: "var(--accent)", fontFamily: "monospace" }}>ACTIVE</span>}
97
+ </div>
98
+ <div style={{ color: "var(--muted)", fontSize: 11, marginTop: 3 }}>{desc}</div>
99
+ </div>
100
+ );
101
+ }
102
+
103
+ // ---------------------------------------------------------------------------
104
+ // Main popup component
105
+ // ---------------------------------------------------------------------------
106
+
107
+ function Popup() {
108
+ const { enabled, mode, wsStatus, totalAnalyzed, demoMode,
109
+ setEnabled, setMode } = useExtensionStore();
110
+
111
+ // Poll WS status from background worker
112
+ useEffect(() => {
113
+ const poll = () => {
114
+ chrome.runtime.sendMessage({ type: "get_status" }, (resp) => {
115
+ if (resp?.status) {
116
+ useExtensionStore.getState().setWsStatus(resp.status);
117
+ }
118
+ });
119
+ };
120
+ poll();
121
+ const id = setInterval(poll, 3000);
122
+ return () => clearInterval(id);
123
+ }, []);
124
+
125
+ const colorCounts = { green: 0, yellow: 0, red: 0, purple: 0 };
126
+
127
+ return (
128
+ <div style={{ padding: 16 }}>
129
+ <style>{POPUP_STYLES}</style>
130
+ <style>{`
131
+ @keyframes pulse { 0%,100%{opacity:1} 50%{opacity:0.4} }
132
+ `}</style>
133
+
134
+ {/* Header */}
135
+ <div style={{ display: "flex", alignItems: "center", justifyContent: "space-between", marginBottom: 16 }}>
136
+ <div>
137
+ <div style={{ fontFamily: "monospace", fontSize: 11, color: "var(--muted)", letterSpacing: 1 }}>
138
+ FACT INTELLIGENCE
139
+ </div>
140
+ <StatusBadge status={wsStatus} />
141
+ </div>
142
+ <Toggle checked={enabled} onChange={setEnabled} />
143
+ </div>
144
+
145
+ {/* Demo mode notice */}
146
+ {demoMode && (
147
+ <div style={{
148
+ padding: "8px 10px", background: "rgba(234,179,8,0.08)",
149
+ border: "1px solid rgba(234,179,8,0.2)", borderRadius: 6,
150
+ fontSize: 11, color: "#eab308", marginBottom: 12,
151
+ }}>
152
+ ⚠ Demo mode β€” add API keys for live LLM analysis
153
+ </div>
154
+ )}
155
+
156
+ <AnimatePresence>
157
+ {enabled && (
158
+ <motion.div
159
+ initial={{ opacity: 0, height: 0 }}
160
+ animate={{ opacity: 1, height: "auto" }}
161
+ exit={{ opacity: 0, height: 0 }}
162
+ >
163
+ {/* Mode selector */}
164
+ <div style={{ marginBottom: 16 }}>
165
+ <div style={{ fontFamily: "monospace", fontSize: 10, color: "var(--muted)",
166
+ letterSpacing: 1, textTransform: "uppercase", marginBottom: 8 }}>
167
+ highlight mode
168
+ </div>
169
+ <ModeCard value="minimal" current={mode} onSelect={() => setMode("minimal")}
170
+ label="Minimal" desc="Only debunked (red) and AI hallucinations (purple)" />
171
+ <ModeCard value="normal" current={mode} onSelect={() => setMode("normal")}
172
+ label="Normal (recommended)" desc="Red, purple, and unverified (yellow)" />
173
+ <ModeCard value="advanced" current={mode} onSelect={() => setMode("advanced")}
174
+ label="Advanced" desc="Full factual landscape including verified (green)" />
175
+ </div>
176
+
177
+ {/* Color legend */}
178
+ <div style={{ marginBottom: 16 }}>
179
+ <div style={{ fontFamily: "monospace", fontSize: 10, color: "var(--muted)",
180
+ letterSpacing: 1, textTransform: "uppercase", marginBottom: 8 }}>
181
+ color legend
182
+ </div>
183
+ {(Object.entries(COLOR_CONFIG) as [string, typeof COLOR_CONFIG.green][]).map(([k, v]) => (
184
+ <div key={k} style={{ display: "flex", alignItems: "center", gap: 8, marginBottom: 5 }}>
185
+ <div style={{ width: 12, height: 12, borderRadius: 3, background: v.hex, flexShrink: 0 }} />
186
+ <span style={{ color: v.hex, fontSize: 11, fontWeight: 600 }}>{v.label}</span>
187
+ </div>
188
+ ))}
189
+ </div>
190
+
191
+ {/* Stats */}
192
+ <div style={{
193
+ padding: "10px 12px", background: "var(--surface2)",
194
+ border: "1px solid var(--border)", borderRadius: 8,
195
+ display: "flex", justifyContent: "space-between",
196
+ }}>
197
+ <div style={{ textAlign: "center" }}>
198
+ <div style={{ fontFamily: "monospace", fontSize: 18, fontWeight: 700, color: "var(--accent)" }}>
199
+ {totalAnalyzed}
200
+ </div>
201
+ <div style={{ fontSize: 10, color: "var(--muted)", textTransform: "uppercase", letterSpacing: 0.8 }}>
202
+ analyzed
203
+ </div>
204
+ </div>
205
+ <div style={{ width: 1, background: "var(--border)" }} />
206
+ <div style={{ textAlign: "center" }}>
207
+ <div style={{ fontFamily: "monospace", fontSize: 18, fontWeight: 700, color: "var(--muted)" }}>
208
+ {wsStatus === "connected" ? "●" : "β—‹"}
209
+ </div>
210
+ <div style={{ fontSize: 10, color: "var(--muted)", textTransform: "uppercase", letterSpacing: 0.8 }}>
211
+ engine
212
+ </div>
213
+ </div>
214
+ </div>
215
+ </motion.div>
216
+ )}
217
+ </AnimatePresence>
218
+
219
+ {/* Footer */}
220
+ <div style={{
221
+ marginTop: 14, paddingTop: 10, borderTop: "1px solid var(--border)",
222
+ fontFamily: "monospace", fontSize: 10, color: "var(--muted)", textAlign: "center",
223
+ }}>
224
+ v1.0.0 Β· WXT + FastAPI + Qdrant + Memgraph
225
+ </div>
226
+ </div>
227
+ );
228
+ }
229
+
230
+ // Mount
231
+ const root = document.getElementById("root");
232
+ if (root) createRoot(root).render(<Popup />);
extension/package.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "fact-intelligence-extension",
3
+ "version": "1.0.0",
4
+ "description": "Omnichannel fact-checking and AI hallucination detection browser extension",
5
+ "private": true,
6
+ "scripts": {
7
+ "dev": "wxt dev",
8
+ "dev:chrome": "wxt dev --browser chrome",
9
+ "dev:firefox": "wxt dev --browser firefox",
10
+ "build": "wxt build",
11
+ "build:chrome": "wxt build --browser chrome",
12
+ "build:firefox": "wxt build --browser firefox",
13
+ "build:all": "wxt build --browser chrome && wxt build --browser firefox",
14
+ "zip": "wxt zip",
15
+ "type-check": "vue-tsc --noEmit"
16
+ },
17
+ "dependencies": {
18
+ "framer-motion": "^11.15.0",
19
+ "react": "^19.0.0",
20
+ "react-dom": "^19.0.0",
21
+ "xxhash-wasm": "^1.0.2",
22
+ "zustand": "^5.0.2"
23
+ },
24
+ "devDependencies": {
25
+ "@types/chrome": "^0.0.287",
26
+ "@types/react": "^19.0.0",
27
+ "@types/react-dom": "^19.0.0",
28
+ "@wxt-dev/module-react": "^1.1.0",
29
+ "typescript": "^5.7.2",
30
+ "wxt": "^0.19.0"
31
+ }
32
+ }
extension/stores/extensionStore.ts ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // extension/stores/extensionStore.ts
2
+ // Zustand store with chrome.storage.sync persistence layer.
3
+ // State is shared across popup, background, and content script contexts.
4
+
5
+ import { create } from "zustand";
6
+ import { subscribeWithSelector } from "zustand/middleware";
7
+
8
+ export type HighlightColor = "green" | "yellow" | "red" | "purple";
9
+ export type ExtensionMode = "minimal" | "normal" | "advanced";
10
+ export type WSStatus = "connected" | "connecting" | "reconnecting" | "offline";
11
+
12
+ export interface AnalysisResult {
13
+ element_id: string;
14
+ content_hash: string;
15
+ platform: string;
16
+ color: HighlightColor;
17
+ confidence: number;
18
+ verdict_label: string;
19
+ explanation: string;
20
+ sources: Array<{
21
+ url: string;
22
+ domain: string;
23
+ favicon_url: string;
24
+ snippet: string;
25
+ }>;
26
+ trust_score: number;
27
+ velocity: number;
28
+ has_community_note: boolean;
29
+ latency_ms: number;
30
+ cached: boolean;
31
+ timestamp: string;
32
+ }
33
+
34
+ interface ExtensionState {
35
+ // User preferences (persisted to chrome.storage.sync)
36
+ enabled: boolean;
37
+ mode: ExtensionMode;
38
+
39
+ // Runtime state (not persisted)
40
+ wsStatus: WSStatus;
41
+ pendingCount: number;
42
+ totalAnalyzed: number;
43
+ demoMode: boolean;
44
+
45
+ // Actions
46
+ setEnabled: (v: boolean) => void;
47
+ setMode: (m: ExtensionMode) => void;
48
+ setWsStatus: (s: WSStatus) => void;
49
+ incrementPending: () => void;
50
+ decrementPending: () => void;
51
+ incrementAnalyzed: () => void;
52
+ setDemoMode: (v: boolean) => void;
53
+ }
54
+
55
+ // ---------------------------------------------------------------------------
56
+ // Chrome storage sync helpers
57
+ // ---------------------------------------------------------------------------
58
+ const STORAGE_KEY = "fact_intelligence_prefs";
59
+
60
+ async function loadFromStorage(): Promise<Partial<ExtensionState>> {
61
+ return new Promise((resolve) => {
62
+ if (typeof chrome === "undefined" || !chrome.storage) {
63
+ resolve({});
64
+ return;
65
+ }
66
+ chrome.storage.sync.get([STORAGE_KEY], (result) => {
67
+ resolve(result[STORAGE_KEY] ?? {});
68
+ });
69
+ });
70
+ }
71
+
72
+ async function saveToStorage(prefs: { enabled: boolean; mode: ExtensionMode }) {
73
+ if (typeof chrome === "undefined" || !chrome.storage) return;
74
+ chrome.storage.sync.set({ [STORAGE_KEY]: prefs });
75
+ }
76
+
77
+ // ---------------------------------------------------------------------------
78
+ // Store definition
79
+ // ---------------------------------------------------------------------------
80
+ export const useExtensionStore = create<ExtensionState>()(
81
+ subscribeWithSelector((set, get) => ({
82
+ enabled: true,
83
+ mode: "normal",
84
+ wsStatus: "connecting",
85
+ pendingCount: 0,
86
+ totalAnalyzed: 0,
87
+ demoMode: false,
88
+
89
+ setEnabled: (v) => {
90
+ set({ enabled: v });
91
+ saveToStorage({ enabled: v, mode: get().mode });
92
+ },
93
+ setMode: (m) => {
94
+ set({ mode: m });
95
+ saveToStorage({ enabled: get().enabled, mode: m });
96
+ },
97
+ setWsStatus: (s) => set({ wsStatus: s }),
98
+ incrementPending: () => set((s) => ({ pendingCount: s.pendingCount + 1 })),
99
+ decrementPending: () =>
100
+ set((s) => ({ pendingCount: Math.max(0, s.pendingCount - 1) })),
101
+ incrementAnalyzed: () =>
102
+ set((s) => ({ totalAnalyzed: s.totalAnalyzed + 1 })),
103
+ setDemoMode: (v) => set({ demoMode: v }),
104
+ }))
105
+ );
106
+
107
+ // Hydrate from chrome.storage.sync on module load
108
+ loadFromStorage().then((saved) => {
109
+ if (saved.enabled !== undefined) {
110
+ useExtensionStore.setState({ enabled: saved.enabled as boolean });
111
+ }
112
+ if (saved.mode !== undefined) {
113
+ useExtensionStore.setState({ mode: saved.mode as ExtensionMode });
114
+ }
115
+ });
116
+
117
+ // ---------------------------------------------------------------------------
118
+ // Mode-based color filter logic
119
+ // ---------------------------------------------------------------------------
120
+ export function shouldShowColor(
121
+ color: HighlightColor,
122
+ mode: ExtensionMode
123
+ ): boolean {
124
+ switch (mode) {
125
+ case "minimal":
126
+ // Only show definitive threats β€” don't add noise for users who want minimal
127
+ return color === "red" || color === "purple";
128
+ case "normal":
129
+ // Default: skip low-confidence green confirmations
130
+ return color === "red" || color === "purple" || color === "yellow";
131
+ case "advanced":
132
+ // Full factual landscape including green corroborations
133
+ return true;
134
+ }
135
+ }
136
+
137
+ // ---------------------------------------------------------------------------
138
+ // Color display config
139
+ // ---------------------------------------------------------------------------
140
+ export const COLOR_CONFIG = {
141
+ green: { hex: "#22c55e", opacity: 0.12, label: "Verified", icon: "βœ“" },
142
+ yellow: { hex: "#eab308", opacity: 0.14, label: "Unverified", icon: "⚠" },
143
+ red: { hex: "#ef4444", opacity: 0.16, label: "Debunked", icon: "βœ—" },
144
+ purple: { hex: "#a855f7", opacity: 0.15, label: "AI Hallucination", icon: "β—ˆ" },
145
+ } as const;
extension/tsconfig.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ESNext",
4
+ "module": "ESNext",
5
+ "moduleResolution": "Bundler",
6
+ "lib": ["ESNext", "DOM", "DOM.Iterable"],
7
+ "jsx": "react-jsx",
8
+ "strict": true,
9
+ "skipLibCheck": true,
10
+ "noUnusedLocals": false,
11
+ "noUnusedParameters": false,
12
+ "paths": {
13
+ "@/*": ["./src/*"]
14
+ }
15
+ },
16
+ "include": ["**/*.ts", "**/*.tsx", ".wxt/types/**/*.d.ts"],
17
+ "exclude": ["node_modules", ".output"]
18
+ }
extension/wxt.config.ts ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // extension/wxt.config.ts
2
+ // WXT framework configuration β€” replaces raw Manifest V3 boilerplate.
3
+ // Provides HMR, multi-browser compatibility (Chrome/Firefox/Edge/Arc),
4
+ // TypeScript-first entrypoints, Vite under the hood.
5
+
6
+ import { defineConfig } from "wxt";
7
+
8
+ export default defineConfig({
9
+ extensionApi: "chrome",
10
+ modules: ["@wxt-dev/module-react"],
11
+
12
+ vite: () => ({
13
+ define: {
14
+ // Injected at build time β€” change this to your cloudflared tunnel URL
15
+ __WS_URL__: JSON.stringify(
16
+ process.env.WS_URL || "wss://fact-engine.your-domain.workers.dev"
17
+ ),
18
+ },
19
+ }),
20
+
21
+ manifest: {
22
+ name: "Fact & Hallucination Intelligence",
23
+ description:
24
+ "Real-time omnichannel fact-checking and AI hallucination detection",
25
+ version: "1.0.0",
26
+ permissions: [
27
+ "storage", // chrome.storage.sync for user preferences
28
+ "tabs", // send messages to content scripts
29
+ "activeTab",
30
+ ],
31
+ host_permissions: [
32
+ "https://twitter.com/*",
33
+ "https://x.com/*",
34
+ "https://www.instagram.com/*",
35
+ "https://www.youtube.com/*",
36
+ "https://chat.openai.com/*",
37
+ "https://claude.ai/*",
38
+ "https://gemini.google.com/*",
39
+ "*://*/*", // Covers news sites β€” restrict in production
40
+ ],
41
+ content_security_policy: {
42
+ extension_pages:
43
+ "script-src 'self'; object-src 'self'; connect-src wss: https:",
44
+ },
45
+ icons: {
46
+ "16": "icon/16.png",
47
+ "32": "icon/32.png",
48
+ "48": "icon/48.png",
49
+ "128": "icon/128.png",
50
+ },
51
+ },
52
+ });
infra/tunnel_setup.sh ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # tunnel_setup.sh β€” Cloudflare Tunnel setup for the Fact Intelligence backend.
3
+ #
4
+ # What this does:
5
+ # 1. Installs the cloudflared binary (Linux/macOS)
6
+ # 2. Authenticates with your Cloudflare account
7
+ # 3. Creates a named tunnel pointing to the FastAPI backend (localhost:7860)
8
+ # 4. Configures DNS routing: wss://fact-engine.<your-domain>.workers.dev
9
+ # 5. Runs the tunnel as a systemd service (optional)
10
+ #
11
+ # Usage:
12
+ # chmod +x tunnel_setup.sh
13
+ # DOMAIN=your-domain.com ./tunnel_setup.sh
14
+ #
15
+ # After running, copy the tunnel URL into extension/wxt.config.ts __WS_URL__
16
+
17
+ set -euo pipefail
18
+
19
+ DOMAIN="${DOMAIN:-your-domain.com}"
20
+ TUNNEL_NAME="fact-intelligence"
21
+ BACKEND_PORT=7860
22
+ CONFIG_DIR="$HOME/.cloudflared"
23
+
24
+ echo "=== Cloudflare Tunnel Setup for Fact Intelligence System ==="
25
+ echo "Domain: $DOMAIN"
26
+ echo "Tunnel: $TUNNEL_NAME"
27
+ echo ""
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # 1. Install cloudflared
31
+ # ---------------------------------------------------------------------------
32
+ install_cloudflared() {
33
+ if command -v cloudflared &>/dev/null; then
34
+ echo "[βœ“] cloudflared already installed: $(cloudflared --version)"
35
+ return
36
+ fi
37
+
38
+ echo "[β†’] Installing cloudflared..."
39
+ OS=$(uname -s | tr '[:upper:]' '[:lower:]')
40
+ ARCH=$(uname -m)
41
+
42
+ case "$OS" in
43
+ linux)
44
+ case "$ARCH" in
45
+ x86_64) PKG="cloudflared-linux-amd64.deb" ;;
46
+ aarch64) PKG="cloudflared-linux-arm64.deb" ;;
47
+ *) echo "Unsupported arch: $ARCH"; exit 1 ;;
48
+ esac
49
+ curl -fsSL "https://github.com/cloudflare/cloudflared/releases/latest/download/$PKG" -o /tmp/cloudflared.deb
50
+ sudo dpkg -i /tmp/cloudflared.deb
51
+ ;;
52
+ darwin)
53
+ brew install cloudflare/cloudflare/cloudflared
54
+ ;;
55
+ *)
56
+ echo "Unsupported OS: $OS. Install cloudflared manually from https://developers.cloudflare.com/cloudflare-one/connections/connect-apps/install-and-setup/"
57
+ exit 1
58
+ ;;
59
+ esac
60
+ echo "[βœ“] cloudflared installed"
61
+ }
62
+
63
+ # ---------------------------------------------------------------------------
64
+ # 2. Authenticate (opens browser for Cloudflare login)
65
+ # ---------------------------------------------------------------------------
66
+ authenticate() {
67
+ if [ -f "$CONFIG_DIR/cert.pem" ]; then
68
+ echo "[βœ“] Already authenticated (cert.pem found)"
69
+ return
70
+ fi
71
+ echo "[β†’] Opening browser for Cloudflare authentication..."
72
+ cloudflared tunnel login
73
+ }
74
+
75
+ # ---------------------------------------------------------------------------
76
+ # 3. Create the tunnel
77
+ # ---------------------------------------------------------------------------
78
+ create_tunnel() {
79
+ if cloudflared tunnel list 2>/dev/null | grep -q "$TUNNEL_NAME"; then
80
+ echo "[βœ“] Tunnel '$TUNNEL_NAME' already exists"
81
+ TUNNEL_ID=$(cloudflared tunnel list | grep "$TUNNEL_NAME" | awk '{print $1}')
82
+ else
83
+ echo "[β†’] Creating tunnel '$TUNNEL_NAME'..."
84
+ cloudflared tunnel create "$TUNNEL_NAME"
85
+ TUNNEL_ID=$(cloudflared tunnel list | grep "$TUNNEL_NAME" | awk '{print $1}')
86
+ echo "[βœ“] Created tunnel ID: $TUNNEL_ID"
87
+ fi
88
+ echo "TUNNEL_ID=$TUNNEL_ID"
89
+ }
90
+
91
+ # ---------------------------------------------------------------------------
92
+ # 4. Write tunnel configuration
93
+ # ---------------------------------------------------------------------------
94
+ write_config() {
95
+ mkdir -p "$CONFIG_DIR"
96
+ cat > "$CONFIG_DIR/config.yml" << EOF
97
+ tunnel: $TUNNEL_NAME
98
+ credentials-file: $CONFIG_DIR/$TUNNEL_ID.json
99
+
100
+ ingress:
101
+ # WebSocket endpoint β€” extension connects here
102
+ - hostname: fact-engine.$DOMAIN
103
+ service: http://localhost:$BACKEND_PORT
104
+ originRequest:
105
+ noTLSVerify: false
106
+ connectTimeout: 30s
107
+
108
+ # Catch-all (required by cloudflared)
109
+ - service: http_status:404
110
+
111
+ warp-routing:
112
+ enabled: false
113
+ EOF
114
+ echo "[βœ“] Config written to $CONFIG_DIR/config.yml"
115
+ }
116
+
117
+ # ---------------------------------------------------------------------------
118
+ # 5. Create DNS record
119
+ # ---------------------------------------------------------------------------
120
+ setup_dns() {
121
+ echo "[β†’] Creating DNS CNAME: fact-engine.$DOMAIN β†’ $TUNNEL_NAME.cfargotunnel.com"
122
+ cloudflared tunnel route dns "$TUNNEL_NAME" "fact-engine.$DOMAIN" || \
123
+ echo "[!] DNS route already exists or failed β€” check Cloudflare dashboard"
124
+ echo "[βœ“] DNS configured"
125
+ }
126
+
127
+ # ---------------------------------------------------------------------------
128
+ # 6. Systemd service (Linux only)
129
+ # ---------------------------------------------------------------------------
130
+ setup_systemd() {
131
+ if [ "$(uname -s)" != "Linux" ]; then
132
+ echo "[!] Skipping systemd setup (not Linux)"
133
+ return
134
+ fi
135
+
136
+ sudo tee /etc/systemd/system/cloudflared-fact.service > /dev/null << EOF
137
+ [Unit]
138
+ Description=Cloudflare Tunnel β€” Fact Intelligence System
139
+ After=network-online.target
140
+ Wants=network-online.target
141
+
142
+ [Service]
143
+ Type=simple
144
+ User=$USER
145
+ ExecStart=$(command -v cloudflared) tunnel --config $CONFIG_DIR/config.yml run $TUNNEL_NAME
146
+ Restart=on-failure
147
+ RestartSec=5s
148
+
149
+ [Install]
150
+ WantedBy=multi-user.target
151
+ EOF
152
+
153
+ sudo systemctl daemon-reload
154
+ sudo systemctl enable cloudflared-fact
155
+ sudo systemctl start cloudflared-fact
156
+ echo "[βœ“] Systemd service started: cloudflared-fact"
157
+ }
158
+
159
+ # ---------------------------------------------------------------------------
160
+ # Main
161
+ # ---------------------------------------------------------------------------
162
+ install_cloudflared
163
+ authenticate
164
+ create_tunnel
165
+ write_config
166
+ setup_dns
167
+ setup_systemd
168
+
169
+ echo ""
170
+ echo "=== Setup complete! ==="
171
+ echo ""
172
+ echo "WebSocket URL for the extension:"
173
+ echo " wss://fact-engine.$DOMAIN/ws/{session_id}"
174
+ echo ""
175
+ echo "Update extension/wxt.config.ts:"
176
+ echo " __WS_URL__: 'wss://fact-engine.$DOMAIN/ws'"
177
+ echo ""
178
+ echo "Test the tunnel:"
179
+ echo " curl https://fact-engine.$DOMAIN/health"
180
+ echo ""