plexdx commited on
Commit
c5175d5
Β·
verified Β·
1 Parent(s): 1a8402e

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +498 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import warnings
4
+ import feedparser
5
+ from datetime import datetime
6
+
7
+ warnings.filterwarnings('ignore')
8
+ print("βœ… Core imports done.")
9
+
10
+ from datasets import load_dataset
11
+
12
+ # ── Fallback generators (non-negotiable for reproducibility) ─────────────────
13
+ def generate_fallback_liar():
14
+ """Synthetic LIAR-style dataset if HuggingFace load fails."""
15
+ data = [
16
+ ("The unemployment rate is at a 50-year low.", "half-true"),
17
+ ("Vaccines contain microchips for government tracking.", "pants-fire"),
18
+ ("Climate change is causing more frequent hurricanes.", "mostly-true"),
19
+ ("The stock market had its best year ever last year.", "false"),
20
+ ("Water covers about 71% of Earth's surface.", "true"),
21
+ ("The moon landing was filmed in a Hollywood studio.", "pants-fire"),
22
+ ("Eating carrots improves night vision significantly.", "barely-true"),
23
+ ("5G towers spread the COVID-19 virus.", "pants-fire"),
24
+ ("Exercise reduces the risk of type 2 diabetes.", "true"),
25
+ ("The Eiffel Tower grows taller in summer.", "mostly-true"),
26
+ ] * 50 # 500 samples
27
+ df = pd.DataFrame(data, columns=['statement', 'label'])
28
+ print("⚠️ Using synthetic LIAR fallback (500 samples).")
29
+ return df
30
+
31
+ def generate_fallback_hallucination():
32
+ """Synthetic hallucination dataset if HuggingFace load fails."""
33
+ data = [
34
+ ("The Eiffel Tower is located in Berlin.", True),
35
+ ("Python was created by Guido van Rossum.", False),
36
+ ("Shakespeare wrote 'War and Peace'.", True),
37
+ ("The speed of light is approximately 3Γ—10⁸ m/s.", False),
38
+ ("The Great Wall of China is visible from space with the naked eye.", True),
39
+ ] * 40
40
+ df = pd.DataFrame(data, columns=['claim', 'is_hallucination'])
41
+ print("⚠️ Using synthetic hallucination fallback (200 samples).")
42
+ return df
43
+
44
+ # ── Load LIAR dataset ─────────────────────────────────────────────────────────
45
+ try:
46
+ liar_raw = load_dataset("liar", trust_remote_code=True)
47
+ liar_df = pd.DataFrame({
48
+ 'statement': liar_raw['train']['statement'],
49
+ 'label': liar_raw['train']['label']
50
+ })
51
+ label_names = ['pants-fire','false','barely-true','half-true','mostly-true','true']
52
+ liar_df['label'] = liar_df['label'].apply(lambda x: label_names[x] if isinstance(x, int) else x)
53
+ print(f"βœ… LIAR dataset loaded: {len(liar_df)} samples")
54
+ except Exception as e:
55
+ print(f"LIAR load failed ({e}), using fallback.")
56
+ liar_df = generate_fallback_liar()
57
+
58
+ # ── Load TruthfulQA ───────────────────────────────────────────────────────────
59
+ try:
60
+ tqa_raw = load_dataset("truthful_qa", "generation", trust_remote_code=True)
61
+ tqa_df = pd.DataFrame({
62
+ 'question': tqa_raw['validation']['question'],
63
+ 'best_answer': tqa_raw['validation']['best_answer'],
64
+ })
65
+ print(f"βœ… TruthfulQA loaded: {len(tqa_df)} samples")
66
+ except Exception as e:
67
+ print(f"TruthfulQA load failed ({e}), using fallback.")
68
+ tqa_df = generate_fallback_hallucination()
69
+
70
+ # ── Load HaluEval ─────────────────────────────────────────────────────────────
71
+ try:
72
+ halu_raw = load_dataset("pminervini/HaluEval", "general_samples", trust_remote_code=True)
73
+ halu_df = pd.DataFrame(halu_raw['data'])
74
+ print(f"βœ… HaluEval loaded: {len(halu_df)} samples")
75
+ except Exception as e:
76
+ print(f"HaluEval load failed ({e}), using fallback.")
77
+ halu_df = generate_fallback_hallucination()
78
+
79
+ print("\nπŸ“Š Dataset summary:")
80
+ print(f" LIAR: {len(liar_df)} rows, columns: {list(liar_df.columns)}")
81
+ print(f" TruthfulQA: {len(tqa_df)} rows")
82
+ print(f" HaluEval: {len(halu_df)} rows")
83
+
84
+ # ── Live RSS News Feed ────────────────────────────────────────────────────────
85
+ RSS_FEEDS = {
86
+ 'BBC': 'http://feeds.bbci.co.uk/news/world/rss.xml',
87
+ 'Reuters': 'https://feeds.reuters.com/reuters/topNews',
88
+ 'AP': 'https://rsshub.app/apnews/topics/apf-topnews',
89
+ }
90
+
91
+ headlines = []
92
+ for source, url in RSS_FEEDS.items():
93
+ try:
94
+ feed = feedparser.parse(url)
95
+ for entry in feed.entries[:10]:
96
+ pub = entry.get('published', str(datetime.now()))
97
+ headlines.append({
98
+ 'headline': entry.get('title', ''),
99
+ 'summary': entry.get('summary', ''),
100
+ 'source': source,
101
+ 'published_at': pub,
102
+ 'link': entry.get('link', '')
103
+ })
104
+ except Exception as e:
105
+ print(f" ⚠️ {source} RSS failed: {e}")
106
+
107
+ if not headlines:
108
+ # Fallback static headlines for offline environments
109
+ headlines = [
110
+ {'headline': 'Global temperatures hit record highs in 2024', 'summary': '', 'source': 'synthetic', 'published_at': '2024-01-01', 'link': ''},
111
+ {'headline': 'AI models show improved reasoning capabilities', 'summary': '', 'source': 'synthetic', 'published_at': '2024-01-02', 'link': ''},
112
+ {'headline': 'New vaccine approved for respiratory illness', 'summary': '', 'source': 'synthetic', 'published_at': '2024-01-03', 'link': ''},
113
+ ] * 5
114
+ print("⚠️ Using synthetic headlines (no network access).")
115
+
116
+ news_df = pd.DataFrame(headlines)
117
+ news_df['published_at'] = pd.to_datetime(news_df['published_at'], errors='coerce', utc=True)
118
+ print(f"βœ… Live news loaded: {len(news_df)} headlines from {news_df['source'].nunique()} sources")
119
+ news_df.head(3)
120
+
121
+ from transformers import pipeline
122
+ from sentence_transformers import SentenceTransformer
123
+ import faiss
124
+ import re
125
+
126
+ # ── Load lightweight models ───────────────────────────────────────────────────
127
+ print("Loading sentiment pipeline...")
128
+ sentiment_pipeline = pipeline(
129
+ "sentiment-analysis",
130
+ model="distilbert-base-uncased-finetuned-sst-2-english",
131
+ truncation=True, max_length=512
132
+ )
133
+
134
+ print("Loading NLI pipeline (DeBERTa)...")
135
+ nli_pipeline = pipeline(
136
+ "zero-shot-classification",
137
+ model="cross-encoder/nli-deberta-v3-small",
138
+ device=-1 # CPU
139
+ )
140
+
141
+ print("Loading sentence embedding model...")
142
+ embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
143
+
144
+ print("βœ… All models loaded.")
145
+
146
+ # ── Build FAISS Vector Index of Trusted Facts ─────────────────────────────────
147
+ TRUSTED_FACTS = [
148
+ "Water boils at 100 degrees Celsius at sea level.",
149
+ "The Earth orbits the Sun, not the other way around.",
150
+ "The speed of light in a vacuum is approximately 299,792 kilometers per second.",
151
+ "DNA carries genetic information in living organisms.",
152
+ "The Great Wall of China is not visible from space with the naked eye.",
153
+ "Humans and chimpanzees share approximately 98.7% of their DNA.",
154
+ "The moon is approximately 384,400 kilometers from Earth.",
155
+ "Mount Everest is the highest mountain above sea level at 8,849 meters.",
156
+ "Vaccines work by stimulating the immune system to recognize pathogens.",
157
+ "The human brain contains approximately 86 billion neurons.",
158
+ "Carbon dioxide concentration in the atmosphere has increased since industrialization.",
159
+ "The Eiffel Tower is located in Paris, France.",
160
+ "Python was created by Guido van Rossum and first released in 1991.",
161
+ "Shakespeare wrote Hamlet, Macbeth, and Romeo and Juliet.",
162
+ "The United States has 50 states.",
163
+ "Albert Einstein published the special theory of relativity in 1905.",
164
+ "Antibiotics are not effective against viral infections.",
165
+ "The Pacific Ocean is the largest ocean on Earth.",
166
+ "The human body has 206 bones in adulthood.",
167
+ "Climate change is driven primarily by human greenhouse gas emissions according to scientific consensus.",
168
+ ]
169
+
170
+ # Encode and index
171
+ fact_embeddings = embedder.encode(TRUSTED_FACTS, convert_to_numpy=True)
172
+ dim = fact_embeddings.shape[1]
173
+ faiss_index = faiss.IndexFlatL2(dim)
174
+ faiss_index.add(fact_embeddings.astype(np.float32))
175
+
176
+ print(f"βœ… FAISS index built with {faiss_index.ntotal} trusted facts (dim={dim})")
177
+
178
+ # ── Feature Extraction Functions ──────────────────────────────────────────────
179
+
180
+ SOURCE_CREDIBILITY = {
181
+ 'bbc.co.uk': 0.92, 'reuters.com': 0.94, 'apnews.com': 0.93,
182
+ 'nytimes.com': 0.88, 'theguardian.com': 0.87, 'nature.com': 0.98,
183
+ 'who.int': 0.97, 'cdc.gov': 0.97, 'infowars.com': 0.05,
184
+ 'naturalnews.com': 0.08, 'breitbart.com': 0.22, 'synthetic': 0.50,
185
+ 'BBC': 0.92, 'Reuters': 0.94, 'AP': 0.93,
186
+ }
187
+
188
+ FAKE_DOI_PATTERN = re.compile(
189
+ r'10\.\d{4,}/[a-zA-Z0-9./_-]+'
190
+ )
191
+ IMPOSSIBLE_YEAR = re.compile(r'\b(19[0-2]\d|2[1-9]\d{2})\b')
192
+ INVENTED_INSTITUTIONS = re.compile(
193
+ r'(Institute of [A-Z][a-z]+ [A-Z][a-z]+|Foundation for [A-Z][a-z]+ Research)',
194
+ re.IGNORECASE
195
+ )
196
+
197
+ def get_sentiment_score(text: str) -> float:
198
+ """Returns float in [-1, 1]. Negative = negative sentiment."""
199
+ try:
200
+ result = sentiment_pipeline(text[:512])[0]
201
+ score = result['score']
202
+ return score if result['label'] == 'POSITIVE' else -score
203
+ except:
204
+ return 0.0
205
+
206
+ def get_source_credibility(source: str) -> float:
207
+ """Lookup against known domain credibility scores."""
208
+ for domain, score in SOURCE_CREDIBILITY.items():
209
+ if domain.lower() in source.lower():
210
+ return score
211
+ return 0.5 # unknown source β†’ uncertain
212
+
213
+ def get_citation_anomaly_score(text: str) -> float:
214
+ """Detects patterns common in hallucinated citations."""
215
+ score = 0.0
216
+ # Fake DOI pattern
217
+ if FAKE_DOI_PATTERN.search(text): score += 0.3
218
+ # Impossible year references
219
+ if IMPOSSIBLE_YEAR.search(text): score += 0.3
220
+ # Suspicious institution names
221
+ if INVENTED_INSTITUTIONS.search(text): score += 0.4
222
+ return min(score, 1.0)
223
+
224
+ def get_semantic_similarity(text: str, k: int = 3) -> float:
225
+ """Cosine similarity of input against top-k trusted FAISS facts."""
226
+ try:
227
+ emb = embedder.encode([text], convert_to_numpy=True).astype(np.float32)
228
+ distances, _ = faiss_index.search(emb, k)
229
+ # Convert L2 distance to similarity (lower distance = higher similarity)
230
+ avg_dist = np.mean(distances[0])
231
+ similarity = 1.0 / (1.0 + avg_dist)
232
+ return float(np.clip(similarity, 0, 1))
233
+ except:
234
+ return 0.5
235
+
236
+ def get_nli_contradiction_score(claim: str, references: list) -> float:
237
+ """DeBERTa NLI: fraction of references that contradict the claim."""
238
+ try:
239
+ result = nli_pipeline(
240
+ claim,
241
+ candidate_labels=["entailment", "neutral", "contradiction"],
242
+ hypothesis_template="This claim is related to: {}",
243
+ )
244
+ # Get contradiction score
245
+ scores = dict(zip(result['labels'], result['scores']))
246
+ return float(scores.get('contradiction', 0.0))
247
+ except:
248
+ return 0.5
249
+
250
+ def retrieve_reference_sentences(claim: str, k: int = 5) -> list:
251
+ """Retrieve top-k relevant facts from FAISS index."""
252
+ try:
253
+ emb = embedder.encode([claim], convert_to_numpy=True).astype(np.float32)
254
+ _, indices = faiss_index.search(emb, k)
255
+ return [TRUSTED_FACTS[i] for i in indices[0] if i < len(TRUSTED_FACTS)]
256
+ except:
257
+ return TRUSTED_FACTS[:k]
258
+
259
+ print("βœ… Feature extraction functions defined.")
260
+
261
+ # ── Compute Features on a Sample ──────────────────────────────────────────────
262
+ SAMPLE_TEXTS = [
263
+ "The moon is made of cheese.",
264
+ "Water boils at 100Β°C at sea level.",
265
+ "Scientists discovered that 5G towers emit mind-control frequencies.",
266
+ "The Eiffel Tower is 330 meters tall.",
267
+ "According to a 2031 study from the Institute of Neural Enhancement, humans only use 10% of their brain.",
268
+ ]
269
+
270
+ rows = []
271
+ for text in SAMPLE_TEXTS:
272
+ refs = retrieve_reference_sentences(text)
273
+ row = {
274
+ 'text': text[:60] + '...' if len(text) > 60 else text,
275
+ 'sentiment_score': get_sentiment_score(text),
276
+ 'source_credibility': 0.5, # unknown source for these samples
277
+ 'nli_contradiction_score': get_nli_contradiction_score(text, refs),
278
+ 'citation_anomaly_score': get_citation_anomaly_score(text),
279
+ 'semantic_similarity': get_semantic_similarity(text),
280
+ }
281
+ rows.append(row)
282
+
283
+ features_df = pd.DataFrame(rows)
284
+ print("βœ… Feature matrix computed:")
285
+ features_df
286
+
287
+ # ── A. Fake News Classifier (LIAR β†’ 3-class) ──────────────────────────────────
288
+ from sklearn.linear_model import LogisticRegression
289
+ from sklearn.preprocessing import LabelEncoder
290
+ from sklearn.model_selection import train_test_split
291
+ from sklearn.metrics import classification_report
292
+ import numpy as np
293
+
294
+ # Collapse LIAR 6-class to 3-class
295
+ LIAR_MAP = {
296
+ 'pants-fire': 'misinformation',
297
+ 'false': 'misinformation',
298
+ 'barely-true': 'uncertain',
299
+ 'half-true': 'uncertain',
300
+ 'mostly-true': 'credible',
301
+ 'true': 'credible',
302
+ }
303
+
304
+ liar_sample = liar_df.sample(min(500, len(liar_df)), random_state=42).copy()
305
+ liar_sample['label_3'] = liar_sample['label'].map(LIAR_MAP).fillna('uncertain')
306
+
307
+ # Encode statements β†’ embeddings for classifier
308
+ print("Encoding LIAR statements...")
309
+ X_liar = embedder.encode(liar_sample['statement'].tolist(), show_progress_bar=True)
310
+ y_liar = liar_sample['label_3'].values
311
+
312
+ X_train, X_test, y_train, y_test = train_test_split(X_liar, y_liar, test_size=0.2, random_state=42)
313
+
314
+ fake_news_clf = LogisticRegression(max_iter=500, random_state=42)
315
+ fake_news_clf.fit(X_train, y_train)
316
+
317
+ print("\nπŸ“Š Fake News Classifier Report:")
318
+ print(classification_report(y_test, fake_news_clf.predict(X_test)))
319
+ print("βœ… Fake news classifier trained.")
320
+
321
+ # ── B. Hallucination Scorer ───────────────────────────────────────────────────
322
+
323
+ def score_hallucination(claim: str) -> dict:
324
+ """
325
+ Scores a single claim for hallucination risk.
326
+ Returns dict with hallucination_risk [0-100] and evidence snippets.
327
+ """
328
+ try:
329
+ references = retrieve_reference_sentences(claim, k=5)
330
+ contradiction_score = get_nli_contradiction_score(claim, references)
331
+ similarity = get_semantic_similarity(claim)
332
+ citation_anomaly = get_citation_anomaly_score(claim)
333
+
334
+ # Weighted combination
335
+ raw_risk = (
336
+ 0.50 * contradiction_score +
337
+ 0.30 * (1 - similarity) + # low similarity to trusted facts = higher risk
338
+ 0.20 * citation_anomaly
339
+ )
340
+ hallucination_risk = int(np.clip(raw_risk * 100, 0, 100))
341
+
342
+ return {
343
+ 'hallucination_risk': hallucination_risk,
344
+ 'contradiction_score': round(contradiction_score, 3),
345
+ 'semantic_similarity': round(similarity, 3),
346
+ 'evidence_snippets': references[:3]
347
+ }
348
+ except Exception as e:
349
+ return {'hallucination_risk': 50, 'contradiction_score': 0.5,
350
+ 'semantic_similarity': 0.5, 'evidence_snippets': [], 'error': str(e)}
351
+
352
+ # Test
353
+ test_claims = [
354
+ "The moon is made of cheese.",
355
+ "Water boils at 100 degrees Celsius at sea level.",
356
+ ]
357
+ for claim in test_claims:
358
+ result = score_hallucination(claim)
359
+ print(f" '{claim[:50]}...' β†’ risk: {result['hallucination_risk']}%")
360
+ print("βœ… Hallucination scorer working.")
361
+
362
+ # ── C. Event Volatility Forecaster ───────────────────────────────────────────
363
+ try:
364
+ from statsforecast import StatsForecast
365
+ from statsforecast.models import AutoARIMA
366
+ HAS_STATSFORECAST = True
367
+ except ImportError:
368
+ HAS_STATSFORECAST = False
369
+ print("⚠️ statsforecast not available, using EWMA fallback.")
370
+
371
+ def compute_volatility_series(df: pd.DataFrame, window: int = 7) -> pd.Series:
372
+ """Rolling std of sentiment scores over headlines."""
373
+ df = df.copy().sort_values('published_at')
374
+ sentiments = df['headline'].apply(get_sentiment_score)
375
+ volatility = sentiments.rolling(window=min(window, len(df)), min_periods=1).std().fillna(0)
376
+ return volatility
377
+
378
+ def forecast_volatility(series: pd.Series, horizon: int = 3) -> dict:
379
+ """Forecast next `horizon` periods of volatility."""
380
+ if HAS_STATSFORECAST and len(series) >= 10:
381
+ try:
382
+ sf_df = pd.DataFrame({
383
+ 'unique_id': 'news_vol',
384
+ 'ds': pd.date_range(start='2024-01-01', periods=len(series), freq='D'),
385
+ 'y': series.values
386
+ })
387
+ sf = StatsForecast(models=[AutoARIMA()], freq='D')
388
+ forecast = sf.forecast(df=sf_df, h=horizon)
389
+ forecasted_values = forecast['AutoARIMA'].values.tolist()
390
+ trend = 'rising' if forecasted_values[-1] > series.mean() else 'stable'
391
+ return {'method': 'AutoARIMA', 'forecast': forecasted_values, 'trend': trend}
392
+ except:
393
+ pass
394
+
395
+ # EWMA fallback
396
+ ewma = series.ewm(span=min(5, len(series))).mean()
397
+ last = ewma.iloc[-1]
398
+ forecasted = [last * (1 + 0.02 * i) for i in range(1, horizon + 1)]
399
+ trend = 'rising' if forecasted[-1] > series.mean() else 'stable'
400
+ return {'method': 'EWMA', 'forecast': forecasted, 'trend': trend}
401
+
402
+ volatility_series = compute_volatility_series(news_df)
403
+ forecast_result = forecast_volatility(volatility_series)
404
+ print(f"βœ… Volatility forecast: {forecast_result['method']} β†’ trend: {forecast_result['trend']}")
405
+
406
+ # ── D. Final Risk Score Aggregator ────────────────────────────────────────────
407
+ # Configurable weights (adjust these constants)
408
+ W_HALLUCINATION = 0.40
409
+ W_FAKE_NEWS = 0.35
410
+ W_CITATION = 0.15
411
+ W_SIMILARITY = 0.10
412
+
413
+ COLOR_MAP = {
414
+ 'confirmed': 'rgba(52, 199, 89, 0.15)', # green
415
+ 'uncertain': 'rgba(255, 204, 0, 0.15)', # yellow
416
+ 'misinformation':'rgba(255, 59, 48, 0.15)', # red
417
+ 'hallucination': 'rgba(175, 82, 222, 0.15)', # purple
418
+ }
419
+
420
+ def get_fake_news_probability(text: str) -> tuple[str, float]:
421
+ """Returns (label, probability) from fake news classifier."""
422
+ try:
423
+ emb = embedder.encode([text])
424
+ proba = fake_news_clf.predict_proba(emb)[0]
425
+ classes = fake_news_clf.classes_
426
+ label = classes[np.argmax(proba)]
427
+ confidence = float(np.max(proba))
428
+ return label, confidence
429
+ except:
430
+ return 'uncertain', 0.5
431
+
432
+ def analyze_text(text: str, source: str = 'unknown') -> dict:
433
+ """
434
+ Full pipeline: text β†’ JSON risk payload.
435
+ This is the function the Gradio API exposes.
436
+ """
437
+ try:
438
+ # --- feature extraction ---
439
+ halu_result = score_hallucination(text)
440
+ fake_label, fake_conf = get_fake_news_probability(text)
441
+ citation_score = get_citation_anomaly_score(text)
442
+ similarity = get_semantic_similarity(text)
443
+ credibility = get_source_credibility(source)
444
+
445
+ # Normalise fake news label to a risk score
446
+ fake_risk = {'misinformation': 0.9, 'uncertain': 0.5, 'credible': 0.1}.get(fake_label, 0.5)
447
+
448
+ # Aggregate
449
+ combined_risk = (
450
+ W_HALLUCINATION * (halu_result['hallucination_risk'] / 100) +
451
+ W_FAKE_NEWS * fake_risk +
452
+ W_CITATION * citation_score +
453
+ W_SIMILARITY * (1 - similarity)
454
+ )
455
+ combined_risk = float(np.clip(combined_risk, 0, 1))
456
+
457
+ # Determine status
458
+ if combined_risk < 0.25:
459
+ status = 'confirmed'
460
+ elif combined_risk < 0.55:
461
+ status = 'uncertain'
462
+ elif halu_result['hallucination_risk'] > 60:
463
+ status = 'hallucination'
464
+ else:
465
+ status = 'misinformation'
466
+
467
+ confidence = abs(combined_risk - 0.5) * 2 # distance from uncertain midpoint
468
+
469
+ tooltip = (
470
+ f"{status.title()} risk: {int(combined_risk*100)}%. "
471
+ f"Hallucination: {halu_result['hallucination_risk']}%. "
472
+ f"Source credibility: {int(credibility*100)}%."
473
+ )
474
+
475
+ return {
476
+ 'text': text,
477
+ 'status': status,
478
+ 'color': COLOR_MAP[status],
479
+ 'hallucination_risk': halu_result['hallucination_risk'],
480
+ 'fake_news_label': fake_label,
481
+ 'combined_risk': round(combined_risk, 3),
482
+ 'confidence': round(confidence, 3),
483
+ 'volatility_index': round(1 - similarity, 3),
484
+ 'tooltip_message': tooltip,
485
+ 'evidence_snippets': halu_result['evidence_snippets']
486
+ }
487
+ except Exception as e:
488
+ return {
489
+ 'text': text, 'status': 'uncertain', 'color': COLOR_MAP['uncertain'],
490
+ 'hallucination_risk': 50, 'fake_news_label': 'uncertain',
491
+ 'combined_risk': 0.5, 'confidence': 0.0, 'volatility_index': 0.5,
492
+ 'tooltip_message': f'Analysis failed gracefully: {str(e)}',
493
+ 'evidence_snippets': []
494
+ }
495
+
496
+ # Quick smoke test
497
+ test = analyze_text("The moon is made of cheese.")
498
+ print(f"βœ… Aggregator test: status={test['status']}, risk={test['combined_risk']}")
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ datasets
3
+ sentence-transformers
4
+ faiss-cpu
5
+ gradio
6
+ statsforecast
7
+ feedparser
8
+ scikit-learn