muneeb-1 commited on
Commit
63d7edb
·
verified ·
1 Parent(s): 2088823

Upload main.py

Browse files
Files changed (1) hide show
  1. main.py +546 -0
main.py ADDED
@@ -0,0 +1,546 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import json
4
+ import requests
5
+ from typing import Dict, List, Any, Optional
6
+ from fastapi import FastAPI, HTTPException, Body
7
+ from pydantic import BaseModel
8
+ import torch
9
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
10
+ from newspaper import Article
11
+ from bs4 import BeautifulSoup
12
+ import easyocr
13
+ from PIL import Image
14
+ import google.generativeai as genai
15
+ from datetime import datetime
16
+ import logging
17
+ from fastapi.middleware.cors import CORSMiddleware
18
+
19
+ # Setup logging
20
+ logging.basicConfig(
21
+ level=logging.INFO,
22
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
23
+ )
24
+ logger = logging.getLogger("TruthLens")
25
+
26
+ # Initialize FastAPI app
27
+ app = FastAPI(title="TruthLens Backend")
28
+
29
+ # Add CORS middleware
30
+ app.add_middleware(
31
+ CORSMiddleware,
32
+ allow_origins=["*"],
33
+ allow_credentials=True,
34
+ allow_methods=["*"],
35
+ allow_headers=["*"],
36
+ )
37
+
38
+ # Load Hugging Face model (RoBERTa-based fake news detector)
39
+ MODEL_NAME = "Pulk17/Fake-News-Detection"
40
+ tokenizer = None
41
+ model = None
42
+
43
+ def load_model():
44
+ """Lazy load the Hugging Face model"""
45
+ global tokenizer, model
46
+ if tokenizer is None or model is None:
47
+ print("Loading Hugging Face model...")
48
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
49
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
50
+ model.eval()
51
+ return tokenizer, model
52
+
53
+ # Environment variables
54
+ # Ensure you set these in your environment or .env file
55
+ GOOGLE_FACT_CHECK_API_KEY = os.environ.get('GOOGLE_FACT_CHECK_API_KEY', '')
56
+ HIVE_API_KEY = os.environ.get('HIVE_API_KEY', '')
57
+ GEMINI_API_KEY = os.environ.get('GEMINI_API_KEY', '')
58
+
59
+ # Initialize Gemini
60
+ if GEMINI_API_KEY:
61
+ genai.configure(api_key=GEMINI_API_KEY)
62
+ gemini_model = genai.GenerativeModel('gemini-pro')
63
+ else:
64
+ gemini_model = None
65
+
66
+ # Request Models
67
+ class TextRequest(BaseModel):
68
+ text: str
69
+
70
+ class UrlRequest(BaseModel):
71
+ url: str
72
+
73
+ class ImageRequest(BaseModel):
74
+ image_url: str
75
+
76
+ # Source Credibility Database
77
+ CREDIBLE_SOURCES = {
78
+ "apnews.com": "Associated Press",
79
+ "reuters.com": "Reuters",
80
+ "bbc.com": "BBC News",
81
+ "bbc.co.uk": "BBC News",
82
+ "nytimes.com": "The New York Times",
83
+ "npr.org": "NPR",
84
+ "pbs.org": "PBS NewsHour",
85
+ "wsj.com": "The Wall Street Journal",
86
+ "bloomberg.com": "Bloomberg",
87
+ "theguardian.com": "The Guardian",
88
+ "washingtonpost.com": "The Washington Post",
89
+ "propublica.org": "ProPublica",
90
+ "aljazeera.com": "Al Jazeera",
91
+ "economist.com": "The Economist",
92
+ "forbes.com": "Forbes"
93
+ }
94
+
95
+ SATIRE_SOURCES = {
96
+ "theonion.com": "The Onion",
97
+ "babylonbee.com": "The Babylon Bee",
98
+ "clickhole.com": "ClickHole",
99
+ "newyorker.com/humor/borowitz-report": "The Borowitz Report",
100
+ "thebeaverton.com": "The Beaverton",
101
+ "cracked.com": "Cracked",
102
+ "dailymash.co.uk": "The Daily Mash",
103
+ "waterfordwhispersnews.com": "Waterford Whispers News"
104
+ }
105
+
106
+ # Helper functions
107
+ def get_fact_checks(text: str) -> List[Dict[str, str]]:
108
+ """Get fact checks from Google Fact Check Tools API"""
109
+ if not GOOGLE_FACT_CHECK_API_KEY:
110
+ return []
111
+
112
+ try:
113
+ # Extract key claims (first 100 chars as query)
114
+ query = text[:100]
115
+
116
+ url = "https://factchecktools.googleapis.com/v1alpha1/claims:search"
117
+ params = {
118
+ "query": query,
119
+ "key": GOOGLE_FACT_CHECK_API_KEY,
120
+ "languageCode": "en"
121
+ }
122
+
123
+ response = requests.get(url, params=params, timeout=10)
124
+
125
+ if response.status_code == 200:
126
+ data = response.json()
127
+ claims = data.get('claims', [])
128
+
129
+ fact_checks = []
130
+ for claim in claims[:3]: # Top 3 fact checks
131
+ fact_check = {
132
+ "claim": claim.get('text', ''),
133
+ "claimant": claim.get('claimant', ''),
134
+ "rating": claim.get('claimReview', [{}])[0].get('textualRating', 'Unknown'),
135
+ "url": claim.get('claimReview', [{}])[0].get('url', '')
136
+ }
137
+ fact_checks.append(fact_check)
138
+
139
+ return fact_checks
140
+ else:
141
+ print(f"Fact check API error: {response.status_code}")
142
+ return []
143
+
144
+ except Exception as e:
145
+ print(f"Error getting fact checks: {e}")
146
+ return []
147
+
148
+ def extract_claims_with_gemini(text: str) -> List[str]:
149
+ """Use Gemini to extract key factual claims for building a search query"""
150
+ if not gemini_model:
151
+ return [text[:100]]
152
+
153
+ try:
154
+ prompt = f"""
155
+ Extract the single most important factual claim from the following text that can be used to search in a fact-check database.
156
+ Output ONLY the extracted claim string, nothing else.
157
+
158
+ Text: {text[:1000]}
159
+ """
160
+ response = gemini_model.generate_content(prompt)
161
+ if response and hasattr(response, 'text'):
162
+ claim = response.text.strip()
163
+ return [claim] if claim else [text[:100]]
164
+ return [text[:100]]
165
+ except Exception as e:
166
+ print(f"Gemini claim extraction error: {e}")
167
+ return [text[:100]]
168
+
169
+ def generate_explanation_with_gemini(text: str, label: str, confidence: float, fact_checks: List[Dict]) -> str:
170
+ """Use Gemini to explain the reasoning behind the detection result"""
171
+ if not gemini_model:
172
+ return f"The news has been classified as {label} with {confidence:.2%} confidence."
173
+
174
+ try:
175
+ fact_check_context = ""
176
+ if fact_checks:
177
+ fact_check_context = "Relevant fact checks found:\n" + "\n".join([f"- {fc['claim']} (Rating: {fc['rating']})" for fc in fact_checks])
178
+
179
+ prompt = f"""
180
+ Act as a professional fact-checker for an app called TruthLens.
181
+ Analyze the following news text and the AI detection result.
182
+
183
+ News Text: {text[:1000]}
184
+ AI Classification: {label}
185
+ Confidence: {confidence:.2%}
186
+ {fact_check_context}
187
+
188
+ Provide a concise, human-readable explanation (2-3 sentences) explaining why this news is likely {label}.
189
+ Focus on style, source (if present), or specific fact-check evidence.
190
+ """
191
+ response = gemini_model.generate_content(prompt)
192
+ if response and hasattr(response, 'text'):
193
+ return response.text.strip()
194
+ return f"The model identified this content as {label} with {confidence:.2%} confidence."
195
+ except Exception as e:
196
+ print(f"Gemini explanation error: {e}")
197
+ return f"Analysis complete: The model identified this content as {label}."
198
+
199
+ def detect_ai_image(image_bytes: bytes) -> Dict[str, Any]:
200
+ """Detect AI-generated content using Hive Moderation API"""
201
+ if not HIVE_API_KEY:
202
+ return {"probability": 0.0, "generator": None}
203
+
204
+ try:
205
+ url = "https://api.hivemoderation.com/v2/task/sync"
206
+
207
+ headers = {
208
+ "Authorization": f"Token {HIVE_API_KEY}",
209
+ "Content-Type": "application/json"
210
+ }
211
+
212
+ # Convert image to base64
213
+ import base64
214
+ image_b64 = base64.b64encode(image_bytes).decode('utf-8')
215
+
216
+ payload = {
217
+ "image": image_b64,
218
+ "models": ["ai_generated"]
219
+ }
220
+
221
+ response = requests.post(url, headers=headers, json=payload, timeout=30)
222
+
223
+ if response.status_code == 200:
224
+ data = response.json()
225
+ ai_generated = data.get('status', [{}])[0].get('response', {}).get('output', [{}])[0]
226
+
227
+ return {
228
+ "probability": ai_generated.get('score', 0.0),
229
+ "generator": ai_generated.get('class', None)
230
+ }
231
+ else:
232
+ print(f"Hive API error: {response.status_code}")
233
+ return {"probability": 0.0, "generator": None}
234
+
235
+ except Exception as e:
236
+ print(f"Error detecting AI image: {e}")
237
+ return {"probability": 0.0, "generator": None}
238
+
239
+ def calculate_risk_level(
240
+ label: str,
241
+ confidence: float,
242
+ fact_checks: List[Dict],
243
+ image_ai_result: Optional[Dict] = None
244
+ ) -> str:
245
+ """Calculate overall risk level"""
246
+
247
+ # Base risk on label and confidence
248
+ if label == "FAKE" and confidence > 0.8:
249
+ base_risk = "high"
250
+ elif label == "FAKE" and confidence > 0.5:
251
+ base_risk = "medium"
252
+ elif label == "REAL" and confidence > 0.8:
253
+ base_risk = "low"
254
+ else:
255
+ base_risk = "medium"
256
+
257
+ # Adjust based on fact checks
258
+ if fact_checks:
259
+ fake_ratings = sum(1 for fc in fact_checks if 'false' in fc['rating'].lower() or 'fake' in fc['rating'].lower() or 'satire' in fc['rating'].lower())
260
+ if fake_ratings >= 2:
261
+ base_risk = "high"
262
+
263
+ # Adjust based on AI image detection
264
+ if image_ai_result and image_ai_result['probability'] > 0.7:
265
+ if base_risk == "low":
266
+ base_risk = "medium"
267
+ elif base_risk == "medium":
268
+ base_risk = "high"
269
+
270
+ return base_risk
271
+
272
+ def check_source_credibility(url: str) -> Dict[str, Any]:
273
+ """Check if the URL belongs to a known credible or satire source"""
274
+ from urllib.parse import urlparse
275
+
276
+ try:
277
+ domain = urlparse(url).netloc.lower()
278
+ if domain.startswith("www."):
279
+ domain = domain[4:]
280
+
281
+ # Check Satire first
282
+ for satire_domain, name in SATIRE_SOURCES.items():
283
+ if satire_domain in url.lower():
284
+ return {"status": "satire", "name": name, "label": "FAKE", "confidence": 1.0}
285
+
286
+ # Check Credible
287
+ if domain in CREDIBLE_SOURCES:
288
+ return {"status": "credible", "name": CREDIBLE_SOURCES[domain], "label": "REAL", "confidence": 0.95}
289
+
290
+ return {"status": "unknown", "name": None, "label": None, "confidence": 0.0}
291
+ except Exception as e:
292
+ print(f"Error checking credibility: {e}")
293
+ return {"status": "unknown", "name": None, "label": None, "confidence": 0.0}
294
+
295
+ def extract_article_text(url: str) -> str:
296
+ """Extract article text from URL using newspaper3k"""
297
+ try:
298
+ article = Article(url)
299
+ article.download()
300
+ article.parse()
301
+ return article.text
302
+ except Exception as e:
303
+ print(f"Error extracting article with newspaper3k: {e}")
304
+ # Fallback to BeautifulSoup
305
+ try:
306
+ response = requests.get(url, timeout=30)
307
+ soup = BeautifulSoup(response.content, 'html.parser')
308
+
309
+ # Remove script and style elements
310
+ for script in soup(["script", "style"]):
311
+ script.decompose()
312
+
313
+ # Get text
314
+ text = soup.get_text()
315
+
316
+ # Clean up whitespace
317
+ lines = (line.strip() for line in text.splitlines())
318
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
319
+ text = ' '.join(chunk for chunk in chunks if chunk)
320
+
321
+ return text
322
+ except Exception as e2:
323
+ logger.error(f"Error with BeautifulSoup fallback: {e2}")
324
+ return ""
325
+
326
+ @app.get("/")
327
+ async def root():
328
+ return {"status": "healthy", "service": "TruthLens API"}
329
+
330
+ # Endpoints
331
+
332
+ @app.post("/detect-text")
333
+ async def detect_text(request: TextRequest, skip_extras: bool = False):
334
+ """Detect fake news in text with optional fact-check and explanation"""
335
+ try:
336
+ text = request.text
337
+ if not text:
338
+ raise HTTPException(status_code=400, detail="Text is required")
339
+
340
+ # Load model
341
+ tok, mdl = load_model()
342
+
343
+ # Tokenize and predict
344
+ inputs = tok(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
345
+
346
+ with torch.no_grad():
347
+ outputs = mdl(**inputs)
348
+ logits = outputs.logits
349
+ probabilities = torch.softmax(logits, dim=1)
350
+ prediction = torch.argmax(probabilities, dim=1).item()
351
+ confidence = probabilities[0][prediction].item()
352
+
353
+ label = "REAL" if prediction == 1 else "FAKE"
354
+
355
+ if skip_extras:
356
+ return {
357
+ "input_type": "text",
358
+ "label": label,
359
+ "confidence": confidence,
360
+ "timestamp": datetime.now().isoformat()
361
+ }
362
+
363
+ # Enhanced Fact Checking with Gemini
364
+ extracted_claims = extract_claims_with_gemini(text)
365
+ fact_checks = get_fact_checks(extracted_claims[0])
366
+
367
+ # Gemini Explanation
368
+ explanation = generate_explanation_with_gemini(text, label, confidence, fact_checks)
369
+
370
+ risk_level = calculate_risk_level(label, confidence, fact_checks)
371
+
372
+ return {
373
+ "input_type": "text",
374
+ "text": text,
375
+ "label": label,
376
+ "confidence": confidence,
377
+ "explanation": explanation,
378
+ "fact_checks": fact_checks,
379
+ "risk_level": risk_level,
380
+ "timestamp": datetime.now().isoformat()
381
+ }
382
+
383
+ except Exception as e:
384
+ print(f"Error in detect_text: {str(e)}")
385
+ raise HTTPException(status_code=500, detail=str(e))
386
+
387
+ @app.post("/detect-url")
388
+ async def detect_url(request: UrlRequest, skip_extras: bool = False):
389
+ """Detect fake news in URL with optional fact-check and explanation"""
390
+ try:
391
+ url = request.url
392
+ if not url:
393
+ raise HTTPException(status_code=400, detail="URL is required")
394
+
395
+ # 1. Check Source Credibility First
396
+ source_info = check_source_credibility(url)
397
+
398
+ article_text = extract_article_text(url)
399
+ if not article_text:
400
+ raise HTTPException(status_code=400, detail="Failed to extract article text from URL")
401
+
402
+ tok, mdl = load_model()
403
+
404
+ inputs = tok(article_text, return_tensors="pt", truncation=True, max_length=512, padding=True)
405
+
406
+ with torch.no_grad():
407
+ outputs = mdl(**inputs)
408
+ logits = outputs.logits
409
+ probabilities = torch.softmax(logits, dim=1)
410
+ prediction = torch.argmax(probabilities, dim=1).item()
411
+ confidence = probabilities[0][prediction].item()
412
+
413
+ label = "REAL" if prediction == 1 else "FAKE"
414
+
415
+ # Override with source credibility if it's definitive
416
+ if source_info["status"] == "satire":
417
+ label = "FAKE"
418
+ confidence = 1.0
419
+ elif source_info["status"] == "credible" and label == "FAKE":
420
+ # If a credible source is flagged as fake, we lower risk but keep label
421
+ # or we could trust the source more. Let's provide it in metadata.
422
+ pass
423
+
424
+ if skip_extras:
425
+ return {
426
+ "input_type": "url",
427
+ "label": label,
428
+ "confidence": confidence,
429
+ "source_metadata": source_info,
430
+ "timestamp": datetime.now().isoformat()
431
+ }
432
+
433
+ # Enhanced Fact Checking with Gemini
434
+ extracted_claims = extract_claims_with_gemini(article_text)
435
+ fact_checks = get_fact_checks(extracted_claims[0])
436
+
437
+ # Gemini Explanation
438
+ explanation = generate_explanation_with_gemini(
439
+ f"Source: {source_info['name'] if source_info['name'] else 'Unknown'}. Content: {article_text}",
440
+ label,
441
+ confidence,
442
+ fact_checks
443
+ )
444
+
445
+ risk_level = calculate_risk_level(label, confidence, fact_checks)
446
+
447
+ return {
448
+ "input_type": "url",
449
+ "url": url,
450
+ "source_metadata": source_info,
451
+ "text": article_text[:500],
452
+ "label": label,
453
+ "confidence": confidence,
454
+ "explanation": explanation,
455
+ "fact_checks": fact_checks,
456
+ "risk_level": risk_level,
457
+ "timestamp": datetime.now().isoformat()
458
+ }
459
+
460
+ except Exception as e:
461
+ print(f"Error in detect_url: {str(e)}")
462
+ raise HTTPException(status_code=500, detail=str(e))
463
+
464
+ @app.post("/detect-image")
465
+ async def detect_image(request: ImageRequest, skip_extras: bool = False):
466
+ try:
467
+ image_url = request.image_url
468
+ logger.info(f"Processing image: {image_url}")
469
+ if not image_url:
470
+ raise HTTPException(status_code=400, detail="Image URL is required")
471
+
472
+ response = requests.get(image_url, timeout=30)
473
+ response.raise_for_status()
474
+ image_bytes = response.content
475
+
476
+ reader = easyocr.Reader(['en'])
477
+ # EasyOCR can read from bytes directly
478
+ ocr_results = reader.readtext(image_bytes)
479
+ extracted_text = ' '.join([result[1] for result in ocr_results])
480
+
481
+ image_ai_result = detect_ai_image(image_bytes)
482
+
483
+ if extracted_text.strip():
484
+ tok, mdl = load_model()
485
+ inputs = tok(extracted_text, return_tensors="pt", truncation=True, max_length=512, padding=True)
486
+
487
+ with torch.no_grad():
488
+ outputs = mdl(**inputs)
489
+ logits = outputs.logits
490
+ probabilities = torch.softmax(logits, dim=1)
491
+ prediction = torch.argmax(probabilities, dim=1).item()
492
+ confidence = probabilities[0][prediction].item()
493
+
494
+ label = "REAL" if prediction == 1 else "FAKE"
495
+
496
+ if skip_extras:
497
+ return {
498
+ "input_type": "image",
499
+ "label": label,
500
+ "confidence": confidence,
501
+ "image_ai_result": image_ai_result,
502
+ "timestamp": datetime.now().isoformat()
503
+ }
504
+
505
+ # Enhanced Fact Checking with Gemini
506
+ extracted_claims = extract_claims_with_gemini(extracted_text)
507
+ fact_checks = get_fact_checks(extracted_claims[0])
508
+ else:
509
+ label = "FAKE" if image_ai_result['probability'] > 0.7 else "REAL"
510
+ confidence = image_ai_result['probability'] if label == "FAKE" else (1 - image_ai_result['probability'])
511
+ fact_checks = []
512
+
513
+ if skip_extras:
514
+ return {
515
+ "input_type": "image",
516
+ "label": label,
517
+ "confidence": confidence,
518
+ "image_ai_result": image_ai_result,
519
+ "timestamp": datetime.now().isoformat()
520
+ }
521
+
522
+ # Gemini Explanation
523
+ explanation = generate_explanation_with_gemini(extracted_text if extracted_text else "No text found in image", label, confidence, fact_checks)
524
+
525
+ risk_level = calculate_risk_level(label, confidence, fact_checks, image_ai_result)
526
+
527
+ return {
528
+ "input_type": "image",
529
+ "image_url": image_url,
530
+ "text": extracted_text[:500] if extracted_text else None,
531
+ "label": label,
532
+ "confidence": confidence,
533
+ "explanation": explanation,
534
+ "fact_checks": fact_checks,
535
+ "image_ai_result": image_ai_result,
536
+ "risk_level": risk_level,
537
+ "timestamp": datetime.now().isoformat()
538
+ }
539
+
540
+ except Exception as e:
541
+ print(f"Error in detect_image: {str(e)}")
542
+ raise HTTPException(status_code=500, detail=str(e))
543
+
544
+ if __name__ == "__main__":
545
+ import uvicorn
546
+ uvicorn.run(app, host="0.0.0.0", port=8000)