muneeb-1 commited on
Commit
6282a53
·
verified ·
1 Parent(s): af6eb96

Upload 2 files

Browse files
Files changed (2) hide show
  1. main.py +410 -0
  2. requirements.txt +14 -0
main.py ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import json
4
+ import requests
5
+ from typing import Dict, List, Any, Optional
6
+ from fastapi import FastAPI, HTTPException, Body
7
+ from pydantic import BaseModel
8
+ import torch
9
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
10
+ from newspaper import Article
11
+ from bs4 import BeautifulSoup
12
+ import easyocr
13
+ from PIL import Image
14
+ import google.generativeai as genai
15
+ from datetime import datetime
16
+
17
+ # Initialize FastAPI app
18
+ app = FastAPI(title="TruthLens Backend")
19
+
20
+ # Load Hugging Face model (RoBERTa-based fake news detector)
21
+ MODEL_NAME = "Pulk17/Fake-News-Detection"
22
+ tokenizer = None
23
+ model = None
24
+
25
+ def load_model():
26
+ """Lazy load the Hugging Face model"""
27
+ global tokenizer, model
28
+ if tokenizer is None or model is None:
29
+ print("Loading Hugging Face model...")
30
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
31
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
32
+ model.eval()
33
+ return tokenizer, model
34
+
35
+ # Environment variables
36
+ # Ensure you set these in your environment or .env file
37
+ GOOGLE_FACT_CHECK_API_KEY = os.environ.get('GOOGLE_FACT_CHECK_API_KEY', '')
38
+ HIVE_API_KEY = os.environ.get('HIVE_API_KEY', '')
39
+ GEMINI_API_KEY = os.environ.get('GEMINI_API_KEY', '')
40
+
41
+ # Initialize Gemini
42
+ if GEMINI_API_KEY:
43
+ genai.configure(api_key=GEMINI_API_KEY)
44
+ gemini_model = genai.GenerativeModel('gemini-pro')
45
+ else:
46
+ gemini_model = None
47
+
48
+ # Request Models
49
+ class TextRequest(BaseModel):
50
+ text: str
51
+
52
+ class UrlRequest(BaseModel):
53
+ url: str
54
+
55
+ class ImageRequest(BaseModel):
56
+ image_url: str
57
+
58
+ # Helper functions
59
+ def get_fact_checks(text: str) -> List[Dict[str, str]]:
60
+ """Get fact checks from Google Fact Check Tools API"""
61
+ if not GOOGLE_FACT_CHECK_API_KEY:
62
+ return []
63
+
64
+ try:
65
+ # Extract key claims (first 100 chars as query)
66
+ query = text[:100]
67
+
68
+ url = "https://factchecktools.googleapis.com/v1alpha1/claims:search"
69
+ params = {
70
+ "query": query,
71
+ "key": GOOGLE_FACT_CHECK_API_KEY,
72
+ "languageCode": "en"
73
+ }
74
+
75
+ response = requests.get(url, params=params, timeout=10)
76
+
77
+ if response.status_code == 200:
78
+ data = response.json()
79
+ claims = data.get('claims', [])
80
+
81
+ fact_checks = []
82
+ for claim in claims[:3]: # Top 3 fact checks
83
+ fact_check = {
84
+ "claim": claim.get('text', ''),
85
+ "claimant": claim.get('claimant', ''),
86
+ "rating": claim.get('claimReview', [{}])[0].get('textualRating', 'Unknown'),
87
+ "url": claim.get('claimReview', [{}])[0].get('url', '')
88
+ }
89
+ fact_checks.append(fact_check)
90
+
91
+ return fact_checks
92
+ else:
93
+ print(f"Fact check API error: {response.status_code}")
94
+ return []
95
+
96
+ except Exception as e:
97
+ print(f"Error getting fact checks: {e}")
98
+ return []
99
+
100
+ def extract_claims_with_gemini(text: str) -> List[str]:
101
+ """Use Gemini to extract key factual claims for building a search query"""
102
+ if not gemini_model:
103
+ return [text[:100]]
104
+
105
+ try:
106
+ prompt = f"""
107
+ Extract the single most important factual claim from the following text that can be used to search in a fact-check database.
108
+ Output ONLY the extracted claim string, nothing else.
109
+
110
+ Text: {text[:1000]}
111
+ """
112
+ response = gemini_model.generate_content(prompt)
113
+ claim = response.text.strip()
114
+ return [claim] if claim else [text[:100]]
115
+ except Exception as e:
116
+ print(f"Gemini claim extraction error: {e}")
117
+ return [text[:100]]
118
+
119
+ def generate_explanation_with_gemini(text: str, label: str, confidence: float, fact_checks: List[Dict]) -> str:
120
+ """Use Gemini to explain the reasoning behind the detection result"""
121
+ if not gemini_model:
122
+ return f"The news has been classified as {label} with {confidence:.2%} confidence."
123
+
124
+ try:
125
+ fact_check_context = ""
126
+ if fact_checks:
127
+ fact_check_context = "Relevant fact checks found:\n" + "\n".join([f"- {fc['claim']} (Rating: {fc['rating']})" for fc in fact_checks])
128
+
129
+ prompt = f"""
130
+ Act as a professional fact-checker for an app called TruthLens.
131
+ Analyze the following news text and the AI detection result.
132
+
133
+ News Text: {text[:1000]}
134
+ AI Classification: {label}
135
+ Confidence: {confidence:.2%}
136
+ {fact_check_context}
137
+
138
+ Provide a concise, human-readable explanation (2-3 sentences) explaining why this news is likely {label}.
139
+ Focus on style, source (if present), or specific fact-check evidence.
140
+ """
141
+ response = gemini_model.generate_content(prompt)
142
+ return response.text.strip()
143
+ except Exception as e:
144
+ print(f"Gemini explanation error: {e}")
145
+ return f"Analysis complete: The model identified this content as {label}."
146
+
147
+ def detect_ai_image(image_bytes: bytes) -> Dict[str, Any]:
148
+ """Detect AI-generated content using Hive Moderation API"""
149
+ if not HIVE_API_KEY:
150
+ return {"probability": 0.0, "generator": None}
151
+
152
+ try:
153
+ url = "https://api.hivemoderation.com/v2/task/sync"
154
+
155
+ headers = {
156
+ "Authorization": f"Token {HIVE_API_KEY}",
157
+ "Content-Type": "application/json"
158
+ }
159
+
160
+ # Convert image to base64
161
+ import base64
162
+ image_b64 = base64.b64encode(image_bytes).decode('utf-8')
163
+
164
+ payload = {
165
+ "image": image_b64,
166
+ "models": ["ai_generated"]
167
+ }
168
+
169
+ response = requests.post(url, headers=headers, json=payload, timeout=30)
170
+
171
+ if response.status_code == 200:
172
+ data = response.json()
173
+ ai_generated = data.get('status', [{}])[0].get('response', {}).get('output', [{}])[0]
174
+
175
+ return {
176
+ "probability": ai_generated.get('score', 0.0),
177
+ "generator": ai_generated.get('class', None)
178
+ }
179
+ else:
180
+ print(f"Hive API error: {response.status_code}")
181
+ return {"probability": 0.0, "generator": None}
182
+
183
+ except Exception as e:
184
+ print(f"Error detecting AI image: {e}")
185
+ return {"probability": 0.0, "generator": None}
186
+
187
+ def calculate_risk_level(
188
+ label: str,
189
+ confidence: float,
190
+ fact_checks: List[Dict],
191
+ image_ai_result: Optional[Dict] = None
192
+ ) -> str:
193
+ """Calculate overall risk level"""
194
+
195
+ # Base risk on label and confidence
196
+ if label == "FAKE" and confidence > 0.8:
197
+ base_risk = "high"
198
+ elif label == "FAKE" and confidence > 0.5:
199
+ base_risk = "medium"
200
+ elif label == "REAL" and confidence > 0.8:
201
+ base_risk = "low"
202
+ else:
203
+ base_risk = "medium"
204
+
205
+ # Adjust based on fact checks
206
+ if fact_checks:
207
+ fake_ratings = sum(1 for fc in fact_checks if 'false' in fc['rating'].lower() or 'fake' in fc['rating'].lower() or 'satire' in fc['rating'].lower())
208
+ if fake_ratings >= 2:
209
+ base_risk = "high"
210
+
211
+ # Adjust based on AI image detection
212
+ if image_ai_result and image_ai_result['probability'] > 0.7:
213
+ if base_risk == "low":
214
+ base_risk = "medium"
215
+ elif base_risk == "medium":
216
+ base_risk = "high"
217
+
218
+ return base_risk
219
+
220
+ def extract_article_text(url: str) -> str:
221
+ """Extract article text from URL using newspaper3k"""
222
+ try:
223
+ article = Article(url)
224
+ article.download()
225
+ article.parse()
226
+ return article.text
227
+ except Exception as e:
228
+ print(f"Error extracting article with newspaper3k: {e}")
229
+ # Fallback to BeautifulSoup
230
+ try:
231
+ response = requests.get(url, timeout=30)
232
+ soup = BeautifulSoup(response.content, 'html.parser')
233
+
234
+ # Remove script and style elements
235
+ for script in soup(["script", "style"]):
236
+ script.decompose()
237
+
238
+ # Get text
239
+ text = soup.get_text()
240
+
241
+ # Clean up whitespace
242
+ lines = (line.strip() for line in text.splitlines())
243
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
244
+ text = ' '.join(chunk for chunk in chunks if chunk)
245
+
246
+ return text
247
+ except Exception as e2:
248
+ print(f"Error with BeautifulSoup fallback: {e2}")
249
+ return ""
250
+
251
+ # Endpoints
252
+
253
+ @app.post("/detect-text")
254
+ async def detect_text(request: TextRequest):
255
+ try:
256
+ text = request.text
257
+ if not text:
258
+ raise HTTPException(status_code=400, detail="Text is required")
259
+
260
+ # Load model
261
+ tok, mdl = load_model()
262
+
263
+ # Tokenize and predict
264
+ inputs = tok(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
265
+
266
+ with torch.no_grad():
267
+ outputs = mdl(**inputs)
268
+ logits = outputs.logits
269
+ probabilities = torch.softmax(logits, dim=1)
270
+ prediction = torch.argmax(probabilities, dim=1).item()
271
+ confidence = probabilities[0][prediction].item()
272
+
273
+ label = "REAL" if prediction == 1 else "FAKE"
274
+
275
+ # Enhanced Fact Checking with Gemini
276
+ extracted_claims = extract_claims_with_gemini(text)
277
+ fact_checks = get_fact_checks(extracted_claims[0])
278
+
279
+ # Gemini Explanation
280
+ explanation = generate_explanation_with_gemini(text, label, confidence, fact_checks)
281
+
282
+ risk_level = calculate_risk_level(label, confidence, fact_checks)
283
+
284
+ return {
285
+ "input_type": "text",
286
+ "text": text,
287
+ "label": label,
288
+ "confidence": confidence,
289
+ "explanation": explanation,
290
+ "fact_checks": fact_checks,
291
+ "risk_level": risk_level,
292
+ "timestamp": datetime.now().isoformat()
293
+ }
294
+
295
+ except Exception as e:
296
+ print(f"Error in detect_text: {str(e)}")
297
+ raise HTTPException(status_code=500, detail=str(e))
298
+
299
+ @app.post("/detect-url")
300
+ async def detect_url(request: UrlRequest):
301
+ try:
302
+ url = request.url
303
+ if not url:
304
+ raise HTTPException(status_code=400, detail="URL is required")
305
+
306
+ article_text = extract_article_text(url)
307
+ if not article_text:
308
+ raise HTTPException(status_code=400, detail="Failed to extract article text from URL")
309
+
310
+ tok, mdl = load_model()
311
+
312
+ inputs = tok(article_text, return_tensors="pt", truncation=True, max_length=512, padding=True)
313
+
314
+ with torch.no_grad():
315
+ outputs = mdl(**inputs)
316
+ logits = outputs.logits
317
+ probabilities = torch.softmax(logits, dim=1)
318
+ prediction = torch.argmax(probabilities, dim=1).item()
319
+ confidence = probabilities[0][prediction].item()
320
+
321
+ label = "REAL" if prediction == 1 else "FAKE"
322
+
323
+ # Enhanced Fact Checking with Gemini
324
+ extracted_claims = extract_claims_with_gemini(article_text)
325
+ fact_checks = get_fact_checks(extracted_claims[0])
326
+
327
+ # Gemini Explanation
328
+ explanation = generate_explanation_with_gemini(article_text, label, confidence, fact_checks)
329
+
330
+ risk_level = calculate_risk_level(label, confidence, fact_checks)
331
+
332
+ return {
333
+ "input_type": "url",
334
+ "url": url,
335
+ "text": article_text[:500],
336
+ "label": label,
337
+ "confidence": confidence,
338
+ "explanation": explanation,
339
+ "fact_checks": fact_checks,
340
+ "risk_level": risk_level,
341
+ "timestamp": datetime.now().isoformat()
342
+ }
343
+
344
+ except Exception as e:
345
+ print(f"Error in detect_url: {str(e)}")
346
+ raise HTTPException(status_code=500, detail=str(e))
347
+
348
+ @app.post("/detect-image")
349
+ async def detect_image(request: ImageRequest):
350
+ try:
351
+ image_url = request.image_url
352
+ if not image_url:
353
+ raise HTTPException(status_code=400, detail="Image URL is required")
354
+
355
+ response = requests.get(image_url, timeout=30)
356
+ response.raise_for_status()
357
+ image_bytes = response.content
358
+
359
+ reader = easyocr.Reader(['en'])
360
+ # EasyOCR can read from bytes directly
361
+ ocr_results = reader.readtext(image_bytes)
362
+ extracted_text = ' '.join([result[1] for result in ocr_results])
363
+
364
+ image_ai_result = detect_ai_image(image_bytes)
365
+
366
+ if extracted_text.strip():
367
+ tok, mdl = load_model()
368
+ inputs = tok(extracted_text, return_tensors="pt", truncation=True, max_length=512, padding=True)
369
+
370
+ with torch.no_grad():
371
+ outputs = mdl(**inputs)
372
+ logits = outputs.logits
373
+ probabilities = torch.softmax(logits, dim=1)
374
+ prediction = torch.argmax(probabilities, dim=1).item()
375
+ confidence = probabilities[0][prediction].item()
376
+
377
+ label = "REAL" if prediction == 1 else "FAKE"
378
+ # Enhanced Fact Checking with Gemini
379
+ extracted_claims = extract_claims_with_gemini(extracted_text)
380
+ fact_checks = get_fact_checks(extracted_claims[0])
381
+ else:
382
+ label = "FAKE" if image_ai_result['probability'] > 0.7 else "REAL"
383
+ confidence = image_ai_result['probability'] if label == "FAKE" else (1 - image_ai_result['probability'])
384
+ fact_checks = []
385
+
386
+ # Gemini Explanation
387
+ explanation = generate_explanation_with_gemini(extracted_text if extracted_text else "No text found in image", label, confidence, fact_checks)
388
+
389
+ risk_level = calculate_risk_level(label, confidence, fact_checks, image_ai_result)
390
+
391
+ return {
392
+ "input_type": "image",
393
+ "image_url": image_url,
394
+ "text": extracted_text[:500] if extracted_text else None,
395
+ "label": label,
396
+ "confidence": confidence,
397
+ "explanation": explanation,
398
+ "fact_checks": fact_checks,
399
+ "image_ai_result": image_ai_result,
400
+ "risk_level": risk_level,
401
+ "timestamp": datetime.now().isoformat()
402
+ }
403
+
404
+ except Exception as e:
405
+ print(f"Error in detect_image: {str(e)}")
406
+ raise HTTPException(status_code=500, detail=str(e))
407
+
408
+ if __name__ == "__main__":
409
+ import uvicorn
410
+ uvicorn.run(app, host="0.0.0.0", port=8000)
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.109.0
2
+ uvicorn==0.27.0
3
+ requests==2.31.0
4
+ torch==2.1.2
5
+ transformers==4.36.2
6
+ newspaper3k==0.2.8
7
+ beautifulsoup4==4.12.2
8
+ easyocr==1.7.1
9
+ Pillow==10.2.0
10
+ python-multipart==0.0.6
11
+ pydantic==2.5.3
12
+ lxml_html_clean
13
+ opencv-python-headless
14
+ google-generativeai==0.3.2