DevNumb commited on
Commit
d10d8b8
Β·
verified Β·
1 Parent(s): fdf37a2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +153 -124
app.py CHANGED
@@ -6,9 +6,7 @@ import torch
6
  import torch.nn.functional as F
7
  import re
8
  import logging
9
- import os
10
  import numpy as np
11
- from sklearn.metrics.pairwise import cosine_similarity
12
 
13
  # Set up logging
14
  logging.basicConfig(level=logging.INFO)
@@ -16,53 +14,81 @@ logger = logging.getLogger(__name__)
16
 
17
  class FakeNewsDetector:
18
  def __init__(self):
19
- logger.info("Loading sentence transformer model directly...")
20
 
21
  try:
22
- # Load model and tokenizer directly
23
  self.model_name = "sentence-transformers/all-MiniLM-L6-v2"
24
  self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
25
  self.model = AutoModel.from_pretrained(self.model_name)
 
26
  logger.info("Model loaded successfully!")
27
  except Exception as e:
28
  logger.error(f"Error loading model: {e}")
29
  raise
30
 
31
- # Fake news patterns
32
  self.fake_news_patterns = [
33
- "conspiracy theory", "false claim", "misinformation", "debunked",
34
- "hoax", "unverified", "clickbait", "fake news", "deep state",
35
- "mainstream media lies", "cover up", "they don't want you to know",
36
- "secret truth", "hidden facts", "wake up people", "government lying",
37
- "media conspiracy", "false flag", "planned pandemic"
 
 
 
 
 
 
 
 
 
38
  ]
39
 
 
 
 
 
 
40
  # Credible sources
41
  self.credible_sources = [
42
  'reuters.com', 'apnews.com', 'bbc.com', 'nytimes.com',
43
  'theguardian.com', 'washingtonpost.com', 'npr.org',
44
  'wsj.com', 'ft.com', 'bloomberg.com', 'abcnews.go.com',
45
- 'cbsnews.com', 'nbcnews.com', 'cnn.com'
46
  ]
47
 
48
- # Fake news indicators
49
  self.fake_indicators = [
50
  "exclusive reveal", "shocking truth", "they don't want you to know",
51
- "mainstream media won't report this", "breaking secret news",
52
- "you won't believe", "this will shock you", "do your own research",
53
- "the truth they're hiding", "wake up sheeple", "open your eyes"
 
54
  ]
55
 
56
  # Sensational words
57
  self.sensational_words = [
58
- 'shocking', 'amazing', 'unbelievable', 'incredible', 'astounding',
59
  'mind-blowing', 'explosive', 'bombshell', 'earth-shattering',
60
- 'revolutionary', 'game-changing', 'miracle'
61
  ]
62
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  def mean_pooling(self, model_output, attention_mask):
64
  """Apply mean pooling to get sentence embeddings"""
65
- token_embeddings = model_output[0] # First element contains all token embeddings
66
  input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
67
  sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
68
  sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
@@ -73,37 +99,37 @@ class FakeNewsDetector:
73
  try:
74
  if not text or len(text.strip()) == 0:
75
  return None
76
-
77
- # Tokenize sentences
78
  encoded_input = self.tokenizer(
79
  text,
80
  padding=True,
81
  truncation=True,
82
- max_length=512,
83
  return_tensors='pt'
84
  )
85
 
86
- # Compute token embeddings
87
  with torch.no_grad():
88
  model_output = self.model(**encoded_input)
89
 
90
- # Perform pooling
91
  sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
92
 
93
- # Normalize embeddings
94
  sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
95
 
96
  return sentence_embeddings
97
 
98
  except Exception as e:
99
- logger.error(f"Error getting sentence embedding: {e}")
100
  return None
101
 
102
  def extract_content(self, url: str):
103
  """Extract content from URL"""
104
  try:
105
  headers = {
106
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
107
  }
108
 
109
  if not url.startswith(('http://', 'https://')):
@@ -125,16 +151,9 @@ class FakeNewsDetector:
125
  # Try multiple content selectors
126
  content_text = ""
127
  content_selectors = [
128
- 'article',
129
- '.article-content',
130
- '.post-content',
131
- '.story-content',
132
- '.entry-content',
133
- 'main',
134
- '[role="main"]',
135
- '.news-content',
136
- '.story-body',
137
- '.content'
138
  ]
139
 
140
  for selector in content_selectors:
@@ -143,7 +162,7 @@ class FakeNewsDetector:
143
  content_parts = []
144
  for elem in elements:
145
  text = elem.get_text().strip()
146
- if len(text) > 100: # Only substantial content
147
  content_parts.append(text)
148
  if content_parts:
149
  content_text = ' '.join(content_parts)
@@ -171,9 +190,7 @@ class FakeNewsDetector:
171
 
172
  def clean_text(self, text: str):
173
  """Clean and normalize text"""
174
- # Remove extra whitespace
175
  text = re.sub(r'\s+', ' ', text)
176
- # Remove special characters but keep basic punctuation
177
  text = re.sub(r'[^\w\s.,!?;:()-]', '', text)
178
  return text.strip()
179
 
@@ -181,7 +198,7 @@ class FakeNewsDetector:
181
  """Analyze text content for fake news indicators"""
182
  text_lower = text.lower()
183
 
184
- # Count various indicators
185
  sensational_score = sum(1 for word in self.sensational_words if word in text_lower)
186
  fake_indicator_count = sum(1 for indicator in self.fake_indicators if indicator in text_lower)
187
 
@@ -189,11 +206,11 @@ class FakeNewsDetector:
189
  exclamation_count = text.count('!')
190
  question_count = text.count('?')
191
 
192
- # Check for all-caps words
193
  capital_words = len(re.findall(r'\b[A-Z]{3,}\b', text))
194
 
195
- # Check for emotional language
196
- emotional_words = ['outrageous', 'disgusting', 'horrible', 'terrible', 'awful']
197
  emotional_count = sum(1 for word in emotional_words if word in text_lower)
198
 
199
  return {
@@ -207,42 +224,60 @@ class FakeNewsDetector:
207
  }
208
 
209
  def check_source_credibility(self, url: str):
210
- """Check if the source is known to be credible"""
211
  url_lower = url.lower()
212
 
213
  # Check credible sources
214
  for credible_source in self.credible_sources:
215
  if credible_source in url_lower:
216
- return 0.8 # High credibility
217
 
218
- # Penalize known unreliable domains
219
- unreliable_domains = ['.blogspot.', '.wordpress.', '.tumblr.', 'medium.com']
220
  for domain in unreliable_domains:
221
  if domain in url_lower:
222
- return 0.2 # Low credibility
223
 
224
- return 0.5 # Neutral credibility
225
 
226
  def semantic_similarity_analysis(self, text: str):
227
- """Analyze semantic similarity with known fake news patterns"""
228
  try:
229
  if not text or len(text) < 50:
230
  return 0.0
231
-
232
- # Get embedding for input text
233
- text_embedding = self.get_sentence_embedding(text)
234
- if text_embedding is None:
235
- return 0.0
236
 
237
- # Get embeddings for fake news patterns
238
- pattern_embeddings = self.get_sentence_embedding(self.fake_news_patterns)
239
- if pattern_embeddings is None:
240
  return 0.0
241
 
242
- # Calculate cosine similarity
243
- similarities = F.cosine_similarity(text_embedding, pattern_embeddings)
244
- max_similarity = float(torch.max(similarities).item())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
 
246
  return max_similarity
247
 
248
  except Exception as e:
@@ -250,7 +285,7 @@ class FakeNewsDetector:
250
  return 0.0
251
 
252
  def detect_fake_news(self, url: str):
253
- """Main fake news detection function"""
254
  logger.info(f"Analyzing URL: {url}")
255
 
256
  # Extract content
@@ -272,7 +307,7 @@ class FakeNewsDetector:
272
  return {
273
  'status': '⚠️ Insufficient Content',
274
  'confidence': 0.0,
275
- 'message': 'Not enough text content found to analyze. The article may be behind a paywall or require JavaScript.',
276
  'title': title,
277
  'color': 'orange'
278
  }
@@ -282,62 +317,74 @@ class FakeNewsDetector:
282
  content_analysis = self.analyze_content(full_text)
283
  semantic_similarity = self.semantic_similarity_analysis(full_text)
284
 
285
- # Calculate fake news score
286
  fake_score = 0.0
287
 
288
- # Source credibility (inverse weight)
289
- source_factor = (1 - source_credibility) * 0.25
290
  fake_score += source_factor
291
 
292
- # Semantic similarity with fake patterns
293
- semantic_factor = semantic_similarity * 0.35
294
  fake_score += semantic_factor
295
 
296
- # Content analysis
297
  content_factor = (
298
- content_analysis['sensational_score'] * 0.05 +
299
- content_analysis['fake_indicator_count'] * 0.15 +
300
- min(content_analysis['exclamation_count'] * 0.02, 0.1) +
301
- min(content_analysis['capital_words'] * 0.01, 0.05) +
302
- min(content_analysis['emotional_count'] * 0.03, 0.05)
303
- ) * 0.4
 
 
304
  fake_score += content_factor
305
 
306
  fake_score = min(fake_score, 1.0)
307
 
308
- # Determine result
309
- if fake_score > 0.7:
 
 
 
 
 
310
  status = "🚨 Likely Fake News"
311
  color = "red"
312
- elif fake_score > 0.5:
313
  status = "⚠️ Suspicious Content"
314
  color = "orange"
315
- elif fake_score > 0.3:
316
  status = "πŸ€” Potentially Misleading"
317
  color = "yellow"
318
  else:
319
  status = "βœ… Likely Credible"
320
  color = "green"
321
 
322
- # Create detailed analysis message
323
  message = f"""
324
  **πŸ“Š Detailed Analysis Results:**
325
 
 
 
326
  **Source Analysis:**
327
- - Source Credibility Score: {source_credibility:.2f}/1.0
 
328
 
329
- **Semantic Analysis:**
330
- - Similarity to Fake News Patterns: {semantic_similarity:.2f}/1.0
 
331
 
332
  **Content Analysis:**
333
- - Sensational Language Score: {content_analysis['sensational_score']}
334
- - Fake News Indicators Found: {content_analysis['fake_indicator_count']}
335
  - Exclamation Marks: {content_analysis['exclamation_count']}
336
  - ALL-CAPS Words: {content_analysis['capital_words']}
337
  - Emotional Language: {content_analysis['emotional_count']}
 
338
 
339
  **Content Preview:**
340
- {content[:400]}...
341
  """.strip()
342
 
343
  return {
@@ -354,26 +401,24 @@ detector = FakeNewsDetector()
354
  def analyze_url(url):
355
  """Gradio interface function"""
356
  if not url.strip():
357
- return "Please enter a URL", "0%", "No analysis performed.", "No title", "gray"
358
 
359
  try:
360
  result = detector.detect_fake_news(url)
361
-
362
  confidence_percent = f"{result['confidence'] * 100:.1f}%"
363
 
364
  return (
365
  result['status'],
366
  confidence_percent,
367
  result['message'],
368
- result['title'],
369
- result['color']
370
  )
371
 
372
  except Exception as e:
373
  logger.error(f"Analysis error: {e}")
374
- return "❌ Analysis Error", "0%", f"An error occurred: {str(e)}", "Error", "red"
375
 
376
- # Create Gradio interface
377
  with gr.Blocks(
378
  theme=gr.themes.Soft(),
379
  title="Fake News Detector",
@@ -381,19 +426,14 @@ with gr.Blocks(
381
  .gradio-container {
382
  max-width: 900px !important;
383
  }
384
- .result-box {
385
- padding: 10px;
386
- border-radius: 5px;
387
- margin: 5px 0;
388
- }
389
  """
390
  ) as demo:
391
 
392
  gr.Markdown("""
393
- # πŸ•΅οΈ Fake News Detector
394
- **Analyze news articles for potential fake news using AI and Semantic Analysis**
395
 
396
- *This tool uses sentence transformers to analyze content patterns and detect potential misinformation*
397
  """)
398
 
399
  with gr.Row():
@@ -401,8 +441,7 @@ with gr.Blocks(
401
  url_input = gr.Textbox(
402
  label="πŸ“° Enter News Article URL",
403
  placeholder="https://example.com/news-article",
404
- lines=1,
405
- max_lines=1
406
  )
407
  analyze_btn = gr.Button("πŸ” Analyze Article", variant="primary", size="lg")
408
 
@@ -413,7 +452,7 @@ with gr.Blocks(
413
  interactive=False
414
  )
415
  confidence_score = gr.Textbox(
416
- label="πŸ“ˆ Confidence Score",
417
  interactive=False
418
  )
419
  article_title = gr.Textbox(
@@ -421,17 +460,14 @@ with gr.Blocks(
421
  interactive=False
422
  )
423
 
424
- details_output = gr.Markdown(
425
- label="πŸ“Š Detailed Analysis"
426
- )
427
 
428
- # Examples
429
  gr.Examples(
430
  label="πŸ’‘ Try these examples:",
431
  examples=[
432
- ["https://www.bbc.com/news"],
433
- ["https://www.reuters.com/"],
434
- ["https://apnews.com/"]
435
  ],
436
  inputs=url_input
437
  )
@@ -440,18 +476,15 @@ with gr.Blocks(
440
  ---
441
 
442
  **πŸ” How it works:**
443
- 1. **Content Extraction**: Extracts text from the provided URL
444
- 2. **Semantic Analysis**: Uses sentence transformers to analyze similarity with known fake news patterns
445
- 3. **Source Verification**: Checks the domain against known credible sources
446
- 4. **Pattern Detection**: Identifies sensational language and fake news indicators
447
- 5. **Confidence Scoring**: Provides a comprehensive confidence score
448
-
449
- **⚠️ Disclaimer**: This is an AI-powered educational tool. Always verify information through multiple credible sources and fact-checking organizations.
450
 
451
- *Built with ❀️ using Transformers from Hugging Face*
452
  """)
453
 
454
- # Set up the analysis button
455
  analyze_btn.click(
456
  fn=analyze_url,
457
  inputs=url_input,
@@ -459,8 +492,4 @@ with gr.Blocks(
459
  )
460
 
461
  if __name__ == "__main__":
462
- demo.launch(
463
- server_name="0.0.0.0",
464
- server_port=7860,
465
- share=False
466
- )
 
6
  import torch.nn.functional as F
7
  import re
8
  import logging
 
9
  import numpy as np
 
10
 
11
  # Set up logging
12
  logging.basicConfig(level=logging.INFO)
 
14
 
15
  class FakeNewsDetector:
16
  def __init__(self):
17
+ logger.info("Loading sentence transformer model...")
18
 
19
  try:
20
+ # Load model and tokenizer
21
  self.model_name = "sentence-transformers/all-MiniLM-L6-v2"
22
  self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
23
  self.model = AutoModel.from_pretrained(self.model_name)
24
+ self.model.eval()
25
  logger.info("Model loaded successfully!")
26
  except Exception as e:
27
  logger.error(f"Error loading model: {e}")
28
  raise
29
 
30
+ # Fake news pattern phrases (each as separate string for embedding)
31
  self.fake_news_patterns = [
32
+ "this is a conspiracy theory and false information",
33
+ "this article contains debunked misinformation and hoaxes",
34
+ "unverified clickbait and fake news content",
35
+ "deep state conspiracy and government cover up",
36
+ "mainstream media lies and hidden truth",
37
+ "they don't want you to know this secret",
38
+ "wake up people this is a false flag",
39
+ "shocking exclusive truth they are hiding",
40
+ "do your own research mainstream media won't report",
41
+ "this will shock you unbelievable secret revealed",
42
+ "breaking secret news the truth exposed",
43
+ "government lying about planned pandemic",
44
+ "media conspiracy and propaganda machine",
45
+ "open your eyes wake up sheeple"
46
  ]
47
 
48
+ # Pre-compute embeddings for fake news patterns
49
+ logger.info("Pre-computing pattern embeddings...")
50
+ self.pattern_embeddings = self._compute_pattern_embeddings()
51
+ logger.info("Pattern embeddings ready!")
52
+
53
  # Credible sources
54
  self.credible_sources = [
55
  'reuters.com', 'apnews.com', 'bbc.com', 'nytimes.com',
56
  'theguardian.com', 'washingtonpost.com', 'npr.org',
57
  'wsj.com', 'ft.com', 'bloomberg.com', 'abcnews.go.com',
58
+ 'cbsnews.com', 'nbcnews.com', 'cnn.com', 'axios.com'
59
  ]
60
 
61
+ # Fake news text indicators
62
  self.fake_indicators = [
63
  "exclusive reveal", "shocking truth", "they don't want you to know",
64
+ "mainstream media won't report", "breaking secret", "you won't believe",
65
+ "this will shock you", "do your own research", "wake up sheeple",
66
+ "the truth they're hiding", "open your eyes", "government doesn't want",
67
+ "big pharma conspiracy", "new world order", "illuminati confirmed"
68
  ]
69
 
70
  # Sensational words
71
  self.sensational_words = [
72
+ 'shocking', 'unbelievable', 'incredible', 'astounding',
73
  'mind-blowing', 'explosive', 'bombshell', 'earth-shattering',
74
+ 'miracle', 'secret', 'hidden', 'exposed', 'revealed'
75
  ]
76
 
77
+ def _compute_pattern_embeddings(self):
78
+ """Pre-compute embeddings for all fake news patterns"""
79
+ embeddings = []
80
+ for pattern in self.fake_news_patterns:
81
+ embedding = self.get_sentence_embedding(pattern)
82
+ if embedding is not None:
83
+ embeddings.append(embedding)
84
+
85
+ if embeddings:
86
+ return torch.cat(embeddings, dim=0)
87
+ return None
88
+
89
  def mean_pooling(self, model_output, attention_mask):
90
  """Apply mean pooling to get sentence embeddings"""
91
+ token_embeddings = model_output[0]
92
  input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
93
  sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
94
  sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
 
99
  try:
100
  if not text or len(text.strip()) == 0:
101
  return None
102
+
103
+ # Tokenize
104
  encoded_input = self.tokenizer(
105
  text,
106
  padding=True,
107
  truncation=True,
108
+ max_length=384,
109
  return_tensors='pt'
110
  )
111
 
112
+ # Compute embeddings
113
  with torch.no_grad():
114
  model_output = self.model(**encoded_input)
115
 
116
+ # Mean pooling
117
  sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
118
 
119
+ # Normalize
120
  sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
121
 
122
  return sentence_embeddings
123
 
124
  except Exception as e:
125
+ logger.error(f"Error getting embedding: {e}")
126
  return None
127
 
128
  def extract_content(self, url: str):
129
  """Extract content from URL"""
130
  try:
131
  headers = {
132
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
133
  }
134
 
135
  if not url.startswith(('http://', 'https://')):
 
151
  # Try multiple content selectors
152
  content_text = ""
153
  content_selectors = [
154
+ 'article', '.article-content', '.post-content',
155
+ '.story-content', '.entry-content', 'main',
156
+ '[role="main"]', '.news-content', '.story-body'
 
 
 
 
 
 
 
157
  ]
158
 
159
  for selector in content_selectors:
 
162
  content_parts = []
163
  for elem in elements:
164
  text = elem.get_text().strip()
165
+ if len(text) > 100:
166
  content_parts.append(text)
167
  if content_parts:
168
  content_text = ' '.join(content_parts)
 
190
 
191
  def clean_text(self, text: str):
192
  """Clean and normalize text"""
 
193
  text = re.sub(r'\s+', ' ', text)
 
194
  text = re.sub(r'[^\w\s.,!?;:()-]', '', text)
195
  return text.strip()
196
 
 
198
  """Analyze text content for fake news indicators"""
199
  text_lower = text.lower()
200
 
201
+ # Count indicators
202
  sensational_score = sum(1 for word in self.sensational_words if word in text_lower)
203
  fake_indicator_count = sum(1 for indicator in self.fake_indicators if indicator in text_lower)
204
 
 
206
  exclamation_count = text.count('!')
207
  question_count = text.count('?')
208
 
209
+ # All-caps words
210
  capital_words = len(re.findall(r'\b[A-Z]{3,}\b', text))
211
 
212
+ # Emotional language
213
+ emotional_words = ['outrageous', 'disgusting', 'horrible', 'terrible', 'corrupt', 'evil']
214
  emotional_count = sum(1 for word in emotional_words if word in text_lower)
215
 
216
  return {
 
224
  }
225
 
226
  def check_source_credibility(self, url: str):
227
+ """Check source credibility"""
228
  url_lower = url.lower()
229
 
230
  # Check credible sources
231
  for credible_source in self.credible_sources:
232
  if credible_source in url_lower:
233
+ return 0.85
234
 
235
+ # Penalize unreliable domains
236
+ unreliable_domains = ['.blogspot.', '.wordpress.', '.tumblr.']
237
  for domain in unreliable_domains:
238
  if domain in url_lower:
239
+ return 0.15
240
 
241
+ return 0.5
242
 
243
  def semantic_similarity_analysis(self, text: str):
244
+ """Analyze semantic similarity with fake news patterns - FIXED"""
245
  try:
246
  if not text or len(text) < 50:
247
  return 0.0
 
 
 
 
 
248
 
249
+ if self.pattern_embeddings is None:
250
+ logger.warning("Pattern embeddings not available")
 
251
  return 0.0
252
 
253
+ # Split text into chunks for better analysis
254
+ max_chunk_length = 500
255
+ chunks = [text[i:i+max_chunk_length] for i in range(0, min(len(text), 2000), max_chunk_length)]
256
+
257
+ max_similarity = 0.0
258
+
259
+ for chunk in chunks:
260
+ if len(chunk.strip()) < 30:
261
+ continue
262
+
263
+ # Get embedding for chunk
264
+ chunk_embedding = self.get_sentence_embedding(chunk)
265
+ if chunk_embedding is None:
266
+ continue
267
+
268
+ # Calculate cosine similarity with all pattern embeddings
269
+ # chunk_embedding: [1, 384], pattern_embeddings: [N, 384]
270
+ similarities = F.cosine_similarity(
271
+ chunk_embedding.unsqueeze(1), # [1, 1, 384]
272
+ self.pattern_embeddings.unsqueeze(0), # [1, N, 384]
273
+ dim=2
274
+ ).squeeze() # [N]
275
+
276
+ # Get maximum similarity
277
+ chunk_max_sim = float(torch.max(similarities).item())
278
+ max_similarity = max(max_similarity, chunk_max_sim)
279
 
280
+ logger.info(f"Semantic similarity: {max_similarity:.4f}")
281
  return max_similarity
282
 
283
  except Exception as e:
 
285
  return 0.0
286
 
287
  def detect_fake_news(self, url: str):
288
+ """Main detection function - FIXED SCORING"""
289
  logger.info(f"Analyzing URL: {url}")
290
 
291
  # Extract content
 
307
  return {
308
  'status': '⚠️ Insufficient Content',
309
  'confidence': 0.0,
310
+ 'message': 'Not enough text content to analyze.',
311
  'title': title,
312
  'color': 'orange'
313
  }
 
317
  content_analysis = self.analyze_content(full_text)
318
  semantic_similarity = self.semantic_similarity_analysis(full_text)
319
 
320
+ # Calculate fake news score - FIXED WEIGHTS
321
  fake_score = 0.0
322
 
323
+ # Source credibility (40% weight)
324
+ source_factor = (1 - source_credibility) * 0.4
325
  fake_score += source_factor
326
 
327
+ # Semantic similarity (40% weight) - INCREASED IMPORTANCE
328
+ semantic_factor = semantic_similarity * 0.4
329
  fake_score += semantic_factor
330
 
331
+ # Content indicators (20% weight)
332
  content_factor = (
333
+ min(content_analysis['sensational_score'] * 0.08, 0.4) +
334
+ min(content_analysis['fake_indicator_count'] * 0.2, 0.6) +
335
+ min(content_analysis['exclamation_count'] * 0.03, 0.15) +
336
+ min(content_analysis['capital_words'] * 0.02, 0.1) +
337
+ min(content_analysis['emotional_count'] * 0.05, 0.15)
338
+ )
339
+ # Normalize content factor to 0-1 range
340
+ content_factor = min(content_factor, 1.0) * 0.2
341
  fake_score += content_factor
342
 
343
  fake_score = min(fake_score, 1.0)
344
 
345
+ logger.info(f"Final fake score: {fake_score:.4f}")
346
+ logger.info(f" - Source factor: {source_factor:.4f}")
347
+ logger.info(f" - Semantic factor: {semantic_factor:.4f}")
348
+ logger.info(f" - Content factor: {content_factor:.4f}")
349
+
350
+ # Determine result - ADJUSTED THRESHOLDS
351
+ if fake_score > 0.65:
352
  status = "🚨 Likely Fake News"
353
  color = "red"
354
+ elif fake_score > 0.45:
355
  status = "⚠️ Suspicious Content"
356
  color = "orange"
357
+ elif fake_score > 0.30:
358
  status = "πŸ€” Potentially Misleading"
359
  color = "yellow"
360
  else:
361
  status = "βœ… Likely Credible"
362
  color = "green"
363
 
364
+ # Detailed message
365
  message = f"""
366
  **πŸ“Š Detailed Analysis Results:**
367
 
368
+ **Overall Fake News Score: {fake_score:.2f}/1.0**
369
+
370
  **Source Analysis:**
371
+ - Source Credibility: {source_credibility:.2f}/1.0
372
+ - Source Factor Impact: {source_factor:.3f}
373
 
374
+ **AI Semantic Analysis:**
375
+ - Similarity to Fake News Patterns: {semantic_similarity:.3f}/1.0
376
+ - Semantic Factor Impact: {semantic_factor:.3f}
377
 
378
  **Content Analysis:**
379
+ - Sensational Language: {content_analysis['sensational_score']} instances
380
+ - Fake News Indicators: {content_analysis['fake_indicator_count']} found
381
  - Exclamation Marks: {content_analysis['exclamation_count']}
382
  - ALL-CAPS Words: {content_analysis['capital_words']}
383
  - Emotional Language: {content_analysis['emotional_count']}
384
+ - Content Factor Impact: {content_factor:.3f}
385
 
386
  **Content Preview:**
387
+ {content[:350]}...
388
  """.strip()
389
 
390
  return {
 
401
  def analyze_url(url):
402
  """Gradio interface function"""
403
  if not url.strip():
404
+ return "Please enter a URL", "0%", "No analysis performed.", "No title"
405
 
406
  try:
407
  result = detector.detect_fake_news(url)
 
408
  confidence_percent = f"{result['confidence'] * 100:.1f}%"
409
 
410
  return (
411
  result['status'],
412
  confidence_percent,
413
  result['message'],
414
+ result['title']
 
415
  )
416
 
417
  except Exception as e:
418
  logger.error(f"Analysis error: {e}")
419
+ return "❌ Analysis Error", "0%", f"Error: {str(e)}", "Error"
420
 
421
+ # Gradio interface
422
  with gr.Blocks(
423
  theme=gr.themes.Soft(),
424
  title="Fake News Detector",
 
426
  .gradio-container {
427
  max-width: 900px !important;
428
  }
 
 
 
 
 
429
  """
430
  ) as demo:
431
 
432
  gr.Markdown("""
433
+ # πŸ•΅οΈ AI-Powered Fake News Detector
434
+ **Analyze news articles using Sentence Transformers and Semantic Analysis**
435
 
436
+ *Uses advanced NLP to detect misinformation patterns*
437
  """)
438
 
439
  with gr.Row():
 
441
  url_input = gr.Textbox(
442
  label="πŸ“° Enter News Article URL",
443
  placeholder="https://example.com/news-article",
444
+ lines=1
 
445
  )
446
  analyze_btn = gr.Button("πŸ” Analyze Article", variant="primary", size="lg")
447
 
 
452
  interactive=False
453
  )
454
  confidence_score = gr.Textbox(
455
+ label="πŸ“ˆ Fake News Score",
456
  interactive=False
457
  )
458
  article_title = gr.Textbox(
 
460
  interactive=False
461
  )
462
 
463
+ details_output = gr.Markdown(label="πŸ“Š Detailed Analysis")
 
 
464
 
 
465
  gr.Examples(
466
  label="πŸ’‘ Try these examples:",
467
  examples=[
468
+ ["https://www.reuters.com/world/"],
469
+ ["https://apnews.com/"],
470
+ ["https://www.bbc.com/news"]
471
  ],
472
  inputs=url_input
473
  )
 
476
  ---
477
 
478
  **πŸ” How it works:**
479
+ 1. **Content Extraction**: Scrapes article text
480
+ 2. **Semantic Analysis**: Compares text embeddings with fake news patterns using cosine similarity
481
+ 3. **Source Verification**: Checks domain credibility
482
+ 4. **Pattern Detection**: Identifies misleading language indicators
483
+ 5. **Confidence Scoring**: Weighted score from all analyses
 
 
484
 
485
+ **⚠️ Disclaimer**: Educational tool only. Always verify through multiple credible sources.
486
  """)
487
 
 
488
  analyze_btn.click(
489
  fn=analyze_url,
490
  inputs=url_input,
 
492
  )
493
 
494
  if __name__ == "__main__":
495
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=False)