DevNumb's picture
Update app.py
01cd108 verified
import gradio as gr
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import re
import logging
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class FakeNewsDetector:
def __init__(self):
logger.info("Loading RoBERTa Fake News Detection model...")
try:
# Using the most popular and proven model
MODEL = "jy46604790/Fake-News-Bert-Detect"
# Simple pipeline approach - handles everything automatically
self.classifier = pipeline(
"text-classification",
model=MODEL,
tokenizer=MODEL,
device=-1, # CPU mode for free tier
max_length=512, # Explicitly set max length
truncation=True # Enable truncation
)
logger.info("βœ… Model loaded successfully!")
except Exception as e:
logger.error(f"Error loading model: {e}")
raise
# Credible sources
self.credible_sources = [
'reuters.com', 'apnews.com', 'bbc.com', 'nytimes.com',
'theguardian.com', 'washingtonpost.com', 'npr.org',
'wsj.com', 'ft.com', 'bloomberg.com', 'abcnews.go.com',
'cbsnews.com', 'nbcnews.com', 'cnn.com', 'axios.com'
]
# Fake news indicators
self.fake_indicators = [
"exclusive reveal", "shocking truth", "they don't want you to know",
"mainstream media won't report", "breaking secret", "you won't believe",
"wake up sheeple", "open your eyes"
]
def extract_content(self, url: str):
"""Extract content from URL"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Remove unwanted elements
for element in soup(["script", "style", "nav", "footer", "header", "aside"]):
element.decompose()
# Extract title
title = soup.find('title')
title_text = title.get_text().strip() if title else "No title found"
# Try multiple content selectors
content_text = ""
content_selectors = [
'article', '.article-content', '.post-content',
'.story-content', '.entry-content', 'main',
'[role="main"]', '.news-content', '.story-body'
]
for selector in content_selectors:
elements = soup.select(selector)
if elements:
content_parts = []
for elem in elements:
text = elem.get_text().strip()
if len(text) > 100:
content_parts.append(text)
if content_parts:
content_text = ' '.join(content_parts)
break
# Fallback to body
if not content_text or len(content_text) < 200:
body = soup.find('body')
if body:
content_text = body.get_text()
# Clean text
content_text = self.clean_text(content_text)
return {
'success': True,
'title': title_text,
'content': content_text,
'url': url
}
except Exception as e:
logger.error(f"Content extraction error: {e}")
return {'success': False, 'error': str(e)}
def clean_text(self, text: str):
"""Clean and normalize text"""
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^\w\s.,!?;:()-]', '', text)
return text.strip()
def truncate_text(self, text: str, max_words: int = 350):
"""Truncate text to maximum words for the model"""
words = text.split()
if len(words) > max_words:
truncated = ' '.join(words[:max_words])
logger.info(f"Text truncated from {len(words)} to {max_words} words")
return truncated
return text
def analyze_content(self, text: str):
"""Analyze text for fake news indicators"""
text_lower = text.lower()
fake_indicator_count = sum(1 for indicator in self.fake_indicators if indicator in text_lower)
exclamation_count = text.count('!')
capital_words = len(re.findall(r'\b[A-Z]{3,}\b', text))
return {
'fake_indicator_count': fake_indicator_count,
'exclamation_count': exclamation_count,
'capital_words': capital_words
}
def check_source_credibility(self, url: str):
"""Check source credibility"""
url_lower = url.lower()
for credible_source in self.credible_sources:
if credible_source in url_lower:
return 0.9
unreliable = ['.blogspot.', '.wordpress.', '.tumblr.']
for domain in unreliable:
if domain in url_lower:
return 0.1
return 0.5
def detect_fake_news(self, url: str):
"""Main detection function"""
logger.info(f"Analyzing: {url}")
# Extract content
content_data = self.extract_content(url)
if not content_data['success']:
return {
'status': '❌ Extraction Failed',
'confidence': 0.0,
'message': f"Could not extract content: {content_data.get('error', 'Unknown error')}",
'title': 'Error'
}
title = content_data['title']
content = content_data['content']
if len(content.strip()) < 100:
return {
'status': '⚠️ Insufficient Content',
'confidence': 0.0,
'message': 'Not enough content to analyze. May be behind paywall.',
'title': title
}
# Prepare text for model (title + truncated content)
full_text = f"{title}. {content}"
# Truncate text to safe length for the model
truncated_text = self.truncate_text(full_text, max_words=350)
logger.info(f"Text length: {len(truncated_text)} characters")
# Use RoBERTa model with error handling
try:
result = self.classifier(truncated_text)[0]
label = result['label']
score = result['score']
# Debug: Log the raw output
logger.info(f"Raw model output: {result}")
# Parse label correctly - check both possible label formats
if label in ['LABEL_1', 'FAKE', 'Fake']:
is_fake = True
model_confidence = score
elif label in ['LABEL_0', 'REAL', 'Real']:
is_fake = False
model_confidence = score
else:
# If label format is unexpected, use score threshold
is_fake = score > 0.5
model_confidence = score if is_fake else (1 - score)
logger.info(f"Interpreted: is_fake={is_fake}, confidence={model_confidence:.3f}")
except Exception as e:
logger.error(f"Model error: {e}")
# Fallback to content analysis only
return self.fallback_analysis(title, content, url, str(e))
# Additional analysis
source_credibility = self.check_source_credibility(url)
content_analysis = self.analyze_content(full_text)
# Calculate combined score based on model prediction
if is_fake:
# For fake news: model confidence + source suspicion + content indicators
combined_score = (
model_confidence * 0.7 +
(1 - source_credibility) * 0.2 +
min(content_analysis['fake_indicator_count'] * 0.1, 0.1)
)
else:
# For real news: model confidence + source credibility
combined_score = (
model_confidence * 0.8 +
source_credibility * 0.2
)
# Determine status based on clear thresholds
if is_fake:
if combined_score > 0.8:
status = "🚨 Highly Likely Fake News"
elif combined_score > 0.6:
status = "⚠️ Likely Fake News"
elif combined_score > 0.4:
status = "πŸ€” Possibly Fake News"
else:
status = "πŸ“° Uncertain (Leaning Fake)"
else:
if combined_score > 0.8:
status = "βœ… Highly Likely Real News"
elif combined_score > 0.6:
status = "πŸ“° Likely Real News"
elif combined_score > 0.4:
status = "πŸ€” Possibly Real News"
else:
status = "❓ Uncertain (Leaning Real)"
# Detailed message
message = f"""
**πŸ“Š Analysis Results:**
**RoBERTa Model Prediction:**
- Classification: **{'FAKE NEWS' if is_fake else 'REAL NEWS'}**
- Model Confidence: **{model_confidence * 100:.1f}%**
- Raw Output: `{label}` (score: {score:.3f})
**Source Analysis:**
- Source Credibility: {source_credibility:.2f}/1.0
- Domain: {url.split('/')[2] if len(url.split('/')) > 2 else 'Unknown'}
**Content Indicators:**
- Fake News Keywords: {content_analysis['fake_indicator_count']}
- Exclamation Marks: {content_analysis['exclamation_count']}
- ALL-CAPS Words: {content_analysis['capital_words']}
**Final Score: {combined_score * 100:.1f}%**
**Content Preview:**
{content[:300]}...
---
**Note:** This is an AI prediction. Always verify from multiple sources.
""".strip()
return {
'status': status,
'confidence': combined_score,
'message': message,
'title': title
}
def fallback_analysis(self, title: str, content: str, url: str, error: str):
"""Fallback analysis when model fails"""
source_credibility = self.check_source_credibility(url)
content_analysis = self.analyze_content(f"{title}. {content}")
# Simple heuristic based on source and content
fake_score = (
(1 - source_credibility) * 0.6 +
min(content_analysis['fake_indicator_count'] * 0.2, 0.4)
)
if fake_score > 0.7:
status = "⚠️ Suspicious (Fallback Analysis)"
elif fake_score > 0.4:
status = "πŸ€” Uncertain (Fallback Analysis)"
else:
status = "πŸ“° Probably Real (Fallback Analysis)"
message = f"""
**πŸ“Š Fallback Analysis (Model Error):**
**Model Error:** {error}
**Source Analysis:**
- Source Credibility: {source_credibility:.2f}/1.0
**Content Indicators:**
- Fake News Keywords: {content_analysis['fake_indicator_count']}
- Exclamation Marks: {content_analysis['exclamation_count']}
- ALL-CAPS Words: {content_analysis['capital_words']}
**Fallback Score: {fake_score * 100:.1f}%**
**Preview:**
{content[:300]}...
---
*Using fallback analysis due to model error*
""".strip()
return {
'status': status,
'confidence': fake_score,
'message': message,
'title': title
}
# Initialize detector
logger.info("Initializing Fake News Detector...")
detector = FakeNewsDetector()
logger.info("Ready!")
def analyze_url(url):
"""Gradio interface function"""
if not url.strip():
return "⚠️ Please enter a URL", "0%", "No URL provided", "No title"
try:
result = detector.detect_fake_news(url)
confidence_percent = f"{result['confidence'] * 100:.1f}%"
return (
result['status'],
confidence_percent,
result['message'],
result['title']
)
except Exception as e:
logger.error(f"Error: {e}")
return "❌ Error", "0%", f"Error: {str(e)}", "Error"
# Gradio Interface
with gr.Blocks(
theme=gr.themes.Soft(),
title="Fake News Detector"
) as demo:
gr.Markdown("""
# πŸ•΅οΈ Fake News Detector
**AI-Powered News Verification using RoBERTa**
*Analyzes news articles using a transformer model trained on 40,000+ articles*
""")
with gr.Row():
with gr.Column(scale=2):
url_input = gr.Textbox(
label="πŸ“° Enter News Article URL",
placeholder="https://example.com/news-article",
lines=1
)
analyze_btn = gr.Button(
"πŸ” Analyze Article",
variant="primary",
size="lg"
)
with gr.Column(scale=1):
with gr.Group():
result_status = gr.Textbox(
label="🎯 Result",
interactive=False
)
confidence_score = gr.Textbox(
label="πŸ“ˆ Confidence",
interactive=False
)
article_title = gr.Textbox(
label="πŸ“ Article Title",
interactive=False
)
details_output = gr.Markdown(label="πŸ“Š Detailed Analysis")
gr.Examples(
label="πŸ’‘ Try these examples:",
examples=[
["https://www.bbc.com/news"],
["https://www.reuters.com/"],
["https://apnews.com/"]
],
inputs=url_input
)
gr.Markdown("""
---
**How it works:**
1. **Extracts** article text from URL
2. **Truncates** to model-safe length (350 words)
3. **Analyzes** using RoBERTa transformer
4. **Checks** source credibility and content patterns
5. **Provides** confidence score
**Model:** `jy46604790/Fake-News-Bert-Detect` (RoBERTa-based)
**⚠️ Disclaimer:** Educational tool only. Always verify information through multiple credible sources.
""")
analyze_btn.click(
fn=analyze_url,
inputs=url_input,
outputs=[result_status, confidence_score, details_output, article_title]
)
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860
)