Spaces:

DevNumb
/

fakeNewsDetector

Running

App Files Files Community

fakeNewsDetector / app.py

DevNumb

Update app.py

01cd108 verified 3 months ago

raw

history blame contribute delete

15 kB

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	from transformers import pipeline
	import re
	import logging

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class FakeNewsDetector:
	def __init__(self):
	logger.info("Loading RoBERTa Fake News Detection model...")

	try:
	# Using the most popular and proven model
	MODEL = "jy46604790/Fake-News-Bert-Detect"

	# Simple pipeline approach - handles everything automatically
	self.classifier = pipeline(
	"text-classification",
	model=MODEL,
	tokenizer=MODEL,
	device=-1, # CPU mode for free tier
	max_length=512, # Explicitly set max length
	truncation=True # Enable truncation
	)

	logger.info("✅ Model loaded successfully!")

	except Exception as e:
	logger.error(f"Error loading model: {e}")
	raise

	# Credible sources
	self.credible_sources = [
	'reuters.com', 'apnews.com', 'bbc.com', 'nytimes.com',
	'theguardian.com', 'washingtonpost.com', 'npr.org',
	'wsj.com', 'ft.com', 'bloomberg.com', 'abcnews.go.com',
	'cbsnews.com', 'nbcnews.com', 'cnn.com', 'axios.com'
	]

	# Fake news indicators
	self.fake_indicators = [
	"exclusive reveal", "shocking truth", "they don't want you to know",
	"mainstream media won't report", "breaking secret", "you won't believe",
	"wake up sheeple", "open your eyes"
	]

	def extract_content(self, url: str):
	"""Extract content from URL"""
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
	}

	if not url.startswith(('http://', 'https://')):
	url = 'https://' + url

	response = requests.get(url, headers=headers, timeout=15)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')

	# Remove unwanted elements
	for element in soup(["script", "style", "nav", "footer", "header", "aside"]):
	element.decompose()

	# Extract title
	title = soup.find('title')
	title_text = title.get_text().strip() if title else "No title found"

	# Try multiple content selectors
	content_text = ""
	content_selectors = [
	'article', '.article-content', '.post-content',
	'.story-content', '.entry-content', 'main',
	'[role="main"]', '.news-content', '.story-body'
	]

	for selector in content_selectors:
	elements = soup.select(selector)
	if elements:
	content_parts = []
	for elem in elements:
	text = elem.get_text().strip()
	if len(text) > 100:
	content_parts.append(text)
	if content_parts:
	content_text = ' '.join(content_parts)
	break

	# Fallback to body
	if not content_text or len(content_text) < 200:
	body = soup.find('body')
	if body:
	content_text = body.get_text()

	# Clean text
	content_text = self.clean_text(content_text)

	return {
	'success': True,
	'title': title_text,
	'content': content_text,
	'url': url
	}

	except Exception as e:
	logger.error(f"Content extraction error: {e}")
	return {'success': False, 'error': str(e)}

	def clean_text(self, text: str):
	"""Clean and normalize text"""
	text = re.sub(r'\s+', ' ', text)
	text = re.sub(r'[^\w\s.,!?;:()-]', '', text)
	return text.strip()

	def truncate_text(self, text: str, max_words: int = 350):
	"""Truncate text to maximum words for the model"""
	words = text.split()
	if len(words) > max_words:
	truncated = ' '.join(words[:max_words])
	logger.info(f"Text truncated from {len(words)} to {max_words} words")
	return truncated
	return text

	def analyze_content(self, text: str):
	"""Analyze text for fake news indicators"""
	text_lower = text.lower()

	fake_indicator_count = sum(1 for indicator in self.fake_indicators if indicator in text_lower)
	exclamation_count = text.count('!')
	capital_words = len(re.findall(r'\b[A-Z]{3,}\b', text))

	return {
	'fake_indicator_count': fake_indicator_count,
	'exclamation_count': exclamation_count,
	'capital_words': capital_words
	}

	def check_source_credibility(self, url: str):
	"""Check source credibility"""
	url_lower = url.lower()

	for credible_source in self.credible_sources:
	if credible_source in url_lower:
	return 0.9

	unreliable = ['.blogspot.', '.wordpress.', '.tumblr.']
	for domain in unreliable:
	if domain in url_lower:
	return 0.1

	return 0.5

	def detect_fake_news(self, url: str):
	"""Main detection function"""
	logger.info(f"Analyzing: {url}")

	# Extract content
	content_data = self.extract_content(url)
	if not content_data['success']:
	return {
	'status': '❌ Extraction Failed',
	'confidence': 0.0,
	'message': f"Could not extract content: {content_data.get('error', 'Unknown error')}",
	'title': 'Error'
	}

	title = content_data['title']
	content = content_data['content']

	if len(content.strip()) < 100:
	return {
	'status': '⚠️ Insufficient Content',
	'confidence': 0.0,
	'message': 'Not enough content to analyze. May be behind paywall.',
	'title': title
	}

	# Prepare text for model (title + truncated content)
	full_text = f"{title}. {content}"

	# Truncate text to safe length for the model
	truncated_text = self.truncate_text(full_text, max_words=350)
	logger.info(f"Text length: {len(truncated_text)} characters")

	# Use RoBERTa model with error handling
	try:
	result = self.classifier(truncated_text)[0]

	label = result['label']
	score = result['score']

	# Debug: Log the raw output
	logger.info(f"Raw model output: {result}")

	# Parse label correctly - check both possible label formats
	if label in ['LABEL_1', 'FAKE', 'Fake']:
	is_fake = True
	model_confidence = score
	elif label in ['LABEL_0', 'REAL', 'Real']:
	is_fake = False
	model_confidence = score
	else:
	# If label format is unexpected, use score threshold
	is_fake = score > 0.5
	model_confidence = score if is_fake else (1 - score)

	logger.info(f"Interpreted: is_fake={is_fake}, confidence={model_confidence:.3f}")

	except Exception as e:
	logger.error(f"Model error: {e}")
	# Fallback to content analysis only
	return self.fallback_analysis(title, content, url, str(e))

	# Additional analysis
	source_credibility = self.check_source_credibility(url)
	content_analysis = self.analyze_content(full_text)

	# Calculate combined score based on model prediction
	if is_fake:
	# For fake news: model confidence + source suspicion + content indicators
	combined_score = (
	model_confidence * 0.7 +
	(1 - source_credibility) * 0.2 +
	min(content_analysis['fake_indicator_count'] * 0.1, 0.1)
	)
	else:
	# For real news: model confidence + source credibility
	combined_score = (
	model_confidence * 0.8 +
	source_credibility * 0.2
	)

	# Determine status based on clear thresholds
	if is_fake:
	if combined_score > 0.8:
	status = "🚨 Highly Likely Fake News"
	elif combined_score > 0.6:
	status = "⚠️ Likely Fake News"
	elif combined_score > 0.4:
	status = "🤔 Possibly Fake News"
	else:
	status = "📰 Uncertain (Leaning Fake)"
	else:
	if combined_score > 0.8:
	status = "✅ Highly Likely Real News"
	elif combined_score > 0.6:
	status = "📰 Likely Real News"
	elif combined_score > 0.4:
	status = "🤔 Possibly Real News"
	else:
	status = "❓ Uncertain (Leaning Real)"

	# Detailed message
	message = f"""
	📊 Analysis Results:

	RoBERTa Model Prediction:
	- Classification: {'FAKE NEWS' if is_fake else 'REAL NEWS'}
	- Model Confidence: *{model_confidence 100:.1f}%**
	- Raw Output: `{label}` (score: {score:.3f})

	Source Analysis:
	- Source Credibility: {source_credibility:.2f}/1.0
	- Domain: {url.split('/')[2] if len(url.split('/')) > 2 else 'Unknown'}

	Content Indicators:
	- Fake News Keywords: {content_analysis['fake_indicator_count']}
	- Exclamation Marks: {content_analysis['exclamation_count']}
	- ALL-CAPS Words: {content_analysis['capital_words']}

	*Final Score: {combined_score 100:.1f}%**

	Content Preview:
	{content[:300]}...

	---
	Note: This is an AI prediction. Always verify from multiple sources.
	""".strip()

	return {
	'status': status,
	'confidence': combined_score,
	'message': message,
	'title': title
	}

	def fallback_analysis(self, title: str, content: str, url: str, error: str):
	"""Fallback analysis when model fails"""
	source_credibility = self.check_source_credibility(url)
	content_analysis = self.analyze_content(f"{title}. {content}")

	# Simple heuristic based on source and content
	fake_score = (
	(1 - source_credibility) * 0.6 +
	min(content_analysis['fake_indicator_count'] * 0.2, 0.4)
	)

	if fake_score > 0.7:
	status = "⚠️ Suspicious (Fallback Analysis)"
	elif fake_score > 0.4:
	status = "🤔 Uncertain (Fallback Analysis)"
	else:
	status = "📰 Probably Real (Fallback Analysis)"

	message = f"""
	📊 Fallback Analysis (Model Error):

	Model Error: {error}

	Source Analysis:
	- Source Credibility: {source_credibility:.2f}/1.0

	Content Indicators:
	- Fake News Keywords: {content_analysis['fake_indicator_count']}
	- Exclamation Marks: {content_analysis['exclamation_count']}
	- ALL-CAPS Words: {content_analysis['capital_words']}

	*Fallback Score: {fake_score 100:.1f}%**

	Preview:
	{content[:300]}...

	---
	Using fallback analysis due to model error
	""".strip()

	return {
	'status': status,
	'confidence': fake_score,
	'message': message,
	'title': title
	}

	# Initialize detector
	logger.info("Initializing Fake News Detector...")
	detector = FakeNewsDetector()
	logger.info("Ready!")

	def analyze_url(url):
	"""Gradio interface function"""
	if not url.strip():
	return "⚠️ Please enter a URL", "0%", "No URL provided", "No title"

	try:
	result = detector.detect_fake_news(url)
	confidence_percent = f"{result['confidence'] * 100:.1f}%"

	return (
	result['status'],
	confidence_percent,
	result['message'],
	result['title']
	)

	except Exception as e:
	logger.error(f"Error: {e}")
	return "❌ Error", "0%", f"Error: {str(e)}", "Error"

	# Gradio Interface
	with gr.Blocks(
	theme=gr.themes.Soft(),
	title="Fake News Detector"
	) as demo:

	gr.Markdown("""
	# 🕵️ Fake News Detector
	AI-Powered News Verification using RoBERTa

	Analyzes news articles using a transformer model trained on 40,000+ articles
	""")

	with gr.Row():
	with gr.Column(scale=2):
	url_input = gr.Textbox(
	label="📰 Enter News Article URL",
	placeholder="https://example.com/news-article",
	lines=1
	)
	analyze_btn = gr.Button(
	"🔍 Analyze Article",
	variant="primary",
	size="lg"
	)

	with gr.Column(scale=1):
	with gr.Group():
	result_status = gr.Textbox(
	label="🎯 Result",
	interactive=False
	)
	confidence_score = gr.Textbox(
	label="📈 Confidence",
	interactive=False
	)
	article_title = gr.Textbox(
	label="📝 Article Title",
	interactive=False
	)

	details_output = gr.Markdown(label="📊 Detailed Analysis")

	gr.Examples(
	label="💡 Try these examples:",
	examples=[
	["https://www.bbc.com/news"],
	["https://www.reuters.com/"],
	["https://apnews.com/"]
	],
	inputs=url_input
	)

	gr.Markdown("""
	---

	How it works:

	1. Extracts article text from URL
	2. Truncates to model-safe length (350 words)
	3. Analyzes using RoBERTa transformer
	4. Checks source credibility and content patterns
	5. Provides confidence score

	Model: `jy46604790/Fake-News-Bert-Detect` (RoBERTa-based)

	⚠️ Disclaimer: Educational tool only. Always verify information through multiple credible sources.
	""")

	analyze_btn.click(
	fn=analyze_url,
	inputs=url_input,
	outputs=[result_status, confidence_score, details_output, article_title]
	)

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860
	)