import gradio as gr import requests from bs4 import BeautifulSoup from transformers import pipeline import re import logging # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class FakeNewsDetector: def __init__(self): logger.info("Loading RoBERTa Fake News Detection model...") try: # Using the most popular and proven model MODEL = "jy46604790/Fake-News-Bert-Detect" # Simple pipeline approach - handles everything automatically self.classifier = pipeline( "text-classification", model=MODEL, tokenizer=MODEL, device=-1, # CPU mode for free tier max_length=512, # Explicitly set max length truncation=True # Enable truncation ) logger.info("✅ Model loaded successfully!") except Exception as e: logger.error(f"Error loading model: {e}") raise # Credible sources self.credible_sources = [ 'reuters.com', 'apnews.com', 'bbc.com', 'nytimes.com', 'theguardian.com', 'washingtonpost.com', 'npr.org', 'wsj.com', 'ft.com', 'bloomberg.com', 'abcnews.go.com', 'cbsnews.com', 'nbcnews.com', 'cnn.com', 'axios.com' ] # Fake news indicators self.fake_indicators = [ "exclusive reveal", "shocking truth", "they don't want you to know", "mainstream media won't report", "breaking secret", "you won't believe", "wake up sheeple", "open your eyes" ] def extract_content(self, url: str): """Extract content from URL""" try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } if not url.startswith(('http://', 'https://')): url = 'https://' + url response = requests.get(url, headers=headers, timeout=15) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Remove unwanted elements for element in soup(["script", "style", "nav", "footer", "header", "aside"]): element.decompose() # Extract title title = soup.find('title') title_text = title.get_text().strip() if title else "No title found" # Try multiple content selectors content_text = "" content_selectors = [ 'article', '.article-content', '.post-content', '.story-content', '.entry-content', 'main', '[role="main"]', '.news-content', '.story-body' ] for selector in content_selectors: elements = soup.select(selector) if elements: content_parts = [] for elem in elements: text = elem.get_text().strip() if len(text) > 100: content_parts.append(text) if content_parts: content_text = ' '.join(content_parts) break # Fallback to body if not content_text or len(content_text) < 200: body = soup.find('body') if body: content_text = body.get_text() # Clean text content_text = self.clean_text(content_text) return { 'success': True, 'title': title_text, 'content': content_text, 'url': url } except Exception as e: logger.error(f"Content extraction error: {e}") return {'success': False, 'error': str(e)} def clean_text(self, text: str): """Clean and normalize text""" text = re.sub(r'\s+', ' ', text) text = re.sub(r'[^\w\s.,!?;:()-]', '', text) return text.strip() def truncate_text(self, text: str, max_words: int = 350): """Truncate text to maximum words for the model""" words = text.split() if len(words) > max_words: truncated = ' '.join(words[:max_words]) logger.info(f"Text truncated from {len(words)} to {max_words} words") return truncated return text def analyze_content(self, text: str): """Analyze text for fake news indicators""" text_lower = text.lower() fake_indicator_count = sum(1 for indicator in self.fake_indicators if indicator in text_lower) exclamation_count = text.count('!') capital_words = len(re.findall(r'\b[A-Z]{3,}\b', text)) return { 'fake_indicator_count': fake_indicator_count, 'exclamation_count': exclamation_count, 'capital_words': capital_words } def check_source_credibility(self, url: str): """Check source credibility""" url_lower = url.lower() for credible_source in self.credible_sources: if credible_source in url_lower: return 0.9 unreliable = ['.blogspot.', '.wordpress.', '.tumblr.'] for domain in unreliable: if domain in url_lower: return 0.1 return 0.5 def detect_fake_news(self, url: str): """Main detection function""" logger.info(f"Analyzing: {url}") # Extract content content_data = self.extract_content(url) if not content_data['success']: return { 'status': '❌ Extraction Failed', 'confidence': 0.0, 'message': f"Could not extract content: {content_data.get('error', 'Unknown error')}", 'title': 'Error' } title = content_data['title'] content = content_data['content'] if len(content.strip()) < 100: return { 'status': '⚠️ Insufficient Content', 'confidence': 0.0, 'message': 'Not enough content to analyze. May be behind paywall.', 'title': title } # Prepare text for model (title + truncated content) full_text = f"{title}. {content}" # Truncate text to safe length for the model truncated_text = self.truncate_text(full_text, max_words=350) logger.info(f"Text length: {len(truncated_text)} characters") # Use RoBERTa model with error handling try: result = self.classifier(truncated_text)[0] label = result['label'] score = result['score'] # Debug: Log the raw output logger.info(f"Raw model output: {result}") # Parse label correctly - check both possible label formats if label in ['LABEL_1', 'FAKE', 'Fake']: is_fake = True model_confidence = score elif label in ['LABEL_0', 'REAL', 'Real']: is_fake = False model_confidence = score else: # If label format is unexpected, use score threshold is_fake = score > 0.5 model_confidence = score if is_fake else (1 - score) logger.info(f"Interpreted: is_fake={is_fake}, confidence={model_confidence:.3f}") except Exception as e: logger.error(f"Model error: {e}") # Fallback to content analysis only return self.fallback_analysis(title, content, url, str(e)) # Additional analysis source_credibility = self.check_source_credibility(url) content_analysis = self.analyze_content(full_text) # Calculate combined score based on model prediction if is_fake: # For fake news: model confidence + source suspicion + content indicators combined_score = ( model_confidence * 0.7 + (1 - source_credibility) * 0.2 + min(content_analysis['fake_indicator_count'] * 0.1, 0.1) ) else: # For real news: model confidence + source credibility combined_score = ( model_confidence * 0.8 + source_credibility * 0.2 ) # Determine status based on clear thresholds if is_fake: if combined_score > 0.8: status = "🚨 Highly Likely Fake News" elif combined_score > 0.6: status = "⚠️ Likely Fake News" elif combined_score > 0.4: status = "🤔 Possibly Fake News" else: status = "📰 Uncertain (Leaning Fake)" else: if combined_score > 0.8: status = "✅ Highly Likely Real News" elif combined_score > 0.6: status = "📰 Likely Real News" elif combined_score > 0.4: status = "🤔 Possibly Real News" else: status = "❓ Uncertain (Leaning Real)" # Detailed message message = f""" **📊 Analysis Results:** **RoBERTa Model Prediction:** - Classification: **{'FAKE NEWS' if is_fake else 'REAL NEWS'}** - Model Confidence: **{model_confidence * 100:.1f}%** - Raw Output: `{label}` (score: {score:.3f}) **Source Analysis:** - Source Credibility: {source_credibility:.2f}/1.0 - Domain: {url.split('/')[2] if len(url.split('/')) > 2 else 'Unknown'} **Content Indicators:** - Fake News Keywords: {content_analysis['fake_indicator_count']} - Exclamation Marks: {content_analysis['exclamation_count']} - ALL-CAPS Words: {content_analysis['capital_words']} **Final Score: {combined_score * 100:.1f}%** **Content Preview:** {content[:300]}... --- **Note:** This is an AI prediction. Always verify from multiple sources. """.strip() return { 'status': status, 'confidence': combined_score, 'message': message, 'title': title } def fallback_analysis(self, title: str, content: str, url: str, error: str): """Fallback analysis when model fails""" source_credibility = self.check_source_credibility(url) content_analysis = self.analyze_content(f"{title}. {content}") # Simple heuristic based on source and content fake_score = ( (1 - source_credibility) * 0.6 + min(content_analysis['fake_indicator_count'] * 0.2, 0.4) ) if fake_score > 0.7: status = "⚠️ Suspicious (Fallback Analysis)" elif fake_score > 0.4: status = "🤔 Uncertain (Fallback Analysis)" else: status = "📰 Probably Real (Fallback Analysis)" message = f""" **📊 Fallback Analysis (Model Error):** **Model Error:** {error} **Source Analysis:** - Source Credibility: {source_credibility:.2f}/1.0 **Content Indicators:** - Fake News Keywords: {content_analysis['fake_indicator_count']} - Exclamation Marks: {content_analysis['exclamation_count']} - ALL-CAPS Words: {content_analysis['capital_words']} **Fallback Score: {fake_score * 100:.1f}%** **Preview:** {content[:300]}... --- *Using fallback analysis due to model error* """.strip() return { 'status': status, 'confidence': fake_score, 'message': message, 'title': title } # Initialize detector logger.info("Initializing Fake News Detector...") detector = FakeNewsDetector() logger.info("Ready!") def analyze_url(url): """Gradio interface function""" if not url.strip(): return "⚠️ Please enter a URL", "0%", "No URL provided", "No title" try: result = detector.detect_fake_news(url) confidence_percent = f"{result['confidence'] * 100:.1f}%" return ( result['status'], confidence_percent, result['message'], result['title'] ) except Exception as e: logger.error(f"Error: {e}") return "❌ Error", "0%", f"Error: {str(e)}", "Error" # Gradio Interface with gr.Blocks( theme=gr.themes.Soft(), title="Fake News Detector" ) as demo: gr.Markdown(""" # 🕵️ Fake News Detector **AI-Powered News Verification using RoBERTa** *Analyzes news articles using a transformer model trained on 40,000+ articles* """) with gr.Row(): with gr.Column(scale=2): url_input = gr.Textbox( label="📰 Enter News Article URL", placeholder="https://example.com/news-article", lines=1 ) analyze_btn = gr.Button( "🔍 Analyze Article", variant="primary", size="lg" ) with gr.Column(scale=1): with gr.Group(): result_status = gr.Textbox( label="🎯 Result", interactive=False ) confidence_score = gr.Textbox( label="📈 Confidence", interactive=False ) article_title = gr.Textbox( label="📝 Article Title", interactive=False ) details_output = gr.Markdown(label="📊 Detailed Analysis") gr.Examples( label="💡 Try these examples:", examples=[ ["https://www.bbc.com/news"], ["https://www.reuters.com/"], ["https://apnews.com/"] ], inputs=url_input ) gr.Markdown(""" --- **How it works:** 1. **Extracts** article text from URL 2. **Truncates** to model-safe length (350 words) 3. **Analyzes** using RoBERTa transformer 4. **Checks** source credibility and content patterns 5. **Provides** confidence score **Model:** `jy46604790/Fake-News-Bert-Detect` (RoBERTa-based) **⚠️ Disclaimer:** Educational tool only. Always verify information through multiple credible sources. """) analyze_btn.click( fn=analyze_url, inputs=url_input, outputs=[result_status, confidence_score, details_output, article_title] ) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860 )