Spaces:
Running
Running
| import gradio as gr | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from transformers import pipeline | |
| import re | |
| import logging | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| class FakeNewsDetector: | |
| def __init__(self): | |
| logger.info("Loading RoBERTa Fake News Detection model...") | |
| try: | |
| # Using the most popular and proven model | |
| MODEL = "jy46604790/Fake-News-Bert-Detect" | |
| # Simple pipeline approach - handles everything automatically | |
| self.classifier = pipeline( | |
| "text-classification", | |
| model=MODEL, | |
| tokenizer=MODEL, | |
| device=-1, # CPU mode for free tier | |
| max_length=512, # Explicitly set max length | |
| truncation=True # Enable truncation | |
| ) | |
| logger.info("β Model loaded successfully!") | |
| except Exception as e: | |
| logger.error(f"Error loading model: {e}") | |
| raise | |
| # Credible sources | |
| self.credible_sources = [ | |
| 'reuters.com', 'apnews.com', 'bbc.com', 'nytimes.com', | |
| 'theguardian.com', 'washingtonpost.com', 'npr.org', | |
| 'wsj.com', 'ft.com', 'bloomberg.com', 'abcnews.go.com', | |
| 'cbsnews.com', 'nbcnews.com', 'cnn.com', 'axios.com' | |
| ] | |
| # Fake news indicators | |
| self.fake_indicators = [ | |
| "exclusive reveal", "shocking truth", "they don't want you to know", | |
| "mainstream media won't report", "breaking secret", "you won't believe", | |
| "wake up sheeple", "open your eyes" | |
| ] | |
| def extract_content(self, url: str): | |
| """Extract content from URL""" | |
| try: | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' | |
| } | |
| if not url.startswith(('http://', 'https://')): | |
| url = 'https://' + url | |
| response = requests.get(url, headers=headers, timeout=15) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Remove unwanted elements | |
| for element in soup(["script", "style", "nav", "footer", "header", "aside"]): | |
| element.decompose() | |
| # Extract title | |
| title = soup.find('title') | |
| title_text = title.get_text().strip() if title else "No title found" | |
| # Try multiple content selectors | |
| content_text = "" | |
| content_selectors = [ | |
| 'article', '.article-content', '.post-content', | |
| '.story-content', '.entry-content', 'main', | |
| '[role="main"]', '.news-content', '.story-body' | |
| ] | |
| for selector in content_selectors: | |
| elements = soup.select(selector) | |
| if elements: | |
| content_parts = [] | |
| for elem in elements: | |
| text = elem.get_text().strip() | |
| if len(text) > 100: | |
| content_parts.append(text) | |
| if content_parts: | |
| content_text = ' '.join(content_parts) | |
| break | |
| # Fallback to body | |
| if not content_text or len(content_text) < 200: | |
| body = soup.find('body') | |
| if body: | |
| content_text = body.get_text() | |
| # Clean text | |
| content_text = self.clean_text(content_text) | |
| return { | |
| 'success': True, | |
| 'title': title_text, | |
| 'content': content_text, | |
| 'url': url | |
| } | |
| except Exception as e: | |
| logger.error(f"Content extraction error: {e}") | |
| return {'success': False, 'error': str(e)} | |
| def clean_text(self, text: str): | |
| """Clean and normalize text""" | |
| text = re.sub(r'\s+', ' ', text) | |
| text = re.sub(r'[^\w\s.,!?;:()-]', '', text) | |
| return text.strip() | |
| def truncate_text(self, text: str, max_words: int = 350): | |
| """Truncate text to maximum words for the model""" | |
| words = text.split() | |
| if len(words) > max_words: | |
| truncated = ' '.join(words[:max_words]) | |
| logger.info(f"Text truncated from {len(words)} to {max_words} words") | |
| return truncated | |
| return text | |
| def analyze_content(self, text: str): | |
| """Analyze text for fake news indicators""" | |
| text_lower = text.lower() | |
| fake_indicator_count = sum(1 for indicator in self.fake_indicators if indicator in text_lower) | |
| exclamation_count = text.count('!') | |
| capital_words = len(re.findall(r'\b[A-Z]{3,}\b', text)) | |
| return { | |
| 'fake_indicator_count': fake_indicator_count, | |
| 'exclamation_count': exclamation_count, | |
| 'capital_words': capital_words | |
| } | |
| def check_source_credibility(self, url: str): | |
| """Check source credibility""" | |
| url_lower = url.lower() | |
| for credible_source in self.credible_sources: | |
| if credible_source in url_lower: | |
| return 0.9 | |
| unreliable = ['.blogspot.', '.wordpress.', '.tumblr.'] | |
| for domain in unreliable: | |
| if domain in url_lower: | |
| return 0.1 | |
| return 0.5 | |
| def detect_fake_news(self, url: str): | |
| """Main detection function""" | |
| logger.info(f"Analyzing: {url}") | |
| # Extract content | |
| content_data = self.extract_content(url) | |
| if not content_data['success']: | |
| return { | |
| 'status': 'β Extraction Failed', | |
| 'confidence': 0.0, | |
| 'message': f"Could not extract content: {content_data.get('error', 'Unknown error')}", | |
| 'title': 'Error' | |
| } | |
| title = content_data['title'] | |
| content = content_data['content'] | |
| if len(content.strip()) < 100: | |
| return { | |
| 'status': 'β οΈ Insufficient Content', | |
| 'confidence': 0.0, | |
| 'message': 'Not enough content to analyze. May be behind paywall.', | |
| 'title': title | |
| } | |
| # Prepare text for model (title + truncated content) | |
| full_text = f"{title}. {content}" | |
| # Truncate text to safe length for the model | |
| truncated_text = self.truncate_text(full_text, max_words=350) | |
| logger.info(f"Text length: {len(truncated_text)} characters") | |
| # Use RoBERTa model with error handling | |
| try: | |
| result = self.classifier(truncated_text)[0] | |
| label = result['label'] | |
| score = result['score'] | |
| # Debug: Log the raw output | |
| logger.info(f"Raw model output: {result}") | |
| # Parse label correctly - check both possible label formats | |
| if label in ['LABEL_1', 'FAKE', 'Fake']: | |
| is_fake = True | |
| model_confidence = score | |
| elif label in ['LABEL_0', 'REAL', 'Real']: | |
| is_fake = False | |
| model_confidence = score | |
| else: | |
| # If label format is unexpected, use score threshold | |
| is_fake = score > 0.5 | |
| model_confidence = score if is_fake else (1 - score) | |
| logger.info(f"Interpreted: is_fake={is_fake}, confidence={model_confidence:.3f}") | |
| except Exception as e: | |
| logger.error(f"Model error: {e}") | |
| # Fallback to content analysis only | |
| return self.fallback_analysis(title, content, url, str(e)) | |
| # Additional analysis | |
| source_credibility = self.check_source_credibility(url) | |
| content_analysis = self.analyze_content(full_text) | |
| # Calculate combined score based on model prediction | |
| if is_fake: | |
| # For fake news: model confidence + source suspicion + content indicators | |
| combined_score = ( | |
| model_confidence * 0.7 + | |
| (1 - source_credibility) * 0.2 + | |
| min(content_analysis['fake_indicator_count'] * 0.1, 0.1) | |
| ) | |
| else: | |
| # For real news: model confidence + source credibility | |
| combined_score = ( | |
| model_confidence * 0.8 + | |
| source_credibility * 0.2 | |
| ) | |
| # Determine status based on clear thresholds | |
| if is_fake: | |
| if combined_score > 0.8: | |
| status = "π¨ Highly Likely Fake News" | |
| elif combined_score > 0.6: | |
| status = "β οΈ Likely Fake News" | |
| elif combined_score > 0.4: | |
| status = "π€ Possibly Fake News" | |
| else: | |
| status = "π° Uncertain (Leaning Fake)" | |
| else: | |
| if combined_score > 0.8: | |
| status = "β Highly Likely Real News" | |
| elif combined_score > 0.6: | |
| status = "π° Likely Real News" | |
| elif combined_score > 0.4: | |
| status = "π€ Possibly Real News" | |
| else: | |
| status = "β Uncertain (Leaning Real)" | |
| # Detailed message | |
| message = f""" | |
| **π Analysis Results:** | |
| **RoBERTa Model Prediction:** | |
| - Classification: **{'FAKE NEWS' if is_fake else 'REAL NEWS'}** | |
| - Model Confidence: **{model_confidence * 100:.1f}%** | |
| - Raw Output: `{label}` (score: {score:.3f}) | |
| **Source Analysis:** | |
| - Source Credibility: {source_credibility:.2f}/1.0 | |
| - Domain: {url.split('/')[2] if len(url.split('/')) > 2 else 'Unknown'} | |
| **Content Indicators:** | |
| - Fake News Keywords: {content_analysis['fake_indicator_count']} | |
| - Exclamation Marks: {content_analysis['exclamation_count']} | |
| - ALL-CAPS Words: {content_analysis['capital_words']} | |
| **Final Score: {combined_score * 100:.1f}%** | |
| **Content Preview:** | |
| {content[:300]}... | |
| --- | |
| **Note:** This is an AI prediction. Always verify from multiple sources. | |
| """.strip() | |
| return { | |
| 'status': status, | |
| 'confidence': combined_score, | |
| 'message': message, | |
| 'title': title | |
| } | |
| def fallback_analysis(self, title: str, content: str, url: str, error: str): | |
| """Fallback analysis when model fails""" | |
| source_credibility = self.check_source_credibility(url) | |
| content_analysis = self.analyze_content(f"{title}. {content}") | |
| # Simple heuristic based on source and content | |
| fake_score = ( | |
| (1 - source_credibility) * 0.6 + | |
| min(content_analysis['fake_indicator_count'] * 0.2, 0.4) | |
| ) | |
| if fake_score > 0.7: | |
| status = "β οΈ Suspicious (Fallback Analysis)" | |
| elif fake_score > 0.4: | |
| status = "π€ Uncertain (Fallback Analysis)" | |
| else: | |
| status = "π° Probably Real (Fallback Analysis)" | |
| message = f""" | |
| **π Fallback Analysis (Model Error):** | |
| **Model Error:** {error} | |
| **Source Analysis:** | |
| - Source Credibility: {source_credibility:.2f}/1.0 | |
| **Content Indicators:** | |
| - Fake News Keywords: {content_analysis['fake_indicator_count']} | |
| - Exclamation Marks: {content_analysis['exclamation_count']} | |
| - ALL-CAPS Words: {content_analysis['capital_words']} | |
| **Fallback Score: {fake_score * 100:.1f}%** | |
| **Preview:** | |
| {content[:300]}... | |
| --- | |
| *Using fallback analysis due to model error* | |
| """.strip() | |
| return { | |
| 'status': status, | |
| 'confidence': fake_score, | |
| 'message': message, | |
| 'title': title | |
| } | |
| # Initialize detector | |
| logger.info("Initializing Fake News Detector...") | |
| detector = FakeNewsDetector() | |
| logger.info("Ready!") | |
| def analyze_url(url): | |
| """Gradio interface function""" | |
| if not url.strip(): | |
| return "β οΈ Please enter a URL", "0%", "No URL provided", "No title" | |
| try: | |
| result = detector.detect_fake_news(url) | |
| confidence_percent = f"{result['confidence'] * 100:.1f}%" | |
| return ( | |
| result['status'], | |
| confidence_percent, | |
| result['message'], | |
| result['title'] | |
| ) | |
| except Exception as e: | |
| logger.error(f"Error: {e}") | |
| return "β Error", "0%", f"Error: {str(e)}", "Error" | |
| # Gradio Interface | |
| with gr.Blocks( | |
| theme=gr.themes.Soft(), | |
| title="Fake News Detector" | |
| ) as demo: | |
| gr.Markdown(""" | |
| # π΅οΈ Fake News Detector | |
| **AI-Powered News Verification using RoBERTa** | |
| *Analyzes news articles using a transformer model trained on 40,000+ articles* | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| url_input = gr.Textbox( | |
| label="π° Enter News Article URL", | |
| placeholder="https://example.com/news-article", | |
| lines=1 | |
| ) | |
| analyze_btn = gr.Button( | |
| "π Analyze Article", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| with gr.Column(scale=1): | |
| with gr.Group(): | |
| result_status = gr.Textbox( | |
| label="π― Result", | |
| interactive=False | |
| ) | |
| confidence_score = gr.Textbox( | |
| label="π Confidence", | |
| interactive=False | |
| ) | |
| article_title = gr.Textbox( | |
| label="π Article Title", | |
| interactive=False | |
| ) | |
| details_output = gr.Markdown(label="π Detailed Analysis") | |
| gr.Examples( | |
| label="π‘ Try these examples:", | |
| examples=[ | |
| ["https://www.bbc.com/news"], | |
| ["https://www.reuters.com/"], | |
| ["https://apnews.com/"] | |
| ], | |
| inputs=url_input | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| **How it works:** | |
| 1. **Extracts** article text from URL | |
| 2. **Truncates** to model-safe length (350 words) | |
| 3. **Analyzes** using RoBERTa transformer | |
| 4. **Checks** source credibility and content patterns | |
| 5. **Provides** confidence score | |
| **Model:** `jy46604790/Fake-News-Bert-Detect` (RoBERTa-based) | |
| **β οΈ Disclaimer:** Educational tool only. Always verify information through multiple credible sources. | |
| """) | |
| analyze_btn.click( | |
| fn=analyze_url, | |
| inputs=url_input, | |
| outputs=[result_status, confidence_score, details_output, article_title] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860 | |
| ) |