import gradio as gr
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import re
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class FakeNewsDetector:
    def __init__(self):
        logger.info("Loading RoBERTa Fake News Detection model...")
        
        try:
            # Using the most popular and proven model
            MODEL = "jy46604790/Fake-News-Bert-Detect"
            
            # Simple pipeline approach - handles everything automatically
            self.classifier = pipeline(
                "text-classification",
                model=MODEL,
                tokenizer=MODEL,
                device=-1,  # CPU mode for free tier
                max_length=512,  # Explicitly set max length
                truncation=True  # Enable truncation
            )
            
            logger.info("✅ Model loaded successfully!")
            
        except Exception as e:
            logger.error(f"Error loading model: {e}")
            raise
        
        # Credible sources
        self.credible_sources = [
            'reuters.com', 'apnews.com', 'bbc.com', 'nytimes.com',
            'theguardian.com', 'washingtonpost.com', 'npr.org',
            'wsj.com', 'ft.com', 'bloomberg.com', 'abcnews.go.com',
            'cbsnews.com', 'nbcnews.com', 'cnn.com', 'axios.com'
        ]
        
        # Fake news indicators
        self.fake_indicators = [
            "exclusive reveal", "shocking truth", "they don't want you to know",
            "mainstream media won't report", "breaking secret", "you won't believe",
            "wake up sheeple", "open your eyes"
        ]

    def extract_content(self, url: str):
        """Extract content from URL"""
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            }
            
            if not url.startswith(('http://', 'https://')):
                url = 'https://' + url
            
            response = requests.get(url, headers=headers, timeout=15)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Remove unwanted elements
            for element in soup(["script", "style", "nav", "footer", "header", "aside"]):
                element.decompose()
            
            # Extract title
            title = soup.find('title')
            title_text = title.get_text().strip() if title else "No title found"
            
            # Try multiple content selectors
            content_text = ""
            content_selectors = [
                'article', '.article-content', '.post-content',
                '.story-content', '.entry-content', 'main',
                '[role="main"]', '.news-content', '.story-body'
            ]
            
            for selector in content_selectors:
                elements = soup.select(selector)
                if elements:
                    content_parts = []
                    for elem in elements:
                        text = elem.get_text().strip()
                        if len(text) > 100:
                            content_parts.append(text)
                    if content_parts:
                        content_text = ' '.join(content_parts)
                        break
            
            # Fallback to body
            if not content_text or len(content_text) < 200:
                body = soup.find('body')
                if body:
                    content_text = body.get_text()
            
            # Clean text
            content_text = self.clean_text(content_text)
            
            return {
                'success': True,
                'title': title_text,
                'content': content_text,
                'url': url
            }
            
        except Exception as e:
            logger.error(f"Content extraction error: {e}")
            return {'success': False, 'error': str(e)}

    def clean_text(self, text: str):
        """Clean and normalize text"""
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s.,!?;:()-]', '', text)
        return text.strip()

    def truncate_text(self, text: str, max_words: int = 350):
        """Truncate text to maximum words for the model"""
        words = text.split()
        if len(words) > max_words:
            truncated = ' '.join(words[:max_words])
            logger.info(f"Text truncated from {len(words)} to {max_words} words")
            return truncated
        return text

    def analyze_content(self, text: str):
        """Analyze text for fake news indicators"""
        text_lower = text.lower()
        
        fake_indicator_count = sum(1 for indicator in self.fake_indicators if indicator in text_lower)
        exclamation_count = text.count('!')
        capital_words = len(re.findall(r'\b[A-Z]{3,}\b', text))
        
        return {
            'fake_indicator_count': fake_indicator_count,
            'exclamation_count': exclamation_count,
            'capital_words': capital_words
        }

    def check_source_credibility(self, url: str):
        """Check source credibility"""
        url_lower = url.lower()
        
        for credible_source in self.credible_sources:
            if credible_source in url_lower:
                return 0.9
        
        unreliable = ['.blogspot.', '.wordpress.', '.tumblr.']
        for domain in unreliable:
            if domain in url_lower:
                return 0.1
                
        return 0.5

    def detect_fake_news(self, url: str):
        """Main detection function"""
        logger.info(f"Analyzing: {url}")
        
        # Extract content
        content_data = self.extract_content(url)
        if not content_data['success']:
            return {
                'status': '❌ Extraction Failed',
                'confidence': 0.0,
                'message': f"Could not extract content: {content_data.get('error', 'Unknown error')}",
                'title': 'Error'
            }
        
        title = content_data['title']
        content = content_data['content']
        
        if len(content.strip()) < 100:
            return {
                'status': '⚠️ Insufficient Content',
                'confidence': 0.0,
                'message': 'Not enough content to analyze. May be behind paywall.',
                'title': title
            }
        
        # Prepare text for model (title + truncated content)
        full_text = f"{title}. {content}"
        
        # Truncate text to safe length for the model
        truncated_text = self.truncate_text(full_text, max_words=350)
        logger.info(f"Text length: {len(truncated_text)} characters")
        
        # Use RoBERTa model with error handling
        try:
            result = self.classifier(truncated_text)[0]
            
            label = result['label']
            score = result['score']
            
            # Debug: Log the raw output
            logger.info(f"Raw model output: {result}")
            
            # Parse label correctly - check both possible label formats
            if label in ['LABEL_1', 'FAKE', 'Fake']:
                is_fake = True
                model_confidence = score
            elif label in ['LABEL_0', 'REAL', 'Real']:
                is_fake = False
                model_confidence = score
            else:
                # If label format is unexpected, use score threshold
                is_fake = score > 0.5
                model_confidence = score if is_fake else (1 - score)
            
            logger.info(f"Interpreted: is_fake={is_fake}, confidence={model_confidence:.3f}")
            
        except Exception as e:
            logger.error(f"Model error: {e}")
            # Fallback to content analysis only
            return self.fallback_analysis(title, content, url, str(e))
        
        # Additional analysis
        source_credibility = self.check_source_credibility(url)
        content_analysis = self.analyze_content(full_text)
        
        # Calculate combined score based on model prediction
        if is_fake:
            # For fake news: model confidence + source suspicion + content indicators
            combined_score = (
                model_confidence * 0.7 +
                (1 - source_credibility) * 0.2 +
                min(content_analysis['fake_indicator_count'] * 0.1, 0.1)
            )
        else:
            # For real news: model confidence + source credibility
            combined_score = (
                model_confidence * 0.8 +
                source_credibility * 0.2
            )
        
        # Determine status based on clear thresholds
        if is_fake:
            if combined_score > 0.8:
                status = "🚨 Highly Likely Fake News"
            elif combined_score > 0.6:
                status = "⚠️ Likely Fake News"
            elif combined_score > 0.4:
                status = "🤔 Possibly Fake News"
            else:
                status = "📰 Uncertain (Leaning Fake)"
        else:
            if combined_score > 0.8:
                status = "✅ Highly Likely Real News"
            elif combined_score > 0.6:
                status = "📰 Likely Real News"
            elif combined_score > 0.4:
                status = "🤔 Possibly Real News"
            else:
                status = "❓ Uncertain (Leaning Real)"
        
        # Detailed message
        message = f"""
**📊 Analysis Results:**

**RoBERTa Model Prediction:**
- Classification: **{'FAKE NEWS' if is_fake else 'REAL NEWS'}**
- Model Confidence: **{model_confidence * 100:.1f}%**
- Raw Output: `{label}` (score: {score:.3f})

**Source Analysis:**
- Source Credibility: {source_credibility:.2f}/1.0
- Domain: {url.split('/')[2] if len(url.split('/')) > 2 else 'Unknown'}

**Content Indicators:**
- Fake News Keywords: {content_analysis['fake_indicator_count']}
- Exclamation Marks: {content_analysis['exclamation_count']}
- ALL-CAPS Words: {content_analysis['capital_words']}

**Final Score: {combined_score * 100:.1f}%**

**Content Preview:**
{content[:300]}...

---
**Note:** This is an AI prediction. Always verify from multiple sources.
        """.strip()
        
        return {
            'status': status,
            'confidence': combined_score,
            'message': message,
            'title': title
        }

    def fallback_analysis(self, title: str, content: str, url: str, error: str):
        """Fallback analysis when model fails"""
        source_credibility = self.check_source_credibility(url)
        content_analysis = self.analyze_content(f"{title}. {content}")
        
        # Simple heuristic based on source and content
        fake_score = (
            (1 - source_credibility) * 0.6 +
            min(content_analysis['fake_indicator_count'] * 0.2, 0.4)
        )
        
        if fake_score > 0.7:
            status = "⚠️ Suspicious (Fallback Analysis)"
        elif fake_score > 0.4:
            status = "🤔 Uncertain (Fallback Analysis)"
        else:
            status = "📰 Probably Real (Fallback Analysis)"
        
        message = f"""
**📊 Fallback Analysis (Model Error):**

**Model Error:** {error}

**Source Analysis:**
- Source Credibility: {source_credibility:.2f}/1.0

**Content Indicators:**
- Fake News Keywords: {content_analysis['fake_indicator_count']}
- Exclamation Marks: {content_analysis['exclamation_count']}
- ALL-CAPS Words: {content_analysis['capital_words']}

**Fallback Score: {fake_score * 100:.1f}%**

**Preview:**
{content[:300]}...

---
*Using fallback analysis due to model error*
        """.strip()
        
        return {
            'status': status,
            'confidence': fake_score,
            'message': message,
            'title': title
        }

# Initialize detector
logger.info("Initializing Fake News Detector...")
detector = FakeNewsDetector()
logger.info("Ready!")

def analyze_url(url):
    """Gradio interface function"""
    if not url.strip():
        return "⚠️ Please enter a URL", "0%", "No URL provided", "No title"
    
    try:
        result = detector.detect_fake_news(url)
        confidence_percent = f"{result['confidence'] * 100:.1f}%"
        
        return (
            result['status'],
            confidence_percent,
            result['message'],
            result['title']
        )
        
    except Exception as e:
        logger.error(f"Error: {e}")
        return "❌ Error", "0%", f"Error: {str(e)}", "Error"

# Gradio Interface
with gr.Blocks(
    theme=gr.themes.Soft(),
    title="Fake News Detector"
) as demo:
    
    gr.Markdown("""
    # 🕵️ Fake News Detector
    **AI-Powered News Verification using RoBERTa**
    
    *Analyzes news articles using a transformer model trained on 40,000+ articles*
    """)
    
    with gr.Row():
        with gr.Column(scale=2):
            url_input = gr.Textbox(
                label="📰 Enter News Article URL",
                placeholder="https://example.com/news-article",
                lines=1
            )
            analyze_btn = gr.Button(
                "🔍 Analyze Article",
                variant="primary",
                size="lg"
            )
        
        with gr.Column(scale=1):
            with gr.Group():
                result_status = gr.Textbox(
                    label="🎯 Result",
                    interactive=False
                )
                confidence_score = gr.Textbox(
                    label="📈 Confidence",
                    interactive=False
                )
                article_title = gr.Textbox(
                    label="📝 Article Title",
                    interactive=False
                )
    
    details_output = gr.Markdown(label="📊 Detailed Analysis")
    
    gr.Examples(
        label="💡 Try these examples:",
        examples=[
            ["https://www.bbc.com/news"],
            ["https://www.reuters.com/"],
            ["https://apnews.com/"]
        ],
        inputs=url_input
    )
    
    gr.Markdown("""
    ---
    
    **How it works:**
    
    1. **Extracts** article text from URL
    2. **Truncates** to model-safe length (350 words)
    3. **Analyzes** using RoBERTa transformer
    4. **Checks** source credibility and content patterns
    5. **Provides** confidence score
    
    **Model:** `jy46604790/Fake-News-Bert-Detect` (RoBERTa-based)
    
    **⚠️ Disclaimer:** Educational tool only. Always verify information through multiple credible sources.
    """)
    
    analyze_btn.click(
        fn=analyze_url,
        inputs=url_input,
        outputs=[result_status, confidence_score, details_output, article_title]
    )

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860
    )