import requests
from bs4 import BeautifulSoup
import pandas as pd
from typing import List, Dict, Any
import numpy as np
from transformers import pipeline
import urllib.parse
from sklearn.feature_extraction.text import TfidfVectorizer
import tldextract
from deep_translator import GoogleTranslator
from playsound import playsound
import soundfile as sf
from transformers import AutoModel, AutoTokenizer
def search_news(company_name: str, num_articles: int = 2) -> List[str]:
    search_url = f"https://www.google.com/search?q={company_name}+news&tbm=nws"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    try:
        response = requests.get(search_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        article_links = []
        for article in soup.select('.SoaBEf'):
            link_element = article.select_one('a')
            if link_element and 'href' in link_element.attrs:
                href = link_element['href']
                if href.startswith('/url?q='):
                    url = href.split('/url?q=')[1].split('&')[0]
                    url = urllib.parse.unquote(url)
                    article_links.append(url)
                elif href.startswith('http'):
                    article_links.append(href)

                if len(article_links) >= num_articles:
                    break

        return article_links
    except Exception as e:
        print(f"Error fetching news articles: {e}")
        return []

def extract_article_content(url: str) -> Dict[str, Any]:
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        title = soup.find("h1").get_text().strip() if soup.find("h1") else "No title found"

        content_element = soup.find("article") or soup.find("main") or soup.find("div", class_=["content", "article", "story"])
        content = " ".join([p.get_text().strip() for p in content_element.find_all("p")]) if content_element else "No content found"

        date_element = soup.find("time")
        date = date_element["datetime"] if date_element and "datetime" in date_element.attrs else None

        return {
            'url': url,
            'title': title,
            'content': content,
            'date': date
        }
    except Exception as e:
        print(f"Error extracting content from {url}: {e}")
        return {
            'url': url,
            'title': "Error extracting content",
            'content': "Error extracting content",
            'date': None
        }

def get_company_news(company_name: str) -> List[Dict[str, Any]]:
    """
    Fetch exactly 10 news articles for a given company.
    If fewer than 10 articles are retrieved initially, retry fetching more.
    """
    max_articles = 10
    articles = []
    retries = 3  # Number of retries to fetch missing articles

    for attempt in range(retries):
        # Fetch article URLs
        article_urls = search_news(company_name, num_articles=max_articles - len(articles))

        # Process each URL to extract content
        for url in article_urls:
            try:
                article_data = extract_article_content(url)
                # Avoid duplicates by checking the URL
                if article_data['url'] not in [a['url'] for a in articles]:
                    articles.append(article_data)
            except Exception as e:
                print(f"Error extracting from {url}: {e}")

        # Break if we have enough articles
        if len(articles) >= max_articles:
            break

    # If still fewer than 10 articles, fill with placeholders
    while len(articles) < max_articles:
        articles.append({
            'url': 'N/A',
            'title': 'No Title Available',
            'content': 'No Content Available',
            'date': None
        })

    return articles
def summarize_article(content: str, max_length: int = 50) -> str:
    summarizer = pipeline("summarization")
    max_input_length = summarizer.model.config.max_position_embeddings  # Get model's max input length

    # Ensure content does not exceed max input length
    truncated_content = content[:max_input_length]

    summary = summarizer(truncated_content, max_length=max_length, min_length=0, do_sample=False)
    return summary[0]['summary_text']

def analyze_sentiment(text: str) -> Dict[str, Any]:
    """
    Analyze sentiment of the given text.

    Args:
        text: The text to analyze.

    Returns:
        Dictionary containing sentiment category and score.
    """
    try:
        # Initialize sentiment analyzer
        sentiment_analyzer = pipeline("sentiment-analysis", truncation=True)

        # Truncate text manually to avoid exceeding token limits
        max_token_limit = 512  # Most transformer models have a 512-token limit
        words = text.split()
        if len(words) > max_token_limit:
            text = ' '.join(words[:max_token_limit])

        # Perform sentiment analysis
        result = sentiment_analyzer(text)

        # Determine sentiment category based on label and score
        sentiment_category = "Positive" if result[0]['label'] == "POSITIVE" else "Negative"
        score = result[0]['score']

        # Add neutral category for borderline cases
        if 0.4 <= score <= 0.6:
            sentiment_category = "Neutral"

        return {
            'sentiment': sentiment_category,
            'score': score
        }
    except Exception as e:
        print(f"Error in sentiment analysis: {e}")
        return {
            'sentiment': "Unknown",
            'score': 0.0
        }

def extract_key_topics(text: str, num_topics: int = 5) -> List[str]:
    if len(text.split()) < 10:
        return ["Not enough text to extract topics"]

    vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.toarray()[0]
    sorted_indices = np.argsort(tfidf_scores)[::-1]
    top_topics = [feature_names[idx] for idx in sorted_indices[:num_topics]]

    return top_topics

def perform_comparative_analysis(articles: List[Dict[str, Any]]) -> Dict[str, Any]:
    sentiment_counts = {
        'Positive': len([a for a in articles if a['sentiment']['sentiment'] == 'Positive']),
        'Neutral': len([a for a in articles if a['sentiment']['sentiment'] == 'Neutral']),
        'Negative': len([a for a in articles if a['sentiment']['sentiment'] == 'Negative'])
    }

    all_topics = [topic for article in articles for topic in article['topics']]
    topic_frequency = {}
    for topic in all_topics:
        topic_frequency[topic] = topic_frequency.get(topic, 0) + 1

    common_topics = sorted(topic_frequency.items(), key=lambda x: x[1], reverse=True)

    sentiment_by_source = {}
    for article in articles:
        source = extract_source_from_url(article['url'])
        if source not in sentiment_by_source:
            sentiment_by_source[source] = []
        sentiment_by_source[source].append(article['sentiment']['sentiment'])

    return {
        'sentiment_distribution': sentiment_counts,
        'common_topics': common_topics[:10],
        'sentiment_by_source': sentiment_by_source
    }

def extract_source_from_url(url: str) -> str:
    extracted_info = tldextract.extract(url)
    return extracted_info.domain

from typing import List, Dict, Any
from transformers import pipeline

def get_combined_summary(articles, max_length: int = 100) -> str:
    """
    Generate a combined summary from multiple news articles.

    Args:
        articles: List of article dictionaries containing content
        max_length: Maximum length of the final summary

    Returns:
        A comprehensive summary combining insights from all articles
    """
    # Combine all article contents with titles as context
    combined_content = ""
    for article in articles:
        # Use .get() with default values to handle missing keys
        title = article.get('title', 'No Title')
        content = article.get('content', 'Content not available')
        combined_content += f"Article: {title}\n{content}\n\n"

    # Initialize the summarizer
    summarizer = pipeline("summarization")

    # Handle token limit constraints
    max_input_length = summarizer.model.config.max_position_embeddings
    truncated_content = combined_content[:max_input_length]

    # Generate the combined summary
    summary = summarizer(truncated_content, max_length=max_length, min_length=30, do_sample=False)

    # Handle different return formats from the pipeline
    if isinstance(summary, list):
        return summary[0]['summary_text']
    else:
        return summary['summary_text']

def generate_hindi_summary(combined_summary: str) -> str:
    """
    Translate the combined summary to Hindi using deep-translator.

    Args:
        combined_summary: The English combined summary

    Returns:
        The Hindi translation of the combined summary
    """
    try:
        translator = GoogleTranslator(source='auto', target='hi')
        hindi_summary = translator.translate(text=combined_summary)
        return hindi_summary
    except Exception as e:
        print(f"Error in translation: {e}")
        return "Translation failed"
def generate_hindi_speech(hindi_summary: str):
    """
    Convert Hindi summary to speech using AI4Bharat's VITS-Rasa-13 model and play it

    Args:
        hindi_summary: Hindi text summary to synthesize (max 500 characters)
    """
    try:
        # Load pre-trained model (requires CUDA-enabled GPU)
        model = AutoModel.from_pretrained("ai4bharat/vits_rasa_13", trust_remote_code=True).to("cuda")
        tokenizer = AutoTokenizer.from_pretrained("ai4bharat/vits_rasa_13", trust_remote_code=True)

        # Process text and generate speech
        inputs = tokenizer(text=hindi_summary, return_tensors="pt").to("cuda")

        # Use default Indian voice profile (speaker_id=16 for male, 17 for female)
        outputs = model(inputs['input_ids'], speaker_id=16, emotion_id=0)

        # Convert to numpy array and save as temporary file
        audio_data = outputs.waveform.squeeze().cpu().numpy()
        sf.write("temp_hindi_speech.wav", audio_data, model.config.sampling_rate)

        # Play the audio using playsound
        playsound("temp_hindi_speech.wav")

    except Exception as e:
        print(f"Error in speech generation or playback: {e}")