Spaces:

Jman666
/

News-Article-Extraction

Build error

App Files Files Community

Jman666 commited on Mar 24, 2025

Commit

b09895f

verified ·

1 Parent(s): 028f10f

Update app.py

Browse files

Files changed (1) hide show

app.py +268 -388

app.py CHANGED Viewed

@@ -1,404 +1,284 @@
-import streamlit as st
 import requests
-import json
-import os
-import base64
-from typing import Dict, Any, List
-import time
-# Define API URL - Configure for different environments
-API_BASE_URL = os.environ.get("https://jman666-api-news-summarization.hf.space", "http://localhost:8000")
-# Set page config
-st.set_page_config(
-    page_title="News Sentiment Analysis",
-    page_icon="📰",
-    layout="wide",
-    initial_sidebar_state="expanded"
-)
-# Custom CSS
-st.markdown("""
-<style>
-    .main {
-        padding: 2rem;
-    }
-    .sentiment-positive {
-        color: green;
-        font-weight: bold;
-    }
-    .sentiment-negative {
-        color: red;
-        font-weight: bold;
-    }
-    .sentiment-neutral {
-        color: gray;
-        font-weight: bold;
-    }
-    .article-card {
-        padding: 1rem;
-        border-radius: 5px;
-        margin-bottom: 1rem;
-        background-color: #f5f5f5;
     }
-    .topic-tag {
-        display: inline-block;
-        padding: 0.25rem 0.5rem;
-        margin-right: 0.5rem;
-        margin-bottom: 0.5rem;
-        border-radius: 15px;
-        font-size: 0.8rem;
-        background-color: #e1e1e1;
-    }
-    .header-row {
-        display: flex;
-        justify-content: space-between;
-        align-items: center;
-        margin-bottom: 1rem;
-    }
-</style>
-""", unsafe_allow_html=True)
-def fetch_company_news(company_name: str) -> Dict[str, Any]:
     """
-    Fetch news analysis for a given company from the API
     Args:
-        company_name: Name of the company
     Returns:
-        Dictionary containing processed news data
     """
     try:
-        response = requests.post(
-            f"{API_BASE_URL}/api/news",
-            json={"company_name": company_name},
-            timeout=30
-        )
-        if response.status_code == 200:
-            return response.json()
-        else:
-            st.error(f"Error: {response.status_code} - {response.text}")
-            return None
     except Exception as e:
-        st.error(f"Error connecting to API: {str(e)}")
-        # For demo/testing, provide fallback sample data
-        return get_sample_data(company_name)
-def get_sample_data(company_name: str) -> Dict[str, Any]:
     """
-    Generate sample data for demonstration when API is not available
     Args:
-        company_name: Name of the company
     Returns:
-        Sample data dictionary
     """
-    return {
-        "Company": company_name,
-        "Articles": [
-            {
-                "Title": f"{company_name} Reports Strong Quarterly Growth",
-                "Summary": f"{company_name} has reported exceptional performance in recent quarters, exceeding analyst expectations.",
-                "Sentiment": "Positive",
-                "Topics": ["Financial Performance", "Market Growth", "Investor Relations"],
-                "Source": "Business News",
-                "Published_Date": "2025-03-15",
-                "URL": f"https://example.com/{company_name.lower()}/1"
-            },
-            {
-                "Title": f"{company_name} Faces Regulatory Scrutiny",
-                "Summary": f"Regulatory concerns continue to impact {company_name}'s operations and strategic plans.",
-                "Sentiment": "Negative",
-                "Topics": ["Regulations", "Compliance", "Legal Issues"],
-                "Source": "Financial Times",
-                "Published_Date": "2025-03-10",
-                "URL": f"https://example.com/{company_name.lower()}/2"
-            },
-            {
-                "Title": f"{company_name} Announces Changes to Leadership Team",
-                "Summary": f"{company_name} has announced changes that could impact its operations in the coming months.",
-                "Sentiment": "Neutral",
-                "Topics": ["Leadership", "Corporate Governance", "Organization Structure"],
-                "Source": "Market Watch",
-                "Published_Date": "2025-03-05",
-                "URL": f"https://example.com/{company_name.lower()}/3"
-            },
-            {
-                "Title": f"{company_name} Announces Changes to Leadership Team",
-                "Summary": f"{company_name} has announced changes that could impact its operations in the coming months.",
-                "Sentiment": "Neutral",
-                "Topics": ["Leadership", "Corporate Governance", "Organization Structure"],
-                "Source": "Market Watch",
-                "Published_Date": "2025-03-05",
-                "URL": f"https://example.com/{company_name.lower()}/3"
-            },
-            {
-                "Title": f"{company_name} Announces Changes to Leadership Team",
-                "Summary": f"{company_name} has announced changes that could impact its operations in the coming months.",
-                "Sentiment": "Neutral",
-                "Topics": ["Leadership", "Corporate Governance", "Organization Structure"],
-                "Source": "Market Watch",
-                "Published_Date": "2025-03-05",
-                "URL": f"https://example.com/{company_name.lower()}/3"
-            },
-            {
-                "Title": f"{company_name} Announces Changes to Leadership Team",
-                "Summary": f"{company_name} has announced changes that could impact its operations in the coming months.",
-                "Sentiment": "Neutral",
-                "Topics": ["Leadership", "Corporate Governance", "Organization Structure"],
-                "Source": "Market Watch",
-                "Published_Date": "2025-03-05",
-                "URL": f"https://example.com/{company_name.lower()}/3"
-            },
-            {
-                "Title": f"{company_name} Announces Changes to Leadership Team",
-                "Summary": f"{company_name} has announced changes that could impact its operations in the coming months.",
-                "Sentiment": "Neutral",
-                "Topics": ["Leadership", "Corporate Governance", "Organization Structure"],
-                "Source": "Market Watch",
-                "Published_Date": "2025-03-05",
-                "URL": f"https://example.com/{company_name.lower()}/3"
-            },
-            {
-                "Title": f"{company_name} Announces Changes to Leadership Team",
-                "Summary": f"{company_name} has announced changes that could impact its operations in the coming months.",
-                "Sentiment": "Neutral",
-                "Topics": ["Leadership", "Corporate Governance", "Organization Structure"],
-                "Source": "Market Watch",
-                "Published_Date": "2025-03-05",
-                "URL": f"https://example.com/{company_name.lower()}/3"
-            },
-            {
-                "Title": f"{company_name} Announces Changes to Leadership Team",
-                "Summary": f"{company_name} has announced changes that could impact its operations in the coming months.",
-                "Sentiment": "Neutral",
-                "Topics": ["Leadership", "Corporate Governance", "Organization Structure"],
-                "Source": "Market Watch",
-                "Published_Date": "2025-03-05",
-                "URL": f"https://example.com/{company_name.lower()}/3"
-            },
-            {
-                "Title": f"{company_name} Announces Changes to Leadership Team",
-                "Summary": f"{company_name} has announced changes that could impact its operations in the coming months.",
-                "Sentiment": "Neutral",
-                "Topics": ["Leadership", "Corporate Governance", "Organization Structure"],
-                "Source": "Market Watch",
-                "Published_Date": "2025-03-05",
-                "URL": f"https://example.com/{company_name.lower()}/3"
-            }
-        ],
-        "Comparative_Sentiment_Score": {
-            "Sentiment_Distribution": {
-                "Positive": 1,
-                "Negative": 1,
-                "Neutral": 1
-            },
-            "Coverage_Differences": [
-                {
-                    "Comparison": "Positive articles focus on Financial Performance, Market Growth, while negative articles emphasize Regulations, Compliance, Legal Issues.",
-                    "Impact": "This suggests a contrast in perception across different aspects of the company."
-                },
-                {
-                    "Comparison": "Coverage varies in depth and focus across different sources.",
-                    "Impact": "This highlights the importance of consulting multiple sources for a comprehensive understanding."
-                }
-            ],
-            "Topic_Overlap": {
-                "Common_Topics": ["Corporate Strategy", "Market Position"],
-                "Unique_Topics": ["Financial Performance", "Regulations", "Leadership"]
-            },
-            "Final_Sentiment_Analysis": "Current news coverage is mixed or neutral, reflecting a complex situation."
-        },
-        "Final_Sentiment_Analysis": "Current news coverage is mixed or neutral, reflecting a complex situation.",
-        "Audio": "sample_audio.mp3"
-    }
-def display_sentiment_badge(sentiment: str) -> None:
-    """Display a colored badge for the sentiment"""
-    if sentiment == "Positive":
-        st.markdown(f'<span class="sentiment-positive">Positive</span>', unsafe_allow_html=True)
-    elif sentiment == "Negative":
-        st.markdown(f'<span class="sentiment-negative">Negative</span>', unsafe_allow_html=True)
-    else:
-        st.markdown(f'<span class="sentiment-neutral">Neutral</span>', unsafe_allow_html=True)
-def display_topics(topics: List[str]) -> None:
-    """Display topic tags"""
-    html = ""
-    for topic in topics:
-        html += f'<span class="topic-tag">{topic}</span>'
-    st.markdown(html, unsafe_allow_html=True)
-def display_article_card(article: Dict[str, Any], index: int) -> None:
-    """Display an article in a card format"""
-    with st.container():
-        st.markdown(f'<div class="article-card">', unsafe_allow_html=True)
-        # Title and sentiment
-        col1, col2 = st.columns([3, 1])
-        with col1:
-            st.markdown(f"### {article['Title']}")
-        with col2:
-            st.markdown("**Sentiment:**")
-            display_sentiment_badge(article['Sentiment'])
-        # Summary
-        st.markdown("**Summary:**")
-        st.write(article['Summary'])
-        # Topics
-        st.markdown("**Topics:**")
-        display_topics(article['Topics'])
-        # Source and date
-        col1, col2 = st.columns(2)
-        with col1:
-            st.markdown(f"**Source:** {article['Source']}")
-        with col2:
-            st.markdown(f"**Published:** {article['Published_Date']}")
-        # URL
-        st.markdown(f"[Read full article]({article['URL']})")
-        st.markdown('</div>', unsafe_allow_html=True)
-def display_comparative_analysis(analysis: Dict[str, Any]) -> None:
-    """Display the comparative analysis section"""
-    st.subheader("Sentiment Distribution")
-    # Display sentiment distribution as a bar chart
-    sentiments = analysis["Sentiment_Distribution"]
-    st.bar_chart(sentiments)
-    # Coverage differences
-    st.subheader("Coverage Analysis")
-    for item in analysis["Coverage_Differences"]:
-        st.markdown(f"**Observation:** {item['Comparison']}")
-        st.markdown(f"*Impact:* {item['Impact']}")
-        st.markdown("---")
-    # Topic overlap
-    st.subheader("Topic Analysis")
-    col1, col2 = st.columns(2)
-    with col1:
-        st.markdown("**Common Topics Across Articles:**")
-        for topic in analysis["Topic_Overlap"]["Common_Topics"]:
-            st.markdown(f"- {topic}")
-    with col2:
-        st.markdown("**Unique Topics:**")
-        for topic in analysis["Topic_Overlap"]["Unique_Topics"]:
-            st.markdown(f"- {topic}")
-    # Final sentiment
-    st.subheader("Overall Sentiment Analysis")
-    st.info(analysis["Final_Sentiment_Analysis"])
-def main():
-    st.title("📰 Company News Sentiment Analysis")
-    st.markdown("""
-    This application extracts key details from news articles related to a given company,
-    performs sentiment analysis, conducts a comparative analysis, and generates a text-to-speech
-    output in Hindi.
-    """)
-    # Company selection
-    st.header("Enter Company Name")
-    # Example companies for dropdown
-    example_companies = [
-        "Tesla",
-        "Apple",
-        "Google",
-        "Microsoft",
-        "Amazon",
-        "Facebook",
-        "Netflix",
-        "Other (specify)"
-    ]
-    company_option = st.selectbox(
-        "Select a company or choose 'Other' to specify:",
-        example_companies
-    )
-    company_name = ""
-    if company_option == "Other (specify)":
-        company_name = st.text_input("Enter company name:")
     else:
-        company_name = company_option
-    # Process button
-    if st.button("Analyze News") and company_name:
-        with st.spinner(f"Analyzing news for {company_name}..."):
-            # Display a progress bar to show work is happening
-            progress_bar = st.progress(0)
-            for i in range(100):
-                time.sleep(0.05)  # Simulate work
-                progress_bar.progress(i + 1)
-            # Fetch data from API
-            result = fetch_company_news(company_name)
-            if result:
-                # Display results
-                st.header(f"News Analysis for {result['Company']}")
-                # Summary tabs
-                tab1, tab2, tab3 = st.tabs(["Articles", "Comparative Analysis", "Audio Summary"])
-                with tab1:
-                    st.subheader("Articles Analysis")
-                    for i, article in enumerate(result["Articles"]):
-                        display_article_card(article, i)
-                with tab2:
-                    st.subheader("Comparative Sentiment Analysis")
-                    display_comparative_analysis(result["Comparative_Sentiment_Score"])
-                with tab3:
-                    st.subheader("Audio Summary (Hindi)")
-                    st.markdown("Listen to the audio summary of the news analysis in Hindi:")
-                    # In a real implementation, you would provide the actual audio file
-                    # For demonstration, we'll show a placeholder
-                    st.audio("https://upload.wikimedia.org/wikipedia/commons/5/5b/Hindi_svar.ogg", format="audio/ogg")
-                    st.markdown("**Note:** This is a placeholder audio. In the actual implementation, the audio would be a Hindi text-to-speech conversion of the news summary.")
-    # Information section
-    st.sidebar.title("About")
-    st.sidebar.info("""
-    This application performs news extraction, sentiment analysis, and text-to-speech conversion
-    for company news articles.
-    **Features:**
-    - Extract news from multiple sources
-    - Analyze sentiment (positive, negative, neutral)
-    - Identify key topics in articles
-    - Compare sentiment across articles
-    - Generate Hindi audio summary
-    **Technologies Used:**
-    - Natural Language Processing
-    - Sentiment Analysis
-    - Text-to-Speech Conversion
-    - Web Scraping
-    """)
-    st.sidebar.title("Instructions")
-    st.sidebar.markdown("""
-    1. Select a company from the dropdown or enter a custom company name
-    2. Click "Analyze News" to start the analysis
-    3. View the results in the three tabs:
-       - Articles: Individual article analysis
-       - Comparative Analysis: Cross-article insights
-       - Audio Summary: Hindi speech summary
-    """)
-if __name__ == "__main__":
-    main()

+#utils
+!pip install deep-translator
+!pip install googletrans
+!pip install tldextract
+!pip install playsound
+!pip install gtts
+!pip install streamlit
+!pip install fastapi
+!pip install pandas
+!pip install matplotlib
+!pip install pydantic
+!pip install requests
 import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+from typing import List, Dict, Any
+import numpy as np
+from transformers import pipeline, AutoProcessor, AutoModel
+import urllib.parse
+from sklearn.feature_extraction.text import TfidfVectorizer
+import tldextract
+import torch
+import soundfile as sf
+from googletrans import Translator
+from playsound import playsound
+from transformers import AutoModel, AutoTokenizer
+import soundfile as sf
+import numpy as np
+from gtts import gTTS
+from deep_translator import GoogleTranslator
+def search_news(company_name: str, num_articles: int = 2) -> List[str]:
+    search_url = f"https://www.google.com/search?q={company_name}+news&tbm=nws"
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
     }
+    try:
+        response = requests.get(search_url, headers=headers)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, "html.parser")
+        article_links = []
+        for article in soup.select('.SoaBEf'):
+            link_element = article.select_one('a')
+            if link_element and 'href' in link_element.attrs:
+                href = link_element['href']
+                if href.startswith('/url?q='):
+                    url = href.split('/url?q=')[1].split('&')[0]
+                    url = urllib.parse.unquote(url)
+                    article_links.append(url)
+                elif href.startswith('http'):
+                    article_links.append(href)
+                if len(article_links) >= num_articles:
+                    break
+        return article_links
+    except Exception as e:
+        print(f"Error fetching news articles: {e}")
+        return []
+def extract_article_content(url: str) -> Dict[str, Any]:
+    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
+    try:
+        response = requests.get(url, headers=headers)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, "html.parser")
+        title = soup.find("h1").get_text().strip() if soup.find("h1") else "No title found"
+        content_element = soup.find("article") or soup.find("main") or soup.find("div", class_=["content", "article", "story"])
+        content = " ".join([p.get_text().strip() for p in content_element.find_all("p")]) if content_element else "No content found"
+        date_element = soup.find("time")
+        date = date_element["datetime"] if date_element and "datetime" in date_element.attrs else None
+        return {
+            'url': url,
+            'title': title,
+            'content': content,
+            'date': date
+        }
+    except Exception as e:
+        print(f"Error extracting content from {url}: {e}")
+        return {
+            'url': url,
+            'title': "Error extracting content",
+            'content': "Error extracting content",
+            'date': None
+        }
+def get_company_news(company_name: str) -> List[Dict[str, Any]]:
+    article_urls = search_news(company_name)
+    articles = []
+    for url in article_urls[:10]:
+        try:
+            article_data = extract_article_content(url)
+            articles.append(article_data)
+        except Exception as e:
+            print(f"Error extracting from {url}: {e}")
+    return articles
+def summarize_article(content: str, max_length: int = 50) -> str:
+    summarizer = pipeline("summarization")
+    max_input_length = summarizer.model.config.max_position_embeddings  # Get model's max input length
+    # Ensure content does not exceed max input length
+    truncated_content = content[:max_input_length]
+    summary = summarizer(truncated_content, max_length=max_length, min_length=0, do_sample=False)
+    return summary[0]['summary_text']
+def analyze_sentiment(text: str) -> Dict[str, Any]:
     """
+    Analyze sentiment of the given text.
     Args:
+        text: The text to analyze.
     Returns:
+        Dictionary containing sentiment category and score.
     """
     try:
+        # Initialize sentiment analyzer
+        sentiment_analyzer = pipeline("sentiment-analysis", truncation=True)
+        # Truncate text manually to avoid exceeding token limits
+        max_token_limit = 512  # Most transformer models have a 512-token limit
+        words = text.split()
+        if len(words) > max_token_limit:
+            text = ' '.join(words[:max_token_limit])
+        # Perform sentiment analysis
+        result = sentiment_analyzer(text)
+        # Determine sentiment category based on label and score
+        sentiment_category = "Positive" if result[0]['label'] == "POSITIVE" else "Negative"
+        score = result[0]['score']
+        # Add neutral category for borderline cases
+        if 0.4 <= score <= 0.6:
+            sentiment_category = "Neutral"
+        return {
+            'sentiment': sentiment_category,
+            'score': score
+        }
     except Exception as e:
+        print(f"Error in sentiment analysis: {e}")
+        return {
+            'sentiment': "Unknown",
+            'score': 0.0
+        }
+def extract_key_topics(text: str, num_topics: int = 5) -> List[str]:
+    if len(text.split()) < 10:
+        return ["Not enough text to extract topics"]
+    vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
+    tfidf_matrix = vectorizer.fit_transform([text])
+    feature_names = vectorizer.get_feature_names_out()
+    tfidf_scores = tfidf_matrix.toarray()[0]
+    sorted_indices = np.argsort(tfidf_scores)[::-1]
+    top_topics = [feature_names[idx] for idx in sorted_indices[:num_topics]]
+    return top_topics
+def perform_comparative_analysis(articles: List[Dict[str, Any]]) -> Dict[str, Any]:
+    sentiment_counts = {
+        'Positive': len([a for a in articles if a['sentiment']['sentiment'] == 'Positive']),
+        'Neutral': len([a for a in articles if a['sentiment']['sentiment'] == 'Neutral']),
+        'Negative': len([a for a in articles if a['sentiment']['sentiment'] == 'Negative'])
+    }
+    all_topics = [topic for article in articles for topic in article['topics']]
+    topic_frequency = {}
+    for topic in all_topics:
+        topic_frequency[topic] = topic_frequency.get(topic, 0) + 1
+    common_topics = sorted(topic_frequency.items(), key=lambda x: x[1], reverse=True)
+    sentiment_by_source = {}
+    for article in articles:
+        source = extract_source_from_url(article['url'])
+        if source not in sentiment_by_source:
+            sentiment_by_source[source] = []
+        sentiment_by_source[source].append(article['sentiment']['sentiment'])
+    return {
+        'sentiment_distribution': sentiment_counts,
+        'common_topics': common_topics[:10],
+        'sentiment_by_source': sentiment_by_source
+    }
+def extract_source_from_url(url: str) -> str:
+    extracted_info = tldextract.extract(url)
+    return extracted_info.domain
+from typing import List, Dict, Any
+from transformers import pipeline
+def get_combined_summary(articles, max_length: int = 100) -> str:
     """
+    Generate a combined summary from multiple news articles.
     Args:
+        articles: List of article dictionaries containing content
+        max_length: Maximum length of the final summary
     Returns:
+        A comprehensive summary combining insights from all articles
     """
+    # Combine all article contents with titles as context
+    combined_content = ""
+    for article in articles:
+        # Use .get() with default values to handle missing keys
+        title = article.get('title', 'No Title')
+        content = article.get('content', 'Content not available')
+        combined_content += f"Article: {title}\n{content}\n\n"
+    # Initialize the summarizer
+    summarizer = pipeline("summarization")
+    # Handle token limit constraints
+    max_input_length = summarizer.model.config.max_position_embeddings
+    truncated_content = combined_content[:max_input_length]
+    # Generate the combined summary
+    summary = summarizer(truncated_content, max_length=max_length, min_length=30, do_sample=False)
+    # Handle different return formats from the pipeline
+    if isinstance(summary, list):
+        return summary[0]['summary_text']
     else:
+        return summary['summary_text']
+def generate_hindi_summary(combined_summary: str) -> str:
+    """
+    Translate the combined summary to Hindi using deep-translator.
+    Args:
+        combined_summary: The English combined summary
+    Returns:
+        The Hindi translation of the combined summary
+    """
+    try:
+        translator = GoogleTranslator(source='auto', target='hi')
+        hindi_summary = translator.translate(text=combined_summary)
+        return hindi_summary
+    except Exception as e:
+        print(f"Error in translation: {e}")
+        return "Translation failed"
+def generate_hindi_speech(hindi_summary: str):
+    """
+    Convert Hindi summary to speech using AI4Bharat's VITS-Rasa-13 model and play it
+    Args:
+        hindi_summary: Hindi text summary to synthesize (max 500 characters)
+    """
+    try:
+        # Load pre-trained model (requires CUDA-enabled GPU)
+        model = AutoModel.from_pretrained("ai4bharat/vits_rasa_13", trust_remote_code=True).to("cuda")
+        tokenizer = AutoTokenizer.from_pretrained("ai4bharat/vits_rasa_13", trust_remote_code=True)
+        # Process text and generate speech
+        inputs = tokenizer(text=hindi_summary, return_tensors="pt").to("cuda")
+        # Use default Indian voice profile (speaker_id=16 for male, 17 for female)
+        outputs = model(inputs['input_ids'], speaker_id=16, emotion_id=0)
+        # Convert to numpy array and save as temporary file
+        audio_data = outputs.waveform.squeeze().cpu().numpy()
+        sf.write("temp_hindi_speech.wav", audio_data, model.config.sampling_rate)
+        # Play the audio using playsound
+        playsound("temp_hindi_speech.wav")
+    except Exception as e:
+        print(f"Error in speech generation or playback: {e}")