Spaces:

Nishitha03
/

News-Scraper

Sleeping

App Files Files Community

Nishitha03 commited on Oct 8, 2025

Commit

eb34004

verified ·

1 Parent(s): 6e39ec1

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +626 -869

src/streamlit_app.py CHANGED Viewed

@@ -1,297 +1,240 @@
 import streamlit as st
 import pandas as pd
 import matplotlib.pyplot as plt
-import numpy as np
 import plotly.express as px
 import plotly.graph_objects as go
 from matplotlib.ticker import MaxNLocator
 import os
 import time
 import json
 import requests
 import spacy
-from tqdm import tqdm
-import warnings
-import pandas as pd
-# import pussy
-# Suppress warnings for cleaner output
 warnings.filterwarnings('ignore')
-config_dir = os.environ.get("STREAMLIT_CONFIG_DIR", "/tmp/.streamlit")
-os.makedirs(config_dir, exist_ok=True)
-# Set page configuration
 st.set_page_config(
-    page_title="Sentiment Analysis of RSS Articles",
     page_icon="📰",
     layout="wide",
     initial_sidebar_state="expanded"
 )
-# Custom CSS for styling
 def load_css():
     st.markdown("""
     <style>
         .main-header {
-            font-size: 3rem !important;
             font-weight: 700 !important;
             text-align: center !important;
-            padding: 2rem 0 !important;
         }
         .sub-header {
-            font-size: 2rem !important;
             font-weight: 600 !important;
             padding: 1rem 0 !important;
         }
-        .newspaper-card {
-            background-color: #f8f9fa;
             border-radius: 10px;
             padding: 20px;
-            box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
             text-align: center;
-            margin-bottom: 20px;
         }
-        .newspaper-title {
-            font-size: 1.5rem;
-            font-weight: 600;
-            margin-bottom: 10px;
         }
-        .entry-page {
-            display: flex;
-            flex-direction: column;
-            justify-content: center;
-            align-items: center;
-            height: 100vh;
-            position: fixed;
-            top: 0;
-            left: 0;
-            right: 0;
-            bottom: 0;
         }
-        .entry-container {
-            text-align: center;
-            background-color: #f8f9fa;
-            padding: 3rem;
-            border-radius: 20px;
-            box-shadow: 0 6px 12px rgba(0, 0, 0, 0.15);
-            max-width: 800px;
         }
-        .button-container {
-            margin-top: 2rem;
         }
-        .footer {
-            text-align: center;
-            padding: 1rem;
-            color: #6c757d;
-            margin-top: 2rem;
-            border-top: 1px solid #dee2e6;
         }
     </style>
     """, unsafe_allow_html=True)
-# Constants
-INDIA_GEOJSON_URL = 'https://raw.githubusercontent.com/geohacker/india/master/state/india_state.geojson'
-# India GeoJSON loading function
 @st.cache_data
 def load_india_geojson():
     """Load India GeoJSON data for mapping"""
     try:
-        response = requests.get(INDIA_GEOJSON_URL)
         return json.loads(response.text)
     except Exception as e:
-        st.error(f"Failed to load GeoJSON: {e}")
-        st.info("Trying fallback method...")
-        try:
-            # Fallback: pip install geopandas
-            import geopandas as gpd
-            india = gpd.read_file(INDIA_GEOJSON_URL)
-            return json.loads(india.to_json())
-        except:
-            st.error("Error: Could not load India GeoJSON. Please ensure internet connection.")
-            return None
-# Load spaCy model (with caching)
 @st.cache_resource
 def load_spacy_model():
     try:
         return spacy.load("en_core_web_sm")
     except OSError:
-        st.info("Downloading spaCy model... This may take a moment.")
         import subprocess
         subprocess.call(["python", "-m", "spacy", "download", "en_core_web_sm"])
         return spacy.load("en_core_web_sm")
-# State mapping dictionary
 def get_state_mapping():
     return {
-        # Standard state names
-        'andhra pradesh': 'Andhra Pradesh',
-        'arunachal pradesh': 'Arunachal Pradesh',
-        'assam': 'Assam',
-        'bihar': 'Bihar',
-        'chhattisgarh': 'Chhattisgarh',
-        'goa': 'Goa',
-        'gujarat': 'Gujarat',
-        'haryana': 'Haryana',
-        'himachal pradesh': 'Himachal Pradesh',
-        'jharkhand': 'Jharkhand',
-        'karnataka': 'Karnataka',
-        'kerala': 'Kerala',
-        'madhya pradesh': 'Madhya Pradesh',
-        'maharashtra': 'Maharashtra',
-        'manipur': 'Manipur',
-        'meghalaya': 'Meghalaya',
-        'mizoram': 'Mizoram',
-        'nagaland': 'Nagaland',
-        'odisha': 'Odisha',
-        'punjab': 'Punjab',
-        'rajasthan': 'Rajasthan',
-        'sikkim': 'Sikkim',
-        'tamil nadu': 'Tamil Nadu',
-        'telangana': 'Telangana',
-        'tripura': 'Tripura',
-        'uttar pradesh': 'Uttar Pradesh',
-        'uttarakhand': 'Uttarakhand',
-        'west bengal': 'West Bengal',
-        # Union Territories
-        'delhi': 'Delhi',
-        'new delhi': 'Delhi',
-        'jammu and kashmir': 'Jammu and Kashmir',
-        'j&k': 'Jammu and Kashmir',
-        'ladakh': 'Ladakh',
-        'chandigarh': 'Chandigarh',
-        'puducherry': 'Puducherry',
-        'pondicherry': 'Puducherry',
-        'andaman and nicobar': 'Andaman and Nicobar Islands',
-        'dadra and nagar haveli': 'Dadra and Nagar Haveli and Daman and Diu',
-        'daman and diu': 'Dadra and Nagar Haveli and Daman and Diu',
-        'lakshadweep': 'Lakshadweep',
-        # Major cities mapped to their states
-        'mumbai': 'Maharashtra',
-        'kolkata': 'West Bengal',
-        'chennai': 'Tamil Nadu',
-        'bangalore': 'Karnataka',
-        'bengaluru': 'Karnataka',
-        'hyderabad': 'Telangana',
-        'ahmedabad': 'Gujarat',
-        'lucknow': 'Uttar Pradesh',
-        'jaipur': 'Rajasthan',
-        'srinagar': 'Jammu and Kashmir',
-        'varanasi': 'Uttar Pradesh',
-        'kochi': 'Kerala',
-        'pune': 'Maharashtra',
-        'agra': 'Uttar Pradesh',
-        'bhopal': 'Madhya Pradesh',
-        'patna': 'Bihar',
     }
-# Function to extract locations from descriptions
 @st.cache_data
 def extract_locations_from_descriptions(df, description_column='desc'):
-    """
-    Extract state names from description column using spaCy
-    """
-    with st.spinner("Extracting location data from articles..."):
-        # Load spaCy model
-        nlp = load_spacy_model()
-        # Get state mapping dictionary
-        state_mapping = get_state_mapping()
-        # Process descriptions to extract locations
-        locations = []
-        # Use a progress bar if processing a large dataset
-        progress_text = "Extracting locations..."
-        progress_bar = st.progress(0)
-        for idx, row in enumerate(df.iterrows()):
-            # Update progress every 100 rows
-            if idx % 100 == 0:
-                progress_bar.progress(min(idx / len(df), 1.0))
-            row = row[1]  # Get the actual row data (second element of the tuple)
-            if pd.isna(row[description_column]):
-                locations.append(None)
-                continue
-            description = str(row[description_column]).lower()
-            doc = nlp(description)
-            # Extract location entities
-            found_locations = []
-            for ent in doc.ents:
-                if ent.label_ in ["GPE", "LOC"]:
-                    loc_name = ent.text.lower()
-                    if loc_name in state_mapping:
-                        found_locations.append(state_mapping[loc_name])
-            # Direct string matching for state names
-            for state_var, standard_name in state_mapping.items():
-                if state_var in description and standard_name not in found_locations:
-                    found_locations.append(standard_name)
-            # Store the first found location, or None if none found
-            locations.append(found_locations[0] if found_locations else None)
-        # Complete progress
-        progress_bar.progress(1.0)
-        # Add locations to dataframe
-        df = df.copy()  # Create a copy to avoid modifying the original
-        df['extracted_location'] = locations
-        st.success(f"Locations extracted. Found locations in {df['extracted_location'].notna().sum()} of {len(df)} articles.")
-        return df
-# Function to analyze sentiment by state
 def analyze_sentiment_by_state(df, sentiment_column='sentiment_score'):
-    """
-    Analyze sentiment by state and prepare data for visualization
-    """
-    # Filter to only rows with extracted locations and valid sentiment
     df_with_locations = df.dropna(subset=['extracted_location', sentiment_column])
     if len(df_with_locations) == 0:
-        st.warning("No locations found with valid sentiment values. Cannot create map.")
         return None
-    # Group by state and calculate average sentiment
-    sentiment_by_state = df_with_locations.groupby('extracted_location')[sentiment_column].agg(
-        avg_sentiment=('mean'),
-        count=('count')
-    ).reset_index()
     return sentiment_by_state
-# Function to create India sentiment map
-def create_india_sentiment_map(sentiment_data, geojson_data, newspaper_name):
-    """
-    Create a choropleth map of India showing sentiment by state
-    """
-    # Ensure state names match between GeoJSON and our data
-    state_property = 'NAME_1'  # This is the property name in the GeoJSON
-    # Determine color scale range based on data
     min_sentiment = sentiment_data['avg_sentiment'].min()
     max_sentiment = sentiment_data['avg_sentiment'].max()
-    # Use symmetrical range if sentiment ranges from negative to positive
     if min_sentiment < 0 and max_sentiment > 0:
         abs_max = max(abs(min_sentiment), abs(max_sentiment))
         color_range = [-abs_max, abs_max]
     else:
-        # Add small buffer to range
         color_range = [min_sentiment - 0.1, max_sentiment + 0.1]
-    # Create the choropleth map
     fig = px.choropleth_mapbox(
         sentiment_data,
         geojson=geojson_data,
         locations='extracted_location',
-        featureidkey=f"properties.{state_property}",
         color='avg_sentiment',
         color_continuous_scale="RdBu",
         range_color=color_range,
@@ -301,86 +244,133 @@ def create_india_sentiment_map(sentiment_data, geojson_data, newspaper_name):
         opacity=0.7,
         hover_data=['count'],
         labels={
-            'avg_sentiment': 'Average Sentiment',
             'extracted_location': 'State',
-            'count': 'Article Count'
         }
     )
-    # Customize the layout
     fig.update_layout(
-        title=dict(
-            text=f'{newspaper_name} - Sentiment Analysis by Indian States',
-            font=dict(size=24, color='#2c3e50'),
-            x=0.5,
-            y=0.95
-        ),
-        height=800,
         margin={"r":0,"t":50,"l":0,"b":0}
     )
-    # Add text annotation explaining the color scale
-    fig.add_annotation(
-        x=0.5, y=0.02,
-        xref="paper", yref="paper",
-        text="Color scale: Red (Negative) to Blue (Positive)",
-        showarrow=False,
-        font=dict(size=14)
     )
     return fig
-# Function to plot sentiment trends by year (from original code)
-def plot_sentiment_trends_by_year(df, newspaper_name):
-    # Set the style to a clean, modern look
-    plt.style.use('seaborn-v0_8-whitegrid')
-    # Custom font settings
-    plt.rcParams['font.family'] = 'sans-serif'
-    plt.rcParams['font.sans-serif'] = ['Arial', 'Helvetica', 'DejaVu Sans']
-    plt.rcParams['font.size'] = 11
-    plt.rcParams['axes.titlesize'] = 16
-    plt.rcParams['axes.labelsize'] = 12
-    # Convert date to datetime and extract year
-    df['year'] = pd.to_datetime(df['date'], errors='coerce').dt.year
-    # Ensure only known sentiments are used
     valid_sentiments = {"positive", "negative", "neutral"}
-    df['sentiment'] = df['sentiment_value'].apply(lambda x: x.lower() if isinstance(x, str) and x.lower() in valid_sentiments else "neutral")
-    # Count the number of articles per sentiment per year
     sentiment_counts = df.groupby(['year', 'sentiment']).size().reset_index(name='count')
-    # Calculate total articles per year
     year_totals = sentiment_counts.groupby('year')['count'].sum().reset_index(name='total')
-    # Merge the counts with totals to calculate percentages
     sentiment_counts = sentiment_counts.merge(year_totals, on='year')
     sentiment_counts['percentage'] = sentiment_counts['count'] / sentiment_counts['total'] * 100
-    # Pivot the data for easier plotting
-    sentiment_pivot = sentiment_counts.pivot(index='year', columns='sentiment', values='percentage').fillna(0)
-    # Ensure all sentiment columns exist
     for sentiment in ['negative', 'neutral', 'positive']:
         if sentiment not in sentiment_pivot.columns:
             sentiment_pivot[sentiment] = 0
-    # Sort by year (ascending for timeline)
-    sentiment_pivot = sentiment_pivot.sort_index()
-    # Create the figure and axis
-    fig, ax = plt.subplots(figsize=(12, 7))
-    # Define custom colors
     colors = {
-        'negative': '#5D3FD3',  # rich purple
-        'neutral': '#9D4EDD',   # lavender purple
-        'positive': '#00897B'   # teal green
     }
-    # Plot lines for each sentiment
     for sentiment in ['negative', 'neutral', 'positive']:
         ax.plot(
             sentiment_pivot.index,
@@ -389,664 +379,431 @@ def plot_sentiment_trends_by_year(df, newspaper_name):
             linewidth=2.5,
             label=sentiment.capitalize(),
             color=colors[sentiment],
-            markersize=8,
-            markeredgecolor='white',
-            markeredgewidth=1.5
         )
-    # Add article counts as annotations
-    for year in sentiment_pivot.index:
-        total = year_totals.loc[year_totals['year'] == year, 'total'].values[0]
-        ax.annotate(
-            f"{total:,}",
-            xy=(year, sentiment_pivot.loc[year, 'negative'] - 5),
-            xytext=(0, -25),
-            textcoords='offset points',
-            ha='center',
-            fontsize=9,
-            color='gray'
-        )
-    # Add a text indicating what the numbers represent
-    ax.text(
-        sentiment_pivot.index[0],
-        -12,
-        "Article Count",
-        fontsize=9,
-        color='gray',
-        ha='center'
-    )
-    # Set x-axis to only show years (integers)
-    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
-    # Set y-axis limits and labels
-    ax.set_ylim(0, max(100, sentiment_pivot.max().max() * 1.1))
     ax.set_ylabel('Percentage (%)', fontweight='bold')
     ax.set_xlabel('Year', fontweight='bold')
-    # Add title
-    ax.set_title(f'{newspaper_name} - Sentiment Trends by Year', fontweight='bold', pad=20)
-    # Customize legend
-    legend = ax.legend(
-        loc='upper right',
-        frameon=True,
-        framealpha=0.95,
-        edgecolor='lightgray',
-        title='Sentiment'
-    )
-    legend.get_title().set_fontweight('bold')
-    # Remove spines for cleaner look
-    ax.spines['top'].set_visible(False)
-    ax.spines['right'].set_visible(False)
-    ax.spines['left'].set_linewidth(0.5)
-    ax.spines['bottom'].set_linewidth(0.5)
-    # Add grid lines
-    ax.grid(axis='y', linestyle='--', alpha=0.3, color='gray')
-    # Add subtle background color
-    fig.patch.set_facecolor('#F8F9FA')
-    ax.set_facecolor('#F8F9FA')
-    # Add percentage labels at the end of each line
-    last_year = sentiment_pivot.index[-1]
-    for sentiment in ['negative', 'neutral', 'positive']:
-        if last_year in sentiment_pivot.index:  # Check if the last_year exists in the index
-            last_value = sentiment_pivot.loc[last_year, sentiment]
-            ax.annotate(
-                f"{last_value:.1f}%",
-                xy=(last_year, last_value),
-                xytext=(5, 0),
-                textcoords='offset points',
-                fontweight='bold',
-                color=colors[sentiment]
-            )
-    # Add a data source footer
-    plt.figtext(
-        0.01, 0.01,
-        f"Data source: Analysis of {df.shape[0]:,} articles",
-        fontsize=8,
-        color='gray'
-    )
-    # Add horizontal line at 50% for reference
-    ax.axhline(y=50, color='gray', linestyle='-', alpha=0.2)
-    ax.text(sentiment_pivot.index[0], 51, "50%", fontsize=8, color='gray')
-    # Adjust layout
-    plt.tight_layout(pad=2.0)
-    return fig
-# Function to plot article volume by year (from original code)
-def plot_article_volume_by_year(df, newspaper_name):
-    # Set the style to a clean, modern look
-    plt.style.use('seaborn-v0_8-whitegrid')
-    # Custom font settings
-    plt.rcParams['font.family'] = 'sans-serif'
-    plt.rcParams['font.sans-serif'] = ['Arial', 'Helvetica', 'DejaVu Sans']
-    # Convert date to datetime and extract year
-    df['year'] = pd.to_datetime(df['date'], errors='coerce').dt.year
-    # Count articles per year
-    article_counts = df.groupby('year').size().reset_index(name='count')
-    # Create the figure and axis
-    fig, ax = plt.subplots(figsize=(12, 5))
-    # Plot line for article count
-    ax.plot(
-        article_counts['year'],
-        article_counts['count'],
-        marker='o',
-        linewidth=2.5,
-        color='#3949AB',
-        markersize=8,
-        markeredgecolor='white',
-        markeredgewidth=1.5
-    )
-    # Fill area under the line
-    ax.fill_between(
-        article_counts['year'],
-        article_counts['count'],
-        alpha=0.2,
-        color='#3949AB'
-    )
-    # Set x-axis to only show years (integers)
-    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
-    # Add count labels above each point
-    for year, count in zip(article_counts['year'], article_counts['count']):
-        ax.annotate(
-            f"{count:,}",
-            xy=(year, count),
-            xytext=(0, 10),
-            textcoords='offset points',
-            ha='center',
-            fontweight='bold',
-            fontsize=10
-        )
-    # Set axis labels
-    ax.set_ylabel('Number of Articles', fontweight='bold')
-    ax.set_xlabel('Year', fontweight='bold')
-    # Add title
-    ax.set_title(f'{newspaper_name} - Article Volume by Year', fontweight='bold', pad=20)
-    # Remove spines for cleaner look
-    ax.spines['top'].set_visible(False)
-    ax.spines['right'].set_visible(False)
-    # Add grid lines
-    ax.grid(axis='y', linestyle='--', alpha=0.3, color='gray')
-    # Add subtle background color
-    fig.patch.set_facecolor('#F8F9FA')
-    ax.set_facecolor('#F8F9FA')
-    # Adjust layout
     plt.tight_layout()
     return fig
-# Function to create a comparison bar chart of newspapers
-def create_newspaper_comparison(dataframes, newspaper_names):
-    # Prepare data for the comparison
-    comparison_data = []
-    for i, df in enumerate(dataframes):
-        if df is not None:
-            # Ensure sentiment column exists and is properly formatted
-            if 'sentiment_value' in df.columns:
-                df['sentiment'] = df['sentiment_value'].apply(
-                    lambda x: x.lower() if isinstance(x, str) and x.lower() in ["positive", "negative", "neutral"] else "neutral"
-                )
-            # Count articles by sentiment
-            sentiment_counts = df['sentiment'].value_counts().to_dict()
-            # Add counts to comparison data
-            for sentiment in ['positive', 'negative', 'neutral']:
-                comparison_data.append({
-                    'Newspaper': newspaper_names[i],
-                    'Sentiment': sentiment.capitalize(),
-                    'Count': sentiment_counts.get(sentiment, 0)
-                })
-    # Create DataFrame from comparison data
-    comparison_df = pd.DataFrame(comparison_data)
-    # Create grouped bar chart
-    fig = px.bar(
-        comparison_df,
-        x='Newspaper',
-        y='Count',
-        color='Sentiment',
-        barmode='group',
-        title='Sentiment Distribution Across Newspapers',
         color_discrete_map={
-            'Positive': '#00897B',
-            'Neutral': '#9D4EDD',
-            'Negative': '#5D3FD3'
         }
     )
-    fig.update_layout(
-        height=500,
-        legend_title='Sentiment',
-        xaxis_title='',
-        yaxis_title='Number of Articles'
-    )
     return fig
-# Function to create a top locations bar chart
-def create_top_locations_chart(df, newspaper_name):
-    """Create a bar chart of the top mentioned locations"""
-    if 'extracted_location' not in df.columns or df['extracted_location'].isna().all():
-        # Return an empty figure
-        fig = go.Figure()
-        fig.add_annotation(
-            text="No location data available",
-            showarrow=False,
-            font=dict(size=20)
-        )
-        fig.update_layout(height=400)
-        return fig
-    # Count articles by location
-    location_counts = df['extracted_location'].value_counts().reset_index()
-    location_counts.columns = ['Location', 'Article Count']
-    # Get top 15 locations
-    top_locations = location_counts.head(15)
-    # Create bar chart
-    fig = px.bar(
-        top_locations,
-        y='Location',
-        x='Article Count',
-        title=f'Top 15 Locations Mentioned in {newspaper_name} Articles',
-        orientation='h',
-        color='Article Count',
-        color_continuous_scale='Viridis'
-    )
-    fig.update_layout(
-        height=500,
-        yaxis={'categoryorder':'total ascending'}
-    )
-    return fig
-def create_top_politicians_chart(df, newspaper_name):
-    """Create a bar chart of the top mentioned locations"""
-    if 'Politician' not in df.columns or df['Politician'].isna().all():
-        print(df.head())
-        # Return an empty figure
-        fig = go.Figure()
-        fig.add_annotation(
-            text="No politician data  available",
-            showarrow=False,
-            font=dict(size=20)
-        )
-        fig.update_layout(height=400)
-        return fig
-    # Get top 15 locations
-    top_locations = df
-    # Create bar chart
-    fig = px.bar(
-        top_locations,
-        y='Politician',
-        x='Mentions',
-        title=f'Top 10 Politicians Mentioned in {newspaper_name} Articles',
-        orientation='h',
-        color='Mentions',
-        color_continuous_scale='Viridis'
-    )
-    fig.update_layout(
-        height=500,
-        yaxis={'categoryorder':'total ascending'}
-    )
-    return fig
-# Function to load data
-@st.cache_data
-def load_data(newspaper_name):
-    try:
-        # Try to load CSV file for the newspaper
-        file_path = f"data/{newspaper_name.lower().replace(' ', '_')}_articles.csv"
-        df = pd.read_csv(file_path)
-        # Check if required columns exist
-        required_columns = ['date', 'sentiment_value']
-        for col in required_columns:
-            if col not in df.columns:
-                st.error(f"Required column '{col}' not found in {file_path}")
-                return None
-        # Add sentiment score column if not exists
-        if 'sentiment_score' not in df.columns:
-            # Create a numeric sentiment score based on sentiment_value
-            sentiment_map = {
-                'positive': 1.0,
-                'negative': -1.0,
-                'neutral': 0.0
-            }
-            df['sentiment_score'] = df['sentiment_value'].str.lower().map(sentiment_map).fillna(0)
-        return df
-    except Exception as e:
-        st.error(f"Error loading data for {newspaper_name}: {str(e)}")
-        return None
-# Entry page
-def show_entry_page():
-    st.markdown('<div class="entry-page">', unsafe_allow_html=True)
-    st.markdown('<div class="entry-container">', unsafe_allow_html=True)
-    st.markdown('<h1 class="main-header">Sentiment Analysis of RSS Articles</h1>', unsafe_allow_html=True)
-    st.markdown("""
-    <p style="font-size: 1.2rem; margin-bottom: 2rem;">
-        Analyze the sentiments of news articles from various RSS feeds across different newspapers.
-        Discover trends, patterns, and insights through interactive visualizations.
-    </p>
-    """, unsafe_allow_html=True)
-    st.markdown('<div class="button-container">', unsafe_allow_html=True)
-    if st.button("Explore Analysis", key="entry_explore", use_container_width=True):
-        st.session_state.show_entry = False
-    st.markdown('</div>', unsafe_allow_html=True)
-    st.markdown('</div>', unsafe_allow_html=True)
-    st.markdown('</div>', unsafe_allow_html=True)
-# Home page with newspaper cards
-def show_home_page():
-    st.markdown('<h1 class="main-header">RSS Articles Sentiment Analysis Dashboard</h1>', unsafe_allow_html=True)
-    # List of newspapers
-    newspapers = ["Print", "Scroll", "Sentinel", "NDTV"]
-    # Load data for all newspapers
-    dataframes = []
-    for newspaper in newspapers:
-        df = load_data(newspaper)
-        dataframes.append(df)
-    # Show comparison chart of all newspapers
-    st.markdown('<h2 class="sub-header">Newspaper Sentiment Comparison</h2>', unsafe_allow_html=True)
-    comparison_fig = create_newspaper_comparison(dataframes, newspapers)
-    st.plotly_chart(comparison_fig, use_container_width=True)
-    # Create a 2x2 grid for newspaper cards
-    col1, col2 = st.columns(2)
-    col3, col4 = st.columns(2)
-    cols = [col1, col2, col3, col4]
-    # Create a card for each newspaper
-    for i, newspaper in enumerate(newspapers):
-        df = dataframes[i]
-        with cols[i]:
-            st.markdown(f'<div class="newspaper-card">', unsafe_allow_html=True)
-            st.markdown(f'<div class="newspaper-title">{newspaper}</div>', unsafe_allow_html=True)
-            # Only show counts if data is available
-            if df is not None:
-                # Count articles by sentiment
-                if 'sentiment_value' in df.columns:
-                    sentiment_counts = df['sentiment_value'].str.lower().value_counts()
-                    # Create three columns for sentiment counts
-                    pos_col, neu_col, neg_col = st.columns(3)
-                    with pos_col:
-                        st.metric("Positive", sentiment_counts.get('positive', 0))
-                    with neu_col:
-                        st.metric("Neutral", sentiment_counts.get('neutral', 0))
-                    with neg_col:
-                        st.metric("Negative", sentiment_counts.get('negative', 0))
-                else:
-                    st.write("Sentiment data not available")
-            else:
-                st.write("Data not available")
-            # Add button to view detailed analysis
-            if st.button(f"View Analysis", key=f"view_{newspaper}"):
-                st.session_state.current_newspaper = newspaper
-                st.session_state.show_newspaper_analysis = True
-                st.rerun()
-            st.markdown('</div>', unsafe_allow_html=True)
-# Function to process all newspapers with location extraction
-@st.cache_data
-def preprocess_newspapers_with_locations(newspapers):
-    # Load GeoJSON for India map
-    india_geojson = load_india_geojson()
-    if india_geojson is None:
-        st.error("Could not load India GeoJSON. Please check your internet connection.")
-        return {}
-    processed_data = {}
-    for newspaper in newspapers:
-        # Load the raw data
-        df = load_data(newspaper)
-        if df is not None and 'desc' in df.columns:
-            # Extract locations if not already done
-            if 'extracted_location' not in df.columns:
-                df = extract_locations_from_descriptions(df, 'desc')
-            # Analyze sentiment by state
-            sentiment_by_state = analyze_sentiment_by_state(df)
-            processed_data[newspaper] = {
-                'df': df,
-                'sentiment_by_state': sentiment_by_state,
-                'india_geojson': india_geojson
-            }
-        else:
-            if df is not None:
-                processed_data[newspaper] = {
-                    'df': df,
-                    'error': "Description column 'desc' not found"
-                }
-            else:
-                processed_data[newspaper] = {
-                    'error': f"Could not load data for {newspaper}"
-                }
-    return processed_data
-# Newspaper analysis page
-# Newspaper analysis page
-def show_newspaper_analysis():
-    # Add back button
-    if st.button("← Back to Home"):
-        st.session_state.show_newspaper_analysis = False
-        st.rerun()
-    newspaper = st.session_state.current_newspaper
-    st.markdown(f'<h1 class="main-header">{newspaper} - Sentiment Analysis</h1>', unsafe_allow_html=True)
-    # Load data for this newspaper
-    df = load_data(newspaper)
-    if df is not None:
-        # Get or preprocess location data
-        if 'processed_data' not in st.session_state:
-            with st.spinner("Processing newspaper data..."):
-                st.session_state.processed_data = preprocess_newspapers_with_locations(["Print", "Scroll", "Sentinel", "NDTV"])
-        processed_data = st.session_state.processed_data.get(newspaper, {})
-        # Display article count and date range
-        article_count = len(df)
-        # Convert date column to datetime to get min and max dates
-        df['date'] = pd.to_datetime(df['date'], errors='coerce')
-        min_date = df['date'].min().strftime('%d %b, %Y') if not pd.isna(df['date'].min()) else "Unknown"
-        max_date = df['date'].max().strftime('%d %b, %Y') if not pd.isna(df['date'].max()) else "Unknown"
-        # Create metrics row
-        col1, col2, col3 = st.columns(3)
-        with col1:
-            st.metric("Total Articles", f"{article_count:,}")
-        with col2:
-            st.metric("First Article", min_date)
-        with col3:
-            st.metric("Latest Article", max_date)
-        # Show sentiment trends by year
-        st.markdown('<h2 class="sub-header">Sentiment Trends Over Time</h2>', unsafe_allow_html=True)
-        try:
-            sentiment_trend_fig = plot_sentiment_trends_by_year(df, newspaper)
-            st.pyplot(sentiment_trend_fig)
-        except Exception as e:
-            st.error(f"Error generating sentiment trends chart: {str(e)}")
-        # Show article volume by year
-        st.markdown('<h2 class="sub-header">Article Volume by Year</h2>', unsafe_allow_html=True)
-        try:
-            volume_fig = plot_article_volume_by_year(df, newspaper)
-            st.pyplot(volume_fig)
-        except Exception as e:
-            st.error(f"Error generating article volume chart: {str(e)}")
-        # Create two columns for location analysis
-        col1, col2 = st.columns(2)
-        with col1:
-            # Top mentioned locations
-            st.markdown('<h2 class="sub-header">Top Mentioned Locations</h2>', unsafe_allow_html=True)
-            if 'extracted_location' in df.columns:
-                top_locations_fig = create_top_locations_chart(df, newspaper)
-                st.plotly_chart(top_locations_fig, use_container_width=True)
-            else:
-                if 'desc' in df.columns:
-                    st.info("Location data not yet extracted. Click the button below to extract locations.")
-                    if st.button("Extract Locations", key=f"extract_{newspaper}"):
-                        with st.spinner("Extracting locations..."):
-                            df = extract_locations_from_descriptions(df)
-                            # Update the processed data
-                            processed_data['df'] = df
-                            sentiment_by_state = analyze_sentiment_by_state(df)
-                            processed_data['sentiment_by_state'] = sentiment_by_state
-                            st.session_state.processed_data[newspaper] = processed_data
-                            st.experimental_rerun()
-                else:
-                    st.warning("Description column not found. Cannot extract locations.")
-            # Top mentioned politicians - Now placed below the locations graph in the same column
-            st.markdown('<h2 class="sub-header">Top Mentioned Politicians</h2>', unsafe_allow_html=True)
-            if 'desc' in df.columns:
-                # Check if rss_personalities is defined, if not you'll need to define it
-                if 'rss_personalities' not in locals() and 'rss_personalities' not in globals():
-                    # Define your list of politicians here or import it
-                    rss_personalities = ["Narendra Modi", "Amit Shah", "Rajnath Singh", "Mohan Bhagwat", "Yogi Adityanath", "Nitin Gadkari"]
-                top_politicians = pussy.count_politicians_in_descriptions(df, rss_personalities).head(10)
-                top_politicians_fig = create_top_politicians_chart(top_politicians, newspaper)
-                st.plotly_chart(top_politicians_fig, use_container_width=True)
-            else:
-                st.warning("Description column not found. Cannot analyze politicians.")
-        with col2:
-            # Sentiment by state map
-            st.markdown('<h2 class="sub-header">Sentiment by State</h2>', unsafe_allow_html=True)
-            sentiment_by_state = processed_data.get('sentiment_by_state')
-            india_geojson = processed_data.get('india_geojson')
-            if sentiment_by_state is not None and india_geojson is not None and not sentiment_by_state.empty:
-                try:
-                    map_fig = create_india_sentiment_map(sentiment_by_state, india_geojson, newspaper)
-                    st.plotly_chart(map_fig, use_container_width=True)
-                except Exception as e:
-                    st.error(f"Error creating sentiment map: {str(e)}")
-            else:
-                if 'error' in processed_data:
-                    st.warning(processed_data['error'])
                 else:
-                    st.info("Sentiment data not available. Extract locations first.")
-        # Add section for detailed article analysis
-        st.markdown('<h2 class="sub-header">Article Analysis</h2>', unsafe_allow_html=True)
-        # Add filters for article display
-        col1, col2, col3 = st.columns(3)
-        with col1:
-            # Sentiment filter
-            sentiment_options = ["All"] + sorted(df['sentiment_value'].unique().tolist())
-            selected_sentiment = st.selectbox("Filter by Sentiment", sentiment_options)
-        with col2:
-            # Year filter
-            year_options = ["All"] + sorted(df['date'].dt.year.dropna().unique().astype(int).tolist())
-            selected_year = st.selectbox("Filter by Year", year_options)
-        with col3:
-            # Location filter (if available)
-            location_options = ["All"]
-            if 'extracted_location' in df.columns:
-                location_options += sorted(df['extracted_location'].dropna().unique().tolist())
-            selected_location = st.selectbox("Filter by Location", location_options)
-        # Apply filters
-        filtered_df = df.copy()
-        if selected_sentiment != "All":
-            filtered_df = filtered_df[filtered_df['sentiment_value'] == selected_sentiment]
-        if selected_year != "All":
-            filtered_df = filtered_df[filtered_df['date'].dt.year == selected_year]
-        if selected_location != "All" and 'extracted_location' in filtered_df.columns:
-            filtered_df = filtered_df[filtered_df['extracted_location'] == selected_location]
-        # Show article count after filtering
-        st.write(f"Displaying {len(filtered_df)} articles based on your filters.")
-        # Display articles in an expandable format
-        if not filtered_df.empty:
-            for index, row in filtered_df.head(50).iterrows():
-                title = row.get('title', 'Untitled')
-                date = row['date'].strftime('%d %b, %Y') if pd.notna(row['date']) else 'Unknown date'
-                sentiment = row.get('sentiment_value', 'Unknown sentiment')
-                description = row.get('desc', 'No description available')
-                link = row.get('link', 'No link available')
-                # Format sentiment with color
-                sentiment_color = {
-                    'positive': 'green',
-                    'neutral': 'gray',
-                    'negative': 'red'
-                }.get(sentiment.lower(), 'gray')
-                # Create expandable card for each article
-                with st.expander(f"{title} - {date}"):
-                    st.markdown(f"**Sentiment:** <span style='color:{sentiment_color}'>{sentiment.capitalize()}</span>", unsafe_allow_html=True)
-                    if 'extracted_location' in row and pd.notna(row['extracted_location']):
-                        st.markdown(f"**Location:** {row['extracted_location']}")
-                    st.markdown("**Description:**")
-                    st.markdown(f"{description}")
-                    st.markdown(f"**Link:**  {link}")
-            if len(filtered_df) > 50:
-                st.info(f"Showing 50 out of {len(filtered_df)} articles. Apply more filters to narrow down results.")
-        else:
-            st.info("No articles match your selected filters.")
-    else:
-        st.error(f"Could not load data for {newspaper}. Please check if the data file exists.")
-# Main function
 def main():
-    # Load CSS
     load_css()
-    # Initialize session state variables if not exists
-    if 'show_entry' not in st.session_state:
-        st.session_state.show_entry = True
-    if 'show_newspaper_analysis' not in st.session_state:
-        st.session_state.show_newspaper_analysis = False
-    if 'current_newspaper' not in st.session_state:
-        st.session_state.current_newspaper = None
-    # Display appropriate page based on session state
-    if st.session_state.show_entry:
-        show_entry_page()
-    elif st.session_state.show_newspaper_analysis:
-        show_newspaper_analysis()
-    else:
         show_home_page()
-    # Footer
-    st.markdown('<div class="footer">', unsafe_allow_html=True)
-    st.markdown('RSS Sentiment Analysis Dashboard - Developed with Streamlit', unsafe_allow_html=True)
-    st.markdown('</div>', unsafe_allow_html=True)
 if __name__ == "__main__":
-    main()

+"""
+Unified News Scraper & Sentiment Analysis Application
+Combines scraping, processing, and visualization in one interface
+"""
 import streamlit as st
 import pandas as pd
 import matplotlib.pyplot as plt
 import plotly.express as px
 import plotly.graph_objects as go
 from matplotlib.ticker import MaxNLocator
+import subprocess
+import sys
 import os
+from pathlib import Path
 import time
+from datetime import datetime
+import warnings
 import json
 import requests
 import spacy
 warnings.filterwarnings('ignore')
+# Constants
+INDIA_GEOJSON_URL = 'https://raw.githubusercontent.com/geohacker/india/master/state/india_state.geojson'
+# Page config
 st.set_page_config(
+    page_title="News Scraper & Analysis Platform",
     page_icon="📰",
     layout="wide",
     initial_sidebar_state="expanded"
 )
+# Custom CSS
 def load_css():
     st.markdown("""
     <style>
         .main-header {
+            font-size: 2.8rem !important;
             font-weight: 700 !important;
             text-align: center !important;
+            padding: 1.5rem 0 !important;
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            -webkit-background-clip: text;
+            -webkit-text-fill-color: transparent;
         }
         .sub-header {
+            font-size: 1.8rem !important;
             font-weight: 600 !important;
             padding: 1rem 0 !important;
+            color: #2c3e50;
         }
+        .feature-card {
+            background: white;
+            border-radius: 15px;
+            padding: 25px;
+            box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+            margin: 10px 0;
+            transition: transform 0.3s;
+        }
+        .feature-card:hover {
+            transform: translateY(-5px);
+            box-shadow: 0 8px 12px rgba(0, 0, 0, 0.15);
+        }
+        .metric-card {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
             border-radius: 10px;
             padding: 20px;
+            color: white;
             text-align: center;
         }
+        .status-running {
+            background-color: #fff3cd;
+            border-left: 4px solid #ffc107;
+            padding: 15px;
+            border-radius: 5px;
         }
+        .status-success {
+            background-color: #d4edda;
+            border-left: 4px solid #28a745;
+            padding: 15px;
+            border-radius: 5px;
         }
+        .status-error {
+            background-color: #f8d7da;
+            border-left: 4px solid #dc3545;
+            padding: 15px;
+            border-radius: 5px;
         }
+        .stTabs [data-baseweb="tab-list"] {
+            gap: 24px;
         }
+        .stTabs [data-baseweb="tab"] {
+            padding: 10px 20px;
+            background-color: #f8f9fa;
+            border-radius: 8px 8px 0 0;
         }
     </style>
     """, unsafe_allow_html=True)
+# Initialize session state
+def init_session_state():
+    defaults = {
+        'scraped_data': {},
+        'scraping_active': False,
+        'processing_status': {},
+        'selected_dataset': None
+    }
+    for key, value in defaults.items():
+        if key not in st.session_state:
+            st.session_state[key] = value
+# Setup directories
+def setup_directories():
+    for dir_name in ['output', 'data', 'temp']:
+        Path(dir_name).mkdir(exist_ok=True)
+# Load India GeoJSON
 @st.cache_data
 def load_india_geojson():
     """Load India GeoJSON data for mapping"""
     try:
+        response = requests.get(INDIA_GEOJSON_URL, timeout=10)
         return json.loads(response.text)
     except Exception as e:
+        st.warning(f"Could not load India map: {e}")
+        return None
+# Load spaCy model
 @st.cache_resource
 def load_spacy_model():
     try:
         return spacy.load("en_core_web_sm")
     except OSError:
+        st.info("Downloading spaCy model...")
         import subprocess
         subprocess.call(["python", "-m", "spacy", "download", "en_core_web_sm"])
         return spacy.load("en_core_web_sm")
+# State mapping
 def get_state_mapping():
     return {
+        'andhra pradesh': 'Andhra Pradesh', 'arunachal pradesh': 'Arunachal Pradesh',
+        'assam': 'Assam', 'bihar': 'Bihar', 'chhattisgarh': 'Chhattisgarh',
+        'goa': 'Goa', 'gujarat': 'Gujarat', 'haryana': 'Haryana',
+        'himachal pradesh': 'Himachal Pradesh', 'jharkhand': 'Jharkhand',
+        'karnataka': 'Karnataka', 'kerala': 'Kerala', 'madhya pradesh': 'Madhya Pradesh',
+        'maharashtra': 'Maharashtra', 'manipur': 'Manipur', 'meghalaya': 'Meghalaya',
+        'mizoram': 'Mizoram', 'nagaland': 'Nagaland', 'odisha': 'Odisha',
+        'punjab': 'Punjab', 'rajasthan': 'Rajasthan', 'sikkim': 'Sikkim',
+        'tamil nadu': 'Tamil Nadu', 'telangana': 'Telangana', 'tripura': 'Tripura',
+        'uttar pradesh': 'Uttar Pradesh', 'uttarakhand': 'Uttarakhand',
+        'west bengal': 'West Bengal', 'delhi': 'Delhi', 'new delhi': 'Delhi',
+        'jammu and kashmir': 'Jammu and Kashmir', 'j&k': 'Jammu and Kashmir',
+        'ladakh': 'Ladakh', 'chandigarh': 'Chandigarh', 'puducherry': 'Puducherry',
+        'mumbai': 'Maharashtra', 'kolkata': 'West Bengal', 'chennai': 'Tamil Nadu',
+        'bangalore': 'Karnataka', 'bengaluru': 'Karnataka', 'hyderabad': 'Telangana',
+        'ahmedabad': 'Gujarat', 'pune': 'Maharashtra', 'jaipur': 'Rajasthan',
     }
+# Extract locations from text
 @st.cache_data
 def extract_locations_from_descriptions(df, description_column='desc'):
+    """Extract state names from description using spaCy"""
+    nlp = load_spacy_model()
+    state_mapping = get_state_mapping()
+    locations = []
+    progress_bar = st.progress(0)
+    for idx, (_, row) in enumerate(df.iterrows()):
+        if idx % 100 == 0:
+            progress_bar.progress(min(idx / len(df), 1.0))
+        if pd.isna(row.get(description_column, None)):
+            locations.append(None)
+            continue
+        description = str(row[description_column]).lower()
+        doc = nlp(description)
+        found_locations = []
+        for ent in doc.ents:
+            if ent.label_ in ["GPE", "LOC"]:
+                loc_name = ent.text.lower()
+                if loc_name in state_mapping:
+                    found_locations.append(state_mapping[loc_name])
+        for state_var, standard_name in state_mapping.items():
+            if state_var in description and standard_name not in found_locations:
+                found_locations.append(standard_name)
+        locations.append(found_locations[0] if found_locations else None)
+    progress_bar.progress(1.0)
+    df = df.copy()
+    df['extracted_location'] = locations
+    return df
+# Analyze sentiment by state
 def analyze_sentiment_by_state(df, sentiment_column='sentiment_score'):
+    """Analyze sentiment by state"""
     df_with_locations = df.dropna(subset=['extracted_location', sentiment_column])
     if len(df_with_locations) == 0:
         return None
+    sentiment_by_state = df_with_locations.groupby('extracted_location')[sentiment_column].agg([
+        ('avg_sentiment', 'mean'),
+        ('count', 'count')
+    ]).reset_index()
     return sentiment_by_state
+# Create India sentiment map
+def create_india_sentiment_map(sentiment_data, geojson_data, title):
+    """Create choropleth map of India showing sentiment by state"""
+    if sentiment_data is None or geojson_data is None:
+        return None
     min_sentiment = sentiment_data['avg_sentiment'].min()
     max_sentiment = sentiment_data['avg_sentiment'].max()
     if min_sentiment < 0 and max_sentiment > 0:
         abs_max = max(abs(min_sentiment), abs(max_sentiment))
         color_range = [-abs_max, abs_max]
     else:
         color_range = [min_sentiment - 0.1, max_sentiment + 0.1]
     fig = px.choropleth_mapbox(
         sentiment_data,
         geojson=geojson_data,
         locations='extracted_location',
+        featureidkey="properties.NAME_1",
         color='avg_sentiment',
         color_continuous_scale="RdBu",
         range_color=color_range,
         opacity=0.7,
         hover_data=['count'],
         labels={
+            'avg_sentiment': 'Avg Sentiment',
             'extracted_location': 'State',
+            'count': 'Articles'
         }
     )
     fig.update_layout(
+        title=dict(text=title, font=dict(size=20), x=0.5),
+        height=600,
         margin={"r":0,"t":50,"l":0,"b":0}
     )
+    return fig
+# Top locations chart
+def create_top_locations_chart(df, title):
+    """Create bar chart of top mentioned locations"""
+    if 'extracted_location' not in df.columns or df['extracted_location'].isna().all():
+        return None
+    location_counts = df['extracted_location'].value_counts().head(15).reset_index()
+    location_counts.columns = ['Location', 'Count']
+    fig = px.bar(
+        location_counts,
+        y='Location',
+        x='Count',
+        title=title,
+        orientation='h',
+        color='Count',
+        color_continuous_scale='Viridis'
     )
+    fig.update_layout(height=500, yaxis={'categoryorder':'total ascending'})
     return fig
+# Discover datasets
+@st.cache_data
+def discover_datasets():
+    datasets = {}
+    for directory in [Path('data'), Path('output')]:
+        if directory.exists():
+            for csv_file in directory.glob('*.csv'):
+                name = csv_file.stem.replace('_articles', '').replace('_', ' ').title()
+                datasets[name] = str(csv_file)
+    return datasets
+# Load data
+@st.cache_data
+def load_data(file_path):
+    try:
+        df = pd.read_csv(file_path)
+        # Standardize columns
+        date_cols = [col for col in df.columns if 'date' in col.lower()]
+        if date_cols:
+            df['date'] = pd.to_datetime(df[date_cols[0]], errors='coerce')
+        sentiment_cols = [col for col in df.columns if 'sentiment' in col.lower()]
+        if sentiment_cols and 'sentiment_value' not in df.columns:
+            df['sentiment_value'] = df[sentiment_cols[0]]
+        if 'sentiment_score' not in df.columns and 'sentiment_value' in df.columns:
+            sentiment_map = {'positive': 1.0, 'negative': -1.0, 'neutral': 0.0}
+            df['sentiment_score'] = df['sentiment_value'].str.lower().map(sentiment_map).fillna(0)
+        return df
+    except Exception as e:
+        st.error(f"Error loading data: {str(e)}")
+        return None
+# Run scraper
+def run_scraper_async(source, topic, workers, interval):
+    cmd = [
+        sys.executable, "main.py",
+        "--source", source,
+        "--topic", topic,
+        "--workers", str(workers),
+        "--interval", str(interval)
+    ]
+    try:
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            bufsize=1
+        )
+        return process
+    except Exception as e:
+        return None
+# Plotting functions
+def plot_sentiment_trends(df, title):
+    if 'date' not in df.columns or 'sentiment_value' not in df.columns:
+        return None
+    plt.style.use('seaborn-v0_8-whitegrid')
+    df['year'] = df['date'].dt.year
     valid_sentiments = {"positive", "negative", "neutral"}
+    df['sentiment'] = df['sentiment_value'].apply(
+        lambda x: x.lower() if isinstance(x, str) and x.lower() in valid_sentiments else "neutral"
+    )
     sentiment_counts = df.groupby(['year', 'sentiment']).size().reset_index(name='count')
     year_totals = sentiment_counts.groupby('year')['count'].sum().reset_index(name='total')
     sentiment_counts = sentiment_counts.merge(year_totals, on='year')
     sentiment_counts['percentage'] = sentiment_counts['count'] / sentiment_counts['total'] * 100
+    sentiment_pivot = sentiment_counts.pivot(
+        index='year', columns='sentiment', values='percentage'
+    ).fillna(0)
     for sentiment in ['negative', 'neutral', 'positive']:
         if sentiment not in sentiment_pivot.columns:
             sentiment_pivot[sentiment] = 0
+    fig, ax = plt.subplots(figsize=(12, 6))
     colors = {
+        'negative': '#e74c3c',
+        'neutral': '#95a5a6',
+        'positive': '#2ecc71'
     }
     for sentiment in ['negative', 'neutral', 'positive']:
         ax.plot(
             sentiment_pivot.index,
             linewidth=2.5,
             label=sentiment.capitalize(),
             color=colors[sentiment],
+            markersize=7
         )
     ax.set_ylabel('Percentage (%)', fontweight='bold')
     ax.set_xlabel('Year', fontweight='bold')
+    ax.set_title(title, fontweight='bold', pad=15)
+    ax.legend(loc='best', frameon=True)
+    ax.grid(axis='y', linestyle='--', alpha=0.3)
     plt.tight_layout()
     return fig
+def create_sentiment_pie(df, title):
+    if 'sentiment_value' not in df.columns:
+        return None
+    sentiment_counts = df['sentiment_value'].str.lower().value_counts()
+    fig = px.pie(
+        values=sentiment_counts.values,
+        names=[s.title() for s in sentiment_counts.index],
+        title=title,
         color_discrete_map={
+            'Positive': '#2ecc71',
+            'Neutral': '#95a5a6',
+            'Negative': '#e74c3c'
         }
     )
+    fig.update_traces(textposition='inside', textinfo='percent+label')
     return fig
+# MAIN APP PAGES
+def show_home_page():
+    st.markdown('<h1 class="main-header">📰 News Scraper & Analysis Platform</h1>',
+                unsafe_allow_html=True)
+    st.markdown("""
+    <div style="text-align: center; padding: 20px; background-color: #f8f9fa;
+                border-radius: 10px; margin: 20px 0;">
+        <h3>Complete Pipeline for News Collection and Sentiment Analysis</h3>
+        <p>Scrape articles from major Indian news sources and analyze sentiment trends</p>
+    </div>
+    """, unsafe_allow_html=True)
+    # Feature cards
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        st.markdown('<div class="feature-card">', unsafe_allow_html=True)
+        st.markdown("### 🔍 Scrape")
+        st.write("Collect articles from TOI, NDTV, WION, and Scroll.in")
+        st.markdown('</div>', unsafe_allow_html=True)
+    with col2:
+        st.markdown('<div class="feature-card">', unsafe_allow_html=True)
+        st.markdown("### 📊 Analyze")
+        st.write("Automatic sentiment classification and trend analysis")
+        st.markdown('</div>', unsafe_allow_html=True)
+    with col3:
+        st.markdown('<div class="feature-card">', unsafe_allow_html=True)
+        st.markdown("### 📈 Visualize")
+        st.write("Interactive charts and geographic sentiment mapping")
+        st.markdown('</div>', unsafe_allow_html=True)
+    # Quick stats
+    datasets = discover_datasets()
+    if datasets:
+        st.markdown("---")
+        st.markdown("### 📊 Available Datasets")
+        cols = st.columns(min(len(datasets), 4))
+        for idx, (name, path) in enumerate(list(datasets.items())[:4]):
+            with cols[idx]:
+                df = load_data(path)
+                if df is not None:
+                    st.markdown('<div class="metric-card">', unsafe_allow_html=True)
+                    st.metric(name, f"{len(df):,} articles")
+                    st.markdown('</div>', unsafe_allow_html=True)
+def show_scraper_page():
+    st.markdown('<h2 class="sub-header">🔍 Article Scraper</h2>', unsafe_allow_html=True)
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        st.markdown("### Configuration")
+        source = st.selectbox(
+            "News Source",
+            options=['toi', 'ndtv', 'wion', 'scroll'],
+            format_func=lambda x: {
+                'toi': '📰 Times of India',
+                'ndtv': '📺 NDTV',
+                'wion': '🌍 WION',
+                'scroll': '📜 Scroll.in'
+            }[x]
+        )
+        topic = st.text_input("Topic", placeholder="e.g., Climate Change, Technology")
+        col_a, col_b = st.columns(2)
+        with col_a:
+            workers = st.slider("Workers", 1, 10, 4)
+        with col_b:
+            interval = st.slider("Save Interval (s)", 60, 600, 300, step=60)
+    with col2:
+        st.markdown("### Quick Guide")
+        st.info("""
+        **Steps:**
+        1. Select news source
+        2. Enter search topic
+        3. Configure settings
+        4. Click Start
+        5. Monitor progress
+        """)
+    st.markdown("---")
+    if st.button("🚀 Start Scraping", disabled=not topic, type="primary"):
+        with st.spinner("Initializing scraper..."):
+            st.markdown('<div class="status-running">', unsafe_allow_html=True)
+            st.write(f"⏳ Scraping **{source.upper()}** for **'{topic}'**...")
+            st.markdown('</div>', unsafe_allow_html=True)
+            progress_bar = st.progress(0)
+            status_text = st.empty()
+            process = run_scraper_async(source, topic, workers, interval)
+            if process:
+                output_lines = []
+                progress = 0
+                while True:
+                    line = process.stdout.readline()
+                    if not line and process.poll() is not None:
+                        break
+                    if line:
+                        output_lines.append(line.strip())
+                        status_text.text(line.strip())
+                        progress = min(progress + 1, 95)
+                        progress_bar.progress(progress / 100)
+                progress_bar.progress(100)
+                if process.returncode == 0:
+                    st.markdown('<div class="status-success">', unsafe_allow_html=True)
+                    st.success("✅ Scraping completed successfully!")
+                    st.markdown('</div>', unsafe_allow_html=True)
+                    st.balloons()
+                else:
+                    st.markdown('<div class="status-error">', unsafe_allow_html=True)
+                    st.error("❌ Scraping failed. Check logs.")
+                    with st.expander("View Logs"):
+                        st.code("\n".join(output_lines[-20:]))
+                    st.markdown('</div>', unsafe_allow_html=True)
+def show_analysis_page():
+    st.markdown('<h2 class="sub-header">📊 Sentiment Analysis Dashboard</h2>',
+                unsafe_allow_html=True)
+    datasets = discover_datasets()
+    if not datasets:
+        st.warning("⚠️ No datasets available. Please scrape some articles first!")
+        return
+    # Dataset selector
+    selected = st.selectbox("Select Dataset", options=list(datasets.keys()))
+    if selected:
+        df = load_data(datasets[selected])
+        if df is not None:
+            # Overview metrics
+            col1, col2, col3, col4 = st.columns(4)
+            with col1:
+                st.metric("📄 Total Articles", f"{len(df):,}")
+            with col2:
+                if 'date' in df.columns:
+                    years = f"{df['date'].dt.year.min()}-{df['date'].dt.year.max()}"
+                    st.metric("📅 Years", years)
+            with col3:
+                if 'sentiment_value' in df.columns:
+                    pos_pct = (df['sentiment_value'].str.lower() == 'positive').mean() * 100
+                    st.metric("😊 Positive", f"{pos_pct:.1f}%")
+            with col4:
+                if 'sentiment_value' in df.columns:
+                    neg_pct = (df['sentiment_value'].str.lower() == 'negative').mean() * 100
+                    st.metric("😞 Negative", f"{neg_pct:.1f}%")
+            st.markdown("---")
+            # Visualizations
+            tab1, tab2, tab3, tab4 = st.tabs(["📈 Trends", "🥧 Distribution", "🗺️ Geographic", "📝 Articles"])
+            with tab1:
+                fig = plot_sentiment_trends(df, f"{selected} - Sentiment Trends")
+                if fig:
+                    st.pyplot(fig)
                 else:
+                    st.info("Insufficient data for trend analysis")
+            with tab2:
+                col_a, col_b = st.columns([2, 1])
+                with col_a:
+                    pie_fig = create_sentiment_pie(df, "Sentiment Distribution")
+                    if pie_fig:
+                        st.plotly_chart(pie_fig, use_container_width=True)
+                with col_b:
+                    if 'sentiment_value' in df.columns:
+                        st.markdown("### Breakdown")
+                        counts = df['sentiment_value'].value_counts()
+                        for sentiment, count in counts.items():
+                            st.metric(sentiment.title(), f"{count:,}")
+            with tab3:
+                st.markdown("### 🗺️ Geographic Sentiment Analysis")
+                # Check if locations already extracted
+                if 'extracted_location' not in df.columns:
+                    if 'desc' in df.columns or 'description' in df.columns:
+                        if st.button("🔍 Extract Locations from Articles"):
+                            with st.spinner("Extracting locations... This may take a few minutes."):
+                                desc_col = 'desc' if 'desc' in df.columns else 'description'
+                                df = extract_locations_from_descriptions(df, desc_col)
+                                # Save updated dataframe
+                                df.to_csv(datasets[selected], index=False)
+                                st.success("✅ Locations extracted successfully!")
+                                st.rerun()
+                    else:
+                        st.info("No description column found. Cannot extract locations.")
+                else:
+                    # Show geographic analysis
+                    col_left, col_right = st.columns([3, 2])
+                    with col_left:
+                        st.markdown("#### Sentiment by State")
+                        # Load geojson
+                        india_geojson = load_india_geojson()
+                        if india_geojson:
+                            # Analyze sentiment by state
+                            sentiment_by_state = analyze_sentiment_by_state(df)
+                            if sentiment_by_state is not None and not sentiment_by_state.empty:
+                                map_fig = create_india_sentiment_map(
+                                    sentiment_by_state,
+                                    india_geojson,
+                                    f"{selected} - Sentiment by Indian States"
+                                )
+                                if map_fig:
+                                    st.plotly_chart(map_fig, use_container_width=True)
+                                # Show state statistics
+                                with st.expander("📊 State-wise Statistics"):
+                                    sentiment_by_state_display = sentiment_by_state.sort_values('count', ascending=False)
+                                    st.dataframe(
+                                        sentiment_by_state_display,
+                                        use_container_width=True,
+                                        hide_index=True
+                                    )
+                            else:
+                                st.warning("No location data with valid sentiment found.")
+                        else:
+                            st.error("Could not load India map data.")
+                    with col_right:
+                        st.markdown("#### Top Mentioned Locations")
+                        top_loc_fig = create_top_locations_chart(df, "Top 15 Locations")
+                        if top_loc_fig:
+                            st.plotly_chart(top_loc_fig, use_container_width=True)
+                        # Location coverage stats
+                        total_articles = len(df)
+                        articles_with_location = df['extracted_location'].notna().sum()
+                        coverage = (articles_with_location / total_articles) * 100
+                        st.metric("Location Coverage", f"{coverage:.1f}%")
+                        st.caption(f"{articles_with_location:,} out of {total_articles:,} articles have location data")
+            with tab4:
+                # Filters
+                col_a, col_b, col_c = st.columns(3)
+                with col_a:
+                    sentiment_filter = st.selectbox(
+                        "Sentiment",
+                        ["All"] + sorted(df['sentiment_value'].unique().tolist())
+                    )
+                with col_b:
+                    if 'date' in df.columns:
+                        years = sorted(df['date'].dt.year.dropna().unique())
+                        year_filter = st.selectbox("Year", ["All"] + years)
+                    else:
+                        year_filter = "All"
+                with col_c:
+                    num_articles = st.slider("Display", 5, 50, 10)
+                # Apply filters
+                filtered_df = df.copy()
+                if sentiment_filter != "All":
+                    filtered_df = filtered_df[filtered_df['sentiment_value'] == sentiment_filter]
+                if year_filter != "All" and 'date' in df.columns:
+                    filtered_df = filtered_df[filtered_df['date'].dt.year == year_filter]
+                st.write(f"Showing {min(num_articles, len(filtered_df))} of {len(filtered_df)} articles")
+                # Display articles
+                for idx, row in filtered_df.head(num_articles).iterrows():
+                    with st.expander(f"📰 {row.get('title', 'Untitled')}"):
+                        col_x, col_y = st.columns([3, 1])
+                        with col_x:
+                            st.write(row.get('desc', row.get('description', 'No description')))
+                            if 'link' in row:
+                                st.markdown(f"[Read more →]({row['link']})")
+                        with col_y:
+                            sentiment = row.get('sentiment_value', 'Unknown')
+                            sentiment_emoji = {
+                                'positive': '😊',
+                                'negative': '😞',
+                                'neutral': '😐'
+                            }.get(sentiment.lower(), '❓')
+                            st.metric("Sentiment", f"{sentiment_emoji} {sentiment.title()}")
+                            if 'date' in row:
+                                st.caption(f"📅 {row['date'].strftime('%d %b %Y')}")
+def show_about_page():
+    st.markdown('<h2 class="sub-header">ℹ️ About This Platform</h2>',
+                unsafe_allow_html=True)
+    st.markdown("""
+    ## 🎯 Overview
+    This platform provides a complete pipeline for news article collection and sentiment analysis,
+    specifically designed for Indian news sources.
+    ### ✨ Key Features
+    - **Multi-Source Scraping**: Collect articles from TOI, NDTV, WION, and Scroll.in
+    - **Real-Time Monitoring**: Track scraping progress live
+    - **Automatic Analysis**: Sentiment classification and scoring
+    - **Interactive Visualizations**: Trends, distributions, and comparisons
+    - **Data Export**: Download processed datasets
+    ### 🔧 Technical Stack
+    - **Frontend**: Streamlit
+    - **Data Processing**: Pandas, NumPy
+    - **Visualization**: Plotly, Matplotlib
+    - **NLP**: spaCy, Transformers
+    - **Scraping**: BeautifulSoup, Requests
+    ### 📖 How to Use
+    1. **Scrape**: Navigate to the Scraper page and configure your search
+    2. **Wait**: Monitor the real-time progress
+    3. **Analyze**: Go to Analysis page and select your dataset
+    4. **Export**: Download processed data for further use
+    ### 🤝 Support
+    For issues or questions, please refer to the documentation or contact support.
+    ---
+    **Version**: 1.0.0
+    **Last Updated**: October 2025
+    """)
+# MAIN APP
 def main():
     load_css()
+    init_session_state()
+    setup_directories()
+    # Sidebar navigation
+    with st.sidebar:
+        st.image("https://via.placeholder.com/150x50?text=News+Scraper", use_container_width=True)
+        st.markdown("---")
+        page = st.radio(
+            "Navigation",
+            ["🏠 Home", "🔍 Scraper", "📊 Analysis", "ℹ️ About"],
+            label_visibility="collapsed"
+        )
+        st.markdown("---")
+        # Quick stats in sidebar
+        datasets = discover_datasets()
+        if datasets:
+            st.markdown("### 📊 Quick Stats")
+            total_articles = 0
+            for path in datasets.values():
+                df = load_data(path)
+                if df is not None:
+                    total_articles += len(df)
+            st.metric("Total Articles", f"{total_articles:,}")
+            st.metric("Datasets", len(datasets))
+    # Route to pages
+    if page == "🏠 Home":
         show_home_page()
+    elif page == "🔍 Scraper":
+        show_scraper_page()
+    elif page == "📊 Analysis":
+        show_analysis_page()
+    else:
+        show_about_page()
 if __name__ == "__main__":
+    main()