Spaces:
Sleeping
Sleeping
| """ | |
| Unified News Scraper & Sentiment Analysis Application | |
| Combines scraping, processing, and visualization in one interface | |
| Modified for Hugging Face Spaces - uses /tmp directory | |
| """ | |
| import streamlit as st | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from matplotlib.ticker import MaxNLocator | |
| import subprocess | |
| import sys | |
| import os | |
| from pathlib import Path | |
| import time | |
| from datetime import datetime | |
| import warnings | |
| import json | |
| import requests | |
| import spacy | |
| warnings.filterwarnings('ignore') | |
| # Constants | |
| INDIA_GEOJSON_URL = 'https://raw.githubusercontent.com/geohacker/india/master/state/india_state.geojson' | |
| BASE_DIR = Path('/tmp/news_scraper') | |
| # Page config | |
| st.set_page_config( | |
| page_title="News Scraper & Analysis Platform", | |
| page_icon="π°", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # Custom CSS | |
| def load_css(): | |
| st.markdown(""" | |
| <style> | |
| .main-header { | |
| font-size: 2.8rem !important; | |
| font-weight: 700 !important; | |
| text-align: center !important; | |
| padding: 1.5rem 0 !important; | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| } | |
| .sub-header { | |
| font-size: 1.8rem !important; | |
| font-weight: 600 !important; | |
| padding: 1rem 0 !important; | |
| color: #2c3e50; | |
| } | |
| .feature-card { | |
| background: white; | |
| border-radius: 15px; | |
| padding: 25px; | |
| box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); | |
| margin: 10px 0; | |
| transition: transform 0.3s; | |
| } | |
| .feature-card:hover { | |
| transform: translateY(-5px); | |
| box-shadow: 0 8px 12px rgba(0, 0, 0, 0.15); | |
| } | |
| .metric-card { | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| border-radius: 10px; | |
| padding: 20px; | |
| color: white; | |
| text-align: center; | |
| } | |
| .status-running { | |
| background-color: #fff3cd; | |
| border-left: 4px solid #ffc107; | |
| padding: 15px; | |
| border-radius: 5px; | |
| } | |
| .status-success { | |
| background-color: #d4edda; | |
| border-left: 4px solid #28a745; | |
| padding: 15px; | |
| border-radius: 5px; | |
| } | |
| .status-error { | |
| background-color: #f8d7da; | |
| border-left: 4px solid #dc3545; | |
| padding: 15px; | |
| border-radius: 5px; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Initialize session state | |
| def init_session_state(): | |
| defaults = { | |
| 'scraped_data': {}, | |
| 'scraping_active': False, | |
| 'processing_status': {}, | |
| 'selected_dataset': None, | |
| 'base_dir': BASE_DIR | |
| } | |
| for key, value in defaults.items(): | |
| if key not in st.session_state: | |
| st.session_state[key] = value | |
| # Setup directories | |
| def setup_directories(): | |
| """Create necessary directories in /tmp""" | |
| try: | |
| for dir_name in ['output', 'data', 'temp']: | |
| dir_path = BASE_DIR / dir_name | |
| dir_path.mkdir(parents=True, exist_ok=True) | |
| return True | |
| except Exception as e: | |
| st.error(f"β Directory setup error: {e}") | |
| return False | |
| # Load India GeoJSON | |
| def load_india_geojson(): | |
| """Load India GeoJSON data for mapping""" | |
| try: | |
| response = requests.get(INDIA_GEOJSON_URL, timeout=10) | |
| return json.loads(response.text) | |
| except Exception as e: | |
| st.warning(f"Could not load India map: {e}") | |
| return None | |
| # Load spaCy model | |
| def load_spacy_model(): | |
| try: | |
| return spacy.load("en_core_web_sm") | |
| except OSError: | |
| st.info("Downloading spaCy model...") | |
| subprocess.call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"]) | |
| return spacy.load("en_core_web_sm") | |
| # State mapping | |
| def get_state_mapping(): | |
| return { | |
| 'andhra pradesh': 'Andhra Pradesh', 'arunachal pradesh': 'Arunachal Pradesh', | |
| 'assam': 'Assam', 'bihar': 'Bihar', 'chhattisgarh': 'Chhattisgarh', | |
| 'goa': 'Goa', 'gujarat': 'Gujarat', 'haryana': 'Haryana', | |
| 'himachal pradesh': 'Himachal Pradesh', 'jharkhand': 'Jharkhand', | |
| 'karnataka': 'Karnataka', 'kerala': 'Kerala', 'madhya pradesh': 'Madhya Pradesh', | |
| 'maharashtra': 'Maharashtra', 'manipur': 'Manipur', 'meghalaya': 'Meghalaya', | |
| 'mizoram': 'Mizoram', 'nagaland': 'Nagaland', 'odisha': 'Odisha', | |
| 'punjab': 'Punjab', 'rajasthan': 'Rajasthan', 'sikkim': 'Sikkim', | |
| 'tamil nadu': 'Tamil Nadu', 'telangana': 'Telangana', 'tripura': 'Tripura', | |
| 'uttar pradesh': 'Uttar Pradesh', 'uttarakhand': 'Uttarakhand', | |
| 'west bengal': 'West Bengal', 'delhi': 'Delhi', 'new delhi': 'Delhi', | |
| 'jammu and kashmir': 'Jammu and Kashmir', 'j&k': 'Jammu and Kashmir', | |
| 'ladakh': 'Ladakh', 'chandigarh': 'Chandigarh', 'puducherry': 'Puducherry', | |
| 'mumbai': 'Maharashtra', 'kolkata': 'West Bengal', 'chennai': 'Tamil Nadu', | |
| 'bangalore': 'Karnataka', 'bengaluru': 'Karnataka', 'hyderabad': 'Telangana', | |
| 'ahmedabad': 'Gujarat', 'pune': 'Maharashtra', 'jaipur': 'Rajasthan', | |
| } | |
| # Extract locations from text | |
| def extract_locations_from_descriptions(df, description_column='desc'): | |
| """Extract state names from description using spaCy""" | |
| nlp = load_spacy_model() | |
| state_mapping = get_state_mapping() | |
| locations = [] | |
| progress_bar = st.progress(0) | |
| for idx, (_, row) in enumerate(df.iterrows()): | |
| if idx % 100 == 0: | |
| progress_bar.progress(min(idx / len(df), 1.0)) | |
| if pd.isna(row.get(description_column, None)): | |
| locations.append(None) | |
| continue | |
| description = str(row[description_column]).lower() | |
| doc = nlp(description) | |
| found_locations = [] | |
| for ent in doc.ents: | |
| if ent.label_ in ["GPE", "LOC"]: | |
| loc_name = ent.text.lower() | |
| if loc_name in state_mapping: | |
| found_locations.append(state_mapping[loc_name]) | |
| for state_var, standard_name in state_mapping.items(): | |
| if state_var in description and standard_name not in found_locations: | |
| found_locations.append(standard_name) | |
| locations.append(found_locations[0] if found_locations else None) | |
| progress_bar.progress(1.0) | |
| df = df.copy() | |
| df['extracted_location'] = locations | |
| return df | |
| # Analyze sentiment by state | |
| def analyze_sentiment_by_state(df, sentiment_column='sentiment_score'): | |
| """Analyze sentiment by state""" | |
| df_with_locations = df.dropna(subset=['extracted_location', sentiment_column]) | |
| if len(df_with_locations) == 0: | |
| return None | |
| sentiment_by_state = df_with_locations.groupby('extracted_location')[sentiment_column].agg([ | |
| ('avg_sentiment', 'mean'), | |
| ('count', 'count') | |
| ]).reset_index() | |
| return sentiment_by_state | |
| # Create India sentiment map | |
| def create_india_sentiment_map(sentiment_data, geojson_data, title): | |
| """Create choropleth map of India showing sentiment by state""" | |
| if sentiment_data is None or geojson_data is None: | |
| return None | |
| min_sentiment = sentiment_data['avg_sentiment'].min() | |
| max_sentiment = sentiment_data['avg_sentiment'].max() | |
| if min_sentiment < 0 and max_sentiment > 0: | |
| abs_max = max(abs(min_sentiment), abs(max_sentiment)) | |
| color_range = [-abs_max, abs_max] | |
| else: | |
| color_range = [min_sentiment - 0.1, max_sentiment + 0.1] | |
| fig = px.choropleth_mapbox( | |
| sentiment_data, | |
| geojson=geojson_data, | |
| locations='extracted_location', | |
| featureidkey="properties.NAME_1", | |
| color='avg_sentiment', | |
| color_continuous_scale="RdBu", | |
| range_color=color_range, | |
| mapbox_style="carto-positron", | |
| zoom=3.5, | |
| center={"lat": 20.5937, "lon": 78.9629}, | |
| opacity=0.7, | |
| hover_data=['count'], | |
| labels={ | |
| 'avg_sentiment': 'Avg Sentiment', | |
| 'extracted_location': 'State', | |
| 'count': 'Articles' | |
| } | |
| ) | |
| fig.update_layout( | |
| title=dict(text=title, font=dict(size=20), x=0.5), | |
| height=600, | |
| margin={"r":0,"t":50,"l":0,"b":0} | |
| ) | |
| return fig | |
| # Top locations chart | |
| def create_top_locations_chart(df, title): | |
| """Create bar chart of top mentioned locations""" | |
| if 'extracted_location' not in df.columns or df['extracted_location'].isna().all(): | |
| return None | |
| location_counts = df['extracted_location'].value_counts().head(15).reset_index() | |
| location_counts.columns = ['Location', 'Count'] | |
| fig = px.bar( | |
| location_counts, | |
| y='Location', | |
| x='Count', | |
| title=title, | |
| orientation='h', | |
| color='Count', | |
| color_continuous_scale='Viridis' | |
| ) | |
| fig.update_layout(height=500, yaxis={'categoryorder':'total ascending'}) | |
| return fig | |
| # Discover datasets | |
| def discover_datasets(): | |
| """Discover datasets in /tmp directory""" | |
| datasets = {} | |
| search_paths = [ | |
| BASE_DIR / 'data', | |
| BASE_DIR / 'output', | |
| BASE_DIR, | |
| ] | |
| for directory in search_paths: | |
| if directory.exists(): | |
| try: | |
| for csv_file in directory.glob('*.csv'): | |
| name = csv_file.stem.replace('_articles', '').replace('_', ' ').title() | |
| if name not in datasets: | |
| datasets[name] = str(csv_file) | |
| except Exception: | |
| continue | |
| return datasets | |
| # Load data | |
| def load_data(file_path): | |
| try: | |
| df = pd.read_csv(file_path) | |
| date_cols = [col for col in df.columns if 'date' in col.lower()] | |
| if date_cols: | |
| df['date'] = pd.to_datetime(df[date_cols[0]], errors='coerce') | |
| sentiment_cols = [col for col in df.columns if 'sentiment' in col.lower()] | |
| if sentiment_cols and 'sentiment_value' not in df.columns: | |
| df['sentiment_value'] = df[sentiment_cols[0]] | |
| if 'sentiment_score' not in df.columns and 'sentiment_value' in df.columns: | |
| sentiment_map = {'positive': 1.0, 'negative': -1.0, 'neutral': 0.0} | |
| df['sentiment_score'] = df['sentiment_value'].str.lower().map(sentiment_map).fillna(0) | |
| return df | |
| except Exception as e: | |
| st.error(f"Error loading data: {str(e)}") | |
| return None | |
| # Plotting functions | |
| def plot_sentiment_trends(df, title): | |
| if 'date' not in df.columns or 'sentiment_value' not in df.columns: | |
| return None | |
| plt.style.use('seaborn-v0_8-whitegrid') | |
| df['year'] = df['date'].dt.year | |
| valid_sentiments = {"positive", "negative", "neutral"} | |
| df['sentiment'] = df['sentiment_value'].apply( | |
| lambda x: x.lower() if isinstance(x, str) and x.lower() in valid_sentiments else "neutral" | |
| ) | |
| sentiment_counts = df.groupby(['year', 'sentiment']).size().reset_index(name='count') | |
| year_totals = sentiment_counts.groupby('year')['count'].sum().reset_index(name='total') | |
| sentiment_counts = sentiment_counts.merge(year_totals, on='year') | |
| sentiment_counts['percentage'] = sentiment_counts['count'] / sentiment_counts['total'] * 100 | |
| sentiment_pivot = sentiment_counts.pivot( | |
| index='year', columns='sentiment', values='percentage' | |
| ).fillna(0) | |
| for sentiment in ['negative', 'neutral', 'positive']: | |
| if sentiment not in sentiment_pivot.columns: | |
| sentiment_pivot[sentiment] = 0 | |
| fig, ax = plt.subplots(figsize=(12, 6)) | |
| colors = { | |
| 'negative': '#e74c3c', | |
| 'neutral': '#95a5a6', | |
| 'positive': '#2ecc71' | |
| } | |
| for sentiment in ['negative', 'neutral', 'positive']: | |
| ax.plot( | |
| sentiment_pivot.index, | |
| sentiment_pivot[sentiment], | |
| marker='o', | |
| linewidth=2.5, | |
| label=sentiment.capitalize(), | |
| color=colors[sentiment], | |
| markersize=7 | |
| ) | |
| ax.set_ylabel('Percentage (%)', fontweight='bold') | |
| ax.set_xlabel('Year', fontweight='bold') | |
| ax.set_title(title, fontweight='bold', pad=15) | |
| ax.legend(loc='best', frameon=True) | |
| ax.grid(axis='y', linestyle='--', alpha=0.3) | |
| plt.tight_layout() | |
| return fig | |
| def create_sentiment_pie(df, title): | |
| if 'sentiment_value' not in df.columns: | |
| return None | |
| sentiment_counts = df['sentiment_value'].str.lower().value_counts() | |
| fig = px.pie( | |
| values=sentiment_counts.values, | |
| names=[s.title() for s in sentiment_counts.index], | |
| title=title, | |
| color_discrete_map={ | |
| 'Positive': '#2ecc71', | |
| 'Neutral': '#95a5a6', | |
| 'Negative': '#e74c3c' | |
| } | |
| ) | |
| fig.update_traces(textposition='inside', textinfo='percent+label') | |
| return fig | |
| # Run scraper function | |
| def run_scraper(source, topic, max_workers=4, max_articles=100): | |
| """ | |
| Run the appropriate scraper based on source | |
| Returns dict with success status, file path, and article count | |
| """ | |
| try: | |
| source_map = { | |
| 'toi': ('scrapers.toi_scraper', 'TOIArticleScraper'), | |
| 'ndtv': ('scrapers.ndtv_scraper', 'NDTVArticleScraper'), | |
| 'wion': ('scrapers.wion_scraper', 'WIONArticleScraper'), | |
| 'scroll': ('scrapers.scroll_scraper', 'ScrollArticleScraper') | |
| } | |
| if source not in source_map: | |
| return {'success': False, 'error': f'Unknown source: {source}'} | |
| module_name, class_name = source_map[source] | |
| try: | |
| module = __import__(module_name, fromlist=[class_name]) | |
| ScrapeClass = getattr(module, class_name) | |
| except (ImportError, AttributeError) as e: | |
| return { | |
| 'success': False, | |
| 'error': f'Scraper module not found. Please ensure scraper files are available.' | |
| } | |
| scraper = ScrapeClass(max_workers=max_workers) | |
| output_dir = BASE_DIR / 'output' | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| original_dir = os.getcwd() | |
| os.chdir(output_dir) | |
| try: | |
| if source == 'toi': | |
| topic_url = f"{scraper.base_url}/topic/{topic}/news" | |
| final_csv = scraper.scrape_topic(topic_url, topic) | |
| elif source == 'ndtv': | |
| final_csv = scraper.scrape_topic(topic) | |
| elif source in ['wion', 'scroll']: | |
| final_csv = scraper.scrape_topic(topic.lower(), topic) | |
| articles = getattr(scraper, 'articles', None) or getattr(scraper, 'scraped_articles', []) | |
| article_count = len(articles) | |
| if article_count > max_articles: | |
| df = pd.read_csv(final_csv) | |
| df = df.head(max_articles) | |
| df.to_csv(final_csv, index=False) | |
| article_count = max_articles | |
| return { | |
| 'success': True, | |
| 'file': str(output_dir / final_csv), | |
| 'count': article_count | |
| } | |
| finally: | |
| os.chdir(original_dir) | |
| except Exception as e: | |
| return { | |
| 'success': False, | |
| 'error': str(e) | |
| } | |
| # MAIN APP PAGES | |
| def show_home_page(): | |
| st.markdown('<h1 class="main-header">π° News Scraper & Analysis Platform</h1>', | |
| unsafe_allow_html=True) | |
| st.markdown(""" | |
| <div style="text-align: center; padding: 20px; background-color: #f8f9fa; | |
| border-radius: 10px; margin: 20px 0;"> | |
| <h3>Complete Pipeline for News Collection and Sentiment Analysis</h3> | |
| <p>Scrape or upload articles from major Indian news sources and analyze sentiment trends</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| st.info(f"πΎ **Storage Location:** `{BASE_DIR}` (temporary - cleared on restart)") | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.markdown('<div class="feature-card">', unsafe_allow_html=True) | |
| st.markdown("### π Scrape") | |
| st.write("Collect articles from major Indian news sources") | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| with col2: | |
| st.markdown('<div class="feature-card">', unsafe_allow_html=True) | |
| st.markdown("### π€ Upload") | |
| st.write("Upload pre-scraped CSV files for analysis") | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| with col3: | |
| st.markdown('<div class="feature-card">', unsafe_allow_html=True) | |
| st.markdown("### π Analyze") | |
| st.write("Automatic sentiment classification and trend analysis") | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| with col4: | |
| st.markdown('<div class="feature-card">', unsafe_allow_html=True) | |
| st.markdown("### π Visualize") | |
| st.write("Interactive charts and geographic sentiment mapping") | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| datasets = discover_datasets() | |
| if datasets: | |
| st.markdown("---") | |
| st.markdown("### π Available Datasets") | |
| cols = st.columns(min(len(datasets), 4)) | |
| for idx, (name, path) in enumerate(list(datasets.items())[:4]): | |
| with cols[idx]: | |
| df = load_data(path) | |
| if df is not None: | |
| st.markdown('<div class="metric-card">', unsafe_allow_html=True) | |
| st.metric(name, f"{len(df):,} articles") | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| def show_scraper_page(): | |
| st.markdown('<h2 class="sub-header">π€ Data Upload & Scraper</h2>', unsafe_allow_html=True) | |
| st.warning(""" | |
| β οΈ **Hugging Face Spaces Notice:** | |
| - Data is stored in `/tmp` and will be cleared on app restart | |
| - Download your processed data regularly | |
| - Scraping may timeout on HF Spaces - consider running locally for large scrapes | |
| """) | |
| tab1, tab2 = st.tabs(["π€ Upload Data", "π Scrape Articles"]) | |
| with tab1: | |
| st.markdown("### π€ Upload Your Data") | |
| uploaded_file = st.file_uploader( | |
| "Upload CSV file with scraped articles", | |
| type=['csv'], | |
| help="Upload a CSV file with columns: title, date, desc, sentiment_value, etc." | |
| ) | |
| if uploaded_file: | |
| try: | |
| data_dir = BASE_DIR / 'data' | |
| data_dir.mkdir(parents=True, exist_ok=True) | |
| save_path = data_dir / uploaded_file.name | |
| with open(save_path, 'wb') as f: | |
| f.write(uploaded_file.getbuffer()) | |
| st.success(f"β File uploaded successfully! Saved to `{save_path}`") | |
| st.info("Go to the Analysis page to view your data.") | |
| discover_datasets.clear() | |
| with st.expander("π Preview Data"): | |
| df = pd.read_csv(save_path) | |
| st.dataframe(df.head(10), use_container_width=True) | |
| st.caption(f"Total rows: {len(df):,}") | |
| except Exception as e: | |
| st.error(f"β Error saving file: {e}") | |
| with tab2: | |
| st.markdown("### π Scrape News Articles") | |
| st.info(""" | |
| **Note:** This feature requires the scraper modules to be available. | |
| On HF Spaces, scraping may timeout for large datasets. For production use, | |
| consider running scrapers locally and uploading the results. | |
| """) | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| news_source = st.selectbox( | |
| "News Source", | |
| ["TOI (Times of India)", "NDTV", "WION", "Scroll.in"], | |
| help="Select which news website to scrape from" | |
| ) | |
| with col2: | |
| topic = st.text_input( | |
| "Topic to Search", | |
| placeholder="e.g., Climate Change, Politics, Education", | |
| help="Enter the topic you want to scrape articles about" | |
| ) | |
| col3, col4 = st.columns(2) | |
| with col3: | |
| max_workers = st.slider( | |
| "Worker Threads", | |
| min_value=1, | |
| max_value=8, | |
| value=4, | |
| help="More workers = faster scraping, but may overload the server" | |
| ) | |
| with col4: | |
| max_articles = st.number_input( | |
| "Max Articles", | |
| min_value=10, | |
| max_value=500, | |
| value=100, | |
| step=10, | |
| help="Maximum number of articles to scrape (set lower for HF Spaces)" | |
| ) | |
| if st.button("π Start Scraping", type="primary", disabled=not topic): | |
| if not topic: | |
| st.error("Please enter a topic to search for!") | |
| else: | |
| source_map = { | |
| "TOI (Times of India)": "toi", | |
| "NDTV": "ndtv", | |
| "WION": "wion", | |
| "Scroll.in": "scroll" | |
| } | |
| source_code = source_map[news_source] | |
| try: | |
| with st.spinner(f"π Scraping {news_source} for '{topic}'... This may take a few minutes."): | |
| result = run_scraper(source_code, topic, max_workers, max_articles) | |
| if result['success']: | |
| st.success(f"β Successfully scraped {result['count']} articles!") | |
| st.info(f"π Saved to: `{result['file']}`") | |
| discover_datasets.clear() | |
| with st.expander("π Preview Scraped Data"): | |
| df = pd.read_csv(result['file']) | |
| st.dataframe(df.head(10), use_container_width=True) | |
| st.caption(f"Total rows: {len(df):,}") | |
| st.balloons() | |
| else: | |
| st.error(f"β Scraping failed: {result['error']}") | |
| except Exception as e: | |
| st.error(f"β Error during scraping: {str(e)}") | |
| st.info("Scraper modules may not be available on this platform. Consider uploading pre-scraped data instead.") | |
| st.markdown("---") | |
| datasets = discover_datasets() | |
| if datasets: | |
| st.markdown("### π Available Datasets") | |
| for name, path in datasets.items(): | |
| col_a, col_b, col_c = st.columns([3, 1, 1]) | |
| with col_a: | |
| st.text(f"π {name}") | |
| with col_b: | |
| df = load_data(path) | |
| if df is not None: | |
| st.text(f"{len(df):,} articles") | |
| with col_c: | |
| if Path(path).exists(): | |
| with open(path, 'rb') as f: | |
| st.download_button( | |
| "β¬οΈ", | |
| f, | |
| file_name=f"{name}.csv", | |
| mime='text/csv', | |
| key=f"dl_{name}" | |
| ) | |
| else: | |
| st.info("No datasets found. Upload a CSV file or scrape articles to get started!") | |
| st.markdown("---") | |
| st.markdown("### πΎ Storage Information") | |
| try: | |
| if BASE_DIR.exists(): | |
| file_count = sum(1 for _ in BASE_DIR.rglob('*.csv')) | |
| st.metric("CSV Files", file_count) | |
| st.caption(f"Location: `{BASE_DIR}`") | |
| else: | |
| st.info("No data directory created yet. Upload a file to get started.") | |
| except Exception as e: | |
| st.error(f"Could not access storage: {e}") | |
| def show_analysis_page(): | |
| st.markdown('<h2 class="sub-header">π Sentiment Analysis Dashboard</h2>', | |
| unsafe_allow_html=True) | |
| datasets = discover_datasets() | |
| if not datasets: | |
| st.warning("β οΈ No datasets available. Please upload a CSV file or scrape articles first!") | |
| st.info("π Go to the 'Upload & Scrape' page to get started.") | |
| return | |
| selected = st.selectbox("Select Dataset", options=list(datasets.keys())) | |
| if selected: | |
| df = load_data(datasets[selected]) | |
| if df is not None: | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.metric("π Total Articles", f"{len(df):,}") | |
| with col2: | |
| if 'date' in df.columns: | |
| years = f"{df['date'].dt.year.min()}-{df['date'].dt.year.max()}" | |
| st.metric("π Years", years) | |
| with col3: | |
| if 'sentiment_value' in df.columns: | |
| pos_pct = (df['sentiment_value'].str.lower() == 'positive').mean() * 100 | |
| st.metric("π Positive", f"{pos_pct:.1f}%") | |
| with col4: | |
| if 'sentiment_value' in df.columns: | |
| neg_pct = (df['sentiment_value'].str.lower() == 'negative').mean() * 100 | |
| st.metric("π Negative", f"{neg_pct:.1f}%") | |
| st.markdown("---") | |
| tab1, tab2, tab3, tab4 = st.tabs(["π Trends", "π₯§ Distribution", "πΊοΈ Geographic", "π Articles"]) | |
| with tab1: | |
| fig = plot_sentiment_trends(df, f"{selected} - Sentiment Trends") | |
| if fig: | |
| st.pyplot(fig) | |
| else: | |
| st.info("Insufficient data for trend analysis") | |
| with tab2: | |
| col_a, col_b = st.columns([2, 1]) | |
| with col_a: | |
| pie_fig = create_sentiment_pie(df, "Sentiment Distribution") | |
| if pie_fig: | |
| st.plotly_chart(pie_fig, use_container_width=True) | |
| with col_b: | |
| if 'sentiment_value' in df.columns: | |
| st.markdown("### Breakdown") | |
| counts = df['sentiment_value'].value_counts() | |
| for sentiment, count in counts.items(): | |
| st.metric(sentiment.title(), f"{count:,}") | |
| with tab3: | |
| st.markdown("### πΊοΈ Geographic Sentiment Analysis") | |
| if 'extracted_location' not in df.columns: | |
| if 'desc' in df.columns or 'description' in df.columns: | |
| if st.button("π Extract Locations from Articles"): | |
| with st.spinner("Extracting locations... This may take a few minutes."): | |
| desc_col = 'desc' if 'desc' in df.columns else 'description' | |
| df = extract_locations_from_descriptions(df, desc_col) | |
| df.to_csv(datasets[selected], index=False) | |
| st.success("β Locations extracted successfully!") | |
| st.rerun() | |
| else: | |
| st.info("No description column found. Cannot extract locations.") | |
| else: | |
| col_left, col_right = st.columns([3, 2]) | |
| with col_left: | |
| st.markdown("#### Sentiment by State") | |
| india_geojson = load_india_geojson() | |
| if india_geojson: | |
| sentiment_by_state = analyze_sentiment_by_state(df) | |
| if sentiment_by_state is not None and not sentiment_by_state.empty: | |
| map_fig = create_india_sentiment_map( | |
| sentiment_by_state, | |
| india_geojson, | |
| f"{selected} - Sentiment by Indian States" | |
| ) | |
| if map_fig: | |
| st.plotly_chart(map_fig, use_container_width=True) | |
| with st.expander("π State-wise Statistics"): | |
| sentiment_by_state_display = sentiment_by_state.sort_values('count', ascending=False) | |
| st.dataframe( | |
| sentiment_by_state_display, | |
| use_container_width=True, | |
| hide_index=True | |
| ) | |
| else: | |
| st.warning("No location data with valid sentiment found.") | |
| else: | |
| st.error("Could not load India map data.") | |
| with col_right: | |
| st.markdown("#### Top Mentioned Locations") | |
| top_loc_fig = create_top_locations_chart(df, "Top 15 Locations") | |
| if top_loc_fig: | |
| st.plotly_chart(top_loc_fig, use_container_width=True) | |
| total_articles = len(df) | |
| articles_with_location = df['extracted_location'].notna().sum() | |
| coverage = (articles_with_location / total_articles) * 100 | |
| st.metric("Location Coverage", f"{coverage:.1f}%") | |
| st.caption(f"{articles_with_location:,} out of {total_articles:,} articles have location data") | |
| with tab4: | |
| col_a, col_b, col_c = st.columns(3) | |
| with col_a: | |
| sentiment_filter = st.selectbox( | |
| "Sentiment", | |
| ["All"] + sorted(df['sentiment_value'].unique().tolist()) | |
| ) | |
| with col_b: | |
| if 'date' in df.columns: | |
| years = sorted(df['date'].dt.year.dropna().unique()) | |
| year_filter = st.selectbox("Year", ["All"] + years) | |
| else: | |
| year_filter = "All" | |
| with col_c: | |
| num_articles = st.slider("Display", 5, 50, 10) | |
| filtered_df = df.copy() | |
| if sentiment_filter != "All": | |
| filtered_df = filtered_df[filtered_df['sentiment_value'] == sentiment_filter] | |
| if year_filter != "All" and 'date' in df.columns: | |
| filtered_df = filtered_df[filtered_df['date'].dt.year == year_filter] | |
| st.write(f"Showing {min(num_articles, len(filtered_df))} of {len(filtered_df)} articles") | |
| for idx, row in filtered_df.head(num_articles).iterrows(): | |
| with st.expander(f"π° {row.get('title', 'Untitled')}"): | |
| col_x, col_y = st.columns([3, 1]) | |
| with col_x: | |
| st.write(row.get('desc', row.get('description', 'No description'))) | |
| if 'link' in row: | |
| st.markdown(f"[Read more β]({row['link']})") | |
| with col_y: | |
| sentiment = row.get('sentiment_value', 'Unknown') | |
| sentiment_emoji = { | |
| 'positive': 'π', | |
| 'negative': 'π', | |
| 'neutral': 'π' | |
| }.get(sentiment.lower(), 'β') | |
| st.metric("Sentiment", f"{sentiment_emoji} {sentiment.title()}") | |
| if 'date' in row: | |
| st.caption(f"π {row['date'].strftime('%d %b %Y')}") | |
| def show_about_page(): | |
| st.markdown('<h2 class="sub-header">βΉοΈ About This Platform</h2>', | |
| unsafe_allow_html=True) | |
| st.markdown(""" | |
| ## π― Overview | |
| This platform provides sentiment analysis and visualization for news articles, | |
| specifically designed for Indian news sources. It includes both upload functionality | |
| and integrated web scraping capabilities. | |
| ### β¨ Key Features | |
| - **Web Scraping**: Built-in scrapers for TOI, NDTV, WION, and Scroll.in | |
| - **Data Upload**: Upload pre-scraped CSV files for analysis | |
| - **Automatic Analysis**: Sentiment classification and scoring | |
| - **Interactive Visualizations**: Trends, distributions, and comparisons | |
| - **Geographic Analysis**: State-wise sentiment mapping for India | |
| - **Data Export**: Download processed datasets | |
| ### π§ Technical Stack | |
| - **Frontend**: Streamlit | |
| - **Data Processing**: Pandas, NumPy | |
| - **Visualization**: Plotly, Matplotlib | |
| - **NLP**: spaCy for location extraction | |
| - **Web Scraping**: BeautifulSoup4, Selenium (if available) | |
| - **Storage**: `/tmp` directory (HF Spaces compatible) | |
| ### π How to Use | |
| 1. **Scrape or Upload**: | |
| - Use the built-in scraper to collect articles from news sources | |
| - Or upload your own CSV file with pre-scraped data | |
| 2. **Analyze**: Go to Analysis page and select your dataset | |
| 3. **Explore**: View trends, distributions, and geographic sentiment | |
| 4. **Extract Locations**: Use the Geographic tab to extract state information | |
| 5. **Export**: Download processed data for further use | |
| ### π Supported News Sources | |
| - **Times of India (TOI)**: India's largest English daily | |
| - **NDTV**: Leading Indian news channel | |
| - **WION**: International news perspective | |
| - **Scroll.in**: Independent digital news | |
| ### π CSV File Format | |
| Your uploaded CSV should contain these columns: | |
| - `title`: Article headline | |
| - `date`: Publication date | |
| - `desc` or `description`: Article content/summary | |
| - `sentiment_value`: Sentiment label (positive/negative/neutral) | |
| - `link` (optional): URL to original article | |
| ### β οΈ Hugging Face Spaces Limitations | |
| - Data stored in `/tmp` is temporary and cleared on restart | |
| - Scraping may timeout for large datasets (>500 articles) | |
| - Download your processed data regularly | |
| - For heavy scraping, consider running locally | |
| ### π‘ Tips for Best Results | |
| - Start with smaller scrapes (50-100 articles) to test | |
| - Use specific topics for better quality results | |
| - Lower worker threads (2-4) for stability on HF Spaces | |
| - Download results immediately after scraping | |
| ### π€ Support | |
| For issues or questions, please refer to the documentation or contact support. | |
| --- | |
| **Version**: 2.0.0 (HF Spaces Edition with Scraping) | |
| **Last Updated**: October 2025 | |
| **Storage**: `/tmp/news_scraper` | |
| """) | |
| # MAIN APP | |
| def main(): | |
| try: | |
| load_css() | |
| init_session_state() | |
| if not setup_directories(): | |
| st.error("Failed to setup directories. Some features may not work.") | |
| with st.sidebar: | |
| st.markdown("### π° News Analysis") | |
| st.markdown("---") | |
| page = st.radio( | |
| "Navigation", | |
| ["π Home", "π€ Upload & Scrape", "π Analysis", "βΉοΈ About"], | |
| label_visibility="collapsed" | |
| ) | |
| st.markdown("---") | |
| st.markdown("### πΎ Storage") | |
| st.caption(f"Location: `/tmp/news_scraper`") | |
| st.caption("β οΈ Temporary storage") | |
| st.markdown("---") | |
| try: | |
| datasets = discover_datasets() | |
| if datasets: | |
| st.markdown("### π Quick Stats") | |
| total_articles = 0 | |
| for path in datasets.values(): | |
| try: | |
| df = load_data(path) | |
| if df is not None: | |
| total_articles += len(df) | |
| except: | |
| continue | |
| if total_articles > 0: | |
| st.metric("Total Articles", f"{total_articles:,}") | |
| st.metric("Datasets", len(datasets)) | |
| else: | |
| st.info("No data yet. Upload a CSV to start!") | |
| except Exception: | |
| pass | |
| if page == "π Home": | |
| show_home_page() | |
| elif page == "π€ Upload & Scrape": | |
| show_scraper_page() | |
| elif page == "π Analysis": | |
| show_analysis_page() | |
| else: | |
| show_about_page() | |
| except Exception as e: | |
| st.error(f"β Application error: {str(e)}") | |
| st.info("Try refreshing the page. If the problem persists, the app may need to restart.") | |
| with st.expander("π Error Details"): | |
| import traceback | |
| st.code(traceback.format_exc()) | |
| if __name__ == "__main__": | |
| main() |