Nishitha03 commited on
Commit
eb34004
Β·
verified Β·
1 Parent(s): 6e39ec1

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +626 -869
src/streamlit_app.py CHANGED
@@ -1,297 +1,240 @@
 
 
 
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  import matplotlib.pyplot as plt
4
- import numpy as np
5
  import plotly.express as px
6
  import plotly.graph_objects as go
7
  from matplotlib.ticker import MaxNLocator
 
 
8
  import os
 
9
  import time
 
 
10
  import json
11
  import requests
12
  import spacy
13
- from tqdm import tqdm
14
- import warnings
15
- import pandas as pd
16
- # import pussy
17
 
18
- # Suppress warnings for cleaner output
19
  warnings.filterwarnings('ignore')
20
- config_dir = os.environ.get("STREAMLIT_CONFIG_DIR", "/tmp/.streamlit")
21
- os.makedirs(config_dir, exist_ok=True)
22
 
23
- # Set page configuration
 
 
 
24
  st.set_page_config(
25
- page_title="Sentiment Analysis of RSS Articles",
26
  page_icon="πŸ“°",
27
  layout="wide",
28
  initial_sidebar_state="expanded"
29
  )
30
 
31
- # Custom CSS for styling
32
  def load_css():
33
  st.markdown("""
34
  <style>
35
  .main-header {
36
- font-size: 3rem !important;
37
  font-weight: 700 !important;
38
  text-align: center !important;
39
- padding: 2rem 0 !important;
 
 
 
40
  }
41
  .sub-header {
42
- font-size: 2rem !important;
43
  font-weight: 600 !important;
44
  padding: 1rem 0 !important;
 
45
  }
46
- .newspaper-card {
47
- background-color: #f8f9fa;
 
 
 
 
 
 
 
 
 
 
 
 
48
  border-radius: 10px;
49
  padding: 20px;
50
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
51
  text-align: center;
52
- margin-bottom: 20px;
53
  }
54
- .newspaper-title {
55
- font-size: 1.5rem;
56
- font-weight: 600;
57
- margin-bottom: 10px;
 
58
  }
59
- .entry-page {
60
- display: flex;
61
- flex-direction: column;
62
- justify-content: center;
63
- align-items: center;
64
- height: 100vh;
65
- position: fixed;
66
- top: 0;
67
- left: 0;
68
- right: 0;
69
- bottom: 0;
70
  }
71
- .entry-container {
72
- text-align: center;
73
- background-color: #f8f9fa;
74
- padding: 3rem;
75
- border-radius: 20px;
76
- box-shadow: 0 6px 12px rgba(0, 0, 0, 0.15);
77
- max-width: 800px;
78
  }
79
- .button-container {
80
- margin-top: 2rem;
81
  }
82
- .footer {
83
- text-align: center;
84
- padding: 1rem;
85
- color: #6c757d;
86
- margin-top: 2rem;
87
- border-top: 1px solid #dee2e6;
88
  }
89
  </style>
90
  """, unsafe_allow_html=True)
91
 
92
- # Constants
93
- INDIA_GEOJSON_URL = 'https://raw.githubusercontent.com/geohacker/india/master/state/india_state.geojson'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
- # India GeoJSON loading function
96
  @st.cache_data
97
  def load_india_geojson():
98
  """Load India GeoJSON data for mapping"""
99
  try:
100
- response = requests.get(INDIA_GEOJSON_URL)
101
  return json.loads(response.text)
102
  except Exception as e:
103
- st.error(f"Failed to load GeoJSON: {e}")
104
- st.info("Trying fallback method...")
105
- try:
106
- # Fallback: pip install geopandas
107
- import geopandas as gpd
108
- india = gpd.read_file(INDIA_GEOJSON_URL)
109
- return json.loads(india.to_json())
110
- except:
111
- st.error("Error: Could not load India GeoJSON. Please ensure internet connection.")
112
- return None
113
 
114
- # Load spaCy model (with caching)
115
  @st.cache_resource
116
  def load_spacy_model():
117
  try:
118
  return spacy.load("en_core_web_sm")
119
  except OSError:
120
- st.info("Downloading spaCy model... This may take a moment.")
121
  import subprocess
122
  subprocess.call(["python", "-m", "spacy", "download", "en_core_web_sm"])
123
  return spacy.load("en_core_web_sm")
124
 
125
- # State mapping dictionary
126
  def get_state_mapping():
127
  return {
128
- # Standard state names
129
- 'andhra pradesh': 'Andhra Pradesh',
130
- 'arunachal pradesh': 'Arunachal Pradesh',
131
- 'assam': 'Assam',
132
- 'bihar': 'Bihar',
133
- 'chhattisgarh': 'Chhattisgarh',
134
- 'goa': 'Goa',
135
- 'gujarat': 'Gujarat',
136
- 'haryana': 'Haryana',
137
- 'himachal pradesh': 'Himachal Pradesh',
138
- 'jharkhand': 'Jharkhand',
139
- 'karnataka': 'Karnataka',
140
- 'kerala': 'Kerala',
141
- 'madhya pradesh': 'Madhya Pradesh',
142
- 'maharashtra': 'Maharashtra',
143
- 'manipur': 'Manipur',
144
- 'meghalaya': 'Meghalaya',
145
- 'mizoram': 'Mizoram',
146
- 'nagaland': 'Nagaland',
147
- 'odisha': 'Odisha',
148
- 'punjab': 'Punjab',
149
- 'rajasthan': 'Rajasthan',
150
- 'sikkim': 'Sikkim',
151
- 'tamil nadu': 'Tamil Nadu',
152
- 'telangana': 'Telangana',
153
- 'tripura': 'Tripura',
154
- 'uttar pradesh': 'Uttar Pradesh',
155
- 'uttarakhand': 'Uttarakhand',
156
- 'west bengal': 'West Bengal',
157
- # Union Territories
158
- 'delhi': 'Delhi',
159
- 'new delhi': 'Delhi',
160
- 'jammu and kashmir': 'Jammu and Kashmir',
161
- 'j&k': 'Jammu and Kashmir',
162
- 'ladakh': 'Ladakh',
163
- 'chandigarh': 'Chandigarh',
164
- 'puducherry': 'Puducherry',
165
- 'pondicherry': 'Puducherry',
166
- 'andaman and nicobar': 'Andaman and Nicobar Islands',
167
- 'dadra and nagar haveli': 'Dadra and Nagar Haveli and Daman and Diu',
168
- 'daman and diu': 'Dadra and Nagar Haveli and Daman and Diu',
169
- 'lakshadweep': 'Lakshadweep',
170
- # Major cities mapped to their states
171
- 'mumbai': 'Maharashtra',
172
- 'kolkata': 'West Bengal',
173
- 'chennai': 'Tamil Nadu',
174
- 'bangalore': 'Karnataka',
175
- 'bengaluru': 'Karnataka',
176
- 'hyderabad': 'Telangana',
177
- 'ahmedabad': 'Gujarat',
178
- 'lucknow': 'Uttar Pradesh',
179
- 'jaipur': 'Rajasthan',
180
- 'srinagar': 'Jammu and Kashmir',
181
- 'varanasi': 'Uttar Pradesh',
182
- 'kochi': 'Kerala',
183
- 'pune': 'Maharashtra',
184
- 'agra': 'Uttar Pradesh',
185
- 'bhopal': 'Madhya Pradesh',
186
- 'patna': 'Bihar',
187
  }
188
 
189
- # Function to extract locations from descriptions
190
  @st.cache_data
191
  def extract_locations_from_descriptions(df, description_column='desc'):
192
- """
193
- Extract state names from description column using spaCy
194
- """
195
- with st.spinner("Extracting location data from articles..."):
196
- # Load spaCy model
197
- nlp = load_spacy_model()
 
 
 
 
198
 
199
- # Get state mapping dictionary
200
- state_mapping = get_state_mapping()
 
201
 
202
- # Process descriptions to extract locations
203
- locations = []
204
 
205
- # Use a progress bar if processing a large dataset
206
- progress_text = "Extracting locations..."
207
- progress_bar = st.progress(0)
 
 
 
208
 
209
- for idx, row in enumerate(df.iterrows()):
210
- # Update progress every 100 rows
211
- if idx % 100 == 0:
212
- progress_bar.progress(min(idx / len(df), 1.0))
213
-
214
- row = row[1] # Get the actual row data (second element of the tuple)
215
-
216
- if pd.isna(row[description_column]):
217
- locations.append(None)
218
- continue
219
-
220
- description = str(row[description_column]).lower()
221
- doc = nlp(description)
222
-
223
- # Extract location entities
224
- found_locations = []
225
- for ent in doc.ents:
226
- if ent.label_ in ["GPE", "LOC"]:
227
- loc_name = ent.text.lower()
228
- if loc_name in state_mapping:
229
- found_locations.append(state_mapping[loc_name])
230
-
231
- # Direct string matching for state names
232
- for state_var, standard_name in state_mapping.items():
233
- if state_var in description and standard_name not in found_locations:
234
- found_locations.append(standard_name)
235
-
236
- # Store the first found location, or None if none found
237
- locations.append(found_locations[0] if found_locations else None)
238
-
239
- # Complete progress
240
- progress_bar.progress(1.0)
241
 
242
- # Add locations to dataframe
243
- df = df.copy() # Create a copy to avoid modifying the original
244
- df['extracted_location'] = locations
245
-
246
- st.success(f"Locations extracted. Found locations in {df['extracted_location'].notna().sum()} of {len(df)} articles.")
247
- return df
 
248
 
249
- # Function to analyze sentiment by state
250
  def analyze_sentiment_by_state(df, sentiment_column='sentiment_score'):
251
- """
252
- Analyze sentiment by state and prepare data for visualization
253
- """
254
- # Filter to only rows with extracted locations and valid sentiment
255
  df_with_locations = df.dropna(subset=['extracted_location', sentiment_column])
256
 
257
  if len(df_with_locations) == 0:
258
- st.warning("No locations found with valid sentiment values. Cannot create map.")
259
  return None
260
 
261
- # Group by state and calculate average sentiment
262
- sentiment_by_state = df_with_locations.groupby('extracted_location')[sentiment_column].agg(
263
- avg_sentiment=('mean'),
264
- count=('count')
265
- ).reset_index()
266
 
267
  return sentiment_by_state
268
 
269
- # Function to create India sentiment map
270
- def create_india_sentiment_map(sentiment_data, geojson_data, newspaper_name):
271
- """
272
- Create a choropleth map of India showing sentiment by state
273
- """
274
- # Ensure state names match between GeoJSON and our data
275
- state_property = 'NAME_1' # This is the property name in the GeoJSON
276
 
277
- # Determine color scale range based on data
278
  min_sentiment = sentiment_data['avg_sentiment'].min()
279
  max_sentiment = sentiment_data['avg_sentiment'].max()
280
 
281
- # Use symmetrical range if sentiment ranges from negative to positive
282
  if min_sentiment < 0 and max_sentiment > 0:
283
  abs_max = max(abs(min_sentiment), abs(max_sentiment))
284
  color_range = [-abs_max, abs_max]
285
  else:
286
- # Add small buffer to range
287
  color_range = [min_sentiment - 0.1, max_sentiment + 0.1]
288
 
289
- # Create the choropleth map
290
  fig = px.choropleth_mapbox(
291
  sentiment_data,
292
  geojson=geojson_data,
293
  locations='extracted_location',
294
- featureidkey=f"properties.{state_property}",
295
  color='avg_sentiment',
296
  color_continuous_scale="RdBu",
297
  range_color=color_range,
@@ -301,86 +244,133 @@ def create_india_sentiment_map(sentiment_data, geojson_data, newspaper_name):
301
  opacity=0.7,
302
  hover_data=['count'],
303
  labels={
304
- 'avg_sentiment': 'Average Sentiment',
305
  'extracted_location': 'State',
306
- 'count': 'Article Count'
307
  }
308
  )
309
 
310
- # Customize the layout
311
  fig.update_layout(
312
- title=dict(
313
- text=f'{newspaper_name} - Sentiment Analysis by Indian States',
314
- font=dict(size=24, color='#2c3e50'),
315
- x=0.5,
316
- y=0.95
317
- ),
318
- height=800,
319
  margin={"r":0,"t":50,"l":0,"b":0}
320
  )
321
 
322
- # Add text annotation explaining the color scale
323
- fig.add_annotation(
324
- x=0.5, y=0.02,
325
- xref="paper", yref="paper",
326
- text="Color scale: Red (Negative) to Blue (Positive)",
327
- showarrow=False,
328
- font=dict(size=14)
 
 
 
 
 
 
 
 
 
 
 
 
329
  )
330
 
 
331
  return fig
332
 
333
- # Function to plot sentiment trends by year (from original code)
334
- def plot_sentiment_trends_by_year(df, newspaper_name):
335
- # Set the style to a clean, modern look
336
- plt.style.use('seaborn-v0_8-whitegrid')
337
-
338
- # Custom font settings
339
- plt.rcParams['font.family'] = 'sans-serif'
340
- plt.rcParams['font.sans-serif'] = ['Arial', 'Helvetica', 'DejaVu Sans']
341
- plt.rcParams['font.size'] = 11
342
- plt.rcParams['axes.titlesize'] = 16
343
- plt.rcParams['axes.labelsize'] = 12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
 
345
- # Convert date to datetime and extract year
346
- df['year'] = pd.to_datetime(df['date'], errors='coerce').dt.year
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
- # Ensure only known sentiments are used
 
 
 
 
 
 
 
349
  valid_sentiments = {"positive", "negative", "neutral"}
350
- df['sentiment'] = df['sentiment_value'].apply(lambda x: x.lower() if isinstance(x, str) and x.lower() in valid_sentiments else "neutral")
351
-
352
- # Count the number of articles per sentiment per year
 
353
  sentiment_counts = df.groupby(['year', 'sentiment']).size().reset_index(name='count')
354
-
355
- # Calculate total articles per year
356
  year_totals = sentiment_counts.groupby('year')['count'].sum().reset_index(name='total')
357
-
358
- # Merge the counts with totals to calculate percentages
359
  sentiment_counts = sentiment_counts.merge(year_totals, on='year')
360
  sentiment_counts['percentage'] = sentiment_counts['count'] / sentiment_counts['total'] * 100
361
-
362
- # Pivot the data for easier plotting
363
- sentiment_pivot = sentiment_counts.pivot(index='year', columns='sentiment', values='percentage').fillna(0)
364
-
365
- # Ensure all sentiment columns exist
366
  for sentiment in ['negative', 'neutral', 'positive']:
367
  if sentiment not in sentiment_pivot.columns:
368
  sentiment_pivot[sentiment] = 0
369
-
370
- # Sort by year (ascending for timeline)
371
- sentiment_pivot = sentiment_pivot.sort_index()
372
-
373
- # Create the figure and axis
374
- fig, ax = plt.subplots(figsize=(12, 7))
375
-
376
- # Define custom colors
377
  colors = {
378
- 'negative': '#5D3FD3', # rich purple
379
- 'neutral': '#9D4EDD', # lavender purple
380
- 'positive': '#00897B' # teal green
381
  }
382
-
383
- # Plot lines for each sentiment
384
  for sentiment in ['negative', 'neutral', 'positive']:
385
  ax.plot(
386
  sentiment_pivot.index,
@@ -389,664 +379,431 @@ def plot_sentiment_trends_by_year(df, newspaper_name):
389
  linewidth=2.5,
390
  label=sentiment.capitalize(),
391
  color=colors[sentiment],
392
- markersize=8,
393
- markeredgecolor='white',
394
- markeredgewidth=1.5
395
  )
396
-
397
- # Add article counts as annotations
398
- for year in sentiment_pivot.index:
399
- total = year_totals.loc[year_totals['year'] == year, 'total'].values[0]
400
- ax.annotate(
401
- f"{total:,}",
402
- xy=(year, sentiment_pivot.loc[year, 'negative'] - 5),
403
- xytext=(0, -25),
404
- textcoords='offset points',
405
- ha='center',
406
- fontsize=9,
407
- color='gray'
408
- )
409
-
410
- # Add a text indicating what the numbers represent
411
- ax.text(
412
- sentiment_pivot.index[0],
413
- -12,
414
- "Article Count",
415
- fontsize=9,
416
- color='gray',
417
- ha='center'
418
- )
419
-
420
- # Set x-axis to only show years (integers)
421
- ax.xaxis.set_major_locator(MaxNLocator(integer=True))
422
-
423
- # Set y-axis limits and labels
424
- ax.set_ylim(0, max(100, sentiment_pivot.max().max() * 1.1))
425
  ax.set_ylabel('Percentage (%)', fontweight='bold')
426
  ax.set_xlabel('Year', fontweight='bold')
427
-
428
- # Add title
429
- ax.set_title(f'{newspaper_name} - Sentiment Trends by Year', fontweight='bold', pad=20)
430
-
431
- # Customize legend
432
- legend = ax.legend(
433
- loc='upper right',
434
- frameon=True,
435
- framealpha=0.95,
436
- edgecolor='lightgray',
437
- title='Sentiment'
438
- )
439
- legend.get_title().set_fontweight('bold')
440
-
441
- # Remove spines for cleaner look
442
- ax.spines['top'].set_visible(False)
443
- ax.spines['right'].set_visible(False)
444
- ax.spines['left'].set_linewidth(0.5)
445
- ax.spines['bottom'].set_linewidth(0.5)
446
-
447
- # Add grid lines
448
- ax.grid(axis='y', linestyle='--', alpha=0.3, color='gray')
449
-
450
- # Add subtle background color
451
- fig.patch.set_facecolor('#F8F9FA')
452
- ax.set_facecolor('#F8F9FA')
453
-
454
- # Add percentage labels at the end of each line
455
- last_year = sentiment_pivot.index[-1]
456
- for sentiment in ['negative', 'neutral', 'positive']:
457
- if last_year in sentiment_pivot.index: # Check if the last_year exists in the index
458
- last_value = sentiment_pivot.loc[last_year, sentiment]
459
- ax.annotate(
460
- f"{last_value:.1f}%",
461
- xy=(last_year, last_value),
462
- xytext=(5, 0),
463
- textcoords='offset points',
464
- fontweight='bold',
465
- color=colors[sentiment]
466
- )
467
-
468
- # Add a data source footer
469
- plt.figtext(
470
- 0.01, 0.01,
471
- f"Data source: Analysis of {df.shape[0]:,} articles",
472
- fontsize=8,
473
- color='gray'
474
- )
475
-
476
- # Add horizontal line at 50% for reference
477
- ax.axhline(y=50, color='gray', linestyle='-', alpha=0.2)
478
- ax.text(sentiment_pivot.index[0], 51, "50%", fontsize=8, color='gray')
479
-
480
- # Adjust layout
481
- plt.tight_layout(pad=2.0)
482
-
483
- return fig
484
-
485
- # Function to plot article volume by year (from original code)
486
- def plot_article_volume_by_year(df, newspaper_name):
487
- # Set the style to a clean, modern look
488
- plt.style.use('seaborn-v0_8-whitegrid')
489
-
490
- # Custom font settings
491
- plt.rcParams['font.family'] = 'sans-serif'
492
- plt.rcParams['font.sans-serif'] = ['Arial', 'Helvetica', 'DejaVu Sans']
493
-
494
- # Convert date to datetime and extract year
495
- df['year'] = pd.to_datetime(df['date'], errors='coerce').dt.year
496
-
497
- # Count articles per year
498
- article_counts = df.groupby('year').size().reset_index(name='count')
499
-
500
- # Create the figure and axis
501
- fig, ax = plt.subplots(figsize=(12, 5))
502
-
503
- # Plot line for article count
504
- ax.plot(
505
- article_counts['year'],
506
- article_counts['count'],
507
- marker='o',
508
- linewidth=2.5,
509
- color='#3949AB',
510
- markersize=8,
511
- markeredgecolor='white',
512
- markeredgewidth=1.5
513
- )
514
-
515
- # Fill area under the line
516
- ax.fill_between(
517
- article_counts['year'],
518
- article_counts['count'],
519
- alpha=0.2,
520
- color='#3949AB'
521
- )
522
-
523
- # Set x-axis to only show years (integers)
524
- ax.xaxis.set_major_locator(MaxNLocator(integer=True))
525
-
526
- # Add count labels above each point
527
- for year, count in zip(article_counts['year'], article_counts['count']):
528
- ax.annotate(
529
- f"{count:,}",
530
- xy=(year, count),
531
- xytext=(0, 10),
532
- textcoords='offset points',
533
- ha='center',
534
- fontweight='bold',
535
- fontsize=10
536
- )
537
-
538
- # Set axis labels
539
- ax.set_ylabel('Number of Articles', fontweight='bold')
540
- ax.set_xlabel('Year', fontweight='bold')
541
-
542
- # Add title
543
- ax.set_title(f'{newspaper_name} - Article Volume by Year', fontweight='bold', pad=20)
544
-
545
- # Remove spines for cleaner look
546
- ax.spines['top'].set_visible(False)
547
- ax.spines['right'].set_visible(False)
548
-
549
- # Add grid lines
550
- ax.grid(axis='y', linestyle='--', alpha=0.3, color='gray')
551
-
552
- # Add subtle background color
553
- fig.patch.set_facecolor('#F8F9FA')
554
- ax.set_facecolor('#F8F9FA')
555
-
556
- # Adjust layout
557
  plt.tight_layout()
558
-
559
  return fig
560
 
561
- # Function to create a comparison bar chart of newspapers
562
- def create_newspaper_comparison(dataframes, newspaper_names):
563
- # Prepare data for the comparison
564
- comparison_data = []
565
-
566
- for i, df in enumerate(dataframes):
567
- if df is not None:
568
- # Ensure sentiment column exists and is properly formatted
569
- if 'sentiment_value' in df.columns:
570
- df['sentiment'] = df['sentiment_value'].apply(
571
- lambda x: x.lower() if isinstance(x, str) and x.lower() in ["positive", "negative", "neutral"] else "neutral"
572
- )
573
-
574
- # Count articles by sentiment
575
- sentiment_counts = df['sentiment'].value_counts().to_dict()
576
-
577
- # Add counts to comparison data
578
- for sentiment in ['positive', 'negative', 'neutral']:
579
- comparison_data.append({
580
- 'Newspaper': newspaper_names[i],
581
- 'Sentiment': sentiment.capitalize(),
582
- 'Count': sentiment_counts.get(sentiment, 0)
583
- })
584
 
585
- # Create DataFrame from comparison data
586
- comparison_df = pd.DataFrame(comparison_data)
587
 
588
- # Create grouped bar chart
589
- fig = px.bar(
590
- comparison_df,
591
- x='Newspaper',
592
- y='Count',
593
- color='Sentiment',
594
- barmode='group',
595
- title='Sentiment Distribution Across Newspapers',
596
  color_discrete_map={
597
- 'Positive': '#00897B',
598
- 'Neutral': '#9D4EDD',
599
- 'Negative': '#5D3FD3'
600
  }
601
  )
602
-
603
- fig.update_layout(
604
- height=500,
605
- legend_title='Sentiment',
606
- xaxis_title='',
607
- yaxis_title='Number of Articles'
608
- )
609
-
610
  return fig
611
 
612
- # Function to create a top locations bar chart
613
- def create_top_locations_chart(df, newspaper_name):
614
- """Create a bar chart of the top mentioned locations"""
615
- if 'extracted_location' not in df.columns or df['extracted_location'].isna().all():
616
- # Return an empty figure
617
- fig = go.Figure()
618
- fig.add_annotation(
619
- text="No location data available",
620
- showarrow=False,
621
- font=dict(size=20)
622
- )
623
- fig.update_layout(height=400)
624
- return fig
625
-
626
- # Count articles by location
627
- location_counts = df['extracted_location'].value_counts().reset_index()
628
- location_counts.columns = ['Location', 'Article Count']
629
-
630
- # Get top 15 locations
631
- top_locations = location_counts.head(15)
632
-
633
- # Create bar chart
634
- fig = px.bar(
635
- top_locations,
636
- y='Location',
637
- x='Article Count',
638
- title=f'Top 15 Locations Mentioned in {newspaper_name} Articles',
639
- orientation='h',
640
- color='Article Count',
641
- color_continuous_scale='Viridis'
642
- )
643
-
644
- fig.update_layout(
645
- height=500,
646
- yaxis={'categoryorder':'total ascending'}
647
- )
648
-
649
- return fig
650
 
651
- def create_top_politicians_chart(df, newspaper_name):
652
- """Create a bar chart of the top mentioned locations"""
653
-
654
- if 'Politician' not in df.columns or df['Politician'].isna().all():
655
- print(df.head())
656
- # Return an empty figure
657
- fig = go.Figure()
658
- fig.add_annotation(
659
- text="No politician data available",
660
- showarrow=False,
661
- font=dict(size=20)
662
- )
663
- fig.update_layout(height=400)
664
- return fig
665
-
666
- # Get top 15 locations
667
- top_locations = df
668
-
669
- # Create bar chart
670
- fig = px.bar(
671
- top_locations,
672
- y='Politician',
673
- x='Mentions',
674
- title=f'Top 10 Politicians Mentioned in {newspaper_name} Articles',
675
- orientation='h',
676
- color='Mentions',
677
- color_continuous_scale='Viridis'
678
- )
679
 
680
- fig.update_layout(
681
- height=500,
682
- yaxis={'categoryorder':'total ascending'}
683
- )
 
 
 
684
 
685
- return fig
686
-
687
- # Function to load data
688
- @st.cache_data
689
- def load_data(newspaper_name):
690
- try:
691
- # Try to load CSV file for the newspaper
692
- file_path = f"data/{newspaper_name.lower().replace(' ', '_')}_articles.csv"
693
- df = pd.read_csv(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
694
 
695
- # Check if required columns exist
696
- required_columns = ['date', 'sentiment_value']
697
- for col in required_columns:
698
- if col not in df.columns:
699
- st.error(f"Required column '{col}' not found in {file_path}")
700
- return None
 
 
 
 
 
 
 
 
 
 
701
 
702
- # Add sentiment score column if not exists
703
- if 'sentiment_score' not in df.columns:
704
- # Create a numeric sentiment score based on sentiment_value
705
- sentiment_map = {
706
- 'positive': 1.0,
707
- 'negative': -1.0,
708
- 'neutral': 0.0
709
- }
710
- df['sentiment_score'] = df['sentiment_value'].str.lower().map(sentiment_map).fillna(0)
 
711
 
712
- return df
713
-
714
- except Exception as e:
715
- st.error(f"Error loading data for {newspaper_name}: {str(e)}")
716
- return None
717
-
718
- # Entry page
719
- def show_entry_page():
720
- st.markdown('<div class="entry-page">', unsafe_allow_html=True)
721
- st.markdown('<div class="entry-container">', unsafe_allow_html=True)
722
-
723
- st.markdown('<h1 class="main-header">Sentiment Analysis of RSS Articles</h1>', unsafe_allow_html=True)
724
- st.markdown("""
725
- <p style="font-size: 1.2rem; margin-bottom: 2rem;">
726
- Analyze the sentiments of news articles from various RSS feeds across different newspapers.
727
- Discover trends, patterns, and insights through interactive visualizations.
728
- </p>
729
- """, unsafe_allow_html=True)
730
-
731
- st.markdown('<div class="button-container">', unsafe_allow_html=True)
732
- if st.button("Explore Analysis", key="entry_explore", use_container_width=True):
733
- st.session_state.show_entry = False
734
- st.markdown('</div>', unsafe_allow_html=True)
735
-
736
- st.markdown('</div>', unsafe_allow_html=True)
737
- st.markdown('</div>', unsafe_allow_html=True)
738
-
739
- # Home page with newspaper cards
740
- def show_home_page():
741
- st.markdown('<h1 class="main-header">RSS Articles Sentiment Analysis Dashboard</h1>', unsafe_allow_html=True)
742
-
743
- # List of newspapers
744
- newspapers = ["Print", "Scroll", "Sentinel", "NDTV"]
745
-
746
- # Load data for all newspapers
747
- dataframes = []
748
- for newspaper in newspapers:
749
- df = load_data(newspaper)
750
- dataframes.append(df)
751
-
752
- # Show comparison chart of all newspapers
753
- st.markdown('<h2 class="sub-header">Newspaper Sentiment Comparison</h2>', unsafe_allow_html=True)
754
- comparison_fig = create_newspaper_comparison(dataframes, newspapers)
755
- st.plotly_chart(comparison_fig, use_container_width=True)
756
-
757
- # Create a 2x2 grid for newspaper cards
758
- col1, col2 = st.columns(2)
759
- col3, col4 = st.columns(2)
760
- cols = [col1, col2, col3, col4]
761
-
762
- # Create a card for each newspaper
763
- for i, newspaper in enumerate(newspapers):
764
- df = dataframes[i]
765
- with cols[i]:
766
- st.markdown(f'<div class="newspaper-card">', unsafe_allow_html=True)
767
- st.markdown(f'<div class="newspaper-title">{newspaper}</div>', unsafe_allow_html=True)
768
 
769
- # Only show counts if data is available
770
- if df is not None:
771
- # Count articles by sentiment
772
- if 'sentiment_value' in df.columns:
773
- sentiment_counts = df['sentiment_value'].str.lower().value_counts()
774
-
775
- # Create three columns for sentiment counts
776
- pos_col, neu_col, neg_col = st.columns(3)
777
- with pos_col:
778
- st.metric("Positive", sentiment_counts.get('positive', 0))
779
- with neu_col:
780
- st.metric("Neutral", sentiment_counts.get('neutral', 0))
781
- with neg_col:
782
- st.metric("Negative", sentiment_counts.get('negative', 0))
783
- else:
784
- st.write("Sentiment data not available")
785
- else:
786
- st.write("Data not available")
787
 
788
- # Add button to view detailed analysis
789
- if st.button(f"View Analysis", key=f"view_{newspaper}"):
790
- st.session_state.current_newspaper = newspaper
791
- st.session_state.show_newspaper_analysis = True
792
- st.rerun()
793
 
794
- st.markdown('</div>', unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
795
 
796
- # Function to process all newspapers with location extraction
797
- @st.cache_data
798
- def preprocess_newspapers_with_locations(newspapers):
799
- # Load GeoJSON for India map
800
- india_geojson = load_india_geojson()
801
- if india_geojson is None:
802
- st.error("Could not load India GeoJSON. Please check your internet connection.")
803
- return {}
804
 
805
- processed_data = {}
806
 
807
- for newspaper in newspapers:
808
- # Load the raw data
809
- df = load_data(newspaper)
810
-
811
- if df is not None and 'desc' in df.columns:
812
- # Extract locations if not already done
813
- if 'extracted_location' not in df.columns:
814
- df = extract_locations_from_descriptions(df, 'desc')
815
-
816
- # Analyze sentiment by state
817
- sentiment_by_state = analyze_sentiment_by_state(df)
818
-
819
- processed_data[newspaper] = {
820
- 'df': df,
821
- 'sentiment_by_state': sentiment_by_state,
822
- 'india_geojson': india_geojson
823
- }
824
- else:
825
- if df is not None:
826
- processed_data[newspaper] = {
827
- 'df': df,
828
- 'error': "Description column 'desc' not found"
829
- }
830
- else:
831
- processed_data[newspaper] = {
832
- 'error': f"Could not load data for {newspaper}"
833
- }
834
-
835
- return processed_data
836
-
837
- # Newspaper analysis page
838
- # Newspaper analysis page
839
- def show_newspaper_analysis():
840
- # Add back button
841
- if st.button("← Back to Home"):
842
- st.session_state.show_newspaper_analysis = False
843
- st.rerun()
844
-
845
- newspaper = st.session_state.current_newspaper
846
- st.markdown(f'<h1 class="main-header">{newspaper} - Sentiment Analysis</h1>', unsafe_allow_html=True)
847
 
848
- # Load data for this newspaper
849
- df = load_data(newspaper)
850
 
851
- if df is not None:
852
- # Get or preprocess location data
853
- if 'processed_data' not in st.session_state:
854
- with st.spinner("Processing newspaper data..."):
855
- st.session_state.processed_data = preprocess_newspapers_with_locations(["Print", "Scroll", "Sentinel", "NDTV"])
856
-
857
- processed_data = st.session_state.processed_data.get(newspaper, {})
858
-
859
- # Display article count and date range
860
- article_count = len(df)
861
-
862
- # Convert date column to datetime to get min and max dates
863
- df['date'] = pd.to_datetime(df['date'], errors='coerce')
864
- min_date = df['date'].min().strftime('%d %b, %Y') if not pd.isna(df['date'].min()) else "Unknown"
865
- max_date = df['date'].max().strftime('%d %b, %Y') if not pd.isna(df['date'].max()) else "Unknown"
866
-
867
- # Create metrics row
868
- col1, col2, col3 = st.columns(3)
869
- with col1:
870
- st.metric("Total Articles", f"{article_count:,}")
871
- with col2:
872
- st.metric("First Article", min_date)
873
- with col3:
874
- st.metric("Latest Article", max_date)
875
 
876
- # Show sentiment trends by year
877
- st.markdown('<h2 class="sub-header">Sentiment Trends Over Time</h2>', unsafe_allow_html=True)
878
- try:
879
- sentiment_trend_fig = plot_sentiment_trends_by_year(df, newspaper)
880
- st.pyplot(sentiment_trend_fig)
881
- except Exception as e:
882
- st.error(f"Error generating sentiment trends chart: {str(e)}")
883
-
884
- # Show article volume by year
885
- st.markdown('<h2 class="sub-header">Article Volume by Year</h2>', unsafe_allow_html=True)
886
- try:
887
- volume_fig = plot_article_volume_by_year(df, newspaper)
888
- st.pyplot(volume_fig)
889
- except Exception as e:
890
- st.error(f"Error generating article volume chart: {str(e)}")
891
-
892
- # Create two columns for location analysis
893
- col1, col2 = st.columns(2)
894
-
895
- with col1:
896
- # Top mentioned locations
897
- st.markdown('<h2 class="sub-header">Top Mentioned Locations</h2>', unsafe_allow_html=True)
898
 
899
- if 'extracted_location' in df.columns:
900
- top_locations_fig = create_top_locations_chart(df, newspaper)
901
- st.plotly_chart(top_locations_fig, use_container_width=True)
902
- else:
903
- if 'desc' in df.columns:
904
- st.info("Location data not yet extracted. Click the button below to extract locations.")
905
- if st.button("Extract Locations", key=f"extract_{newspaper}"):
906
- with st.spinner("Extracting locations..."):
907
- df = extract_locations_from_descriptions(df)
908
- # Update the processed data
909
- processed_data['df'] = df
910
- sentiment_by_state = analyze_sentiment_by_state(df)
911
- processed_data['sentiment_by_state'] = sentiment_by_state
912
- st.session_state.processed_data[newspaper] = processed_data
913
- st.experimental_rerun()
914
- else:
915
- st.warning("Description column not found. Cannot extract locations.")
916
 
917
- # Top mentioned politicians - Now placed below the locations graph in the same column
918
- st.markdown('<h2 class="sub-header">Top Mentioned Politicians</h2>', unsafe_allow_html=True)
 
 
919
 
920
- if 'desc' in df.columns:
921
- # Check if rss_personalities is defined, if not you'll need to define it
922
- if 'rss_personalities' not in locals() and 'rss_personalities' not in globals():
923
- # Define your list of politicians here or import it
924
- rss_personalities = ["Narendra Modi", "Amit Shah", "Rajnath Singh", "Mohan Bhagwat", "Yogi Adityanath", "Nitin Gadkari"]
925
-
926
- top_politicians = pussy.count_politicians_in_descriptions(df, rss_personalities).head(10)
927
- top_politicians_fig = create_top_politicians_chart(top_politicians, newspaper)
928
- st.plotly_chart(top_politicians_fig, use_container_width=True)
929
- else:
930
- st.warning("Description column not found. Cannot analyze politicians.")
931
-
932
- with col2:
933
- # Sentiment by state map
934
- st.markdown('<h2 class="sub-header">Sentiment by State</h2>', unsafe_allow_html=True)
935
 
936
- sentiment_by_state = processed_data.get('sentiment_by_state')
937
- india_geojson = processed_data.get('india_geojson')
938
 
939
- if sentiment_by_state is not None and india_geojson is not None and not sentiment_by_state.empty:
940
- try:
941
- map_fig = create_india_sentiment_map(sentiment_by_state, india_geojson, newspaper)
942
- st.plotly_chart(map_fig, use_container_width=True)
943
- except Exception as e:
944
- st.error(f"Error creating sentiment map: {str(e)}")
945
- else:
946
- if 'error' in processed_data:
947
- st.warning(processed_data['error'])
948
  else:
949
- st.info("Sentiment data not available. Extract locations first.")
950
-
951
- # Add section for detailed article analysis
952
- st.markdown('<h2 class="sub-header">Article Analysis</h2>', unsafe_allow_html=True)
953
-
954
- # Add filters for article display
955
- col1, col2, col3 = st.columns(3)
956
-
957
- with col1:
958
- # Sentiment filter
959
- sentiment_options = ["All"] + sorted(df['sentiment_value'].unique().tolist())
960
- selected_sentiment = st.selectbox("Filter by Sentiment", sentiment_options)
961
-
962
- with col2:
963
- # Year filter
964
- year_options = ["All"] + sorted(df['date'].dt.year.dropna().unique().astype(int).tolist())
965
- selected_year = st.selectbox("Filter by Year", year_options)
966
-
967
- with col3:
968
- # Location filter (if available)
969
- location_options = ["All"]
970
- if 'extracted_location' in df.columns:
971
- location_options += sorted(df['extracted_location'].dropna().unique().tolist())
972
- selected_location = st.selectbox("Filter by Location", location_options)
973
-
974
- # Apply filters
975
- filtered_df = df.copy()
976
-
977
- if selected_sentiment != "All":
978
- filtered_df = filtered_df[filtered_df['sentiment_value'] == selected_sentiment]
979
-
980
- if selected_year != "All":
981
- filtered_df = filtered_df[filtered_df['date'].dt.year == selected_year]
982
-
983
- if selected_location != "All" and 'extracted_location' in filtered_df.columns:
984
- filtered_df = filtered_df[filtered_df['extracted_location'] == selected_location]
985
-
986
- # Show article count after filtering
987
- st.write(f"Displaying {len(filtered_df)} articles based on your filters.")
988
-
989
- # Display articles in an expandable format
990
- if not filtered_df.empty:
991
- for index, row in filtered_df.head(50).iterrows():
992
- title = row.get('title', 'Untitled')
993
- date = row['date'].strftime('%d %b, %Y') if pd.notna(row['date']) else 'Unknown date'
994
- sentiment = row.get('sentiment_value', 'Unknown sentiment')
995
- description = row.get('desc', 'No description available')
996
- link = row.get('link', 'No link available')
997
 
998
- # Format sentiment with color
999
- sentiment_color = {
1000
- 'positive': 'green',
1001
- 'neutral': 'gray',
1002
- 'negative': 'red'
1003
- }.get(sentiment.lower(), 'gray')
 
 
 
1004
 
1005
- # Create expandable card for each article
1006
- with st.expander(f"{title} - {date}"):
1007
- st.markdown(f"**Sentiment:** <span style='color:{sentiment_color}'>{sentiment.capitalize()}</span>", unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
1008
 
1009
- if 'extracted_location' in row and pd.notna(row['extracted_location']):
1010
- st.markdown(f"**Location:** {row['extracted_location']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1011
 
1012
- st.markdown("**Description:**")
1013
- st.markdown(f"{description}")
1014
- st.markdown(f"**Link:** {link}")
 
 
 
 
 
 
 
 
 
 
1015
 
1016
- if len(filtered_df) > 50:
1017
- st.info(f"Showing 50 out of {len(filtered_df)} articles. Apply more filters to narrow down results.")
1018
- else:
1019
- st.info("No articles match your selected filters.")
1020
- else:
1021
- st.error(f"Could not load data for {newspaper}. Please check if the data file exists.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1022
 
1023
- # Main function
1024
  def main():
1025
- # Load CSS
1026
  load_css()
 
 
1027
 
1028
- # Initialize session state variables if not exists
1029
- if 'show_entry' not in st.session_state:
1030
- st.session_state.show_entry = True
1031
-
1032
- if 'show_newspaper_analysis' not in st.session_state:
1033
- st.session_state.show_newspaper_analysis = False
1034
-
1035
- if 'current_newspaper' not in st.session_state:
1036
- st.session_state.current_newspaper = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1037
 
1038
- # Display appropriate page based on session state
1039
- if st.session_state.show_entry:
1040
- show_entry_page()
1041
- elif st.session_state.show_newspaper_analysis:
1042
- show_newspaper_analysis()
1043
- else:
1044
  show_home_page()
1045
-
1046
- # Footer
1047
- st.markdown('<div class="footer">', unsafe_allow_html=True)
1048
- st.markdown('RSS Sentiment Analysis Dashboard - Developed with Streamlit', unsafe_allow_html=True)
1049
- st.markdown('</div>', unsafe_allow_html=True)
 
1050
 
1051
  if __name__ == "__main__":
1052
- main()
 
1
+ """
2
+ Unified News Scraper & Sentiment Analysis Application
3
+ Combines scraping, processing, and visualization in one interface
4
+ """
5
+
6
  import streamlit as st
7
  import pandas as pd
8
  import matplotlib.pyplot as plt
 
9
  import plotly.express as px
10
  import plotly.graph_objects as go
11
  from matplotlib.ticker import MaxNLocator
12
+ import subprocess
13
+ import sys
14
  import os
15
+ from pathlib import Path
16
  import time
17
+ from datetime import datetime
18
+ import warnings
19
  import json
20
  import requests
21
  import spacy
 
 
 
 
22
 
 
23
  warnings.filterwarnings('ignore')
 
 
24
 
25
+ # Constants
26
+ INDIA_GEOJSON_URL = 'https://raw.githubusercontent.com/geohacker/india/master/state/india_state.geojson'
27
+
28
+ # Page config
29
  st.set_page_config(
30
+ page_title="News Scraper & Analysis Platform",
31
  page_icon="πŸ“°",
32
  layout="wide",
33
  initial_sidebar_state="expanded"
34
  )
35
 
36
+ # Custom CSS
37
  def load_css():
38
  st.markdown("""
39
  <style>
40
  .main-header {
41
+ font-size: 2.8rem !important;
42
  font-weight: 700 !important;
43
  text-align: center !important;
44
+ padding: 1.5rem 0 !important;
45
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
46
+ -webkit-background-clip: text;
47
+ -webkit-text-fill-color: transparent;
48
  }
49
  .sub-header {
50
+ font-size: 1.8rem !important;
51
  font-weight: 600 !important;
52
  padding: 1rem 0 !important;
53
+ color: #2c3e50;
54
  }
55
+ .feature-card {
56
+ background: white;
57
+ border-radius: 15px;
58
+ padding: 25px;
59
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
60
+ margin: 10px 0;
61
+ transition: transform 0.3s;
62
+ }
63
+ .feature-card:hover {
64
+ transform: translateY(-5px);
65
+ box-shadow: 0 8px 12px rgba(0, 0, 0, 0.15);
66
+ }
67
+ .metric-card {
68
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
69
  border-radius: 10px;
70
  padding: 20px;
71
+ color: white;
72
  text-align: center;
 
73
  }
74
+ .status-running {
75
+ background-color: #fff3cd;
76
+ border-left: 4px solid #ffc107;
77
+ padding: 15px;
78
+ border-radius: 5px;
79
  }
80
+ .status-success {
81
+ background-color: #d4edda;
82
+ border-left: 4px solid #28a745;
83
+ padding: 15px;
84
+ border-radius: 5px;
 
 
 
 
 
 
85
  }
86
+ .status-error {
87
+ background-color: #f8d7da;
88
+ border-left: 4px solid #dc3545;
89
+ padding: 15px;
90
+ border-radius: 5px;
 
 
91
  }
92
+ .stTabs [data-baseweb="tab-list"] {
93
+ gap: 24px;
94
  }
95
+ .stTabs [data-baseweb="tab"] {
96
+ padding: 10px 20px;
97
+ background-color: #f8f9fa;
98
+ border-radius: 8px 8px 0 0;
 
 
99
  }
100
  </style>
101
  """, unsafe_allow_html=True)
102
 
103
+ # Initialize session state
104
+ def init_session_state():
105
+ defaults = {
106
+ 'scraped_data': {},
107
+ 'scraping_active': False,
108
+ 'processing_status': {},
109
+ 'selected_dataset': None
110
+ }
111
+ for key, value in defaults.items():
112
+ if key not in st.session_state:
113
+ st.session_state[key] = value
114
+
115
+ # Setup directories
116
+ def setup_directories():
117
+ for dir_name in ['output', 'data', 'temp']:
118
+ Path(dir_name).mkdir(exist_ok=True)
119
 
120
+ # Load India GeoJSON
121
  @st.cache_data
122
  def load_india_geojson():
123
  """Load India GeoJSON data for mapping"""
124
  try:
125
+ response = requests.get(INDIA_GEOJSON_URL, timeout=10)
126
  return json.loads(response.text)
127
  except Exception as e:
128
+ st.warning(f"Could not load India map: {e}")
129
+ return None
 
 
 
 
 
 
 
 
130
 
131
+ # Load spaCy model
132
  @st.cache_resource
133
  def load_spacy_model():
134
  try:
135
  return spacy.load("en_core_web_sm")
136
  except OSError:
137
+ st.info("Downloading spaCy model...")
138
  import subprocess
139
  subprocess.call(["python", "-m", "spacy", "download", "en_core_web_sm"])
140
  return spacy.load("en_core_web_sm")
141
 
142
+ # State mapping
143
  def get_state_mapping():
144
  return {
145
+ 'andhra pradesh': 'Andhra Pradesh', 'arunachal pradesh': 'Arunachal Pradesh',
146
+ 'assam': 'Assam', 'bihar': 'Bihar', 'chhattisgarh': 'Chhattisgarh',
147
+ 'goa': 'Goa', 'gujarat': 'Gujarat', 'haryana': 'Haryana',
148
+ 'himachal pradesh': 'Himachal Pradesh', 'jharkhand': 'Jharkhand',
149
+ 'karnataka': 'Karnataka', 'kerala': 'Kerala', 'madhya pradesh': 'Madhya Pradesh',
150
+ 'maharashtra': 'Maharashtra', 'manipur': 'Manipur', 'meghalaya': 'Meghalaya',
151
+ 'mizoram': 'Mizoram', 'nagaland': 'Nagaland', 'odisha': 'Odisha',
152
+ 'punjab': 'Punjab', 'rajasthan': 'Rajasthan', 'sikkim': 'Sikkim',
153
+ 'tamil nadu': 'Tamil Nadu', 'telangana': 'Telangana', 'tripura': 'Tripura',
154
+ 'uttar pradesh': 'Uttar Pradesh', 'uttarakhand': 'Uttarakhand',
155
+ 'west bengal': 'West Bengal', 'delhi': 'Delhi', 'new delhi': 'Delhi',
156
+ 'jammu and kashmir': 'Jammu and Kashmir', 'j&k': 'Jammu and Kashmir',
157
+ 'ladakh': 'Ladakh', 'chandigarh': 'Chandigarh', 'puducherry': 'Puducherry',
158
+ 'mumbai': 'Maharashtra', 'kolkata': 'West Bengal', 'chennai': 'Tamil Nadu',
159
+ 'bangalore': 'Karnataka', 'bengaluru': 'Karnataka', 'hyderabad': 'Telangana',
160
+ 'ahmedabad': 'Gujarat', 'pune': 'Maharashtra', 'jaipur': 'Rajasthan',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  }
162
 
163
+ # Extract locations from text
164
  @st.cache_data
165
  def extract_locations_from_descriptions(df, description_column='desc'):
166
+ """Extract state names from description using spaCy"""
167
+ nlp = load_spacy_model()
168
+ state_mapping = get_state_mapping()
169
+
170
+ locations = []
171
+ progress_bar = st.progress(0)
172
+
173
+ for idx, (_, row) in enumerate(df.iterrows()):
174
+ if idx % 100 == 0:
175
+ progress_bar.progress(min(idx / len(df), 1.0))
176
 
177
+ if pd.isna(row.get(description_column, None)):
178
+ locations.append(None)
179
+ continue
180
 
181
+ description = str(row[description_column]).lower()
182
+ doc = nlp(description)
183
 
184
+ found_locations = []
185
+ for ent in doc.ents:
186
+ if ent.label_ in ["GPE", "LOC"]:
187
+ loc_name = ent.text.lower()
188
+ if loc_name in state_mapping:
189
+ found_locations.append(state_mapping[loc_name])
190
 
191
+ for state_var, standard_name in state_mapping.items():
192
+ if state_var in description and standard_name not in found_locations:
193
+ found_locations.append(standard_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
+ locations.append(found_locations[0] if found_locations else None)
196
+
197
+ progress_bar.progress(1.0)
198
+ df = df.copy()
199
+ df['extracted_location'] = locations
200
+
201
+ return df
202
 
203
+ # Analyze sentiment by state
204
  def analyze_sentiment_by_state(df, sentiment_column='sentiment_score'):
205
+ """Analyze sentiment by state"""
 
 
 
206
  df_with_locations = df.dropna(subset=['extracted_location', sentiment_column])
207
 
208
  if len(df_with_locations) == 0:
 
209
  return None
210
 
211
+ sentiment_by_state = df_with_locations.groupby('extracted_location')[sentiment_column].agg([
212
+ ('avg_sentiment', 'mean'),
213
+ ('count', 'count')
214
+ ]).reset_index()
 
215
 
216
  return sentiment_by_state
217
 
218
+ # Create India sentiment map
219
+ def create_india_sentiment_map(sentiment_data, geojson_data, title):
220
+ """Create choropleth map of India showing sentiment by state"""
221
+ if sentiment_data is None or geojson_data is None:
222
+ return None
 
 
223
 
 
224
  min_sentiment = sentiment_data['avg_sentiment'].min()
225
  max_sentiment = sentiment_data['avg_sentiment'].max()
226
 
 
227
  if min_sentiment < 0 and max_sentiment > 0:
228
  abs_max = max(abs(min_sentiment), abs(max_sentiment))
229
  color_range = [-abs_max, abs_max]
230
  else:
 
231
  color_range = [min_sentiment - 0.1, max_sentiment + 0.1]
232
 
 
233
  fig = px.choropleth_mapbox(
234
  sentiment_data,
235
  geojson=geojson_data,
236
  locations='extracted_location',
237
+ featureidkey="properties.NAME_1",
238
  color='avg_sentiment',
239
  color_continuous_scale="RdBu",
240
  range_color=color_range,
 
244
  opacity=0.7,
245
  hover_data=['count'],
246
  labels={
247
+ 'avg_sentiment': 'Avg Sentiment',
248
  'extracted_location': 'State',
249
+ 'count': 'Articles'
250
  }
251
  )
252
 
 
253
  fig.update_layout(
254
+ title=dict(text=title, font=dict(size=20), x=0.5),
255
+ height=600,
 
 
 
 
 
256
  margin={"r":0,"t":50,"l":0,"b":0}
257
  )
258
 
259
+ return fig
260
+
261
+ # Top locations chart
262
+ def create_top_locations_chart(df, title):
263
+ """Create bar chart of top mentioned locations"""
264
+ if 'extracted_location' not in df.columns or df['extracted_location'].isna().all():
265
+ return None
266
+
267
+ location_counts = df['extracted_location'].value_counts().head(15).reset_index()
268
+ location_counts.columns = ['Location', 'Count']
269
+
270
+ fig = px.bar(
271
+ location_counts,
272
+ y='Location',
273
+ x='Count',
274
+ title=title,
275
+ orientation='h',
276
+ color='Count',
277
+ color_continuous_scale='Viridis'
278
  )
279
 
280
+ fig.update_layout(height=500, yaxis={'categoryorder':'total ascending'})
281
  return fig
282
 
283
+ # Discover datasets
284
+ @st.cache_data
285
+ def discover_datasets():
286
+ datasets = {}
287
+ for directory in [Path('data'), Path('output')]:
288
+ if directory.exists():
289
+ for csv_file in directory.glob('*.csv'):
290
+ name = csv_file.stem.replace('_articles', '').replace('_', ' ').title()
291
+ datasets[name] = str(csv_file)
292
+ return datasets
293
+
294
+ # Load data
295
+ @st.cache_data
296
+ def load_data(file_path):
297
+ try:
298
+ df = pd.read_csv(file_path)
299
+
300
+ # Standardize columns
301
+ date_cols = [col for col in df.columns if 'date' in col.lower()]
302
+ if date_cols:
303
+ df['date'] = pd.to_datetime(df[date_cols[0]], errors='coerce')
304
+
305
+ sentiment_cols = [col for col in df.columns if 'sentiment' in col.lower()]
306
+ if sentiment_cols and 'sentiment_value' not in df.columns:
307
+ df['sentiment_value'] = df[sentiment_cols[0]]
308
+
309
+ if 'sentiment_score' not in df.columns and 'sentiment_value' in df.columns:
310
+ sentiment_map = {'positive': 1.0, 'negative': -1.0, 'neutral': 0.0}
311
+ df['sentiment_score'] = df['sentiment_value'].str.lower().map(sentiment_map).fillna(0)
312
+
313
+ return df
314
+ except Exception as e:
315
+ st.error(f"Error loading data: {str(e)}")
316
+ return None
317
 
318
+ # Run scraper
319
+ def run_scraper_async(source, topic, workers, interval):
320
+ cmd = [
321
+ sys.executable, "main.py",
322
+ "--source", source,
323
+ "--topic", topic,
324
+ "--workers", str(workers),
325
+ "--interval", str(interval)
326
+ ]
327
+
328
+ try:
329
+ process = subprocess.Popen(
330
+ cmd,
331
+ stdout=subprocess.PIPE,
332
+ stderr=subprocess.PIPE,
333
+ text=True,
334
+ bufsize=1
335
+ )
336
+ return process
337
+ except Exception as e:
338
+ return None
339
 
340
+ # Plotting functions
341
+ def plot_sentiment_trends(df, title):
342
+ if 'date' not in df.columns or 'sentiment_value' not in df.columns:
343
+ return None
344
+
345
+ plt.style.use('seaborn-v0_8-whitegrid')
346
+ df['year'] = df['date'].dt.year
347
+
348
  valid_sentiments = {"positive", "negative", "neutral"}
349
+ df['sentiment'] = df['sentiment_value'].apply(
350
+ lambda x: x.lower() if isinstance(x, str) and x.lower() in valid_sentiments else "neutral"
351
+ )
352
+
353
  sentiment_counts = df.groupby(['year', 'sentiment']).size().reset_index(name='count')
 
 
354
  year_totals = sentiment_counts.groupby('year')['count'].sum().reset_index(name='total')
 
 
355
  sentiment_counts = sentiment_counts.merge(year_totals, on='year')
356
  sentiment_counts['percentage'] = sentiment_counts['count'] / sentiment_counts['total'] * 100
357
+
358
+ sentiment_pivot = sentiment_counts.pivot(
359
+ index='year', columns='sentiment', values='percentage'
360
+ ).fillna(0)
361
+
362
  for sentiment in ['negative', 'neutral', 'positive']:
363
  if sentiment not in sentiment_pivot.columns:
364
  sentiment_pivot[sentiment] = 0
365
+
366
+ fig, ax = plt.subplots(figsize=(12, 6))
367
+
 
 
 
 
 
368
  colors = {
369
+ 'negative': '#e74c3c',
370
+ 'neutral': '#95a5a6',
371
+ 'positive': '#2ecc71'
372
  }
373
+
 
374
  for sentiment in ['negative', 'neutral', 'positive']:
375
  ax.plot(
376
  sentiment_pivot.index,
 
379
  linewidth=2.5,
380
  label=sentiment.capitalize(),
381
  color=colors[sentiment],
382
+ markersize=7
 
 
383
  )
384
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385
  ax.set_ylabel('Percentage (%)', fontweight='bold')
386
  ax.set_xlabel('Year', fontweight='bold')
387
+ ax.set_title(title, fontweight='bold', pad=15)
388
+ ax.legend(loc='best', frameon=True)
389
+ ax.grid(axis='y', linestyle='--', alpha=0.3)
390
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
  plt.tight_layout()
 
392
  return fig
393
 
394
+ def create_sentiment_pie(df, title):
395
+ if 'sentiment_value' not in df.columns:
396
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
 
398
+ sentiment_counts = df['sentiment_value'].str.lower().value_counts()
 
399
 
400
+ fig = px.pie(
401
+ values=sentiment_counts.values,
402
+ names=[s.title() for s in sentiment_counts.index],
403
+ title=title,
 
 
 
 
404
  color_discrete_map={
405
+ 'Positive': '#2ecc71',
406
+ 'Neutral': '#95a5a6',
407
+ 'Negative': '#e74c3c'
408
  }
409
  )
410
+ fig.update_traces(textposition='inside', textinfo='percent+label')
 
 
 
 
 
 
 
411
  return fig
412
 
413
+ # MAIN APP PAGES
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
 
415
+ def show_home_page():
416
+ st.markdown('<h1 class="main-header">πŸ“° News Scraper & Analysis Platform</h1>',
417
+ unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
 
419
+ st.markdown("""
420
+ <div style="text-align: center; padding: 20px; background-color: #f8f9fa;
421
+ border-radius: 10px; margin: 20px 0;">
422
+ <h3>Complete Pipeline for News Collection and Sentiment Analysis</h3>
423
+ <p>Scrape articles from major Indian news sources and analyze sentiment trends</p>
424
+ </div>
425
+ """, unsafe_allow_html=True)
426
 
427
+ # Feature cards
428
+ col1, col2, col3 = st.columns(3)
429
+
430
+ with col1:
431
+ st.markdown('<div class="feature-card">', unsafe_allow_html=True)
432
+ st.markdown("### πŸ” Scrape")
433
+ st.write("Collect articles from TOI, NDTV, WION, and Scroll.in")
434
+ st.markdown('</div>', unsafe_allow_html=True)
435
+
436
+ with col2:
437
+ st.markdown('<div class="feature-card">', unsafe_allow_html=True)
438
+ st.markdown("### πŸ“Š Analyze")
439
+ st.write("Automatic sentiment classification and trend analysis")
440
+ st.markdown('</div>', unsafe_allow_html=True)
441
+
442
+ with col3:
443
+ st.markdown('<div class="feature-card">', unsafe_allow_html=True)
444
+ st.markdown("### πŸ“ˆ Visualize")
445
+ st.write("Interactive charts and geographic sentiment mapping")
446
+ st.markdown('</div>', unsafe_allow_html=True)
447
+
448
+ # Quick stats
449
+ datasets = discover_datasets()
450
+ if datasets:
451
+ st.markdown("---")
452
+ st.markdown("### πŸ“Š Available Datasets")
453
 
454
+ cols = st.columns(min(len(datasets), 4))
455
+ for idx, (name, path) in enumerate(list(datasets.items())[:4]):
456
+ with cols[idx]:
457
+ df = load_data(path)
458
+ if df is not None:
459
+ st.markdown('<div class="metric-card">', unsafe_allow_html=True)
460
+ st.metric(name, f"{len(df):,} articles")
461
+ st.markdown('</div>', unsafe_allow_html=True)
462
+
463
+ def show_scraper_page():
464
+ st.markdown('<h2 class="sub-header">πŸ” Article Scraper</h2>', unsafe_allow_html=True)
465
+
466
+ col1, col2 = st.columns([2, 1])
467
+
468
+ with col1:
469
+ st.markdown("### Configuration")
470
 
471
+ source = st.selectbox(
472
+ "News Source",
473
+ options=['toi', 'ndtv', 'wion', 'scroll'],
474
+ format_func=lambda x: {
475
+ 'toi': 'πŸ“° Times of India',
476
+ 'ndtv': 'πŸ“Ί NDTV',
477
+ 'wion': '🌍 WION',
478
+ 'scroll': 'πŸ“œ Scroll.in'
479
+ }[x]
480
+ )
481
 
482
+ topic = st.text_input("Topic", placeholder="e.g., Climate Change, Technology")
483
+
484
+ col_a, col_b = st.columns(2)
485
+ with col_a:
486
+ workers = st.slider("Workers", 1, 10, 4)
487
+ with col_b:
488
+ interval = st.slider("Save Interval (s)", 60, 600, 300, step=60)
489
+
490
+ with col2:
491
+ st.markdown("### Quick Guide")
492
+ st.info("""
493
+ **Steps:**
494
+ 1. Select news source
495
+ 2. Enter search topic
496
+ 3. Configure settings
497
+ 4. Click Start
498
+ 5. Monitor progress
499
+ """)
500
+
501
+ st.markdown("---")
502
+
503
+ if st.button("πŸš€ Start Scraping", disabled=not topic, type="primary"):
504
+ with st.spinner("Initializing scraper..."):
505
+ st.markdown('<div class="status-running">', unsafe_allow_html=True)
506
+ st.write(f"⏳ Scraping **{source.upper()}** for **'{topic}'**...")
507
+ st.markdown('</div>', unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508
 
509
+ progress_bar = st.progress(0)
510
+ status_text = st.empty()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
511
 
512
+ process = run_scraper_async(source, topic, workers, interval)
 
 
 
 
513
 
514
+ if process:
515
+ output_lines = []
516
+ progress = 0
517
+
518
+ while True:
519
+ line = process.stdout.readline()
520
+ if not line and process.poll() is not None:
521
+ break
522
+ if line:
523
+ output_lines.append(line.strip())
524
+ status_text.text(line.strip())
525
+ progress = min(progress + 1, 95)
526
+ progress_bar.progress(progress / 100)
527
+
528
+ progress_bar.progress(100)
529
+
530
+ if process.returncode == 0:
531
+ st.markdown('<div class="status-success">', unsafe_allow_html=True)
532
+ st.success("βœ… Scraping completed successfully!")
533
+ st.markdown('</div>', unsafe_allow_html=True)
534
+ st.balloons()
535
+ else:
536
+ st.markdown('<div class="status-error">', unsafe_allow_html=True)
537
+ st.error("❌ Scraping failed. Check logs.")
538
+ with st.expander("View Logs"):
539
+ st.code("\n".join(output_lines[-20:]))
540
+ st.markdown('</div>', unsafe_allow_html=True)
541
 
542
+ def show_analysis_page():
543
+ st.markdown('<h2 class="sub-header">πŸ“Š Sentiment Analysis Dashboard</h2>',
544
+ unsafe_allow_html=True)
 
 
 
 
 
545
 
546
+ datasets = discover_datasets()
547
 
548
+ if not datasets:
549
+ st.warning("⚠️ No datasets available. Please scrape some articles first!")
550
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
551
 
552
+ # Dataset selector
553
+ selected = st.selectbox("Select Dataset", options=list(datasets.keys()))
554
 
555
+ if selected:
556
+ df = load_data(datasets[selected])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
557
 
558
+ if df is not None:
559
+ # Overview metrics
560
+ col1, col2, col3, col4 = st.columns(4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
561
 
562
+ with col1:
563
+ st.metric("πŸ“„ Total Articles", f"{len(df):,}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
564
 
565
+ with col2:
566
+ if 'date' in df.columns:
567
+ years = f"{df['date'].dt.year.min()}-{df['date'].dt.year.max()}"
568
+ st.metric("πŸ“… Years", years)
569
 
570
+ with col3:
571
+ if 'sentiment_value' in df.columns:
572
+ pos_pct = (df['sentiment_value'].str.lower() == 'positive').mean() * 100
573
+ st.metric("😊 Positive", f"{pos_pct:.1f}%")
574
+
575
+ with col4:
576
+ if 'sentiment_value' in df.columns:
577
+ neg_pct = (df['sentiment_value'].str.lower() == 'negative').mean() * 100
578
+ st.metric("😞 Negative", f"{neg_pct:.1f}%")
579
+
580
+ st.markdown("---")
 
 
 
 
581
 
582
+ # Visualizations
583
+ tab1, tab2, tab3, tab4 = st.tabs(["πŸ“ˆ Trends", "πŸ₯§ Distribution", "πŸ—ΊοΈ Geographic", "πŸ“ Articles"])
584
 
585
+ with tab1:
586
+ fig = plot_sentiment_trends(df, f"{selected} - Sentiment Trends")
587
+ if fig:
588
+ st.pyplot(fig)
 
 
 
 
 
589
  else:
590
+ st.info("Insufficient data for trend analysis")
591
+
592
+ with tab2:
593
+ col_a, col_b = st.columns([2, 1])
594
+ with col_a:
595
+ pie_fig = create_sentiment_pie(df, "Sentiment Distribution")
596
+ if pie_fig:
597
+ st.plotly_chart(pie_fig, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
598
 
599
+ with col_b:
600
+ if 'sentiment_value' in df.columns:
601
+ st.markdown("### Breakdown")
602
+ counts = df['sentiment_value'].value_counts()
603
+ for sentiment, count in counts.items():
604
+ st.metric(sentiment.title(), f"{count:,}")
605
+
606
+ with tab3:
607
+ st.markdown("### πŸ—ΊοΈ Geographic Sentiment Analysis")
608
 
609
+ # Check if locations already extracted
610
+ if 'extracted_location' not in df.columns:
611
+ if 'desc' in df.columns or 'description' in df.columns:
612
+ if st.button("πŸ” Extract Locations from Articles"):
613
+ with st.spinner("Extracting locations... This may take a few minutes."):
614
+ desc_col = 'desc' if 'desc' in df.columns else 'description'
615
+ df = extract_locations_from_descriptions(df, desc_col)
616
+ # Save updated dataframe
617
+ df.to_csv(datasets[selected], index=False)
618
+ st.success("βœ… Locations extracted successfully!")
619
+ st.rerun()
620
+ else:
621
+ st.info("No description column found. Cannot extract locations.")
622
+ else:
623
+ # Show geographic analysis
624
+ col_left, col_right = st.columns([3, 2])
625
 
626
+ with col_left:
627
+ st.markdown("#### Sentiment by State")
628
+
629
+ # Load geojson
630
+ india_geojson = load_india_geojson()
631
+
632
+ if india_geojson:
633
+ # Analyze sentiment by state
634
+ sentiment_by_state = analyze_sentiment_by_state(df)
635
+
636
+ if sentiment_by_state is not None and not sentiment_by_state.empty:
637
+ map_fig = create_india_sentiment_map(
638
+ sentiment_by_state,
639
+ india_geojson,
640
+ f"{selected} - Sentiment by Indian States"
641
+ )
642
+ if map_fig:
643
+ st.plotly_chart(map_fig, use_container_width=True)
644
+
645
+ # Show state statistics
646
+ with st.expander("πŸ“Š State-wise Statistics"):
647
+ sentiment_by_state_display = sentiment_by_state.sort_values('count', ascending=False)
648
+ st.dataframe(
649
+ sentiment_by_state_display,
650
+ use_container_width=True,
651
+ hide_index=True
652
+ )
653
+ else:
654
+ st.warning("No location data with valid sentiment found.")
655
+ else:
656
+ st.error("Could not load India map data.")
657
 
658
+ with col_right:
659
+ st.markdown("#### Top Mentioned Locations")
660
+ top_loc_fig = create_top_locations_chart(df, "Top 15 Locations")
661
+ if top_loc_fig:
662
+ st.plotly_chart(top_loc_fig, use_container_width=True)
663
+
664
+ # Location coverage stats
665
+ total_articles = len(df)
666
+ articles_with_location = df['extracted_location'].notna().sum()
667
+ coverage = (articles_with_location / total_articles) * 100
668
+
669
+ st.metric("Location Coverage", f"{coverage:.1f}%")
670
+ st.caption(f"{articles_with_location:,} out of {total_articles:,} articles have location data")
671
 
672
+ with tab4:
673
+ # Filters
674
+ col_a, col_b, col_c = st.columns(3)
675
+
676
+ with col_a:
677
+ sentiment_filter = st.selectbox(
678
+ "Sentiment",
679
+ ["All"] + sorted(df['sentiment_value'].unique().tolist())
680
+ )
681
+
682
+ with col_b:
683
+ if 'date' in df.columns:
684
+ years = sorted(df['date'].dt.year.dropna().unique())
685
+ year_filter = st.selectbox("Year", ["All"] + years)
686
+ else:
687
+ year_filter = "All"
688
+
689
+ with col_c:
690
+ num_articles = st.slider("Display", 5, 50, 10)
691
+
692
+ # Apply filters
693
+ filtered_df = df.copy()
694
+ if sentiment_filter != "All":
695
+ filtered_df = filtered_df[filtered_df['sentiment_value'] == sentiment_filter]
696
+ if year_filter != "All" and 'date' in df.columns:
697
+ filtered_df = filtered_df[filtered_df['date'].dt.year == year_filter]
698
+
699
+ st.write(f"Showing {min(num_articles, len(filtered_df))} of {len(filtered_df)} articles")
700
+
701
+ # Display articles
702
+ for idx, row in filtered_df.head(num_articles).iterrows():
703
+ with st.expander(f"πŸ“° {row.get('title', 'Untitled')}"):
704
+ col_x, col_y = st.columns([3, 1])
705
+
706
+ with col_x:
707
+ st.write(row.get('desc', row.get('description', 'No description')))
708
+ if 'link' in row:
709
+ st.markdown(f"[Read more β†’]({row['link']})")
710
+
711
+ with col_y:
712
+ sentiment = row.get('sentiment_value', 'Unknown')
713
+ sentiment_emoji = {
714
+ 'positive': '😊',
715
+ 'negative': '😞',
716
+ 'neutral': '😐'
717
+ }.get(sentiment.lower(), '❓')
718
+
719
+ st.metric("Sentiment", f"{sentiment_emoji} {sentiment.title()}")
720
+ if 'date' in row:
721
+ st.caption(f"πŸ“… {row['date'].strftime('%d %b %Y')}")
722
+
723
+ def show_about_page():
724
+ st.markdown('<h2 class="sub-header">ℹ️ About This Platform</h2>',
725
+ unsafe_allow_html=True)
726
+
727
+ st.markdown("""
728
+ ## 🎯 Overview
729
+
730
+ This platform provides a complete pipeline for news article collection and sentiment analysis,
731
+ specifically designed for Indian news sources.
732
+
733
+ ### ✨ Key Features
734
+
735
+ - **Multi-Source Scraping**: Collect articles from TOI, NDTV, WION, and Scroll.in
736
+ - **Real-Time Monitoring**: Track scraping progress live
737
+ - **Automatic Analysis**: Sentiment classification and scoring
738
+ - **Interactive Visualizations**: Trends, distributions, and comparisons
739
+ - **Data Export**: Download processed datasets
740
+
741
+ ### πŸ”§ Technical Stack
742
+
743
+ - **Frontend**: Streamlit
744
+ - **Data Processing**: Pandas, NumPy
745
+ - **Visualization**: Plotly, Matplotlib
746
+ - **NLP**: spaCy, Transformers
747
+ - **Scraping**: BeautifulSoup, Requests
748
+
749
+ ### πŸ“– How to Use
750
+
751
+ 1. **Scrape**: Navigate to the Scraper page and configure your search
752
+ 2. **Wait**: Monitor the real-time progress
753
+ 3. **Analyze**: Go to Analysis page and select your dataset
754
+ 4. **Export**: Download processed data for further use
755
+
756
+ ### 🀝 Support
757
+
758
+ For issues or questions, please refer to the documentation or contact support.
759
+
760
+ ---
761
+
762
+ **Version**: 1.0.0
763
+ **Last Updated**: October 2025
764
+ """)
765
 
766
+ # MAIN APP
767
  def main():
 
768
  load_css()
769
+ init_session_state()
770
+ setup_directories()
771
 
772
+ # Sidebar navigation
773
+ with st.sidebar:
774
+ st.image("https://via.placeholder.com/150x50?text=News+Scraper", use_container_width=True)
775
+ st.markdown("---")
776
+
777
+ page = st.radio(
778
+ "Navigation",
779
+ ["🏠 Home", "πŸ” Scraper", "πŸ“Š Analysis", "ℹ️ About"],
780
+ label_visibility="collapsed"
781
+ )
782
+
783
+ st.markdown("---")
784
+
785
+ # Quick stats in sidebar
786
+ datasets = discover_datasets()
787
+ if datasets:
788
+ st.markdown("### πŸ“Š Quick Stats")
789
+ total_articles = 0
790
+ for path in datasets.values():
791
+ df = load_data(path)
792
+ if df is not None:
793
+ total_articles += len(df)
794
+
795
+ st.metric("Total Articles", f"{total_articles:,}")
796
+ st.metric("Datasets", len(datasets))
797
 
798
+ # Route to pages
799
+ if page == "🏠 Home":
 
 
 
 
800
  show_home_page()
801
+ elif page == "πŸ” Scraper":
802
+ show_scraper_page()
803
+ elif page == "πŸ“Š Analysis":
804
+ show_analysis_page()
805
+ else:
806
+ show_about_page()
807
 
808
  if __name__ == "__main__":
809
+ main()