Nishitha03 commited on
Commit
8a0f05d
Β·
verified Β·
1 Parent(s): eb34004

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +213 -152
src/streamlit_app.py CHANGED
@@ -1,6 +1,7 @@
1
  """
2
  Unified News Scraper & Sentiment Analysis Application
3
  Combines scraping, processing, and visualization in one interface
 
4
  """
5
 
6
  import streamlit as st
@@ -25,6 +26,9 @@ warnings.filterwarnings('ignore')
25
  # Constants
26
  INDIA_GEOJSON_URL = 'https://raw.githubusercontent.com/geohacker/india/master/state/india_state.geojson'
27
 
 
 
 
28
  # Page config
29
  st.set_page_config(
30
  page_title="News Scraper & Analysis Platform",
@@ -106,7 +110,8 @@ def init_session_state():
106
  'scraped_data': {},
107
  'scraping_active': False,
108
  'processing_status': {},
109
- 'selected_dataset': None
 
110
  }
111
  for key, value in defaults.items():
112
  if key not in st.session_state:
@@ -114,8 +119,15 @@ def init_session_state():
114
 
115
  # Setup directories
116
  def setup_directories():
117
- for dir_name in ['output', 'data', 'temp']:
118
- Path(dir_name).mkdir(exist_ok=True)
 
 
 
 
 
 
 
119
 
120
  # Load India GeoJSON
121
  @st.cache_data
@@ -135,8 +147,7 @@ def load_spacy_model():
135
  return spacy.load("en_core_web_sm")
136
  except OSError:
137
  st.info("Downloading spaCy model...")
138
- import subprocess
139
- subprocess.call(["python", "-m", "spacy", "download", "en_core_web_sm"])
140
  return spacy.load("en_core_web_sm")
141
 
142
  # State mapping
@@ -283,12 +294,26 @@ def create_top_locations_chart(df, title):
283
  # Discover datasets
284
  @st.cache_data
285
  def discover_datasets():
 
286
  datasets = {}
287
- for directory in [Path('data'), Path('output')]:
 
 
 
 
 
 
 
 
288
  if directory.exists():
289
- for csv_file in directory.glob('*.csv'):
290
- name = csv_file.stem.replace('_articles', '').replace('_', ' ').title()
291
- datasets[name] = str(csv_file)
 
 
 
 
 
292
  return datasets
293
 
294
  # Load data
@@ -315,28 +340,6 @@ def load_data(file_path):
315
  st.error(f"Error loading data: {str(e)}")
316
  return None
317
 
318
- # Run scraper
319
- def run_scraper_async(source, topic, workers, interval):
320
- cmd = [
321
- sys.executable, "main.py",
322
- "--source", source,
323
- "--topic", topic,
324
- "--workers", str(workers),
325
- "--interval", str(interval)
326
- ]
327
-
328
- try:
329
- process = subprocess.Popen(
330
- cmd,
331
- stdout=subprocess.PIPE,
332
- stderr=subprocess.PIPE,
333
- text=True,
334
- bufsize=1
335
- )
336
- return process
337
- except Exception as e:
338
- return None
339
-
340
  # Plotting functions
341
  def plot_sentiment_trends(df, title):
342
  if 'date' not in df.columns or 'sentiment_value' not in df.columns:
@@ -420,17 +423,20 @@ def show_home_page():
420
  <div style="text-align: center; padding: 20px; background-color: #f8f9fa;
421
  border-radius: 10px; margin: 20px 0;">
422
  <h3>Complete Pipeline for News Collection and Sentiment Analysis</h3>
423
- <p>Scrape articles from major Indian news sources and analyze sentiment trends</p>
424
  </div>
425
  """, unsafe_allow_html=True)
426
 
 
 
 
427
  # Feature cards
428
  col1, col2, col3 = st.columns(3)
429
 
430
  with col1:
431
  st.markdown('<div class="feature-card">', unsafe_allow_html=True)
432
- st.markdown("### πŸ” Scrape")
433
- st.write("Collect articles from TOI, NDTV, WION, and Scroll.in")
434
  st.markdown('</div>', unsafe_allow_html=True)
435
 
436
  with col2:
@@ -461,83 +467,92 @@ def show_home_page():
461
  st.markdown('</div>', unsafe_allow_html=True)
462
 
463
  def show_scraper_page():
464
- st.markdown('<h2 class="sub-header">πŸ” Article Scraper</h2>', unsafe_allow_html=True)
465
-
466
- col1, col2 = st.columns([2, 1])
467
-
468
- with col1:
469
- st.markdown("### Configuration")
470
-
471
- source = st.selectbox(
472
- "News Source",
473
- options=['toi', 'ndtv', 'wion', 'scroll'],
474
- format_func=lambda x: {
475
- 'toi': 'πŸ“° Times of India',
476
- 'ndtv': 'πŸ“Ί NDTV',
477
- 'wion': '🌍 WION',
478
- 'scroll': 'πŸ“œ Scroll.in'
479
- }[x]
480
- )
481
-
482
- topic = st.text_input("Topic", placeholder="e.g., Climate Change, Technology")
483
-
484
- col_a, col_b = st.columns(2)
485
- with col_a:
486
- workers = st.slider("Workers", 1, 10, 4)
487
- with col_b:
488
- interval = st.slider("Save Interval (s)", 60, 600, 300, step=60)
489
-
490
- with col2:
491
- st.markdown("### Quick Guide")
492
- st.info("""
493
- **Steps:**
494
- 1. Select news source
495
- 2. Enter search topic
496
- 3. Configure settings
497
- 4. Click Start
498
- 5. Monitor progress
499
- """)
500
 
501
  st.markdown("---")
502
 
503
- if st.button("πŸš€ Start Scraping", disabled=not topic, type="primary"):
504
- with st.spinner("Initializing scraper..."):
505
- st.markdown('<div class="status-running">', unsafe_allow_html=True)
506
- st.write(f"⏳ Scraping **{source.upper()}** for **'{topic}'**...")
507
- st.markdown('</div>', unsafe_allow_html=True)
 
 
 
 
 
 
 
 
508
 
509
- progress_bar = st.progress(0)
510
- status_text = st.empty()
 
511
 
512
- process = run_scraper_async(source, topic, workers, interval)
 
513
 
514
- if process:
515
- output_lines = []
516
- progress = 0
517
-
518
- while True:
519
- line = process.stdout.readline()
520
- if not line and process.poll() is not None:
521
- break
522
- if line:
523
- output_lines.append(line.strip())
524
- status_text.text(line.strip())
525
- progress = min(progress + 1, 95)
526
- progress_bar.progress(progress / 100)
527
-
528
- progress_bar.progress(100)
529
-
530
- if process.returncode == 0:
531
- st.markdown('<div class="status-success">', unsafe_allow_html=True)
532
- st.success("βœ… Scraping completed successfully!")
533
- st.markdown('</div>', unsafe_allow_html=True)
534
- st.balloons()
535
- else:
536
- st.markdown('<div class="status-error">', unsafe_allow_html=True)
537
- st.error("❌ Scraping failed. Check logs.")
538
- with st.expander("View Logs"):
539
- st.code("\n".join(output_lines[-20:]))
540
- st.markdown('</div>', unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
541
 
542
  def show_analysis_page():
543
  st.markdown('<h2 class="sub-header">πŸ“Š Sentiment Analysis Dashboard</h2>',
@@ -546,7 +561,8 @@ def show_analysis_page():
546
  datasets = discover_datasets()
547
 
548
  if not datasets:
549
- st.warning("⚠️ No datasets available. Please scrape some articles first!")
 
550
  return
551
 
552
  # Dataset selector
@@ -727,15 +743,15 @@ def show_about_page():
727
  st.markdown("""
728
  ## 🎯 Overview
729
 
730
- This platform provides a complete pipeline for news article collection and sentiment analysis,
731
  specifically designed for Indian news sources.
732
 
733
  ### ✨ Key Features
734
 
735
- - **Multi-Source Scraping**: Collect articles from TOI, NDTV, WION, and Scroll.in
736
- - **Real-Time Monitoring**: Track scraping progress live
737
  - **Automatic Analysis**: Sentiment classification and scoring
738
  - **Interactive Visualizations**: Trends, distributions, and comparisons
 
739
  - **Data Export**: Download processed datasets
740
 
741
  ### πŸ”§ Technical Stack
@@ -743,15 +759,31 @@ def show_about_page():
743
  - **Frontend**: Streamlit
744
  - **Data Processing**: Pandas, NumPy
745
  - **Visualization**: Plotly, Matplotlib
746
- - **NLP**: spaCy, Transformers
747
- - **Scraping**: BeautifulSoup, Requests
748
 
749
  ### πŸ“– How to Use
750
 
751
- 1. **Scrape**: Navigate to the Scraper page and configure your search
752
- 2. **Wait**: Monitor the real-time progress
753
- 3. **Analyze**: Go to Analysis page and select your dataset
754
- 4. **Export**: Download processed data for further use
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
755
 
756
  ### 🀝 Support
757
 
@@ -759,51 +791,80 @@ def show_about_page():
759
 
760
  ---
761
 
762
- **Version**: 1.0.0
763
- **Last Updated**: October 2025
 
764
  """)
765
 
766
  # MAIN APP
767
  def main():
768
- load_css()
769
- init_session_state()
770
- setup_directories()
771
-
772
- # Sidebar navigation
773
- with st.sidebar:
774
- st.image("https://via.placeholder.com/150x50?text=News+Scraper", use_container_width=True)
775
- st.markdown("---")
776
 
777
- page = st.radio(
778
- "Navigation",
779
- ["🏠 Home", "πŸ” Scraper", "πŸ“Š Analysis", "ℹ️ About"],
780
- label_visibility="collapsed"
781
- )
782
 
783
- st.markdown("---")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
784
 
785
- # Quick stats in sidebar
786
- datasets = discover_datasets()
787
- if datasets:
788
- st.markdown("### πŸ“Š Quick Stats")
789
- total_articles = 0
790
- for path in datasets.values():
791
- df = load_data(path)
792
- if df is not None:
793
- total_articles += len(df)
794
 
795
- st.metric("Total Articles", f"{total_articles:,}")
796
- st.metric("Datasets", len(datasets))
797
-
798
- # Route to pages
799
- if page == "🏠 Home":
800
- show_home_page()
801
- elif page == "πŸ” Scraper":
802
- show_scraper_page()
803
- elif page == "πŸ“Š Analysis":
804
- show_analysis_page()
805
- else:
806
- show_about_page()
807
 
808
  if __name__ == "__main__":
809
  main()
 
1
  """
2
  Unified News Scraper & Sentiment Analysis Application
3
  Combines scraping, processing, and visualization in one interface
4
+ Modified for Hugging Face Spaces - uses /tmp directory
5
  """
6
 
7
  import streamlit as st
 
26
  # Constants
27
  INDIA_GEOJSON_URL = 'https://raw.githubusercontent.com/geohacker/india/master/state/india_state.geojson'
28
 
29
+ # Base directory for all file operations - using /tmp for HF Spaces
30
+ BASE_DIR = Path('/tmp/news_scraper')
31
+
32
  # Page config
33
  st.set_page_config(
34
  page_title="News Scraper & Analysis Platform",
 
110
  'scraped_data': {},
111
  'scraping_active': False,
112
  'processing_status': {},
113
+ 'selected_dataset': None,
114
+ 'base_dir': BASE_DIR
115
  }
116
  for key, value in defaults.items():
117
  if key not in st.session_state:
 
119
 
120
  # Setup directories
121
  def setup_directories():
122
+ """Create necessary directories in /tmp"""
123
+ try:
124
+ for dir_name in ['output', 'data', 'temp']:
125
+ dir_path = BASE_DIR / dir_name
126
+ dir_path.mkdir(parents=True, exist_ok=True)
127
+ return True
128
+ except Exception as e:
129
+ st.error(f"❌ Directory setup error: {e}")
130
+ return False
131
 
132
  # Load India GeoJSON
133
  @st.cache_data
 
147
  return spacy.load("en_core_web_sm")
148
  except OSError:
149
  st.info("Downloading spaCy model...")
150
+ subprocess.call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
 
151
  return spacy.load("en_core_web_sm")
152
 
153
  # State mapping
 
294
  # Discover datasets
295
  @st.cache_data
296
  def discover_datasets():
297
+ """Discover datasets in /tmp directory"""
298
  datasets = {}
299
+
300
+ # Check /tmp/news_scraper directories
301
+ search_paths = [
302
+ BASE_DIR / 'data',
303
+ BASE_DIR / 'output',
304
+ BASE_DIR,
305
+ ]
306
+
307
+ for directory in search_paths:
308
  if directory.exists():
309
+ try:
310
+ for csv_file in directory.glob('*.csv'):
311
+ name = csv_file.stem.replace('_articles', '').replace('_', ' ').title()
312
+ if name not in datasets:
313
+ datasets[name] = str(csv_file)
314
+ except Exception:
315
+ continue
316
+
317
  return datasets
318
 
319
  # Load data
 
340
  st.error(f"Error loading data: {str(e)}")
341
  return None
342
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
  # Plotting functions
344
  def plot_sentiment_trends(df, title):
345
  if 'date' not in df.columns or 'sentiment_value' not in df.columns:
 
423
  <div style="text-align: center; padding: 20px; background-color: #f8f9fa;
424
  border-radius: 10px; margin: 20px 0;">
425
  <h3>Complete Pipeline for News Collection and Sentiment Analysis</h3>
426
+ <p>Upload or scrape articles from major Indian news sources and analyze sentiment trends</p>
427
  </div>
428
  """, unsafe_allow_html=True)
429
 
430
+ # Show current storage location
431
+ st.info(f"πŸ’Ύ **Storage Location:** `{BASE_DIR}` (temporary - cleared on restart)")
432
+
433
  # Feature cards
434
  col1, col2, col3 = st.columns(3)
435
 
436
  with col1:
437
  st.markdown('<div class="feature-card">', unsafe_allow_html=True)
438
+ st.markdown("### πŸ“€ Upload")
439
+ st.write("Upload pre-scraped CSV files for analysis")
440
  st.markdown('</div>', unsafe_allow_html=True)
441
 
442
  with col2:
 
467
  st.markdown('</div>', unsafe_allow_html=True)
468
 
469
  def show_scraper_page():
470
+ st.markdown('<h2 class="sub-header">πŸ“€ Data Upload & Management</h2>', unsafe_allow_html=True)
471
+
472
+ # Important notice for HF Spaces
473
+ st.warning("""
474
+ ⚠️ **Hugging Face Spaces Notice:**
475
+ - Data is stored in `/tmp` and will be cleared on app restart
476
+ - Download your processed data regularly
477
+ - For actual scraping, run the scraper locally and upload results here
478
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
479
 
480
  st.markdown("---")
481
 
482
+ # File uploader
483
+ st.markdown("### πŸ“€ Upload Your Data")
484
+ uploaded_file = st.file_uploader(
485
+ "Upload CSV file with scraped articles",
486
+ type=['csv'],
487
+ help="Upload a CSV file with columns: title, date, desc, sentiment_value, etc."
488
+ )
489
+
490
+ if uploaded_file:
491
+ try:
492
+ # Save to /tmp directory
493
+ data_dir = BASE_DIR / 'data'
494
+ data_dir.mkdir(parents=True, exist_ok=True)
495
 
496
+ save_path = data_dir / uploaded_file.name
497
+ with open(save_path, 'wb') as f:
498
+ f.write(uploaded_file.getbuffer())
499
 
500
+ st.success(f"βœ… File uploaded successfully! Saved to `{save_path}`")
501
+ st.info("Go to the Analysis page to view your data.")
502
 
503
+ # Clear cache to refresh dataset list
504
+ discover_datasets.clear()
505
+
506
+ # Show preview
507
+ with st.expander("πŸ“‹ Preview Data"):
508
+ df = pd.read_csv(save_path)
509
+ st.dataframe(df.head(10), use_container_width=True)
510
+ st.caption(f"Total rows: {len(df):,}")
511
+
512
+ except Exception as e:
513
+ st.error(f"❌ Error saving file: {e}")
514
+
515
+ st.markdown("---")
516
+
517
+ # Check available datasets
518
+ datasets = discover_datasets()
519
+ if datasets:
520
+ st.markdown("### πŸ“ Available Datasets")
521
+ for name, path in datasets.items():
522
+ col_a, col_b, col_c = st.columns([3, 1, 1])
523
+ with col_a:
524
+ st.text(f"πŸ“„ {name}")
525
+ with col_b:
526
+ df = load_data(path)
527
+ if df is not None:
528
+ st.text(f"{len(df):,} articles")
529
+ with col_c:
530
+ if Path(path).exists():
531
+ with open(path, 'rb') as f:
532
+ st.download_button(
533
+ "⬇️",
534
+ f,
535
+ file_name=f"{name}.csv",
536
+ mime='text/csv',
537
+ key=f"dl_{name}"
538
+ )
539
+ else:
540
+ st.info("No datasets found. Upload a CSV file to get started!")
541
+
542
+ st.markdown("---")
543
+
544
+ # Storage info
545
+ st.markdown("### πŸ’Ύ Storage Information")
546
+ try:
547
+ if BASE_DIR.exists():
548
+ # Count files
549
+ file_count = sum(1 for _ in BASE_DIR.rglob('*.csv'))
550
+ st.metric("CSV Files", file_count)
551
+ st.caption(f"Location: `{BASE_DIR}`")
552
+ else:
553
+ st.info("No data directory created yet. Upload a file to get started.")
554
+ except Exception as e:
555
+ st.error(f"Could not access storage: {e}")
556
 
557
  def show_analysis_page():
558
  st.markdown('<h2 class="sub-header">πŸ“Š Sentiment Analysis Dashboard</h2>',
 
561
  datasets = discover_datasets()
562
 
563
  if not datasets:
564
+ st.warning("⚠️ No datasets available. Please upload a CSV file first!")
565
+ st.info("πŸ‘‰ Go to the 'Upload & Management' page to upload your data.")
566
  return
567
 
568
  # Dataset selector
 
743
  st.markdown("""
744
  ## 🎯 Overview
745
 
746
+ This platform provides sentiment analysis and visualization for news articles,
747
  specifically designed for Indian news sources.
748
 
749
  ### ✨ Key Features
750
 
751
+ - **Data Upload**: Upload pre-scraped CSV files for analysis
 
752
  - **Automatic Analysis**: Sentiment classification and scoring
753
  - **Interactive Visualizations**: Trends, distributions, and comparisons
754
+ - **Geographic Analysis**: State-wise sentiment mapping for India
755
  - **Data Export**: Download processed datasets
756
 
757
  ### πŸ”§ Technical Stack
 
759
  - **Frontend**: Streamlit
760
  - **Data Processing**: Pandas, NumPy
761
  - **Visualization**: Plotly, Matplotlib
762
+ - **NLP**: spaCy for location extraction
763
+ - **Storage**: `/tmp` directory (HF Spaces compatible)
764
 
765
  ### πŸ“– How to Use
766
 
767
+ 1. **Upload**: Navigate to the Upload page and upload your CSV file
768
+ 2. **Analyze**: Go to Analysis page and select your dataset
769
+ 3. **Explore**: View trends, distributions, and geographic sentiment
770
+ 4. **Extract Locations**: Use the Geographic tab to extract state information
771
+ 5. **Export**: Download processed data for further use
772
+
773
+ ### πŸ“‹ CSV File Format
774
+
775
+ Your uploaded CSV should contain these columns:
776
+ - `title`: Article headline
777
+ - `date`: Publication date
778
+ - `desc` or `description`: Article content/summary
779
+ - `sentiment_value`: Sentiment label (positive/negative/neutral)
780
+ - `link` (optional): URL to original article
781
+
782
+ ### ⚠️ Hugging Face Spaces Limitations
783
+
784
+ - Data stored in `/tmp` is temporary and cleared on restart
785
+ - Download your processed data regularly
786
+ - For scraping, run locally and upload results here
787
 
788
  ### 🀝 Support
789
 
 
791
 
792
  ---
793
 
794
+ **Version**: 1.0.0 (HF Spaces Edition)
795
+ **Last Updated**: October 2025
796
+ **Storage**: `/tmp/news_scraper`
797
  """)
798
 
799
  # MAIN APP
800
  def main():
801
+ try:
802
+ load_css()
803
+ init_session_state()
 
 
 
 
 
804
 
805
+ # Setup directories and show status
806
+ if not setup_directories():
807
+ st.error("Failed to setup directories. Some features may not work.")
 
 
808
 
809
+ # Sidebar navigation
810
+ with st.sidebar:
811
+ st.markdown("### πŸ“° News Analysis")
812
+ st.markdown("---")
813
+
814
+ page = st.radio(
815
+ "Navigation",
816
+ ["🏠 Home", "πŸ“€ Upload & Manage", "πŸ“Š Analysis", "ℹ️ About"],
817
+ label_visibility="collapsed"
818
+ )
819
+
820
+ st.markdown("---")
821
+
822
+ # Storage info in sidebar
823
+ st.markdown("### πŸ’Ύ Storage")
824
+ st.caption(f"Location: `/tmp/news_scraper`")
825
+ st.caption("⚠️ Temporary storage")
826
+
827
+ st.markdown("---")
828
+
829
+ # Quick stats in sidebar
830
+ try:
831
+ datasets = discover_datasets()
832
+ if datasets:
833
+ st.markdown("### πŸ“Š Quick Stats")
834
+ total_articles = 0
835
+ for path in datasets.values():
836
+ try:
837
+ df = load_data(path)
838
+ if df is not None:
839
+ total_articles += len(df)
840
+ except:
841
+ continue
842
+
843
+ if total_articles > 0:
844
+ st.metric("Total Articles", f"{total_articles:,}")
845
+ st.metric("Datasets", len(datasets))
846
+ else:
847
+ st.info("No data yet. Upload a CSV to start!")
848
+ except Exception:
849
+ pass
850
 
851
+ # Route to pages
852
+ if page == "🏠 Home":
853
+ show_home_page()
854
+ elif page == "πŸ“€ Upload & Manage":
855
+ show_scraper_page()
856
+ elif page == "πŸ“Š Analysis":
857
+ show_analysis_page()
858
+ else:
859
+ show_about_page()
860
 
861
+ except Exception as e:
862
+ st.error(f"❌ Application error: {str(e)}")
863
+ st.info("Try refreshing the page. If the problem persists, the app may need to restart.")
864
+
865
+ with st.expander("πŸ” Error Details"):
866
+ import traceback
867
+ st.code(traceback.format_exc())
 
 
 
 
 
868
 
869
  if __name__ == "__main__":
870
  main()