Spaces:

Nishitha03
/

News-Scraper

Sleeping

App Files Files Community

Nishitha03 commited on Oct 8, 2025

Commit

8a0f05d

verified ·

1 Parent(s): eb34004

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +213 -152

src/streamlit_app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """
 Unified News Scraper & Sentiment Analysis Application
 Combines scraping, processing, and visualization in one interface
 """
 import streamlit as st
@@ -25,6 +26,9 @@ warnings.filterwarnings('ignore')
 # Constants
 INDIA_GEOJSON_URL = 'https://raw.githubusercontent.com/geohacker/india/master/state/india_state.geojson'
 # Page config
 st.set_page_config(
     page_title="News Scraper & Analysis Platform",
@@ -106,7 +110,8 @@ def init_session_state():
         'scraped_data': {},
         'scraping_active': False,
         'processing_status': {},
-        'selected_dataset': None
     }
     for key, value in defaults.items():
         if key not in st.session_state:
@@ -114,8 +119,15 @@ def init_session_state():
 # Setup directories
 def setup_directories():
-    for dir_name in ['output', 'data', 'temp']:
-        Path(dir_name).mkdir(exist_ok=True)
 # Load India GeoJSON
 @st.cache_data
@@ -135,8 +147,7 @@ def load_spacy_model():
         return spacy.load("en_core_web_sm")
     except OSError:
         st.info("Downloading spaCy model...")
-        import subprocess
-        subprocess.call(["python", "-m", "spacy", "download", "en_core_web_sm"])
         return spacy.load("en_core_web_sm")
 # State mapping
@@ -283,12 +294,26 @@ def create_top_locations_chart(df, title):
 # Discover datasets
 @st.cache_data
 def discover_datasets():
     datasets = {}
-    for directory in [Path('data'), Path('output')]:
         if directory.exists():
-            for csv_file in directory.glob('*.csv'):
-                name = csv_file.stem.replace('_articles', '').replace('_', ' ').title()
-                datasets[name] = str(csv_file)
     return datasets
 # Load data
@@ -315,28 +340,6 @@ def load_data(file_path):
         st.error(f"Error loading data: {str(e)}")
         return None
-# Run scraper
-def run_scraper_async(source, topic, workers, interval):
-    cmd = [
-        sys.executable, "main.py",
-        "--source", source,
-        "--topic", topic,
-        "--workers", str(workers),
-        "--interval", str(interval)
-    ]
-    try:
-        process = subprocess.Popen(
-            cmd,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            text=True,
-            bufsize=1
-        )
-        return process
-    except Exception as e:
-        return None
 # Plotting functions
 def plot_sentiment_trends(df, title):
     if 'date' not in df.columns or 'sentiment_value' not in df.columns:
@@ -420,17 +423,20 @@ def show_home_page():
     <div style="text-align: center; padding: 20px; background-color: #f8f9fa;
                 border-radius: 10px; margin: 20px 0;">
         <h3>Complete Pipeline for News Collection and Sentiment Analysis</h3>
-        <p>Scrape articles from major Indian news sources and analyze sentiment trends</p>
     </div>
     """, unsafe_allow_html=True)
     # Feature cards
     col1, col2, col3 = st.columns(3)
     with col1:
         st.markdown('<div class="feature-card">', unsafe_allow_html=True)
-        st.markdown("### 🔍 Scrape")
-        st.write("Collect articles from TOI, NDTV, WION, and Scroll.in")
         st.markdown('</div>', unsafe_allow_html=True)
     with col2:
@@ -461,83 +467,92 @@ def show_home_page():
                     st.markdown('</div>', unsafe_allow_html=True)
 def show_scraper_page():
-    st.markdown('<h2 class="sub-header">🔍 Article Scraper</h2>', unsafe_allow_html=True)
-    col1, col2 = st.columns([2, 1])
-    with col1:
-        st.markdown("### Configuration")
-        source = st.selectbox(
-            "News Source",
-            options=['toi', 'ndtv', 'wion', 'scroll'],
-            format_func=lambda x: {
-                'toi': '📰 Times of India',
-                'ndtv': '📺 NDTV',
-                'wion': '🌍 WION',
-                'scroll': '📜 Scroll.in'
-            }[x]
-        )
-        topic = st.text_input("Topic", placeholder="e.g., Climate Change, Technology")
-        col_a, col_b = st.columns(2)
-        with col_a:
-            workers = st.slider("Workers", 1, 10, 4)
-        with col_b:
-            interval = st.slider("Save Interval (s)", 60, 600, 300, step=60)
-    with col2:
-        st.markdown("### Quick Guide")
-        st.info("""
-        **Steps:**
-        1. Select news source
-        2. Enter search topic
-        3. Configure settings
-        4. Click Start
-        5. Monitor progress
-        """)
     st.markdown("---")
-    if st.button("🚀 Start Scraping", disabled=not topic, type="primary"):
-        with st.spinner("Initializing scraper..."):
-            st.markdown('<div class="status-running">', unsafe_allow_html=True)
-            st.write(f"⏳ Scraping **{source.upper()}** for **'{topic}'**...")
-            st.markdown('</div>', unsafe_allow_html=True)
-            progress_bar = st.progress(0)
-            status_text = st.empty()
-            process = run_scraper_async(source, topic, workers, interval)
-            if process:
-                output_lines = []
-                progress = 0
-                while True:
-                    line = process.stdout.readline()
-                    if not line and process.poll() is not None:
-                        break
-                    if line:
-                        output_lines.append(line.strip())
-                        status_text.text(line.strip())
-                        progress = min(progress + 1, 95)
-                        progress_bar.progress(progress / 100)
-                progress_bar.progress(100)
-                if process.returncode == 0:
-                    st.markdown('<div class="status-success">', unsafe_allow_html=True)
-                    st.success("✅ Scraping completed successfully!")
-                    st.markdown('</div>', unsafe_allow_html=True)
-                    st.balloons()
-                else:
-                    st.markdown('<div class="status-error">', unsafe_allow_html=True)
-                    st.error("❌ Scraping failed. Check logs.")
-                    with st.expander("View Logs"):
-                        st.code("\n".join(output_lines[-20:]))
-                    st.markdown('</div>', unsafe_allow_html=True)
 def show_analysis_page():
     st.markdown('<h2 class="sub-header">📊 Sentiment Analysis Dashboard</h2>',
@@ -546,7 +561,8 @@ def show_analysis_page():
     datasets = discover_datasets()
     if not datasets:
-        st.warning("⚠️ No datasets available. Please scrape some articles first!")
         return
     # Dataset selector
@@ -727,15 +743,15 @@ def show_about_page():
     st.markdown("""
     ## 🎯 Overview
-    This platform provides a complete pipeline for news article collection and sentiment analysis,
     specifically designed for Indian news sources.
     ### ✨ Key Features
-    - **Multi-Source Scraping**: Collect articles from TOI, NDTV, WION, and Scroll.in
-    - **Real-Time Monitoring**: Track scraping progress live
     - **Automatic Analysis**: Sentiment classification and scoring
     - **Interactive Visualizations**: Trends, distributions, and comparisons
     - **Data Export**: Download processed datasets
     ### 🔧 Technical Stack
@@ -743,15 +759,31 @@ def show_about_page():
     - **Frontend**: Streamlit
     - **Data Processing**: Pandas, NumPy
     - **Visualization**: Plotly, Matplotlib
-    - **NLP**: spaCy, Transformers
-    - **Scraping**: BeautifulSoup, Requests
     ### 📖 How to Use
-    1. **Scrape**: Navigate to the Scraper page and configure your search
-    2. **Wait**: Monitor the real-time progress
-    3. **Analyze**: Go to Analysis page and select your dataset
-    4. **Export**: Download processed data for further use
     ### 🤝 Support
@@ -759,51 +791,80 @@ def show_about_page():
     ---
-    **Version**: 1.0.0
-    **Last Updated**: October 2025
     """)
 # MAIN APP
 def main():
-    load_css()
-    init_session_state()
-    setup_directories()
-    # Sidebar navigation
-    with st.sidebar:
-        st.image("https://via.placeholder.com/150x50?text=News+Scraper", use_container_width=True)
-        st.markdown("---")
-        page = st.radio(
-            "Navigation",
-            ["🏠 Home", "🔍 Scraper", "📊 Analysis", "ℹ️ About"],
-            label_visibility="collapsed"
-        )
-        st.markdown("---")
-        # Quick stats in sidebar
-        datasets = discover_datasets()
-        if datasets:
-            st.markdown("### 📊 Quick Stats")
-            total_articles = 0
-            for path in datasets.values():
-                df = load_data(path)
-                if df is not None:
-                    total_articles += len(df)
-            st.metric("Total Articles", f"{total_articles:,}")
-            st.metric("Datasets", len(datasets))
-    # Route to pages
-    if page == "🏠 Home":
-        show_home_page()
-    elif page == "🔍 Scraper":
-        show_scraper_page()
-    elif page == "📊 Analysis":
-        show_analysis_page()
-    else:
-        show_about_page()
 if __name__ == "__main__":
     main()

 """
 Unified News Scraper & Sentiment Analysis Application
 Combines scraping, processing, and visualization in one interface
+Modified for Hugging Face Spaces - uses /tmp directory
 """
 import streamlit as st
 # Constants
 INDIA_GEOJSON_URL = 'https://raw.githubusercontent.com/geohacker/india/master/state/india_state.geojson'
+# Base directory for all file operations - using /tmp for HF Spaces
+BASE_DIR = Path('/tmp/news_scraper')
 # Page config
 st.set_page_config(
     page_title="News Scraper & Analysis Platform",
         'scraped_data': {},
         'scraping_active': False,
         'processing_status': {},
+        'selected_dataset': None,
+        'base_dir': BASE_DIR
     }
     for key, value in defaults.items():
         if key not in st.session_state:
 # Setup directories
 def setup_directories():
+    """Create necessary directories in /tmp"""
+    try:
+        for dir_name in ['output', 'data', 'temp']:
+            dir_path = BASE_DIR / dir_name
+            dir_path.mkdir(parents=True, exist_ok=True)
+        return True
+    except Exception as e:
+        st.error(f"❌ Directory setup error: {e}")
+        return False
 # Load India GeoJSON
 @st.cache_data
         return spacy.load("en_core_web_sm")
     except OSError:
         st.info("Downloading spaCy model...")
+        subprocess.call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
         return spacy.load("en_core_web_sm")
 # State mapping
 # Discover datasets
 @st.cache_data
 def discover_datasets():
+    """Discover datasets in /tmp directory"""
     datasets = {}
+    # Check /tmp/news_scraper directories
+    search_paths = [
+        BASE_DIR / 'data',
+        BASE_DIR / 'output',
+        BASE_DIR,
+    ]
+    for directory in search_paths:
         if directory.exists():
+            try:
+                for csv_file in directory.glob('*.csv'):
+                    name = csv_file.stem.replace('_articles', '').replace('_', ' ').title()
+                    if name not in datasets:
+                        datasets[name] = str(csv_file)
+            except Exception:
+                continue
     return datasets
 # Load data
         st.error(f"Error loading data: {str(e)}")
         return None
 # Plotting functions
 def plot_sentiment_trends(df, title):
     if 'date' not in df.columns or 'sentiment_value' not in df.columns:
     <div style="text-align: center; padding: 20px; background-color: #f8f9fa;
                 border-radius: 10px; margin: 20px 0;">
         <h3>Complete Pipeline for News Collection and Sentiment Analysis</h3>
+        <p>Upload or scrape articles from major Indian news sources and analyze sentiment trends</p>
     </div>
     """, unsafe_allow_html=True)
+    # Show current storage location
+    st.info(f"💾 **Storage Location:** `{BASE_DIR}` (temporary - cleared on restart)")
     # Feature cards
     col1, col2, col3 = st.columns(3)
     with col1:
         st.markdown('<div class="feature-card">', unsafe_allow_html=True)
+        st.markdown("### 📤 Upload")
+        st.write("Upload pre-scraped CSV files for analysis")
         st.markdown('</div>', unsafe_allow_html=True)
     with col2:
                     st.markdown('</div>', unsafe_allow_html=True)
 def show_scraper_page():
+    st.markdown('<h2 class="sub-header">📤 Data Upload & Management</h2>', unsafe_allow_html=True)
+    # Important notice for HF Spaces
+    st.warning("""
+    ⚠️ **Hugging Face Spaces Notice:**
+    - Data is stored in `/tmp` and will be cleared on app restart
+    - Download your processed data regularly
+    - For actual scraping, run the scraper locally and upload results here
+    """)
     st.markdown("---")
+    # File uploader
+    st.markdown("### 📤 Upload Your Data")
+    uploaded_file = st.file_uploader(
+        "Upload CSV file with scraped articles",
+        type=['csv'],
+        help="Upload a CSV file with columns: title, date, desc, sentiment_value, etc."
+    )
+    if uploaded_file:
+        try:
+            # Save to /tmp directory
+            data_dir = BASE_DIR / 'data'
+            data_dir.mkdir(parents=True, exist_ok=True)
+            save_path = data_dir / uploaded_file.name
+            with open(save_path, 'wb') as f:
+                f.write(uploaded_file.getbuffer())
+            st.success(f"✅ File uploaded successfully! Saved to `{save_path}`")
+            st.info("Go to the Analysis page to view your data.")
+            # Clear cache to refresh dataset list
+            discover_datasets.clear()
+            # Show preview
+            with st.expander("📋 Preview Data"):
+                df = pd.read_csv(save_path)
+                st.dataframe(df.head(10), use_container_width=True)
+                st.caption(f"Total rows: {len(df):,}")
+        except Exception as e:
+            st.error(f"❌ Error saving file: {e}")
+    st.markdown("---")
+    # Check available datasets
+    datasets = discover_datasets()
+    if datasets:
+        st.markdown("### 📁 Available Datasets")
+        for name, path in datasets.items():
+            col_a, col_b, col_c = st.columns([3, 1, 1])
+            with col_a:
+                st.text(f"📄 {name}")
+            with col_b:
+                df = load_data(path)
+                if df is not None:
+                    st.text(f"{len(df):,} articles")
+            with col_c:
+                if Path(path).exists():
+                    with open(path, 'rb') as f:
+                        st.download_button(
+                            "⬇️",
+                            f,
+                            file_name=f"{name}.csv",
+                            mime='text/csv',
+                            key=f"dl_{name}"
+                        )
+    else:
+        st.info("No datasets found. Upload a CSV file to get started!")
+    st.markdown("---")
+    # Storage info
+    st.markdown("### 💾 Storage Information")
+    try:
+        if BASE_DIR.exists():
+            # Count files
+            file_count = sum(1 for _ in BASE_DIR.rglob('*.csv'))
+            st.metric("CSV Files", file_count)
+            st.caption(f"Location: `{BASE_DIR}`")
+        else:
+            st.info("No data directory created yet. Upload a file to get started.")
+    except Exception as e:
+        st.error(f"Could not access storage: {e}")
 def show_analysis_page():
     st.markdown('<h2 class="sub-header">📊 Sentiment Analysis Dashboard</h2>',
     datasets = discover_datasets()
     if not datasets:
+        st.warning("⚠️ No datasets available. Please upload a CSV file first!")
+        st.info("👉 Go to the 'Upload & Management' page to upload your data.")
         return
     # Dataset selector
     st.markdown("""
     ## 🎯 Overview
+    This platform provides sentiment analysis and visualization for news articles,
     specifically designed for Indian news sources.
     ### ✨ Key Features
+    - **Data Upload**: Upload pre-scraped CSV files for analysis
     - **Automatic Analysis**: Sentiment classification and scoring
     - **Interactive Visualizations**: Trends, distributions, and comparisons
+    - **Geographic Analysis**: State-wise sentiment mapping for India
     - **Data Export**: Download processed datasets
     ### 🔧 Technical Stack
     - **Frontend**: Streamlit
     - **Data Processing**: Pandas, NumPy
     - **Visualization**: Plotly, Matplotlib
+    - **NLP**: spaCy for location extraction
+    - **Storage**: `/tmp` directory (HF Spaces compatible)
     ### 📖 How to Use
+    1. **Upload**: Navigate to the Upload page and upload your CSV file
+    2. **Analyze**: Go to Analysis page and select your dataset
+    3. **Explore**: View trends, distributions, and geographic sentiment
+    4. **Extract Locations**: Use the Geographic tab to extract state information
+    5. **Export**: Download processed data for further use
+    ### 📋 CSV File Format
+    Your uploaded CSV should contain these columns:
+    - `title`: Article headline
+    - `date`: Publication date
+    - `desc` or `description`: Article content/summary
+    - `sentiment_value`: Sentiment label (positive/negative/neutral)
+    - `link` (optional): URL to original article
+    ### ⚠️ Hugging Face Spaces Limitations
+    - Data stored in `/tmp` is temporary and cleared on restart
+    - Download your processed data regularly
+    - For scraping, run locally and upload results here
     ### 🤝 Support
     ---
+    **Version**: 1.0.0 (HF Spaces Edition)
+    **Last Updated**: October 2025
+    **Storage**: `/tmp/news_scraper`
     """)
 # MAIN APP
 def main():
+    try:
+        load_css()
+        init_session_state()
+        # Setup directories and show status
+        if not setup_directories():
+            st.error("Failed to setup directories. Some features may not work.")
+        # Sidebar navigation
+        with st.sidebar:
+            st.markdown("### 📰 News Analysis")
+            st.markdown("---")
+            page = st.radio(
+                "Navigation",
+                ["🏠 Home", "📤 Upload & Manage", "📊 Analysis", "ℹ️ About"],
+                label_visibility="collapsed"
+            )
+            st.markdown("---")
+            # Storage info in sidebar
+            st.markdown("### 💾 Storage")
+            st.caption(f"Location: `/tmp/news_scraper`")
+            st.caption("⚠️ Temporary storage")
+            st.markdown("---")
+            # Quick stats in sidebar
+            try:
+                datasets = discover_datasets()
+                if datasets:
+                    st.markdown("### 📊 Quick Stats")
+                    total_articles = 0
+                    for path in datasets.values():
+                        try:
+                            df = load_data(path)
+                            if df is not None:
+                                total_articles += len(df)
+                        except:
+                            continue
+                    if total_articles > 0:
+                        st.metric("Total Articles", f"{total_articles:,}")
+                        st.metric("Datasets", len(datasets))
+                else:
+                    st.info("No data yet. Upload a CSV to start!")
+            except Exception:
+                pass
+        # Route to pages
+        if page == "🏠 Home":
+            show_home_page()
+        elif page == "📤 Upload & Manage":
+            show_scraper_page()
+        elif page == "📊 Analysis":
+            show_analysis_page()
+        else:
+            show_about_page()
+    except Exception as e:
+        st.error(f"❌ Application error: {str(e)}")
+        st.info("Try refreshing the page. If the problem persists, the app may need to restart.")
+        with st.expander("🔍 Error Details"):
+            import traceback
+            st.code(traceback.format_exc())
 if __name__ == "__main__":
     main()