Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +213 -152
src/streamlit_app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
"""
|
| 2 |
Unified News Scraper & Sentiment Analysis Application
|
| 3 |
Combines scraping, processing, and visualization in one interface
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
import streamlit as st
|
|
@@ -25,6 +26,9 @@ warnings.filterwarnings('ignore')
|
|
| 25 |
# Constants
|
| 26 |
INDIA_GEOJSON_URL = 'https://raw.githubusercontent.com/geohacker/india/master/state/india_state.geojson'
|
| 27 |
|
|
|
|
|
|
|
|
|
|
| 28 |
# Page config
|
| 29 |
st.set_page_config(
|
| 30 |
page_title="News Scraper & Analysis Platform",
|
|
@@ -106,7 +110,8 @@ def init_session_state():
|
|
| 106 |
'scraped_data': {},
|
| 107 |
'scraping_active': False,
|
| 108 |
'processing_status': {},
|
| 109 |
-
'selected_dataset': None
|
|
|
|
| 110 |
}
|
| 111 |
for key, value in defaults.items():
|
| 112 |
if key not in st.session_state:
|
|
@@ -114,8 +119,15 @@ def init_session_state():
|
|
| 114 |
|
| 115 |
# Setup directories
|
| 116 |
def setup_directories():
|
| 117 |
-
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
# Load India GeoJSON
|
| 121 |
@st.cache_data
|
|
@@ -135,8 +147,7 @@ def load_spacy_model():
|
|
| 135 |
return spacy.load("en_core_web_sm")
|
| 136 |
except OSError:
|
| 137 |
st.info("Downloading spaCy model...")
|
| 138 |
-
|
| 139 |
-
subprocess.call(["python", "-m", "spacy", "download", "en_core_web_sm"])
|
| 140 |
return spacy.load("en_core_web_sm")
|
| 141 |
|
| 142 |
# State mapping
|
|
@@ -283,12 +294,26 @@ def create_top_locations_chart(df, title):
|
|
| 283 |
# Discover datasets
|
| 284 |
@st.cache_data
|
| 285 |
def discover_datasets():
|
|
|
|
| 286 |
datasets = {}
|
| 287 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
if directory.exists():
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
return datasets
|
| 293 |
|
| 294 |
# Load data
|
|
@@ -315,28 +340,6 @@ def load_data(file_path):
|
|
| 315 |
st.error(f"Error loading data: {str(e)}")
|
| 316 |
return None
|
| 317 |
|
| 318 |
-
# Run scraper
|
| 319 |
-
def run_scraper_async(source, topic, workers, interval):
|
| 320 |
-
cmd = [
|
| 321 |
-
sys.executable, "main.py",
|
| 322 |
-
"--source", source,
|
| 323 |
-
"--topic", topic,
|
| 324 |
-
"--workers", str(workers),
|
| 325 |
-
"--interval", str(interval)
|
| 326 |
-
]
|
| 327 |
-
|
| 328 |
-
try:
|
| 329 |
-
process = subprocess.Popen(
|
| 330 |
-
cmd,
|
| 331 |
-
stdout=subprocess.PIPE,
|
| 332 |
-
stderr=subprocess.PIPE,
|
| 333 |
-
text=True,
|
| 334 |
-
bufsize=1
|
| 335 |
-
)
|
| 336 |
-
return process
|
| 337 |
-
except Exception as e:
|
| 338 |
-
return None
|
| 339 |
-
|
| 340 |
# Plotting functions
|
| 341 |
def plot_sentiment_trends(df, title):
|
| 342 |
if 'date' not in df.columns or 'sentiment_value' not in df.columns:
|
|
@@ -420,17 +423,20 @@ def show_home_page():
|
|
| 420 |
<div style="text-align: center; padding: 20px; background-color: #f8f9fa;
|
| 421 |
border-radius: 10px; margin: 20px 0;">
|
| 422 |
<h3>Complete Pipeline for News Collection and Sentiment Analysis</h3>
|
| 423 |
-
<p>
|
| 424 |
</div>
|
| 425 |
""", unsafe_allow_html=True)
|
| 426 |
|
|
|
|
|
|
|
|
|
|
| 427 |
# Feature cards
|
| 428 |
col1, col2, col3 = st.columns(3)
|
| 429 |
|
| 430 |
with col1:
|
| 431 |
st.markdown('<div class="feature-card">', unsafe_allow_html=True)
|
| 432 |
-
st.markdown("###
|
| 433 |
-
st.write("
|
| 434 |
st.markdown('</div>', unsafe_allow_html=True)
|
| 435 |
|
| 436 |
with col2:
|
|
@@ -461,83 +467,92 @@ def show_home_page():
|
|
| 461 |
st.markdown('</div>', unsafe_allow_html=True)
|
| 462 |
|
| 463 |
def show_scraper_page():
|
| 464 |
-
st.markdown('<h2 class="sub-header"
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
options=['toi', 'ndtv', 'wion', 'scroll'],
|
| 474 |
-
format_func=lambda x: {
|
| 475 |
-
'toi': 'π° Times of India',
|
| 476 |
-
'ndtv': 'πΊ NDTV',
|
| 477 |
-
'wion': 'π WION',
|
| 478 |
-
'scroll': 'π Scroll.in'
|
| 479 |
-
}[x]
|
| 480 |
-
)
|
| 481 |
-
|
| 482 |
-
topic = st.text_input("Topic", placeholder="e.g., Climate Change, Technology")
|
| 483 |
-
|
| 484 |
-
col_a, col_b = st.columns(2)
|
| 485 |
-
with col_a:
|
| 486 |
-
workers = st.slider("Workers", 1, 10, 4)
|
| 487 |
-
with col_b:
|
| 488 |
-
interval = st.slider("Save Interval (s)", 60, 600, 300, step=60)
|
| 489 |
-
|
| 490 |
-
with col2:
|
| 491 |
-
st.markdown("### Quick Guide")
|
| 492 |
-
st.info("""
|
| 493 |
-
**Steps:**
|
| 494 |
-
1. Select news source
|
| 495 |
-
2. Enter search topic
|
| 496 |
-
3. Configure settings
|
| 497 |
-
4. Click Start
|
| 498 |
-
5. Monitor progress
|
| 499 |
-
""")
|
| 500 |
|
| 501 |
st.markdown("---")
|
| 502 |
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 508 |
|
| 509 |
-
|
| 510 |
-
|
|
|
|
| 511 |
|
| 512 |
-
|
|
|
|
| 513 |
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 541 |
|
| 542 |
def show_analysis_page():
|
| 543 |
st.markdown('<h2 class="sub-header">π Sentiment Analysis Dashboard</h2>',
|
|
@@ -546,7 +561,8 @@ def show_analysis_page():
|
|
| 546 |
datasets = discover_datasets()
|
| 547 |
|
| 548 |
if not datasets:
|
| 549 |
-
st.warning("β οΈ No datasets available. Please
|
|
|
|
| 550 |
return
|
| 551 |
|
| 552 |
# Dataset selector
|
|
@@ -727,15 +743,15 @@ def show_about_page():
|
|
| 727 |
st.markdown("""
|
| 728 |
## π― Overview
|
| 729 |
|
| 730 |
-
This platform provides
|
| 731 |
specifically designed for Indian news sources.
|
| 732 |
|
| 733 |
### β¨ Key Features
|
| 734 |
|
| 735 |
-
- **
|
| 736 |
-
- **Real-Time Monitoring**: Track scraping progress live
|
| 737 |
- **Automatic Analysis**: Sentiment classification and scoring
|
| 738 |
- **Interactive Visualizations**: Trends, distributions, and comparisons
|
|
|
|
| 739 |
- **Data Export**: Download processed datasets
|
| 740 |
|
| 741 |
### π§ Technical Stack
|
|
@@ -743,15 +759,31 @@ def show_about_page():
|
|
| 743 |
- **Frontend**: Streamlit
|
| 744 |
- **Data Processing**: Pandas, NumPy
|
| 745 |
- **Visualization**: Plotly, Matplotlib
|
| 746 |
-
- **NLP**: spaCy
|
| 747 |
-
- **
|
| 748 |
|
| 749 |
### π How to Use
|
| 750 |
|
| 751 |
-
1. **
|
| 752 |
-
2. **
|
| 753 |
-
3. **
|
| 754 |
-
4. **
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 755 |
|
| 756 |
### π€ Support
|
| 757 |
|
|
@@ -759,51 +791,80 @@ def show_about_page():
|
|
| 759 |
|
| 760 |
---
|
| 761 |
|
| 762 |
-
**Version**: 1.0.0
|
| 763 |
-
**Last Updated**: October 2025
|
|
|
|
| 764 |
""")
|
| 765 |
|
| 766 |
# MAIN APP
|
| 767 |
def main():
|
| 768 |
-
|
| 769 |
-
|
| 770 |
-
|
| 771 |
-
|
| 772 |
-
# Sidebar navigation
|
| 773 |
-
with st.sidebar:
|
| 774 |
-
st.image("https://via.placeholder.com/150x50?text=News+Scraper", use_container_width=True)
|
| 775 |
-
st.markdown("---")
|
| 776 |
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
label_visibility="collapsed"
|
| 781 |
-
)
|
| 782 |
|
| 783 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 784 |
|
| 785 |
-
#
|
| 786 |
-
|
| 787 |
-
|
| 788 |
-
|
| 789 |
-
|
| 790 |
-
|
| 791 |
-
|
| 792 |
-
|
| 793 |
-
|
| 794 |
|
| 795 |
-
|
| 796 |
-
|
| 797 |
-
|
| 798 |
-
|
| 799 |
-
|
| 800 |
-
|
| 801 |
-
|
| 802 |
-
show_scraper_page()
|
| 803 |
-
elif page == "π Analysis":
|
| 804 |
-
show_analysis_page()
|
| 805 |
-
else:
|
| 806 |
-
show_about_page()
|
| 807 |
|
| 808 |
if __name__ == "__main__":
|
| 809 |
main()
|
|
|
|
| 1 |
"""
|
| 2 |
Unified News Scraper & Sentiment Analysis Application
|
| 3 |
Combines scraping, processing, and visualization in one interface
|
| 4 |
+
Modified for Hugging Face Spaces - uses /tmp directory
|
| 5 |
"""
|
| 6 |
|
| 7 |
import streamlit as st
|
|
|
|
| 26 |
# Constants
|
| 27 |
INDIA_GEOJSON_URL = 'https://raw.githubusercontent.com/geohacker/india/master/state/india_state.geojson'
|
| 28 |
|
| 29 |
+
# Base directory for all file operations - using /tmp for HF Spaces
|
| 30 |
+
BASE_DIR = Path('/tmp/news_scraper')
|
| 31 |
+
|
| 32 |
# Page config
|
| 33 |
st.set_page_config(
|
| 34 |
page_title="News Scraper & Analysis Platform",
|
|
|
|
| 110 |
'scraped_data': {},
|
| 111 |
'scraping_active': False,
|
| 112 |
'processing_status': {},
|
| 113 |
+
'selected_dataset': None,
|
| 114 |
+
'base_dir': BASE_DIR
|
| 115 |
}
|
| 116 |
for key, value in defaults.items():
|
| 117 |
if key not in st.session_state:
|
|
|
|
| 119 |
|
| 120 |
# Setup directories
|
| 121 |
def setup_directories():
|
| 122 |
+
"""Create necessary directories in /tmp"""
|
| 123 |
+
try:
|
| 124 |
+
for dir_name in ['output', 'data', 'temp']:
|
| 125 |
+
dir_path = BASE_DIR / dir_name
|
| 126 |
+
dir_path.mkdir(parents=True, exist_ok=True)
|
| 127 |
+
return True
|
| 128 |
+
except Exception as e:
|
| 129 |
+
st.error(f"β Directory setup error: {e}")
|
| 130 |
+
return False
|
| 131 |
|
| 132 |
# Load India GeoJSON
|
| 133 |
@st.cache_data
|
|
|
|
| 147 |
return spacy.load("en_core_web_sm")
|
| 148 |
except OSError:
|
| 149 |
st.info("Downloading spaCy model...")
|
| 150 |
+
subprocess.call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
|
|
|
|
| 151 |
return spacy.load("en_core_web_sm")
|
| 152 |
|
| 153 |
# State mapping
|
|
|
|
| 294 |
# Discover datasets
|
| 295 |
@st.cache_data
|
| 296 |
def discover_datasets():
|
| 297 |
+
"""Discover datasets in /tmp directory"""
|
| 298 |
datasets = {}
|
| 299 |
+
|
| 300 |
+
# Check /tmp/news_scraper directories
|
| 301 |
+
search_paths = [
|
| 302 |
+
BASE_DIR / 'data',
|
| 303 |
+
BASE_DIR / 'output',
|
| 304 |
+
BASE_DIR,
|
| 305 |
+
]
|
| 306 |
+
|
| 307 |
+
for directory in search_paths:
|
| 308 |
if directory.exists():
|
| 309 |
+
try:
|
| 310 |
+
for csv_file in directory.glob('*.csv'):
|
| 311 |
+
name = csv_file.stem.replace('_articles', '').replace('_', ' ').title()
|
| 312 |
+
if name not in datasets:
|
| 313 |
+
datasets[name] = str(csv_file)
|
| 314 |
+
except Exception:
|
| 315 |
+
continue
|
| 316 |
+
|
| 317 |
return datasets
|
| 318 |
|
| 319 |
# Load data
|
|
|
|
| 340 |
st.error(f"Error loading data: {str(e)}")
|
| 341 |
return None
|
| 342 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
# Plotting functions
|
| 344 |
def plot_sentiment_trends(df, title):
|
| 345 |
if 'date' not in df.columns or 'sentiment_value' not in df.columns:
|
|
|
|
| 423 |
<div style="text-align: center; padding: 20px; background-color: #f8f9fa;
|
| 424 |
border-radius: 10px; margin: 20px 0;">
|
| 425 |
<h3>Complete Pipeline for News Collection and Sentiment Analysis</h3>
|
| 426 |
+
<p>Upload or scrape articles from major Indian news sources and analyze sentiment trends</p>
|
| 427 |
</div>
|
| 428 |
""", unsafe_allow_html=True)
|
| 429 |
|
| 430 |
+
# Show current storage location
|
| 431 |
+
st.info(f"πΎ **Storage Location:** `{BASE_DIR}` (temporary - cleared on restart)")
|
| 432 |
+
|
| 433 |
# Feature cards
|
| 434 |
col1, col2, col3 = st.columns(3)
|
| 435 |
|
| 436 |
with col1:
|
| 437 |
st.markdown('<div class="feature-card">', unsafe_allow_html=True)
|
| 438 |
+
st.markdown("### π€ Upload")
|
| 439 |
+
st.write("Upload pre-scraped CSV files for analysis")
|
| 440 |
st.markdown('</div>', unsafe_allow_html=True)
|
| 441 |
|
| 442 |
with col2:
|
|
|
|
| 467 |
st.markdown('</div>', unsafe_allow_html=True)
|
| 468 |
|
| 469 |
def show_scraper_page():
|
| 470 |
+
st.markdown('<h2 class="sub-header">π€ Data Upload & Management</h2>', unsafe_allow_html=True)
|
| 471 |
+
|
| 472 |
+
# Important notice for HF Spaces
|
| 473 |
+
st.warning("""
|
| 474 |
+
β οΈ **Hugging Face Spaces Notice:**
|
| 475 |
+
- Data is stored in `/tmp` and will be cleared on app restart
|
| 476 |
+
- Download your processed data regularly
|
| 477 |
+
- For actual scraping, run the scraper locally and upload results here
|
| 478 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 479 |
|
| 480 |
st.markdown("---")
|
| 481 |
|
| 482 |
+
# File uploader
|
| 483 |
+
st.markdown("### π€ Upload Your Data")
|
| 484 |
+
uploaded_file = st.file_uploader(
|
| 485 |
+
"Upload CSV file with scraped articles",
|
| 486 |
+
type=['csv'],
|
| 487 |
+
help="Upload a CSV file with columns: title, date, desc, sentiment_value, etc."
|
| 488 |
+
)
|
| 489 |
+
|
| 490 |
+
if uploaded_file:
|
| 491 |
+
try:
|
| 492 |
+
# Save to /tmp directory
|
| 493 |
+
data_dir = BASE_DIR / 'data'
|
| 494 |
+
data_dir.mkdir(parents=True, exist_ok=True)
|
| 495 |
|
| 496 |
+
save_path = data_dir / uploaded_file.name
|
| 497 |
+
with open(save_path, 'wb') as f:
|
| 498 |
+
f.write(uploaded_file.getbuffer())
|
| 499 |
|
| 500 |
+
st.success(f"β
File uploaded successfully! Saved to `{save_path}`")
|
| 501 |
+
st.info("Go to the Analysis page to view your data.")
|
| 502 |
|
| 503 |
+
# Clear cache to refresh dataset list
|
| 504 |
+
discover_datasets.clear()
|
| 505 |
+
|
| 506 |
+
# Show preview
|
| 507 |
+
with st.expander("π Preview Data"):
|
| 508 |
+
df = pd.read_csv(save_path)
|
| 509 |
+
st.dataframe(df.head(10), use_container_width=True)
|
| 510 |
+
st.caption(f"Total rows: {len(df):,}")
|
| 511 |
+
|
| 512 |
+
except Exception as e:
|
| 513 |
+
st.error(f"β Error saving file: {e}")
|
| 514 |
+
|
| 515 |
+
st.markdown("---")
|
| 516 |
+
|
| 517 |
+
# Check available datasets
|
| 518 |
+
datasets = discover_datasets()
|
| 519 |
+
if datasets:
|
| 520 |
+
st.markdown("### π Available Datasets")
|
| 521 |
+
for name, path in datasets.items():
|
| 522 |
+
col_a, col_b, col_c = st.columns([3, 1, 1])
|
| 523 |
+
with col_a:
|
| 524 |
+
st.text(f"π {name}")
|
| 525 |
+
with col_b:
|
| 526 |
+
df = load_data(path)
|
| 527 |
+
if df is not None:
|
| 528 |
+
st.text(f"{len(df):,} articles")
|
| 529 |
+
with col_c:
|
| 530 |
+
if Path(path).exists():
|
| 531 |
+
with open(path, 'rb') as f:
|
| 532 |
+
st.download_button(
|
| 533 |
+
"β¬οΈ",
|
| 534 |
+
f,
|
| 535 |
+
file_name=f"{name}.csv",
|
| 536 |
+
mime='text/csv',
|
| 537 |
+
key=f"dl_{name}"
|
| 538 |
+
)
|
| 539 |
+
else:
|
| 540 |
+
st.info("No datasets found. Upload a CSV file to get started!")
|
| 541 |
+
|
| 542 |
+
st.markdown("---")
|
| 543 |
+
|
| 544 |
+
# Storage info
|
| 545 |
+
st.markdown("### πΎ Storage Information")
|
| 546 |
+
try:
|
| 547 |
+
if BASE_DIR.exists():
|
| 548 |
+
# Count files
|
| 549 |
+
file_count = sum(1 for _ in BASE_DIR.rglob('*.csv'))
|
| 550 |
+
st.metric("CSV Files", file_count)
|
| 551 |
+
st.caption(f"Location: `{BASE_DIR}`")
|
| 552 |
+
else:
|
| 553 |
+
st.info("No data directory created yet. Upload a file to get started.")
|
| 554 |
+
except Exception as e:
|
| 555 |
+
st.error(f"Could not access storage: {e}")
|
| 556 |
|
| 557 |
def show_analysis_page():
|
| 558 |
st.markdown('<h2 class="sub-header">π Sentiment Analysis Dashboard</h2>',
|
|
|
|
| 561 |
datasets = discover_datasets()
|
| 562 |
|
| 563 |
if not datasets:
|
| 564 |
+
st.warning("β οΈ No datasets available. Please upload a CSV file first!")
|
| 565 |
+
st.info("π Go to the 'Upload & Management' page to upload your data.")
|
| 566 |
return
|
| 567 |
|
| 568 |
# Dataset selector
|
|
|
|
| 743 |
st.markdown("""
|
| 744 |
## π― Overview
|
| 745 |
|
| 746 |
+
This platform provides sentiment analysis and visualization for news articles,
|
| 747 |
specifically designed for Indian news sources.
|
| 748 |
|
| 749 |
### β¨ Key Features
|
| 750 |
|
| 751 |
+
- **Data Upload**: Upload pre-scraped CSV files for analysis
|
|
|
|
| 752 |
- **Automatic Analysis**: Sentiment classification and scoring
|
| 753 |
- **Interactive Visualizations**: Trends, distributions, and comparisons
|
| 754 |
+
- **Geographic Analysis**: State-wise sentiment mapping for India
|
| 755 |
- **Data Export**: Download processed datasets
|
| 756 |
|
| 757 |
### π§ Technical Stack
|
|
|
|
| 759 |
- **Frontend**: Streamlit
|
| 760 |
- **Data Processing**: Pandas, NumPy
|
| 761 |
- **Visualization**: Plotly, Matplotlib
|
| 762 |
+
- **NLP**: spaCy for location extraction
|
| 763 |
+
- **Storage**: `/tmp` directory (HF Spaces compatible)
|
| 764 |
|
| 765 |
### π How to Use
|
| 766 |
|
| 767 |
+
1. **Upload**: Navigate to the Upload page and upload your CSV file
|
| 768 |
+
2. **Analyze**: Go to Analysis page and select your dataset
|
| 769 |
+
3. **Explore**: View trends, distributions, and geographic sentiment
|
| 770 |
+
4. **Extract Locations**: Use the Geographic tab to extract state information
|
| 771 |
+
5. **Export**: Download processed data for further use
|
| 772 |
+
|
| 773 |
+
### π CSV File Format
|
| 774 |
+
|
| 775 |
+
Your uploaded CSV should contain these columns:
|
| 776 |
+
- `title`: Article headline
|
| 777 |
+
- `date`: Publication date
|
| 778 |
+
- `desc` or `description`: Article content/summary
|
| 779 |
+
- `sentiment_value`: Sentiment label (positive/negative/neutral)
|
| 780 |
+
- `link` (optional): URL to original article
|
| 781 |
+
|
| 782 |
+
### β οΈ Hugging Face Spaces Limitations
|
| 783 |
+
|
| 784 |
+
- Data stored in `/tmp` is temporary and cleared on restart
|
| 785 |
+
- Download your processed data regularly
|
| 786 |
+
- For scraping, run locally and upload results here
|
| 787 |
|
| 788 |
### π€ Support
|
| 789 |
|
|
|
|
| 791 |
|
| 792 |
---
|
| 793 |
|
| 794 |
+
**Version**: 1.0.0 (HF Spaces Edition)
|
| 795 |
+
**Last Updated**: October 2025
|
| 796 |
+
**Storage**: `/tmp/news_scraper`
|
| 797 |
""")
|
| 798 |
|
| 799 |
# MAIN APP
|
| 800 |
def main():
|
| 801 |
+
try:
|
| 802 |
+
load_css()
|
| 803 |
+
init_session_state()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 804 |
|
| 805 |
+
# Setup directories and show status
|
| 806 |
+
if not setup_directories():
|
| 807 |
+
st.error("Failed to setup directories. Some features may not work.")
|
|
|
|
|
|
|
| 808 |
|
| 809 |
+
# Sidebar navigation
|
| 810 |
+
with st.sidebar:
|
| 811 |
+
st.markdown("### π° News Analysis")
|
| 812 |
+
st.markdown("---")
|
| 813 |
+
|
| 814 |
+
page = st.radio(
|
| 815 |
+
"Navigation",
|
| 816 |
+
["π Home", "π€ Upload & Manage", "π Analysis", "βΉοΈ About"],
|
| 817 |
+
label_visibility="collapsed"
|
| 818 |
+
)
|
| 819 |
+
|
| 820 |
+
st.markdown("---")
|
| 821 |
+
|
| 822 |
+
# Storage info in sidebar
|
| 823 |
+
st.markdown("### πΎ Storage")
|
| 824 |
+
st.caption(f"Location: `/tmp/news_scraper`")
|
| 825 |
+
st.caption("β οΈ Temporary storage")
|
| 826 |
+
|
| 827 |
+
st.markdown("---")
|
| 828 |
+
|
| 829 |
+
# Quick stats in sidebar
|
| 830 |
+
try:
|
| 831 |
+
datasets = discover_datasets()
|
| 832 |
+
if datasets:
|
| 833 |
+
st.markdown("### π Quick Stats")
|
| 834 |
+
total_articles = 0
|
| 835 |
+
for path in datasets.values():
|
| 836 |
+
try:
|
| 837 |
+
df = load_data(path)
|
| 838 |
+
if df is not None:
|
| 839 |
+
total_articles += len(df)
|
| 840 |
+
except:
|
| 841 |
+
continue
|
| 842 |
+
|
| 843 |
+
if total_articles > 0:
|
| 844 |
+
st.metric("Total Articles", f"{total_articles:,}")
|
| 845 |
+
st.metric("Datasets", len(datasets))
|
| 846 |
+
else:
|
| 847 |
+
st.info("No data yet. Upload a CSV to start!")
|
| 848 |
+
except Exception:
|
| 849 |
+
pass
|
| 850 |
|
| 851 |
+
# Route to pages
|
| 852 |
+
if page == "π Home":
|
| 853 |
+
show_home_page()
|
| 854 |
+
elif page == "π€ Upload & Manage":
|
| 855 |
+
show_scraper_page()
|
| 856 |
+
elif page == "π Analysis":
|
| 857 |
+
show_analysis_page()
|
| 858 |
+
else:
|
| 859 |
+
show_about_page()
|
| 860 |
|
| 861 |
+
except Exception as e:
|
| 862 |
+
st.error(f"β Application error: {str(e)}")
|
| 863 |
+
st.info("Try refreshing the page. If the problem persists, the app may need to restart.")
|
| 864 |
+
|
| 865 |
+
with st.expander("π Error Details"):
|
| 866 |
+
import traceback
|
| 867 |
+
st.code(traceback.format_exc())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 868 |
|
| 869 |
if __name__ == "__main__":
|
| 870 |
main()
|