Spaces:

Arjon07CSE
/

SPF_News_Scrapper

Runtime error

App Files Files Community

Arjon07CSE commited on Dec 6, 2025

Commit

ebe173a

verified ·

1 Parent(s): 4abd6da

Update app.py

Browse files

Files changed (1) hide show

app.py +100 -110

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ import time
 import urllib.parse
 import feedparser
-# --- 1. STABLE SEARCH ENGINE (No external libraries) ---
 class GoogleNewsEngine:
     def __init__(self, lang='bn', country='BD'):
         self.lang = lang.lower()
@@ -15,34 +15,46 @@ class GoogleNewsEngine:
         self.BASE_URL = 'https://news.google.com/rss'
     def search(self, query, from_=None, to_=None):
         full_query = query
         if from_: full_query += f" after:{from_}"
         if to_: full_query += f" before:{to_}"
         encoded_query = urllib.parse.quote(full_query)
         url = (f"{self.BASE_URL}/search?q={encoded_query}"
                f"&hl={self.lang}-{self.country}"
                f"&gl={self.country}"
                f"&ceid={self.country}:{self.lang}")
         return feedparser.parse(url)
-# --- 2. CONFIGURATION ---
-SESSION_TIMEOUT_SECONDS = 1800  # 30 Minutes
-AUTH_USERS = [("admin", "admin123"), ("user", "user123")]
-guide_text = """
-### 🇧🇩 Search Guide
-* **AND**: `বিএনপি AND নির্বাচন`
-* **OR**: `বন্যা OR জলোচ্ছ্বাস`
-* **Exclude**: `ক্রিকেট -সাকিব`
 """
 # --- 3. HELPER FUNCTIONS ---
 def scrape_article_content(url):
     try:
         headers = {'User-Agent': 'Mozilla/5.0'}
         response = requests.get(url, headers=headers, timeout=4)
         if response.status_code == 200:
             soup = BeautifulSoup(response.content, 'html.parser')
             text = ' '.join([p.get_text() for p in soup.find_all('p')])
             return text[:500] + "..." if len(text) > 500 else text
         return "Content extraction failed."
@@ -50,129 +62,107 @@ def scrape_article_content(url):
         return "N/A (Scraping Blocked)"
 def perform_search(query, start_date, end_date, lang, country, fetch_content):
     log_text = ""
-    gn = GoogleNewsEngine(lang={'Bangla':'bn', 'English':'en'}.get(lang, 'bn'),
-                          country={'Bangladesh':'BD', 'USA':'US', 'UK':'GB'}.get(country, 'BD'))
     try:
         search_result = gn.search(query=query, from_=start_date, to_=end_date)
         entries = search_result.entries
-        if not entries: return None, "⚠️ No articles found.", None
         news_data = []
         log_text += f"✅ Found {len(entries)} articles.\n"
         for entry in entries:
             item = {
-                'Date': entry.get('published', 'N/A')[:16],
-                'Source': entry.get('source', {}).get('title', 'Google News'),
                 'Title': entry.title,
                 'Link': entry.link
             }
             if fetch_content:
                 item['Snippet'] = scrape_article_content(entry.link)
-                time.sleep(0.1)
             news_data.append(item)
         df = pd.DataFrame(news_data)
         filename = f"BD_News_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
         df.to_csv(filename, index=False, encoding='utf-8-sig')
-        return df, log_text + "🚀 Complete.", filename
-    except Exception as e:
-        return None, f"❌ Error: {str(e)}", None
-# --- 4. VISIBILITY & AUTH LOGIC ---
-# Using gr.update() to fix the sticking login screen
-def authenticate(username, password):
-    for u, p in AUTH_USERS:
-        if username == u and password == p:
-            # HIDE Login, SHOW App
-            return (
-                gr.update(visible=False),
-                gr.update(visible=True),
-                {"logged_in": True, "time": time.time(), "user": username}
-            )
-    return gr.update(), gr.update(), None
-def check_session_and_search(query, start, end, lang, country, fetch, session_data):
-    # Check Session
-    if not session_data or not session_data.get("logged_in") or (time.time() - session_data.get("time")) > SESSION_TIMEOUT_SECONDS:
-        # FAIL: Show Login, Hide App
-        return (
-            gr.update(visible=True),
-            gr.update(visible=False),
-            None, None, "⚠️ Session Expired.", None
-        )
-    # SUCCESS: Search
-    df, log, csv = perform_search(query, start, end, lang, country, fetch)
-    # MAINTAIN: Hide Login, Show App
-    return (
-        gr.update(visible=False),
-        gr.update(visible=True),
-        df, csv, log, session_data
-    )
-def manual_logout():
-    # SHOW Login, HIDE App
-    return (
-        gr.update(visible=True),
-        gr.update(visible=False),
-        None, "Logged out."
-    )
-# --- 5. UI LAYOUT (CRASH FIX: Removed 'theme' and 'css') ---
 with gr.Blocks(title="BD News Analyst") as app:
-    session_state = gr.State()
-    # === LOGIN GROUP ===
-    with gr.Group(visible=True) as login_view:
-        with gr.Row():
-            with gr.Column(scale=1): pass
-            with gr.Column(scale=1):
-                gr.Markdown("## 🔐 Login Required")
-                u_in = gr.Textbox(label="Username")
-                p_in = gr.Textbox(label="Password", type="password")
-                l_btn = gr.Button("Login", variant="primary")
-            with gr.Column(scale=1): pass
-    # === APP GROUP ===
-    with gr.Group(visible=False) as app_view:
-        with gr.Row():
-            with gr.Column(scale=4):
-                gr.Markdown("# 🇧🇩 BD News Intelligence")
-            with gr.Column(scale=1):
-                logout_btn = gr.Button("Logout", variant="stop")
-        with gr.Row():
-            # Controls
-            with gr.Column(scale=1):
-                gr.Markdown("### Configuration")
-                gr.Markdown(guide_text)
-                query_in = gr.Textbox(label="Search Keyword", value="বিএনপি")
-                with gr.Row():
-                    start_in = gr.Textbox(label="Start", value=(datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d'))
-                    end_in = gr.Textbox(label="End", value=datetime.now().strftime('%Y-%m-%d'))
-                with gr.Row():
-                    lang_in = gr.Dropdown(["Bangla", "English"], value="Bangla", label="Language")
-                    country_in = gr.Dropdown(["Bangladesh", "USA"], value="Bangladesh", label="Region")
-                fetch_chk = gr.Checkbox(label="Deep Scrape?", value=False)
-                run_btn = gr.Button("Run Analysis", variant="primary")
-                status_box = gr.Textbox(label="Status", interactive=False)
-            # Results
-            with gr.Column(scale=2):
-                results_df = gr.Dataframe(label="Results", interactive=False)
-                download_btn = gr.File(label="Download CSV")
-    # === EVENTS ===
-    l_btn.click(authenticate, [u_in, p_in], [login_view, app_view, session_state])
-    logout_btn.click(manual_logout, None, [login_view, app_view, session_state, status_box])
-    run_btn.click(check_session_and_search, [query_in, start_in, end_in, lang_in, country_in, fetch_chk, session_state], [login_view, app_view, results_df, download_btn, status_box, session_state])
 if __name__ == "__main__":
     app.launch()

 import urllib.parse
 import feedparser
+# --- 1. ROBUST SEARCH ENGINE ---
 class GoogleNewsEngine:
     def __init__(self, lang='bn', country='BD'):
         self.lang = lang.lower()
         self.BASE_URL = 'https://news.google.com/rss'
     def search(self, query, from_=None, to_=None):
+        # Construct query with standard Google operators
         full_query = query
         if from_: full_query += f" after:{from_}"
         if to_: full_query += f" before:{to_}"
+        # URL Encode
         encoded_query = urllib.parse.quote(full_query)
+        # Construct RSS URL
         url = (f"{self.BASE_URL}/search?q={encoded_query}"
                f"&hl={self.lang}-{self.country}"
                f"&gl={self.country}"
                f"&ceid={self.country}:{self.lang}")
         return feedparser.parse(url)
+# --- 2. SOPHISTICATED GUIDE MARKDOWN ---
+guide_markdown = """
+### 🇧🇩 Advanced Search Intelligence (সার্চ গাইড)
+Master your queries using these professional operators to filter noise.
+| Search Goal | Operator | Example (Copy & Paste) | Explanation |
+| :--- | :--- | :--- | :--- |
+| **Precise Match** | `AND` | `বিএনপি AND নির্বাচন` | Finds articles containing **BOTH** 'BNP' and 'Election'. |
+| **Broad Search** | `OR` | `বন্যা OR জলোচ্ছ্বাস` | Finds articles containing **EITHER** 'Flood' or 'Surge'. |
+| **Noise Filtering** | `-` (Minus) | `ক্রিকেট -সাকিব` | Finds 'Cricket' news but **REMOVES** any mention of 'Shakib'. |
+| **Exact Phrase** | `""` (Quotes) | `"পদ্মা সেতু"` | Finds the exact sequence of words, not just scattered keywords. |
+| **Complex Logic** | `( )` | `(ঢাকা OR চট্টগ্রাম) AND ডেঙ্গু` | Finds Dengue news specifically for **Dhaka OR Chittagong**. |
+| **Source Specific** | `site:` | `site:prothomalo.com রাজনীতি` | Finds 'Politics' news **ONLY** from Prothom Alo. |
 """
 # --- 3. HELPER FUNCTIONS ---
 def scrape_article_content(url):
+    """Fetches article text if Deep Scrape is enabled."""
     try:
         headers = {'User-Agent': 'Mozilla/5.0'}
         response = requests.get(url, headers=headers, timeout=4)
         if response.status_code == 200:
             soup = BeautifulSoup(response.content, 'html.parser')
+            # Join paragraphs safely
             text = ' '.join([p.get_text() for p in soup.find_all('p')])
             return text[:500] + "..." if len(text) > 500 else text
         return "Content extraction failed."
         return "N/A (Scraping Blocked)"
 def perform_search(query, start_date, end_date, lang, country, fetch_content):
+    """Main logic to fetch news and return DataFrame."""
     log_text = ""
+    # Map friendly names to codes
+    l_map = {'Bangla':'bn', 'English':'en'}
+    c_map = {'Bangladesh':'BD', 'USA':'US', 'UK':'GB', 'India':'IN'}
+    gn = GoogleNewsEngine(lang=l_map.get(lang, 'bn'), country=c_map.get(country, 'BD'))
     try:
+        # Perform Search
         search_result = gn.search(query=query, from_=start_date, to_=end_date)
         entries = search_result.entries
+        if not entries:
+            return None, "⚠️ No articles found. Try changing dates or keywords.", None
         news_data = []
         log_text += f"✅ Found {len(entries)} articles.\n"
+        # Process Results
         for entry in entries:
+            # Safe data extraction
+            pub_date = entry.get('published', 'N/A')[:16]
+            source = entry.get('source', {}).get('title', 'Google News')
             item = {
+                'Date': pub_date,
+                'Source': source,
                 'Title': entry.title,
                 'Link': entry.link
             }
+            # Deep Scrape (Optional)
             if fetch_content:
                 item['Snippet'] = scrape_article_content(entry.link)
+                time.sleep(0.1) # Be polite to servers
             news_data.append(item)
+        # Create Dataframe and CSV
         df = pd.DataFrame(news_data)
         filename = f"BD_News_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
         df.to_csv(filename, index=False, encoding='utf-8-sig')
+        return df, log_text + "🚀 Analysis Complete.", filename
+    except Exception as e:
+        return None, f"❌ System Error: {str(e)}", None
+# --- 4. UI LAYOUT ---
+# NO THEME ARGUMENT to ensure stability
 with gr.Blocks(title="BD News Analyst") as app:
+    with gr.Row():
+        with gr.Column(scale=4):
+            gr.Markdown("# 🇧🇩 BD News Intelligence Tool")
+            gr.Markdown("Search, Filter, and Analyze Bangladeshi News Data in seconds.")
+        with gr.Column(scale=1):
+            pass
+    with gr.Row():
+        # --- LEFT COLUMN: INPUTS ---
+        with gr.Column(scale=1):
+            gr.Markdown("### ⚙️ Search Configuration")
+            # The Sophisticated Guide
+            with gr.Accordion("📘 Search Operator Cheat Sheet (Click to Open)", open=True):
+                gr.Markdown(guide_markdown)
+            query_in = gr.Textbox(
+                label="Search Keyword (Supports Boolean Logic)",
+                value="রাজনীতি",
+                placeholder="e.g. অর্থনীতি AND (রিজার্ভ OR ডলার)"
+            )
+            with gr.Row():
+                start_in = gr.Textbox(label="Start Date", value=(datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d'))
+                end_in = gr.Textbox(label="End Date", value=datetime.now().strftime('%Y-%m-%d'))
+            with gr.Row():
+                lang_in = gr.Dropdown(["Bangla", "English"], value="Bangla", label="Language")
+                country_in = gr.Dropdown(["Bangladesh", "USA", "UK", "India"], value="Bangladesh", label="Region")
+            fetch_chk = gr.Checkbox(label="Deep Scrape? (Fetch full article text - Slower)", value=False)
+            run_btn = gr.Button("🚀 Run Analysis", variant="primary")
+            status_box = gr.Textbox(label="System Status", interactive=False, lines=2)
+        # --- RIGHT COLUMN: OUTPUTS ---
+        with gr.Column(scale=2):
+            gr.Markdown("### 📊 Search Results")
+            results_df = gr.Dataframe(label="News Data Table", interactive=False, wrap=True)
+            download_btn = gr.File(label="📥 Download CSV Report")
+    # --- EVENTS ---
+    run_btn.click(
+        fn=perform_search,
+        inputs=[query_in, start_in, end_in, lang_in, country_in, fetch_chk],
+        outputs=[results_df, status_box, download_btn]
+    )
 if __name__ == "__main__":
     app.launch()