Spaces:

Arjon07CSE
/

SPF_News_Scrapper

Runtime error

App Files Files Community

Arjon07CSE commited on about 1 month ago

Commit

84faf8a

verified ·

1 Parent(s): ebe173a

Update app.py

Browse files

Files changed (1) hide show

app.py +160 -116

app.py CHANGED Viewed

@@ -1,168 +1,212 @@
 import gradio as gr
 import pandas as pd
 from datetime import datetime, timedelta
 import requests
 from bs4 import BeautifulSoup
 import time
-import urllib.parse
-import feedparser
-# --- 1. ROBUST SEARCH ENGINE ---
-class GoogleNewsEngine:
-    def __init__(self, lang='bn', country='BD'):
-        self.lang = lang.lower()
-        self.country = country.upper()
-        self.BASE_URL = 'https://news.google.com/rss'
-    def search(self, query, from_=None, to_=None):
-        # Construct query with standard Google operators
-        full_query = query
-        if from_: full_query += f" after:{from_}"
-        if to_: full_query += f" before:{to_}"
-        # URL Encode
-        encoded_query = urllib.parse.quote(full_query)
-        # Construct RSS URL
-        url = (f"{self.BASE_URL}/search?q={encoded_query}"
-               f"&hl={self.lang}-{self.country}"
-               f"&gl={self.country}"
-               f"&ceid={self.country}:{self.lang}")
-        return feedparser.parse(url)
-# --- 2. SOPHISTICATED GUIDE MARKDOWN ---
-guide_markdown = """
-### 🇧🇩 Advanced Search Intelligence (সার্চ গাইড)
-Master your queries using these professional operators to filter noise.
-| Search Goal | Operator | Example (Copy & Paste) | Explanation |
-| :--- | :--- | :--- | :--- |
-| **Precise Match** | `AND` | `বিএনপি AND নির্বাচন` | Finds articles containing **BOTH** 'BNP' and 'Election'. |
-| **Broad Search** | `OR` | `বন্যা OR জলোচ্ছ্বাস` | Finds articles containing **EITHER** 'Flood' or 'Surge'. |
-| **Noise Filtering** | `-` (Minus) | `ক্রিকেট -সাকিব` | Finds 'Cricket' news but **REMOVES** any mention of 'Shakib'. |
-| **Exact Phrase** | `""` (Quotes) | `"পদ্মা সেতু"` | Finds the exact sequence of words, not just scattered keywords. |
-| **Complex Logic** | `( )` | `(ঢাকা OR চট্টগ্রাম) AND ডেঙ্গু` | Finds Dengue news specifically for **Dhaka OR Chittagong**. |
-| **Source Specific** | `site:` | `site:prothomalo.com রাজনীতি` | Finds 'Politics' news **ONLY** from Prothom Alo. |
-"""
-# --- 3. HELPER FUNCTIONS ---
 def scrape_article_content(url):
-    """Fetches article text if Deep Scrape is enabled."""
     try:
         headers = {'User-Agent': 'Mozilla/5.0'}
         response = requests.get(url, headers=headers, timeout=4)
         if response.status_code == 200:
             soup = BeautifulSoup(response.content, 'html.parser')
-            # Join paragraphs safely
-            text = ' '.join([p.get_text() for p in soup.find_all('p')])
             return text[:500] + "..." if len(text) > 500 else text
         return "Content extraction failed."
     except Exception:
         return "N/A (Scraping Blocked)"
 def perform_search(query, start_date, end_date, lang, country, fetch_content):
-    """Main logic to fetch news and return DataFrame."""
     log_text = ""
-    # Map friendly names to codes
-    l_map = {'Bangla':'bn', 'English':'en'}
-    c_map = {'Bangladesh':'BD', 'USA':'US', 'UK':'GB', 'India':'IN'}
-    gn = GoogleNewsEngine(lang=l_map.get(lang, 'bn'), country=c_map.get(country, 'BD'))
     try:
-        # Perform Search
         search_result = gn.search(query=query, from_=start_date, to_=end_date)
-        entries = search_result.entries
-        if not entries:
-            return None, "⚠️ No articles found. Try changing dates or keywords.", None
         news_data = []
-        log_text += f"✅ Found {len(entries)} articles.\n"
-        # Process Results
-        for entry in entries:
-            # Safe data extraction
-            pub_date = entry.get('published', 'N/A')[:16]
-            source = entry.get('source', {}).get('title', 'Google News')
             item = {
-                'Date': pub_date,
-                'Source': source,
                 'Title': entry.title,
                 'Link': entry.link
             }
-            # Deep Scrape (Optional)
             if fetch_content:
                 item['Snippet'] = scrape_article_content(entry.link)
-                time.sleep(0.1) # Be polite to servers
             news_data.append(item)
-        # Create Dataframe and CSV
         df = pd.DataFrame(news_data)
         filename = f"BD_News_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
         df.to_csv(filename, index=False, encoding='utf-8-sig')
-        return df, log_text + "🚀 Analysis Complete.", filename
     except Exception as e:
         return None, f"❌ System Error: {str(e)}", None
-# --- 4. UI LAYOUT ---
-# NO THEME ARGUMENT to ensure stability
-with gr.Blocks(title="BD News Analyst") as app:
-    with gr.Row():
-        with gr.Column(scale=4):
-            gr.Markdown("# 🇧🇩 BD News Intelligence Tool")
-            gr.Markdown("Search, Filter, and Analyze Bangladeshi News Data in seconds.")
-        with gr.Column(scale=1):
-            pass
-    with gr.Row():
-        # --- LEFT COLUMN: INPUTS ---
-        with gr.Column(scale=1):
-            gr.Markdown("### ⚙️ Search Configuration")
-            # The Sophisticated Guide
-            with gr.Accordion("📘 Search Operator Cheat Sheet (Click to Open)", open=True):
-                gr.Markdown(guide_markdown)
-            query_in = gr.Textbox(
-                label="Search Keyword (Supports Boolean Logic)",
-                value="রাজনীতি",
-                placeholder="e.g. অর্থনীতি AND (রিজার্ভ OR ডলার)"
             )
-            with gr.Row():
-                start_in = gr.Textbox(label="Start Date", value=(datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d'))
-                end_in = gr.Textbox(label="End Date", value=datetime.now().strftime('%Y-%m-%d'))
-            with gr.Row():
-                lang_in = gr.Dropdown(["Bangla", "English"], value="Bangla", label="Language")
-                country_in = gr.Dropdown(["Bangladesh", "USA", "UK", "India"], value="Bangladesh", label="Region")
-            fetch_chk = gr.Checkbox(label="Deep Scrape? (Fetch full article text - Slower)", value=False)
-            run_btn = gr.Button("🚀 Run Analysis", variant="primary")
-            status_box = gr.Textbox(label="System Status", interactive=False, lines=2)
-        # --- RIGHT COLUMN: OUTPUTS ---
-        with gr.Column(scale=2):
-            gr.Markdown("### 📊 Search Results")
-            results_df = gr.Dataframe(label="News Data Table", interactive=False, wrap=True)
-            download_btn = gr.File(label="📥 Download CSV Report")
-    # --- EVENTS ---
     run_btn.click(
-        fn=perform_search,
-        inputs=[query_in, start_in, end_in, lang_in, country_in, fetch_chk],
-        outputs=[results_df, status_box, download_btn]
     )
 if __name__ == "__main__":
-    app.launch()

 import gradio as gr
+from pygooglenews import GoogleNews
 import pandas as pd
 from datetime import datetime, timedelta
 import requests
 from bs4 import BeautifulSoup
 import time
+# --- CONFIGURATION ---
+SESSION_TIMEOUT_SECONDS = 1800  # 30 Minutes
+AUTH_USERS = [
+    ("admin", "admin123"),
+    ("user", "user123")
+]
+# --- BACKEND LOGIC ---
 def scrape_article_content(url):
+    """Scrapes the main text from a news URL with a timeout."""
     try:
         headers = {'User-Agent': 'Mozilla/5.0'}
         response = requests.get(url, headers=headers, timeout=4)
         if response.status_code == 200:
             soup = BeautifulSoup(response.content, 'html.parser')
+            paragraphs = soup.find_all('p')
+            text = ' '.join([p.get_text() for p in paragraphs])
             return text[:500] + "..." if len(text) > 500 else text
         return "Content extraction failed."
     except Exception:
         return "N/A (Scraping Blocked)"
 def perform_search(query, start_date, end_date, lang, country, fetch_content):
     log_text = ""
+    lang_map = {'Bangla': 'bn', 'English': 'en'}
+    country_map = {'Bangladesh': 'BD', 'USA': 'US', 'UK': 'GB', 'India': 'IN'}
+    gn = GoogleNews(lang=lang_map.get(lang, 'bn'), country=country_map.get(country, 'BD'))
     try:
+        # Validate Dates
+        datetime.strptime(start_date, '%Y-%m-%d')
+        datetime.strptime(end_date, '%Y-%m-%d')
+    except ValueError:
+        return None, "❌ Error: Invalid date format. Please use YYYY-MM-DD.", None
+    try:
+        log_text += f"🔎 Searching: {query} ({start_date} to {end_date})\n"
         search_result = gn.search(query=query, from_=start_date, to_=end_date)
+        entries = search_result['entries']
+        if not entries:
+            return None, "⚠️ No articles found. Try a different keyword or date range.", None
         news_data = []
+        log_text += f"✅ Found {len(entries)} articles. Processing...\n"
+        for i, entry in enumerate(entries):
             item = {
+                'Date': entry.published[:16] if 'published' in entry else 'N/A', # Shorten date string
+                'Source': entry.source['title'],
                 'Title': entry.title,
                 'Link': entry.link
             }
             if fetch_content:
                 item['Snippet'] = scrape_article_content(entry.link)
+                time.sleep(0.1)
             news_data.append(item)
         df = pd.DataFrame(news_data)
+        # Save CSV
         filename = f"BD_News_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
         df.to_csv(filename, index=False, encoding='utf-8-sig')
+        return df, log_text + "🚀 Process Complete.", filename
     except Exception as e:
         return None, f"❌ System Error: {str(e)}", None
+# --- AUTHENTICATION LOGIC ---
+def authenticate(username, password):
+    for valid_user, valid_pass in AUTH_USERS:
+        if username == valid_user and password == valid_pass:
+            return (
+                gr.update(visible=False),
+                gr.update(visible=True),
+                {"logged_in": True, "time": time.time(), "user": username},
+                gr.update(value="") # Clear password
             )
+    return gr.update(), gr.update(), None, gr.update()
+def check_session_and_search(query, start, end, lang, country, fetch, session_data):
+    # 1. Check Login Status
+    if not session_data or not session_data.get("logged_in"):
+        return (gr.update(visible=True), gr.update(visible=False), None, None, "⚠️ Session Expired.", None)
+    # 2. Check Timeout
+    if (time.time() - session_data.get("time")) > SESSION_TIMEOUT_SECONDS:
+        return (gr.update(visible=True), gr.update(visible=False), None, None, "⚠️ Timeout (30m). Log in again.", None)
+    # 3. Perform Search
+    df, log, csv = perform_search(query, start, end, lang, country, fetch)
+    return (gr.update(visible=False), gr.update(visible=True), df, csv, log, session_data)
+def manual_logout():
+    return gr.update(visible=True), gr.update(visible=False), None, "Logged out."
+# --- UI THEME & MARKDOWN ---
+# Custom Theme for a Professional Look
+theme = gr.themes.Soft(
+    primary_hue="blue",
+    neutral_hue="slate",
+    text_size="sm",
+    spacing_size="sm",
+)
+guide_markdown = """
+### 🇧🇩 Search Logic Guide (সার্চ গাইড)
+Create powerful filters using these operators.
+| Goal | Operator | Example (Copy & Paste) | Description |
+| :--- | :--- | :--- | :--- |
+| **Both Required** | `AND` | `বিএনপি AND নির্বাচন` | Finds articles containing **both** keywords. |
+| **Either One** | `OR` | `বন্যা OR জলোচ্ছ্বাস` | Finds articles containing **either** word. |
+| **Exclude** | `-` | `আওয়ামী লীগ -শেখ হাসিনা` | Finds 'Awami League' but **removes** articles mentioning 'Sheikh Hasina'. |
+| **Exact Phrase** | `""` | `"পদ্মা সেতু"` | Finds the exact phrase 'Padma Bridge', not just the separate words. |
+| **Complex** | `()` | `(ঢাকা OR চট্টগ্রাম) AND ডেঙ্গু` | Finds Dengue news specifically for Dhaka or Chittagong. |
+"""
+# --- MAIN APP LAYOUT ---
+with gr.Blocks(theme=theme, title="BD News Analyst Pro", css="footer {visibility: hidden}") as app:
+    session_state = gr.State()
+    # === LOGIN VIEW ===
+    with gr.Column(visible=True) as login_view:
+        with gr.Row(variant="panel"):
+            with gr.Column(scale=1):
+                pass # Spacer
+            with gr.Column(scale=1):
+                gr.Markdown("## 🔐 News Analyst Pro \n Please login to access the dashboard.")
+                u_in = gr.Textbox(label="Username", placeholder="Enter username")
+                p_in = gr.Textbox(label="Password", type="password", placeholder="Enter password")
+                l_btn = gr.Button("Login", variant="primary")
+                l_msg = gr.Markdown("")
+            with gr.Column(scale=1):
+                pass # Spacer
+    # === DASHBOARD VIEW ===
+    with gr.Column(visible=False) as app_view:
+        with gr.Row():
+            with gr.Column(scale=4):
+                gr.Markdown("# 🇧🇩 Bangladesh News Intelligence Tool")
+            with gr.Column(scale=1):
+                logout_btn = gr.Button("🚪 Logout", variant="stop", size="sm")
+        with gr.Row():
+            # --- Left Panel: Controls ---
+            with gr.Column(scale=1, variant="panel"):
+                gr.Markdown("### ⚙️ Search Configuration")
+                # Search Guide Accordion
+                with gr.Accordion("📘 How to Search (Click to Expand)", open=True):
+                    gr.Markdown(guide_markdown)
+                query_in = gr.Textbox(
+                    label="Search Keyword (Supports Boolean)",
+                    placeholder="e.g. অর্থনীতি AND (রিজার্ভ OR ডলার)",
+                    lines=2,
+                    value="রাজনীতি"
+                )
+                with gr.Row():
+                    start_in = gr.Textbox(label="Start Date", value=(datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d'))
+                    end_in = gr.Textbox(label="End Date", value=datetime.now().strftime('%Y-%m-%d'))
+                with gr.Row():
+                    lang_in = gr.Dropdown(["Bangla", "English"], value="Bangla", label="Language")
+                    country_in = gr.Dropdown(["Bangladesh", "USA", "UK", "India"], value="Bangladesh", label="Region")
+                fetch_chk = gr.Checkbox(label="Fetch Full Content? (Slower)", value=False)
+                run_btn = gr.Button("🚀 Run Analysis", variant="primary", size="lg")
+                status_box = gr.Textbox(label="System Status", interactive=False, lines=4)
+            # --- Right Panel: Results ---
+            with gr.Column(scale=2):
+                gr.Markdown("### 📊 Search Results")
+                results_df = gr.Dataframe(
+                    label="News Data",
+                    interactive=False,
+                    wrap=True,
+                    headers=["Date", "Source", "Title", "Link", "Snippet"]
+                )
+                download_btn = gr.File(label="📥 Download CSV Report")
+    # === INTERACTIONS ===
+    l_btn.click(authenticate, [u_in, p_in], [login_view, app_view, session_state, p_in])
+    logout_btn.click(manual_logout, None, [login_view, app_view, session_state, l_msg])
     run_btn.click(
+        check_session_and_search,
+        inputs=[query_in, start_in, end_in, lang_in, country_in, fetch_chk, session_state],
+        outputs=[login_view, app_view, results_df, download_btn, status_box, session_state]
     )
 if __name__ == "__main__":
+    app.launch(share=True)