Arjon07CSE commited on
Commit
84faf8a
·
verified ·
1 Parent(s): ebe173a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +160 -116
app.py CHANGED
@@ -1,168 +1,212 @@
1
  import gradio as gr
 
2
  import pandas as pd
3
  from datetime import datetime, timedelta
4
  import requests
5
  from bs4 import BeautifulSoup
6
  import time
7
- import urllib.parse
8
- import feedparser
9
-
10
- # --- 1. ROBUST SEARCH ENGINE ---
11
- class GoogleNewsEngine:
12
- def __init__(self, lang='bn', country='BD'):
13
- self.lang = lang.lower()
14
- self.country = country.upper()
15
- self.BASE_URL = 'https://news.google.com/rss'
16
-
17
- def search(self, query, from_=None, to_=None):
18
- # Construct query with standard Google operators
19
- full_query = query
20
- if from_: full_query += f" after:{from_}"
21
- if to_: full_query += f" before:{to_}"
22
-
23
- # URL Encode
24
- encoded_query = urllib.parse.quote(full_query)
25
-
26
- # Construct RSS URL
27
- url = (f"{self.BASE_URL}/search?q={encoded_query}"
28
- f"&hl={self.lang}-{self.country}"
29
- f"&gl={self.country}"
30
- f"&ceid={self.country}:{self.lang}")
31
-
32
- return feedparser.parse(url)
33
 
34
- # --- 2. SOPHISTICATED GUIDE MARKDOWN ---
35
- guide_markdown = """
36
- ### 🇧🇩 Advanced Search Intelligence (সার্চ গাইড)
37
- Master your queries using these professional operators to filter noise.
 
 
38
 
39
- | Search Goal | Operator | Example (Copy & Paste) | Explanation |
40
- | :--- | :--- | :--- | :--- |
41
- | **Precise Match** | `AND` | `বিএনপি AND নির্বাচন` | Finds articles containing **BOTH** 'BNP' and 'Election'. |
42
- | **Broad Search** | `OR` | `বন্যা OR জলোচ্ছ্বাস` | Finds articles containing **EITHER** 'Flood' or 'Surge'. |
43
- | **Noise Filtering** | `-` (Minus) | `ক্রিকেট -সাকিব` | Finds 'Cricket' news but **REMOVES** any mention of 'Shakib'. |
44
- | **Exact Phrase** | `""` (Quotes) | `"পদ্মা সেতু"` | Finds the exact sequence of words, not just scattered keywords. |
45
- | **Complex Logic** | `( )` | `(ঢাকা OR চট্টগ্রাম) AND ডেঙ্গু` | Finds Dengue news specifically for **Dhaka OR Chittagong**. |
46
- | **Source Specific** | `site:` | `site:prothomalo.com রাজনীতি` | Finds 'Politics' news **ONLY** from Prothom Alo. |
47
- """
48
 
49
- # --- 3. HELPER FUNCTIONS ---
50
  def scrape_article_content(url):
51
- """Fetches article text if Deep Scrape is enabled."""
52
  try:
53
  headers = {'User-Agent': 'Mozilla/5.0'}
54
  response = requests.get(url, headers=headers, timeout=4)
55
  if response.status_code == 200:
56
  soup = BeautifulSoup(response.content, 'html.parser')
57
- # Join paragraphs safely
58
- text = ' '.join([p.get_text() for p in soup.find_all('p')])
59
  return text[:500] + "..." if len(text) > 500 else text
60
  return "Content extraction failed."
61
  except Exception:
62
  return "N/A (Scraping Blocked)"
63
 
64
  def perform_search(query, start_date, end_date, lang, country, fetch_content):
65
- """Main logic to fetch news and return DataFrame."""
66
  log_text = ""
 
 
67
 
68
- # Map friendly names to codes
69
- l_map = {'Bangla':'bn', 'English':'en'}
70
- c_map = {'Bangladesh':'BD', 'USA':'US', 'UK':'GB', 'India':'IN'}
71
-
72
- gn = GoogleNewsEngine(lang=l_map.get(lang, 'bn'), country=c_map.get(country, 'BD'))
73
 
74
  try:
75
- # Perform Search
 
 
 
 
 
 
 
76
  search_result = gn.search(query=query, from_=start_date, to_=end_date)
77
- entries = search_result.entries
78
-
79
- if not entries:
80
- return None, "⚠️ No articles found. Try changing dates or keywords.", None
81
 
 
 
 
82
  news_data = []
83
- log_text += f"✅ Found {len(entries)} articles.\n"
84
 
85
- # Process Results
86
- for entry in entries:
87
- # Safe data extraction
88
- pub_date = entry.get('published', 'N/A')[:16]
89
- source = entry.get('source', {}).get('title', 'Google News')
90
-
91
  item = {
92
- 'Date': pub_date,
93
- 'Source': source,
94
  'Title': entry.title,
95
  'Link': entry.link
96
  }
97
-
98
- # Deep Scrape (Optional)
99
  if fetch_content:
100
  item['Snippet'] = scrape_article_content(entry.link)
101
- time.sleep(0.1) # Be polite to servers
102
 
103
  news_data.append(item)
104
-
105
- # Create Dataframe and CSV
106
  df = pd.DataFrame(news_data)
 
 
107
  filename = f"BD_News_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
108
  df.to_csv(filename, index=False, encoding='utf-8-sig')
109
 
110
- return df, log_text + "🚀 Analysis Complete.", filename
111
 
112
  except Exception as e:
113
  return None, f"❌ System Error: {str(e)}", None
114
 
115
- # --- 4. UI LAYOUT ---
116
- # NO THEME ARGUMENT to ensure stability
117
- with gr.Blocks(title="BD News Analyst") as app:
118
-
119
- with gr.Row():
120
- with gr.Column(scale=4):
121
- gr.Markdown("# 🇧🇩 BD News Intelligence Tool")
122
- gr.Markdown("Search, Filter, and Analyze Bangladeshi News Data in seconds.")
123
- with gr.Column(scale=1):
124
- pass
125
-
126
- with gr.Row():
127
- # --- LEFT COLUMN: INPUTS ---
128
- with gr.Column(scale=1):
129
- gr.Markdown("### ⚙️ Search Configuration")
130
-
131
- # The Sophisticated Guide
132
- with gr.Accordion("📘 Search Operator Cheat Sheet (Click to Open)", open=True):
133
- gr.Markdown(guide_markdown)
134
-
135
- query_in = gr.Textbox(
136
- label="Search Keyword (Supports Boolean Logic)",
137
- value="রাজনীতি",
138
- placeholder="e.g. অর্থনীতি AND (রিজার্ভ OR ডলার)"
139
  )
140
-
141
- with gr.Row():
142
- start_in = gr.Textbox(label="Start Date", value=(datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d'))
143
- end_in = gr.Textbox(label="End Date", value=datetime.now().strftime('%Y-%m-%d'))
144
-
145
- with gr.Row():
146
- lang_in = gr.Dropdown(["Bangla", "English"], value="Bangla", label="Language")
147
- country_in = gr.Dropdown(["Bangladesh", "USA", "UK", "India"], value="Bangladesh", label="Region")
148
-
149
- fetch_chk = gr.Checkbox(label="Deep Scrape? (Fetch full article text - Slower)", value=False)
150
-
151
- run_btn = gr.Button("🚀 Run Analysis", variant="primary")
152
- status_box = gr.Textbox(label="System Status", interactive=False, lines=2)
 
 
 
 
153
 
154
- # --- RIGHT COLUMN: OUTPUTS ---
155
- with gr.Column(scale=2):
156
- gr.Markdown("### 📊 Search Results")
157
- results_df = gr.Dataframe(label="News Data Table", interactive=False, wrap=True)
158
- download_btn = gr.File(label="📥 Download CSV Report")
159
 
160
- # --- EVENTS ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  run_btn.click(
162
- fn=perform_search,
163
- inputs=[query_in, start_in, end_in, lang_in, country_in, fetch_chk],
164
- outputs=[results_df, status_box, download_btn]
165
  )
166
 
167
  if __name__ == "__main__":
168
- app.launch()
 
1
  import gradio as gr
2
+ from pygooglenews import GoogleNews
3
  import pandas as pd
4
  from datetime import datetime, timedelta
5
  import requests
6
  from bs4 import BeautifulSoup
7
  import time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ # --- CONFIGURATION ---
10
+ SESSION_TIMEOUT_SECONDS = 1800 # 30 Minutes
11
+ AUTH_USERS = [
12
+ ("admin", "admin123"),
13
+ ("user", "user123")
14
+ ]
15
 
16
+ # --- BACKEND LOGIC ---
 
 
 
 
 
 
 
 
17
 
 
18
  def scrape_article_content(url):
19
+ """Scrapes the main text from a news URL with a timeout."""
20
  try:
21
  headers = {'User-Agent': 'Mozilla/5.0'}
22
  response = requests.get(url, headers=headers, timeout=4)
23
  if response.status_code == 200:
24
  soup = BeautifulSoup(response.content, 'html.parser')
25
+ paragraphs = soup.find_all('p')
26
+ text = ' '.join([p.get_text() for p in paragraphs])
27
  return text[:500] + "..." if len(text) > 500 else text
28
  return "Content extraction failed."
29
  except Exception:
30
  return "N/A (Scraping Blocked)"
31
 
32
  def perform_search(query, start_date, end_date, lang, country, fetch_content):
 
33
  log_text = ""
34
+ lang_map = {'Bangla': 'bn', 'English': 'en'}
35
+ country_map = {'Bangladesh': 'BD', 'USA': 'US', 'UK': 'GB', 'India': 'IN'}
36
 
37
+ gn = GoogleNews(lang=lang_map.get(lang, 'bn'), country=country_map.get(country, 'BD'))
 
 
 
 
38
 
39
  try:
40
+ # Validate Dates
41
+ datetime.strptime(start_date, '%Y-%m-%d')
42
+ datetime.strptime(end_date, '%Y-%m-%d')
43
+ except ValueError:
44
+ return None, "❌ Error: Invalid date format. Please use YYYY-MM-DD.", None
45
+
46
+ try:
47
+ log_text += f"🔎 Searching: {query} ({start_date} to {end_date})\n"
48
  search_result = gn.search(query=query, from_=start_date, to_=end_date)
49
+ entries = search_result['entries']
 
 
 
50
 
51
+ if not entries:
52
+ return None, "⚠️ No articles found. Try a different keyword or date range.", None
53
+
54
  news_data = []
55
+ log_text += f"✅ Found {len(entries)} articles. Processing...\n"
56
 
57
+ for i, entry in enumerate(entries):
 
 
 
 
 
58
  item = {
59
+ 'Date': entry.published[:16] if 'published' in entry else 'N/A', # Shorten date string
60
+ 'Source': entry.source['title'],
61
  'Title': entry.title,
62
  'Link': entry.link
63
  }
 
 
64
  if fetch_content:
65
  item['Snippet'] = scrape_article_content(entry.link)
66
+ time.sleep(0.1)
67
 
68
  news_data.append(item)
69
+
 
70
  df = pd.DataFrame(news_data)
71
+
72
+ # Save CSV
73
  filename = f"BD_News_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
74
  df.to_csv(filename, index=False, encoding='utf-8-sig')
75
 
76
+ return df, log_text + "🚀 Process Complete.", filename
77
 
78
  except Exception as e:
79
  return None, f"❌ System Error: {str(e)}", None
80
 
81
+ # --- AUTHENTICATION LOGIC ---
82
+
83
+ def authenticate(username, password):
84
+ for valid_user, valid_pass in AUTH_USERS:
85
+ if username == valid_user and password == valid_pass:
86
+ return (
87
+ gr.update(visible=False),
88
+ gr.update(visible=True),
89
+ {"logged_in": True, "time": time.time(), "user": username},
90
+ gr.update(value="") # Clear password
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  )
92
+ return gr.update(), gr.update(), None, gr.update()
93
+
94
+ def check_session_and_search(query, start, end, lang, country, fetch, session_data):
95
+ # 1. Check Login Status
96
+ if not session_data or not session_data.get("logged_in"):
97
+ return (gr.update(visible=True), gr.update(visible=False), None, None, "⚠️ Session Expired.", None)
98
+
99
+ # 2. Check Timeout
100
+ if (time.time() - session_data.get("time")) > SESSION_TIMEOUT_SECONDS:
101
+ return (gr.update(visible=True), gr.update(visible=False), None, None, "⚠️ Timeout (30m). Log in again.", None)
102
+
103
+ # 3. Perform Search
104
+ df, log, csv = perform_search(query, start, end, lang, country, fetch)
105
+ return (gr.update(visible=False), gr.update(visible=True), df, csv, log, session_data)
106
+
107
+ def manual_logout():
108
+ return gr.update(visible=True), gr.update(visible=False), None, "Logged out."
109
 
110
+ # --- UI THEME & MARKDOWN ---
 
 
 
 
111
 
112
+ # Custom Theme for a Professional Look
113
+ theme = gr.themes.Soft(
114
+ primary_hue="blue",
115
+ neutral_hue="slate",
116
+ text_size="sm",
117
+ spacing_size="sm",
118
+ )
119
+
120
+ guide_markdown = """
121
+ ### 🇧🇩 Search Logic Guide (সার্চ গাইড)
122
+ Create powerful filters using these operators.
123
+
124
+ | Goal | Operator | Example (Copy & Paste) | Description |
125
+ | :--- | :--- | :--- | :--- |
126
+ | **Both Required** | `AND` | `বিএনপি AND নির্বাচন` | Finds articles containing **both** keywords. |
127
+ | **Either One** | `OR` | `বন্যা OR জলোচ্ছ্বাস` | Finds articles containing **either** word. |
128
+ | **Exclude** | `-` | `আওয়ামী লীগ -শেখ হাসিনা` | Finds 'Awami League' but **removes** articles mentioning 'Sheikh Hasina'. |
129
+ | **Exact Phrase** | `""` | `"পদ্মা সেতু"` | Finds the exact phrase 'Padma Bridge', not just the separate words. |
130
+ | **Complex** | `()` | `(ঢাকা OR চট্টগ্রাম) AND ডেঙ্গু` | Finds Dengue news specifically for Dhaka or Chittagong. |
131
+ """
132
+
133
+ # --- MAIN APP LAYOUT ---
134
+
135
+ with gr.Blocks(theme=theme, title="BD News Analyst Pro", css="footer {visibility: hidden}") as app:
136
+
137
+ session_state = gr.State()
138
+
139
+ # === LOGIN VIEW ===
140
+ with gr.Column(visible=True) as login_view:
141
+ with gr.Row(variant="panel"):
142
+ with gr.Column(scale=1):
143
+ pass # Spacer
144
+ with gr.Column(scale=1):
145
+ gr.Markdown("## 🔐 News Analyst Pro \n Please login to access the dashboard.")
146
+ u_in = gr.Textbox(label="Username", placeholder="Enter username")
147
+ p_in = gr.Textbox(label="Password", type="password", placeholder="Enter password")
148
+ l_btn = gr.Button("Login", variant="primary")
149
+ l_msg = gr.Markdown("")
150
+ with gr.Column(scale=1):
151
+ pass # Spacer
152
+
153
+ # === DASHBOARD VIEW ===
154
+ with gr.Column(visible=False) as app_view:
155
+ with gr.Row():
156
+ with gr.Column(scale=4):
157
+ gr.Markdown("# 🇧🇩 Bangladesh News Intelligence Tool")
158
+ with gr.Column(scale=1):
159
+ logout_btn = gr.Button("🚪 Logout", variant="stop", size="sm")
160
+
161
+ with gr.Row():
162
+ # --- Left Panel: Controls ---
163
+ with gr.Column(scale=1, variant="panel"):
164
+ gr.Markdown("### ⚙️ Search Configuration")
165
+
166
+ # Search Guide Accordion
167
+ with gr.Accordion("📘 How to Search (Click to Expand)", open=True):
168
+ gr.Markdown(guide_markdown)
169
+
170
+ query_in = gr.Textbox(
171
+ label="Search Keyword (Supports Boolean)",
172
+ placeholder="e.g. অর্থনীতি AND (রিজার্ভ OR ডলার)",
173
+ lines=2,
174
+ value="রাজনীতি"
175
+ )
176
+
177
+ with gr.Row():
178
+ start_in = gr.Textbox(label="Start Date", value=(datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d'))
179
+ end_in = gr.Textbox(label="End Date", value=datetime.now().strftime('%Y-%m-%d'))
180
+
181
+ with gr.Row():
182
+ lang_in = gr.Dropdown(["Bangla", "English"], value="Bangla", label="Language")
183
+ country_in = gr.Dropdown(["Bangladesh", "USA", "UK", "India"], value="Bangladesh", label="Region")
184
+
185
+ fetch_chk = gr.Checkbox(label="Fetch Full Content? (Slower)", value=False)
186
+
187
+ run_btn = gr.Button("🚀 Run Analysis", variant="primary", size="lg")
188
+ status_box = gr.Textbox(label="System Status", interactive=False, lines=4)
189
+
190
+ # --- Right Panel: Results ---
191
+ with gr.Column(scale=2):
192
+ gr.Markdown("### 📊 Search Results")
193
+ results_df = gr.Dataframe(
194
+ label="News Data",
195
+ interactive=False,
196
+ wrap=True,
197
+ headers=["Date", "Source", "Title", "Link", "Snippet"]
198
+ )
199
+ download_btn = gr.File(label="📥 Download CSV Report")
200
+
201
+ # === INTERACTIONS ===
202
+ l_btn.click(authenticate, [u_in, p_in], [login_view, app_view, session_state, p_in])
203
+ logout_btn.click(manual_logout, None, [login_view, app_view, session_state, l_msg])
204
+
205
  run_btn.click(
206
+ check_session_and_search,
207
+ inputs=[query_in, start_in, end_in, lang_in, country_in, fetch_chk, session_state],
208
+ outputs=[login_view, app_view, results_df, download_btn, status_box, session_state]
209
  )
210
 
211
  if __name__ == "__main__":
212
+ app.launch(share=True)