Arjon07CSE commited on
Commit
ebe173a
·
verified ·
1 Parent(s): 4abd6da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -110
app.py CHANGED
@@ -7,7 +7,7 @@ import time
7
  import urllib.parse
8
  import feedparser
9
 
10
- # --- 1. STABLE SEARCH ENGINE (No external libraries) ---
11
  class GoogleNewsEngine:
12
  def __init__(self, lang='bn', country='BD'):
13
  self.lang = lang.lower()
@@ -15,34 +15,46 @@ class GoogleNewsEngine:
15
  self.BASE_URL = 'https://news.google.com/rss'
16
 
17
  def search(self, query, from_=None, to_=None):
 
18
  full_query = query
19
  if from_: full_query += f" after:{from_}"
20
  if to_: full_query += f" before:{to_}"
 
 
21
  encoded_query = urllib.parse.quote(full_query)
 
 
22
  url = (f"{self.BASE_URL}/search?q={encoded_query}"
23
  f"&hl={self.lang}-{self.country}"
24
  f"&gl={self.country}"
25
  f"&ceid={self.country}:{self.lang}")
 
26
  return feedparser.parse(url)
27
 
28
- # --- 2. CONFIGURATION ---
29
- SESSION_TIMEOUT_SECONDS = 1800 # 30 Minutes
30
- AUTH_USERS = [("admin", "admin123"), ("user", "user123")]
31
-
32
- guide_text = """
33
- ### 🇧🇩 Search Guide
34
- * **AND**: `বিএনপি AND নির্বাচন`
35
- * **OR**: `বন্যা OR জলোচছ্বা`
36
- * **Exclude**: `ক্রিকেট -সকিব`
 
 
 
 
37
  """
38
 
39
  # --- 3. HELPER FUNCTIONS ---
40
  def scrape_article_content(url):
 
41
  try:
42
  headers = {'User-Agent': 'Mozilla/5.0'}
43
  response = requests.get(url, headers=headers, timeout=4)
44
  if response.status_code == 200:
45
  soup = BeautifulSoup(response.content, 'html.parser')
 
46
  text = ' '.join([p.get_text() for p in soup.find_all('p')])
47
  return text[:500] + "..." if len(text) > 500 else text
48
  return "Content extraction failed."
@@ -50,129 +62,107 @@ def scrape_article_content(url):
50
  return "N/A (Scraping Blocked)"
51
 
52
  def perform_search(query, start_date, end_date, lang, country, fetch_content):
 
53
  log_text = ""
54
- gn = GoogleNewsEngine(lang={'Bangla':'bn', 'English':'en'}.get(lang, 'bn'),
55
- country={'Bangladesh':'BD', 'USA':'US', 'UK':'GB'}.get(country, 'BD'))
 
 
 
 
 
56
  try:
 
57
  search_result = gn.search(query=query, from_=start_date, to_=end_date)
58
  entries = search_result.entries
59
- if not entries: return None, "⚠️ No articles found.", None
 
 
60
 
61
  news_data = []
62
  log_text += f"✅ Found {len(entries)} articles.\n"
 
 
63
  for entry in entries:
 
 
 
 
64
  item = {
65
- 'Date': entry.get('published', 'N/A')[:16],
66
- 'Source': entry.get('source', {}).get('title', 'Google News'),
67
  'Title': entry.title,
68
  'Link': entry.link
69
  }
 
 
70
  if fetch_content:
71
  item['Snippet'] = scrape_article_content(entry.link)
72
- time.sleep(0.1)
 
73
  news_data.append(item)
74
 
 
75
  df = pd.DataFrame(news_data)
76
  filename = f"BD_News_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
77
  df.to_csv(filename, index=False, encoding='utf-8-sig')
78
- return df, log_text + "🚀 Complete.", filename
79
- except Exception as e:
80
- return None, f"❌ Error: {str(e)}", None
81
-
82
- # --- 4. VISIBILITY & AUTH LOGIC ---
83
- # Using gr.update() to fix the sticking login screen
84
-
85
- def authenticate(username, password):
86
- for u, p in AUTH_USERS:
87
- if username == u and password == p:
88
- # HIDE Login, SHOW App
89
- return (
90
- gr.update(visible=False),
91
- gr.update(visible=True),
92
- {"logged_in": True, "time": time.time(), "user": username}
93
- )
94
- return gr.update(), gr.update(), None
95
-
96
- def check_session_and_search(query, start, end, lang, country, fetch, session_data):
97
- # Check Session
98
- if not session_data or not session_data.get("logged_in") or (time.time() - session_data.get("time")) > SESSION_TIMEOUT_SECONDS:
99
- # FAIL: Show Login, Hide App
100
- return (
101
- gr.update(visible=True),
102
- gr.update(visible=False),
103
- None, None, "⚠️ Session Expired.", None
104
- )
105
-
106
- # SUCCESS: Search
107
- df, log, csv = perform_search(query, start, end, lang, country, fetch)
108
-
109
- # MAINTAIN: Hide Login, Show App
110
- return (
111
- gr.update(visible=False),
112
- gr.update(visible=True),
113
- df, csv, log, session_data
114
- )
115
 
116
- def manual_logout():
117
- # SHOW Login, HIDE App
118
- return (
119
- gr.update(visible=True),
120
- gr.update(visible=False),
121
- None, "Logged out."
122
- )
123
 
124
- # --- 5. UI LAYOUT (CRASH FIX: Removed 'theme' and 'css') ---
 
125
  with gr.Blocks(title="BD News Analyst") as app:
126
 
127
- session_state = gr.State()
128
-
129
- # === LOGIN GROUP ===
130
- with gr.Group(visible=True) as login_view:
131
- with gr.Row():
132
- with gr.Column(scale=1): pass
133
- with gr.Column(scale=1):
134
- gr.Markdown("## 🔐 Login Required")
135
- u_in = gr.Textbox(label="Username")
136
- p_in = gr.Textbox(label="Password", type="password")
137
- l_btn = gr.Button("Login", variant="primary")
138
- with gr.Column(scale=1): pass
139
-
140
- # === APP GROUP ===
141
- with gr.Group(visible=False) as app_view:
142
- with gr.Row():
143
- with gr.Column(scale=4):
144
- gr.Markdown("# 🇧🇩 BD News Intelligence")
145
- with gr.Column(scale=1):
146
- logout_btn = gr.Button("Logout", variant="stop")
147
-
148
- with gr.Row():
149
- # Controls
150
- with gr.Column(scale=1):
151
- gr.Markdown("### Configuration")
152
- gr.Markdown(guide_text)
153
-
154
- query_in = gr.Textbox(label="Search Keyword", value="বিএনপি")
155
- with gr.Row():
156
- start_in = gr.Textbox(label="Start", value=(datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d'))
157
- end_in = gr.Textbox(label="End", value=datetime.now().strftime('%Y-%m-%d'))
158
-
159
- with gr.Row():
160
- lang_in = gr.Dropdown(["Bangla", "English"], value="Bangla", label="Language")
161
- country_in = gr.Dropdown(["Bangladesh", "USA"], value="Bangladesh", label="Region")
162
-
163
- fetch_chk = gr.Checkbox(label="Deep Scrape?", value=False)
164
- run_btn = gr.Button("Run Analysis", variant="primary")
165
- status_box = gr.Textbox(label="Status", interactive=False)
166
-
167
- # Results
168
- with gr.Column(scale=2):
169
- results_df = gr.Dataframe(label="Results", interactive=False)
170
- download_btn = gr.File(label="Download CSV")
171
-
172
- # === EVENTS ===
173
- l_btn.click(authenticate, [u_in, p_in], [login_view, app_view, session_state])
174
- logout_btn.click(manual_logout, None, [login_view, app_view, session_state, status_box])
175
- run_btn.click(check_session_and_search, [query_in, start_in, end_in, lang_in, country_in, fetch_chk, session_state], [login_view, app_view, results_df, download_btn, status_box, session_state])
176
 
177
  if __name__ == "__main__":
178
  app.launch()
 
7
  import urllib.parse
8
  import feedparser
9
 
10
+ # --- 1. ROBUST SEARCH ENGINE ---
11
  class GoogleNewsEngine:
12
  def __init__(self, lang='bn', country='BD'):
13
  self.lang = lang.lower()
 
15
  self.BASE_URL = 'https://news.google.com/rss'
16
 
17
  def search(self, query, from_=None, to_=None):
18
+ # Construct query with standard Google operators
19
  full_query = query
20
  if from_: full_query += f" after:{from_}"
21
  if to_: full_query += f" before:{to_}"
22
+
23
+ # URL Encode
24
  encoded_query = urllib.parse.quote(full_query)
25
+
26
+ # Construct RSS URL
27
  url = (f"{self.BASE_URL}/search?q={encoded_query}"
28
  f"&hl={self.lang}-{self.country}"
29
  f"&gl={self.country}"
30
  f"&ceid={self.country}:{self.lang}")
31
+
32
  return feedparser.parse(url)
33
 
34
+ # --- 2. SOPHISTICATED GUIDE MARKDOWN ---
35
+ guide_markdown = """
36
+ ### 🇧🇩 Advanced Search Intelligence (সার্চ গাইড)
37
+ Master your queries using these professional operators to filter noise.
38
+
39
+ | Search Goal | Operator | Example (Copy & Paste) | Explanation |
40
+ | :--- | :--- | :--- | :--- |
41
+ | **Precise Match** | `AND` | `িএপি AND নির্বাচন` | Finds articles containing **BOTH** 'BNP' and 'Election'. |
42
+ | **Broad Search** | `OR` | `বন্য OR জলোচ্ছ্াস` | Finds articles containing **EITHER** 'Flood' or 'Surge'. |
43
+ | **Noise Filtering** | `-` (Minus) | `ক্রিকেট -সাকিব` | Finds 'Cricket' news but **REMOVES** any mention of 'Shakib'. |
44
+ | **Exact Phrase** | `""` (Quotes) | `"পদ্মা সেতু"` | Finds the exact sequence of words, not just scattered keywords. |
45
+ | **Complex Logic** | `( )` | `(ঢাকা OR চট্টগ্রাম) AND ডেঙ্গু` | Finds Dengue news specifically for **Dhaka OR Chittagong**. |
46
+ | **Source Specific** | `site:` | `site:prothomalo.com রাজনীতি` | Finds 'Politics' news **ONLY** from Prothom Alo. |
47
  """
48
 
49
  # --- 3. HELPER FUNCTIONS ---
50
  def scrape_article_content(url):
51
+ """Fetches article text if Deep Scrape is enabled."""
52
  try:
53
  headers = {'User-Agent': 'Mozilla/5.0'}
54
  response = requests.get(url, headers=headers, timeout=4)
55
  if response.status_code == 200:
56
  soup = BeautifulSoup(response.content, 'html.parser')
57
+ # Join paragraphs safely
58
  text = ' '.join([p.get_text() for p in soup.find_all('p')])
59
  return text[:500] + "..." if len(text) > 500 else text
60
  return "Content extraction failed."
 
62
  return "N/A (Scraping Blocked)"
63
 
64
  def perform_search(query, start_date, end_date, lang, country, fetch_content):
65
+ """Main logic to fetch news and return DataFrame."""
66
  log_text = ""
67
+
68
+ # Map friendly names to codes
69
+ l_map = {'Bangla':'bn', 'English':'en'}
70
+ c_map = {'Bangladesh':'BD', 'USA':'US', 'UK':'GB', 'India':'IN'}
71
+
72
+ gn = GoogleNewsEngine(lang=l_map.get(lang, 'bn'), country=c_map.get(country, 'BD'))
73
+
74
  try:
75
+ # Perform Search
76
  search_result = gn.search(query=query, from_=start_date, to_=end_date)
77
  entries = search_result.entries
78
+
79
+ if not entries:
80
+ return None, "⚠️ No articles found. Try changing dates or keywords.", None
81
 
82
  news_data = []
83
  log_text += f"✅ Found {len(entries)} articles.\n"
84
+
85
+ # Process Results
86
  for entry in entries:
87
+ # Safe data extraction
88
+ pub_date = entry.get('published', 'N/A')[:16]
89
+ source = entry.get('source', {}).get('title', 'Google News')
90
+
91
  item = {
92
+ 'Date': pub_date,
93
+ 'Source': source,
94
  'Title': entry.title,
95
  'Link': entry.link
96
  }
97
+
98
+ # Deep Scrape (Optional)
99
  if fetch_content:
100
  item['Snippet'] = scrape_article_content(entry.link)
101
+ time.sleep(0.1) # Be polite to servers
102
+
103
  news_data.append(item)
104
 
105
+ # Create Dataframe and CSV
106
  df = pd.DataFrame(news_data)
107
  filename = f"BD_News_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
108
  df.to_csv(filename, index=False, encoding='utf-8-sig')
109
+
110
+ return df, log_text + "🚀 Analysis Complete.", filename
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
+ except Exception as e:
113
+ return None, f"❌ System Error: {str(e)}", None
 
 
 
 
 
114
 
115
+ # --- 4. UI LAYOUT ---
116
+ # NO THEME ARGUMENT to ensure stability
117
  with gr.Blocks(title="BD News Analyst") as app:
118
 
119
+ with gr.Row():
120
+ with gr.Column(scale=4):
121
+ gr.Markdown("# 🇧🇩 BD News Intelligence Tool")
122
+ gr.Markdown("Search, Filter, and Analyze Bangladeshi News Data in seconds.")
123
+ with gr.Column(scale=1):
124
+ pass
125
+
126
+ with gr.Row():
127
+ # --- LEFT COLUMN: INPUTS ---
128
+ with gr.Column(scale=1):
129
+ gr.Markdown("### ⚙️ Search Configuration")
130
+
131
+ # The Sophisticated Guide
132
+ with gr.Accordion("📘 Search Operator Cheat Sheet (Click to Open)", open=True):
133
+ gr.Markdown(guide_markdown)
134
+
135
+ query_in = gr.Textbox(
136
+ label="Search Keyword (Supports Boolean Logic)",
137
+ value="রাজনীতি",
138
+ placeholder="e.g. অর্থনীতি AND (রিজার্ভ OR ডলার)"
139
+ )
140
+
141
+ with gr.Row():
142
+ start_in = gr.Textbox(label="Start Date", value=(datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d'))
143
+ end_in = gr.Textbox(label="End Date", value=datetime.now().strftime('%Y-%m-%d'))
144
+
145
+ with gr.Row():
146
+ lang_in = gr.Dropdown(["Bangla", "English"], value="Bangla", label="Language")
147
+ country_in = gr.Dropdown(["Bangladesh", "USA", "UK", "India"], value="Bangladesh", label="Region")
148
+
149
+ fetch_chk = gr.Checkbox(label="Deep Scrape? (Fetch full article text - Slower)", value=False)
150
+
151
+ run_btn = gr.Button("🚀 Run Analysis", variant="primary")
152
+ status_box = gr.Textbox(label="System Status", interactive=False, lines=2)
153
+
154
+ # --- RIGHT COLUMN: OUTPUTS ---
155
+ with gr.Column(scale=2):
156
+ gr.Markdown("### 📊 Search Results")
157
+ results_df = gr.Dataframe(label="News Data Table", interactive=False, wrap=True)
158
+ download_btn = gr.File(label="📥 Download CSV Report")
159
+
160
+ # --- EVENTS ---
161
+ run_btn.click(
162
+ fn=perform_search,
163
+ inputs=[query_in, start_in, end_in, lang_in, country_in, fetch_chk],
164
+ outputs=[results_df, status_box, download_btn]
165
+ )
 
 
166
 
167
  if __name__ == "__main__":
168
  app.launch()