pjdevelop commited on
Commit
d5e14e4
Β·
1 Parent(s): 8f62fb8

Deploy TenderBot to Hugging Face Spaces

Browse files
Files changed (2) hide show
  1. app.py +393 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import requests
4
+ import pandas as pd
5
+ import gradio as gr
6
+ import time
7
+ import random
8
+ from bs4 import BeautifulSoup
9
+ from dateutil.parser import parse
10
+ from datetime import datetime, timedelta
11
+ from requests.adapters import HTTPAdapter
12
+ from urllib3.util.retry import Retry
13
+
14
+ # ─── 1. OPTIONAL: LLM FOR CORRECTION & PARAPHRASING ────────────────────────────
15
+ try:
16
+ from transformers import T5ForConditionalGeneration, T5Tokenizer
17
+ tokenizer = T5Tokenizer.from_pretrained("t5-small")
18
+ model = T5ForConditionalGeneration.from_pretrained("t5-small")
19
+
20
+ def correct_text(raw_text: str) -> str:
21
+ """Paraphrase & correct via T5-small, with fallback on error."""
22
+ try:
23
+ prompt = "paraphrase and correct: " + raw_text.strip()
24
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
25
+ outputs = model.generate(**inputs, max_length=128)
26
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
27
+ except Exception:
28
+ return raw_text
29
+ except ImportError:
30
+ def correct_text(raw_text: str) -> str:
31
+ # If transformers not installed, return raw text
32
+ return raw_text
33
+
34
+ # ─── 2. CREATE REQUESTS SESSION WITH RETRY LOGIC ──────────────────────────────
35
+ def create_robust_session():
36
+ """Create a requests session with retry logic"""
37
+ session = requests.Session()
38
+
39
+ # Configure retry strategy
40
+ retry_strategy = Retry(
41
+ total=5, # Total number of retries
42
+ backoff_factor=1, # Exponential backoff
43
+ status_forcelist=[429, 500, 502, 503, 504], # Retry on these HTTP status codes
44
+ allowed_methods=["GET", "POST"] # Allow retrying on POST requests
45
+ )
46
+
47
+ # Mount adapter with retry strategy
48
+ adapter = HTTPAdapter(max_retries=retry_strategy)
49
+ session.mount("http://", adapter)
50
+ session.mount("https://", adapter)
51
+
52
+ return session
53
+
54
+ # ─── 3. SCRAPER FOR GeM CPPP ────────────────────────────────────────────────────
55
+ def scrape_gem_cppp(keyword="", org_name="", start_date=None, end_date=None, max_pages=10):
56
+ """Scrape tender data from GeM CPPP portal with robust error handling"""
57
+ headers = {
58
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
59
+ 'Content-Type': 'application/x-www-form-urlencoded',
60
+ 'Referer': 'https://gem.gov.in/cppp',
61
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
62
+ 'Accept-Language': 'en-US,en;q=0.5',
63
+ 'Connection': 'keep-alive'
64
+ }
65
+
66
+ # Create a robust session with retry logic
67
+ session = create_robust_session()
68
+
69
+ tenders = []
70
+ page = 1
71
+ total_pages = max_pages
72
+
73
+ while page <= total_pages and page <= max_pages:
74
+ try:
75
+ print(f"Fetching page {page} of maximum {max_pages}")
76
+
77
+ # Prepare form data for the request
78
+ form_data = {
79
+ 'page': str(page),
80
+ 'tid': '',
81
+ 'title': keyword,
82
+ 'orgname': org_name,
83
+ 'startdate': start_date.strftime('%d-%m-%Y') if start_date else '',
84
+ 'enddate': end_date.strftime('%d-%m-%Y') if end_date else '',
85
+ 't_outrefid': '',
86
+ 'search': '1',
87
+ }
88
+
89
+ # Add a small random delay to avoid rate limiting
90
+ time.sleep(random.uniform(0.5, 1.5))
91
+
92
+ # Make POST request with increased timeouts
93
+ resp = session.post(
94
+ "https://gem.gov.in/cppp",
95
+ headers=headers,
96
+ data=form_data,
97
+ timeout=(30, 60) # (Connect timeout, Read timeout)
98
+ )
99
+
100
+ # Check if request was successful
101
+ if resp.status_code != 200:
102
+ print(f"Error: Received status code {resp.status_code}")
103
+ break
104
+
105
+ # Parse the response
106
+ soup = BeautifulSoup(resp.text, "html.parser")
107
+
108
+ # Find the tender table
109
+ table = soup.find("table", {"class": "table"})
110
+ if not table:
111
+ print(f"No tender table found on page {page}")
112
+ break
113
+
114
+ # Extract data from rows (skip header row)
115
+ rows = table.find_all("tr")[1:]
116
+ if not rows:
117
+ print(f"No tender rows found on page {page}")
118
+ break
119
+
120
+ print(f"Found {len(rows)} tender rows on page {page}")
121
+
122
+ # Process each row
123
+ for row in rows:
124
+ cols = row.find_all("td")
125
+ if len(cols) < 8:
126
+ continue
127
+
128
+ try:
129
+ # Extract fields with detailed error handling
130
+ closing = cols[0].get_text(strip=True)
131
+ opening_date = cols[1].get_text(strip=True)
132
+ publish_date = cols[2].get_text(strip=True)
133
+
134
+ # Extract title and link with careful error handling
135
+ title_el = cols[3].find("a")
136
+ title = title_el.get_text(strip=True) if title_el else cols[3].get_text(strip=True)
137
+
138
+ # Extract full link with proper domain
139
+ link = ""
140
+ if title_el and title_el.has_attr("href"):
141
+ link = title_el["href"]
142
+ if link and link.startswith("/"):
143
+ link = "https://gem.gov.in" + link
144
+
145
+ # Extract organization
146
+ org = cols[4].get_text(strip=True)
147
+
148
+ # Extract reference ID with better parsing
149
+ full_text = cols[3].get_text(strip=True)
150
+ ref_id = ""
151
+ if title in full_text:
152
+ ref_id = full_text.replace(title, "").strip("/").strip()
153
+ else:
154
+ # Try to extract any alphanumeric ID patterns
155
+ id_match = re.search(r'[A-Za-z0-9_-]+/\d+', full_text)
156
+ if id_match:
157
+ ref_id = id_match.group(0)
158
+
159
+ # Extract download link with proper error handling
160
+ dl_el = cols[7].find("a")
161
+ dl_link = ""
162
+ if dl_el and dl_el.has_attr("href"):
163
+ dl_link = dl_el["href"]
164
+ # Ensure it's a complete URL
165
+ if dl_link and dl_link.startswith("/"):
166
+ dl_link = "https://gem.gov.in" + dl_link
167
+
168
+ # Apply date filters if specified
169
+ try:
170
+ if closing:
171
+ cdate = parse(closing)
172
+ if start_date and cdate < start_date:
173
+ continue
174
+ if end_date and cdate > end_date:
175
+ continue
176
+ except Exception:
177
+ # If date parsing fails, include the tender anyway
178
+ pass
179
+
180
+ # Add to results
181
+ tenders.append({
182
+ "Title": title,
183
+ "Organization": org,
184
+ "Closing Date": closing,
185
+ "Opening Date": opening_date,
186
+ "Published Date": publish_date,
187
+ "Reference/Tender ID": ref_id,
188
+ "Tender Link": link,
189
+ "Download Link": dl_link
190
+ })
191
+
192
+ except Exception as row_err:
193
+ print(f"Error processing row on page {page}: {row_err}")
194
+ continue
195
+
196
+ # Check for pagination
197
+ pag = soup.find("ul", {"class": "pagination"})
198
+ next_page_exists = False
199
+
200
+ if pag:
201
+ # Look for "Next" button or links to next pages
202
+ next_link = pag.find("a", string=re.compile(r"Next", re.I))
203
+ if next_link:
204
+ next_page_exists = True
205
+
206
+ # Also check for numbered page links
207
+ page_links = pag.find_all("a")
208
+ for link in page_links:
209
+ try:
210
+ page_num = int(link.get_text(strip=True))
211
+ total_pages = max(total_pages, page_num)
212
+ except (ValueError, TypeError):
213
+ pass
214
+
215
+ if not next_page_exists:
216
+ print(f"No next page found after page {page}")
217
+ break
218
+
219
+ # Move to the next page
220
+ page += 1
221
+
222
+ except requests.Timeout:
223
+ print(f"Timeout error on page {page}. Retrying...")
224
+ continue
225
+
226
+ except requests.RequestException as e:
227
+ print(f"Request error on page {page}: {e}")
228
+ # Wait before retrying
229
+ time.sleep(5)
230
+ continue
231
+
232
+ except Exception as e:
233
+ print(f"Unexpected error on page {page}: {e}")
234
+ break
235
+
236
+ print(f"Scraping completed: found {len(tenders)} tenders across {page} pages")
237
+ return tenders
238
+
239
+ # ─── 4. SUMMARY GENERATOR (ALL RESULTS) ────────────────────────────────────────
240
+ def summarize_tenders(tenders: list[dict]) -> str:
241
+ if not tenders:
242
+ return "No tenders were found matching those criteria."
243
+
244
+ lines = [f"I found {len(tenders)} tenders matching your criteria:\n"]
245
+
246
+ # Sort tenders by closing date (newest first)
247
+ try:
248
+ tenders = sorted(tenders,
249
+ key=lambda x: parse(x.get("Closing Date", "01-01-2000")),
250
+ reverse=True)
251
+ except Exception:
252
+ # If sorting fails, continue with unsorted data
253
+ pass
254
+
255
+ # Generate the summary
256
+ for idx, t in enumerate(tenders, 1):
257
+ # Format title with link if available
258
+ title_line = f"{idx}. "
259
+ if t.get("Tender Link"):
260
+ title_line += f"[{t['Title']}]({t['Tender Link']})"
261
+ else:
262
+ title_line += t['Title']
263
+
264
+ lines.append(title_line)
265
+
266
+ # Add organization info
267
+ lines.append(f" β€’ Organization: {t['Organization']}")
268
+
269
+ # Add date information
270
+ lines.append(f" β€’ Closing Date: {t['Closing Date']}")
271
+
272
+ if t.get("Opening Date") and t["Opening Date"].strip():
273
+ lines.append(f" β€’ Opening Date: {t['Opening Date']}")
274
+
275
+ if t.get("Published Date") and t["Published Date"].strip():
276
+ lines.append(f" β€’ Published Date: {t['Published Date']}")
277
+
278
+ # Add Reference ID
279
+ if t.get("Reference/Tender ID") and t["Reference/Tender ID"].strip():
280
+ lines.append(f" β€’ Ref ID: {t['Reference/Tender ID']}")
281
+
282
+ # Add download link if available
283
+ if t.get("Download Link") and t["Download Link"].strip():
284
+ lines.append(f" β€’ [Download Tender Document]({t['Download Link']})")
285
+
286
+ lines.append("") # Add a blank line between tenders
287
+
288
+ # Return the formatted summary
289
+ return "\n".join(lines)
290
+
291
+ # ─── 5. CHAT FUNCTION ──────────────────────────────────────────────────────────
292
+ def chat_fn(user_message: str, history):
293
+ """Process chat messages and extract search parameters"""
294
+ # Debug output
295
+ print(f"User Message: {user_message}")
296
+
297
+ try:
298
+ # Clean and potentially correct user message
299
+ corrected = correct_text(user_message)
300
+ print(f"Corrected Text: {corrected}")
301
+
302
+ # Extract date ranges with flexible patterns
303
+ date_patterns = [
304
+ # Format: "from DD/MM/YYYY to DD/MM/YYYY"
305
+ r"from\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})\s+to\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})",
306
+ # Format: "between DD/MM/YYYY and DD/MM/YYYY"
307
+ r"between\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})\s+and\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})"
308
+ ]
309
+
310
+ start_date = end_date = None
311
+
312
+ for pattern in date_patterns:
313
+ match = re.search(pattern, corrected, re.I)
314
+ if match:
315
+ try:
316
+ start_date = parse(match.group(1))
317
+ end_date = parse(match.group(2))
318
+ print(f"Dates extracted: {start_date} to {end_date}")
319
+ break
320
+ except Exception as e:
321
+ print(f"Date parsing error: {e}")
322
+
323
+ # Extract organization with multiple patterns
324
+ org_patterns = [
325
+ r"from\s+ministry\s+of\s+(\w+)",
326
+ r"from\s+(\w+)\s+ministry",
327
+ r"by\s+(\w+\s+\w+)",
328
+ r"organization\s+(\w+\s+\w+)"
329
+ ]
330
+
331
+ org = ""
332
+ for pattern in org_patterns:
333
+ org_match = re.search(pattern, corrected.lower())
334
+ if org_match:
335
+ org = org_match.group(1)
336
+ print(f"Organization extracted: {org}")
337
+ break
338
+
339
+ # Extract keywords with smarter filtering
340
+ stops = {"find", "search", "get", "tenders", "tender", "from", "to",
341
+ "between", "after", "before", "the", "and", "of", "in"}
342
+
343
+ # Try pattern matching first
344
+ keyword = ""
345
+ kw_match = re.search(r"(?:get|find|search)\s+(.*?)\s+tenders?", corrected.lower())
346
+ if kw_match:
347
+ keyword = kw_match.group(1).strip()
348
+ else:
349
+ # Fallback to word filtering
350
+ words = re.findall(r"\b\w+\b", corrected.lower())
351
+ keyword = " ".join(w for w in words if w not in stops and len(w) > 2)
352
+
353
+ print(f"Final keyword: '{keyword}'")
354
+
355
+ # Search for tenders
356
+ results = scrape_gem_cppp(
357
+ keyword=keyword.strip(),
358
+ org_name=org,
359
+ start_date=start_date,
360
+ end_date=end_date,
361
+ max_pages=10 # Increased max pages
362
+ )
363
+
364
+ # Generate reply
365
+ bot_reply = summarize_tenders(results)
366
+
367
+ except Exception as e:
368
+ import traceback
369
+ print(f"Error in chat function: {e}")
370
+ print(traceback.format_exc())
371
+ bot_reply = f"Sorry, an error occurred while processing your request: {str(e)}"
372
+
373
+ return bot_reply
374
+
375
+ # ─── 6. GRADIO APP ─────────────────────────────────────────────────────────────
376
+ with gr.Blocks() as demo:
377
+ gr.Markdown("## Government Tender Search Chatbot")
378
+ gr.Markdown("Ask me to find tenders by keyword, organization, or date range.")
379
+ gr.ChatInterface(
380
+ fn=chat_fn,
381
+ title="TenderBot",
382
+ description="E.g. Search solar panel tenders from 01/06/2025 to 30/06/2025",
383
+ examples=[
384
+ "Find solar panel tenders",
385
+ "Search for IT tenders from Ministry of Defense",
386
+ "Get construction tenders from 01/05/2025 to 30/06/2025"
387
+ ],
388
+
389
+ )
390
+
391
+ if __name__ == "__main__":
392
+ # Launch with appropriate parameters
393
+ demo.launch(debug=True, share=False)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ requests
3
+ beautifulsoup4
4
+ pandas
5
+ python-dateutil
6
+ transformers
7
+ torch
8
+ sentencepiece