Spaces:

pjdevelop
/

tenderbot

Build error

App Files Files Community

pjdevelop commited on May 14, 2025

Commit

d5e14e4

1 Parent(s): 8f62fb8

Deploy TenderBot to Hugging Face Spaces

Browse files

Files changed (2) hide show

app.py +393 -0
requirements.txt +8 -0

app.py ADDED Viewed

	@@ -0,0 +1,393 @@

+import os
+import re
+import requests
+import pandas as pd
+import gradio as gr
+import time
+import random
+from bs4 import BeautifulSoup
+from dateutil.parser import parse
+from datetime import datetime, timedelta
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+# ─── 1. OPTIONAL: LLM FOR CORRECTION & PARAPHRASING ────────────────────────────
+try:
+    from transformers import T5ForConditionalGeneration, T5Tokenizer
+    tokenizer = T5Tokenizer.from_pretrained("t5-small")
+    model     = T5ForConditionalGeneration.from_pretrained("t5-small")
+    def correct_text(raw_text: str) -> str:
+        """Paraphrase & correct via T5-small, with fallback on error."""
+        try:
+            prompt = "paraphrase and correct: " + raw_text.strip()
+            inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
+            outputs = model.generate(**inputs, max_length=128)
+            return tokenizer.decode(outputs[0], skip_special_tokens=True)
+        except Exception:
+            return raw_text
+except ImportError:
+    def correct_text(raw_text: str) -> str:
+        # If transformers not installed, return raw text
+        return raw_text
+# ─── 2. CREATE REQUESTS SESSION WITH RETRY LOGIC ──────────────────────────────
+def create_robust_session():
+    """Create a requests session with retry logic"""
+    session = requests.Session()
+    # Configure retry strategy
+    retry_strategy = Retry(
+        total=5,  # Total number of retries
+        backoff_factor=1,  # Exponential backoff
+        status_forcelist=[429, 500, 502, 503, 504],  # Retry on these HTTP status codes
+        allowed_methods=["GET", "POST"]  # Allow retrying on POST requests
+    )
+    # Mount adapter with retry strategy
+    adapter = HTTPAdapter(max_retries=retry_strategy)
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+    return session
+# ─── 3. SCRAPER FOR GeM CPPP ────────────────────────────────────────────────────
+def scrape_gem_cppp(keyword="", org_name="", start_date=None, end_date=None, max_pages=10):
+    """Scrape tender data from GeM CPPP portal with robust error handling"""
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+        'Content-Type': 'application/x-www-form-urlencoded',
+        'Referer': 'https://gem.gov.in/cppp',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.5',
+        'Connection': 'keep-alive'
+    }
+    # Create a robust session with retry logic
+    session = create_robust_session()
+    tenders = []
+    page = 1
+    total_pages = max_pages
+    while page <= total_pages and page <= max_pages:
+        try:
+            print(f"Fetching page {page} of maximum {max_pages}")
+            # Prepare form data for the request
+            form_data = {
+                'page': str(page),
+                'tid': '',
+                'title': keyword,
+                'orgname': org_name,
+                'startdate': start_date.strftime('%d-%m-%Y') if start_date else '',
+                'enddate': end_date.strftime('%d-%m-%Y') if end_date else '',
+                't_outrefid': '',
+                'search': '1',
+            }
+            # Add a small random delay to avoid rate limiting
+            time.sleep(random.uniform(0.5, 1.5))
+            # Make POST request with increased timeouts
+            resp = session.post(
+                "https://gem.gov.in/cppp",
+                headers=headers,
+                data=form_data,
+                timeout=(30, 60)  # (Connect timeout, Read timeout)
+            )
+            # Check if request was successful
+            if resp.status_code != 200:
+                print(f"Error: Received status code {resp.status_code}")
+                break
+            # Parse the response
+            soup = BeautifulSoup(resp.text, "html.parser")
+            # Find the tender table
+            table = soup.find("table", {"class": "table"})
+            if not table:
+                print(f"No tender table found on page {page}")
+                break
+            # Extract data from rows (skip header row)
+            rows = table.find_all("tr")[1:]
+            if not rows:
+                print(f"No tender rows found on page {page}")
+                break
+            print(f"Found {len(rows)} tender rows on page {page}")
+            # Process each row
+            for row in rows:
+                cols = row.find_all("td")
+                if len(cols) < 8:
+                    continue
+                try:
+                    # Extract fields with detailed error handling
+                    closing = cols[0].get_text(strip=True)
+                    opening_date = cols[1].get_text(strip=True)
+                    publish_date = cols[2].get_text(strip=True)
+                    # Extract title and link with careful error handling
+                    title_el = cols[3].find("a")
+                    title = title_el.get_text(strip=True) if title_el else cols[3].get_text(strip=True)
+                    # Extract full link with proper domain
+                    link = ""
+                    if title_el and title_el.has_attr("href"):
+                        link = title_el["href"]
+                        if link and link.startswith("/"):
+                            link = "https://gem.gov.in" + link
+                    # Extract organization
+                    org = cols[4].get_text(strip=True)
+                    # Extract reference ID with better parsing
+                    full_text = cols[3].get_text(strip=True)
+                    ref_id = ""
+                    if title in full_text:
+                        ref_id = full_text.replace(title, "").strip("/").strip()
+                    else:
+                        # Try to extract any alphanumeric ID patterns
+                        id_match = re.search(r'[A-Za-z0-9_-]+/\d+', full_text)
+                        if id_match:
+                            ref_id = id_match.group(0)
+                    # Extract download link with proper error handling
+                    dl_el = cols[7].find("a")
+                    dl_link = ""
+                    if dl_el and dl_el.has_attr("href"):
+                        dl_link = dl_el["href"]
+                        # Ensure it's a complete URL
+                        if dl_link and dl_link.startswith("/"):
+                            dl_link = "https://gem.gov.in" + dl_link
+                    # Apply date filters if specified
+                    try:
+                        if closing:
+                            cdate = parse(closing)
+                            if start_date and cdate < start_date:
+                                continue
+                            if end_date and cdate > end_date:
+                                continue
+                    except Exception:
+                        # If date parsing fails, include the tender anyway
+                        pass
+                    # Add to results
+                    tenders.append({
+                        "Title": title,
+                        "Organization": org,
+                        "Closing Date": closing,
+                        "Opening Date": opening_date,
+                        "Published Date": publish_date,
+                        "Reference/Tender ID": ref_id,
+                        "Tender Link": link,
+                        "Download Link": dl_link
+                    })
+                except Exception as row_err:
+                    print(f"Error processing row on page {page}: {row_err}")
+                    continue
+            # Check for pagination
+            pag = soup.find("ul", {"class": "pagination"})
+            next_page_exists = False
+            if pag:
+                # Look for "Next" button or links to next pages
+                next_link = pag.find("a", string=re.compile(r"Next", re.I))
+                if next_link:
+                    next_page_exists = True
+                # Also check for numbered page links
+                page_links = pag.find_all("a")
+                for link in page_links:
+                    try:
+                        page_num = int(link.get_text(strip=True))
+                        total_pages = max(total_pages, page_num)
+                    except (ValueError, TypeError):
+                        pass
+            if not next_page_exists:
+                print(f"No next page found after page {page}")
+                break
+            # Move to the next page
+            page += 1
+        except requests.Timeout:
+            print(f"Timeout error on page {page}. Retrying...")
+            continue
+        except requests.RequestException as e:
+            print(f"Request error on page {page}: {e}")
+            # Wait before retrying
+            time.sleep(5)
+            continue
+        except Exception as e:
+            print(f"Unexpected error on page {page}: {e}")
+            break
+    print(f"Scraping completed: found {len(tenders)} tenders across {page} pages")
+    return tenders
+# ─── 4. SUMMARY GENERATOR (ALL RESULTS) ────────────────────────────────────────
+def summarize_tenders(tenders: list[dict]) -> str:
+    if not tenders:
+        return "No tenders were found matching those criteria."
+    lines = [f"I found {len(tenders)} tenders matching your criteria:\n"]
+    # Sort tenders by closing date (newest first)
+    try:
+        tenders = sorted(tenders,
+                         key=lambda x: parse(x.get("Closing Date", "01-01-2000")),
+                         reverse=True)
+    except Exception:
+        # If sorting fails, continue with unsorted data
+        pass
+    # Generate the summary
+    for idx, t in enumerate(tenders, 1):
+        # Format title with link if available
+        title_line = f"{idx}. "
+        if t.get("Tender Link"):
+            title_line += f"[{t['Title']}]({t['Tender Link']})"
+        else:
+            title_line += t['Title']
+        lines.append(title_line)
+        # Add organization info
+        lines.append(f"   • Organization: {t['Organization']}")
+        # Add date information
+        lines.append(f"   • Closing Date: {t['Closing Date']}")
+        if t.get("Opening Date") and t["Opening Date"].strip():
+            lines.append(f"   • Opening Date: {t['Opening Date']}")
+        if t.get("Published Date") and t["Published Date"].strip():
+            lines.append(f"   • Published Date: {t['Published Date']}")
+        # Add Reference ID
+        if t.get("Reference/Tender ID") and t["Reference/Tender ID"].strip():
+            lines.append(f"   • Ref ID: {t['Reference/Tender ID']}")
+        # Add download link if available
+        if t.get("Download Link") and t["Download Link"].strip():
+            lines.append(f"   • [Download Tender Document]({t['Download Link']})")
+        lines.append("")  # Add a blank line between tenders
+    # Return the formatted summary
+    return "\n".join(lines)
+# ─── 5. CHAT FUNCTION ──────────────────────────────────────────────────────────
+def chat_fn(user_message: str, history):
+    """Process chat messages and extract search parameters"""
+    # Debug output
+    print(f"User Message: {user_message}")
+    try:
+        # Clean and potentially correct user message
+        corrected = correct_text(user_message)
+        print(f"Corrected Text: {corrected}")
+        # Extract date ranges with flexible patterns
+        date_patterns = [
+            # Format: "from DD/MM/YYYY to DD/MM/YYYY"
+            r"from\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})\s+to\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})",
+            # Format: "between DD/MM/YYYY and DD/MM/YYYY"
+            r"between\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})\s+and\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})"
+        ]
+        start_date = end_date = None
+        for pattern in date_patterns:
+            match = re.search(pattern, corrected, re.I)
+            if match:
+                try:
+                    start_date = parse(match.group(1))
+                    end_date = parse(match.group(2))
+                    print(f"Dates extracted: {start_date} to {end_date}")
+                    break
+                except Exception as e:
+                    print(f"Date parsing error: {e}")
+        # Extract organization with multiple patterns
+        org_patterns = [
+            r"from\s+ministry\s+of\s+(\w+)",
+            r"from\s+(\w+)\s+ministry",
+            r"by\s+(\w+\s+\w+)",
+            r"organization\s+(\w+\s+\w+)"
+        ]
+        org = ""
+        for pattern in org_patterns:
+            org_match = re.search(pattern, corrected.lower())
+            if org_match:
+                org = org_match.group(1)
+                print(f"Organization extracted: {org}")
+                break
+        # Extract keywords with smarter filtering
+        stops = {"find", "search", "get", "tenders", "tender", "from", "to",
+                "between", "after", "before", "the", "and", "of", "in"}
+        # Try pattern matching first
+        keyword = ""
+        kw_match = re.search(r"(?:get|find|search)\s+(.*?)\s+tenders?", corrected.lower())
+        if kw_match:
+            keyword = kw_match.group(1).strip()
+        else:
+            # Fallback to word filtering
+            words = re.findall(r"\b\w+\b", corrected.lower())
+            keyword = " ".join(w for w in words if w not in stops and len(w) > 2)
+        print(f"Final keyword: '{keyword}'")
+        # Search for tenders
+        results = scrape_gem_cppp(
+            keyword=keyword.strip(),
+            org_name=org,
+            start_date=start_date,
+            end_date=end_date,
+            max_pages=10  # Increased max pages
+        )
+        # Generate reply
+        bot_reply = summarize_tenders(results)
+    except Exception as e:
+        import traceback
+        print(f"Error in chat function: {e}")
+        print(traceback.format_exc())
+        bot_reply = f"Sorry, an error occurred while processing your request: {str(e)}"
+    return bot_reply
+# ─── 6. GRADIO APP ─────────────────────────────────────────────────────────────
+with gr.Blocks() as demo:
+    gr.Markdown("## Government Tender Search Chatbot")
+    gr.Markdown("Ask me to find tenders by keyword, organization, or date range.")
+    gr.ChatInterface(
+        fn=chat_fn,
+        title="TenderBot",
+        description="E.g. Search solar panel tenders from 01/06/2025 to 30/06/2025",
+        examples=[
+            "Find solar panel tenders",
+            "Search for IT tenders from Ministry of Defense",
+            "Get construction tenders from 01/05/2025 to 30/06/2025"
+        ],
+    )
+if __name__ == "__main__":
+    # Launch with appropriate parameters
+    demo.launch(debug=True, share=False)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio
+requests
+beautifulsoup4
+pandas
+python-dateutil
+transformers
+torch
+sentencepiece