import requests from bs4 import BeautifulSoup import re from io import BytesIO import logging # Setup logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def chunk_text(text, max_tokens=3000, chars_per_token=4): """Split text into smaller chunks that won't exceed token limits.""" max_chars = max_tokens * chars_per_token chunks = [] # Simple chunking by paragraphs paragraphs = text.split('\n\n') current_chunk = "" for para in paragraphs: if len(current_chunk) + len(para) < max_chars: current_chunk += para + "\n\n" else: chunks.append(current_chunk) current_chunk = para + "\n\n" if current_chunk: chunks.append(current_chunk) return chunks def fetch_active_tenders(): """Fetch active tenders from PPRA website with error handling.""" try: logger.info("Fetching active tenders from PPRA website") url = "https://www.ppra.org.pk/dad_tenders.asp" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(url, headers=headers, timeout=15) if response.status_code != 200: logger.error(f"Failed to fetch tenders. Status code: {response.status_code}") return [] soup = BeautifulSoup(response.text, "html.parser") tables = soup.find_all("table") # Find the right table with tender data target_table = None for table in tables: if table.find("tr") and table.find("tr").find("th") and "Tender Description" in table.find("tr").text: target_table = table break if not target_table: # Try a different approach if the table header isn't found target_table = soup.find("table", {"class": "data"}) if not target_table: # Last resort: get the largest table tables = sorted(tables, key=lambda t: len(t.find_all("tr")), reverse=True) target_table = tables[0] if tables else None if not target_table: logger.error("Could not find tender table on the page") return [] rows = target_table.find_all("tr")[1:] # Skip header tenders = [] for row in rows: cols = row.find_all("td") if len(cols) >= 5: link_element = cols[4].find("a") link = "" if link_element and link_element.has_attr("href"): link = "https://www.ppra.org.pk/" + link_element["href"].strip() tender = { "title": cols[0].text.strip(), "department": cols[1].text.strip(), "closing_date": cols[3].text.strip(), "link": link } tenders.append(tender) logger.info(f"Successfully fetched {len(tenders)} tenders") return tenders except Exception as e: logger.error(f"Error fetching tenders: {str(e)}") return [] def fetch_tender_details(url): """Fetch and parse tender details from the provided URL.""" try: logger.info(f"Fetching tender details from {url}") headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(url, headers=headers, timeout=15) if response.status_code != 200: logger.error(f"Failed to fetch tender details. Status code: {response.status_code}") return "Could not fetch details from this link." # Check if it's a PDF if url.lower().endswith('.pdf') or 'application/pdf' in response.headers.get('Content-Type', ''): logger.info("PDF detected, extracting text preview") try: import pdfplumber pdf = pdfplumber.open(BytesIO(response.content)) text = "" # Extract first few pages for i in range(min(3, len(pdf.pages))): text += pdf.pages[i].extract_text() or "" return text[:5000] # Return first 5000 chars except ImportError: return "PDF document detected. Install pdfplumber to extract content." # HTML content soup = BeautifulSoup(response.text, "html.parser") # Remove script and style elements for script in soup(["script", "style"]): script.decompose() # Get text and clean it text = soup.get_text(separator="\n", strip=True) # Clean up text - remove excessive newlines text = re.sub(r'\n\s*\n', '\n\n', text) logger.info(f"Successfully fetched tender details ({len(text)} chars)") return text[:5000] # Limit to 5000 chars to prevent token issues except Exception as e: logger.error(f"Error fetching tender details: {str(e)}") return f"Could not fetch details from this link. Error: {str(e)}" def get_ppra_resources(): """Return PPRA resources dictionary.""" return { "Home": "https://www.ppra.org.pk/", "Active Tenders": "https://www.ppra.org.pk/dad_tenders.asp", "Procurement Guidelines (PDF)": "https://www.ppra.org.pk/pguidelines.pdf", "PPRA Ordinance": "https://www.ppra.org.pk/ordinance.asp", "Rules": "https://www.ppra.org.pk/Rules.asp", "Regulations Page": "https://www.ppra.org.pk/regulation.asp", "Regulations, 2024 - Disposal of Public Assets": "https://www.ppra.org.pk/SRO615I2025.pdf", "Specimens for Advertisement (Amended 2024)": "https://www.ppra.org.pk/SRO461I2024.pdf", "Blacklisting & Debarment Regulations 2024": "https://www.ppra.org.pk/SRO460I2024.pdf", "Review Petition Rule 19(3), 2021": "https://www.ppra.org.pk/SRO19I2021.pdf", "Regulation 2009": "https://www.ppra.org.pk/reg2009.pdf", "Regulation 2010 - Consultancy Services": "https://www.ppra.org.pk/reg2010.pdf", "Regulation 2011": "https://www.ppra.org.pk/reg2011.pdf", "Eligible Bidders Tax Compliance 2015": "https://www.ppra.org.pk/reg2015.pdf", "Transaction of Business Board Meeting Regulations (2021)": "https://www.ppra.org.pk/SRO15I2021.pdf", "Review Petition and Grievances (SRO90I2022)": "https://www.ppra.org.pk/SRO90I2022.pdf", "National Standard Procurement Docs (SRO370I2022)": "https://www.ppra.org.pk/SRO370I2022.pdf", "Manner of Advertisement (SRO591I2022)": "https://www.ppra.org.pk/SRO591I2022.pdf", "Declaration of Beneficial Owners (SRO592I2022)": "https://www.ppra.org.pk/SRO592I2022.pdf", "E-Pak Procurement Regulation (SRO296I2023)": "https://www.ppra.org.pk/SRO296I2023.pdf", "Board Info": "https://www.ppra.org.pk/board.asp" }