Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| from io import BytesIO | |
| import logging | |
| # Setup logging | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| def chunk_text(text, max_tokens=3000, chars_per_token=4): | |
| """Split text into smaller chunks that won't exceed token limits.""" | |
| max_chars = max_tokens * chars_per_token | |
| chunks = [] | |
| # Simple chunking by paragraphs | |
| paragraphs = text.split('\n\n') | |
| current_chunk = "" | |
| for para in paragraphs: | |
| if len(current_chunk) + len(para) < max_chars: | |
| current_chunk += para + "\n\n" | |
| else: | |
| chunks.append(current_chunk) | |
| current_chunk = para + "\n\n" | |
| if current_chunk: | |
| chunks.append(current_chunk) | |
| return chunks | |
| def fetch_active_tenders(): | |
| """Fetch active tenders from PPRA website with error handling.""" | |
| try: | |
| logger.info("Fetching active tenders from PPRA website") | |
| url = "https://www.ppra.org.pk/dad_tenders.asp" | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| } | |
| response = requests.get(url, headers=headers, timeout=15) | |
| if response.status_code != 200: | |
| logger.error(f"Failed to fetch tenders. Status code: {response.status_code}") | |
| return [] | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| tables = soup.find_all("table") | |
| # Find the right table with tender data | |
| target_table = None | |
| for table in tables: | |
| if table.find("tr") and table.find("tr").find("th") and "Tender Description" in table.find("tr").text: | |
| target_table = table | |
| break | |
| if not target_table: | |
| # Try a different approach if the table header isn't found | |
| target_table = soup.find("table", {"class": "data"}) | |
| if not target_table: | |
| # Last resort: get the largest table | |
| tables = sorted(tables, key=lambda t: len(t.find_all("tr")), reverse=True) | |
| target_table = tables[0] if tables else None | |
| if not target_table: | |
| logger.error("Could not find tender table on the page") | |
| return [] | |
| rows = target_table.find_all("tr")[1:] # Skip header | |
| tenders = [] | |
| for row in rows: | |
| cols = row.find_all("td") | |
| if len(cols) >= 5: | |
| link_element = cols[4].find("a") | |
| link = "" | |
| if link_element and link_element.has_attr("href"): | |
| link = "https://www.ppra.org.pk/" + link_element["href"].strip() | |
| tender = { | |
| "title": cols[0].text.strip(), | |
| "department": cols[1].text.strip(), | |
| "closing_date": cols[3].text.strip(), | |
| "link": link | |
| } | |
| tenders.append(tender) | |
| logger.info(f"Successfully fetched {len(tenders)} tenders") | |
| return tenders | |
| except Exception as e: | |
| logger.error(f"Error fetching tenders: {str(e)}") | |
| return [] | |
| def fetch_tender_details(url): | |
| """Fetch and parse tender details from the provided URL.""" | |
| try: | |
| logger.info(f"Fetching tender details from {url}") | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| } | |
| response = requests.get(url, headers=headers, timeout=15) | |
| if response.status_code != 200: | |
| logger.error(f"Failed to fetch tender details. Status code: {response.status_code}") | |
| return "Could not fetch details from this link." | |
| # Check if it's a PDF | |
| if url.lower().endswith('.pdf') or 'application/pdf' in response.headers.get('Content-Type', ''): | |
| logger.info("PDF detected, extracting text preview") | |
| try: | |
| import pdfplumber | |
| pdf = pdfplumber.open(BytesIO(response.content)) | |
| text = "" | |
| # Extract first few pages | |
| for i in range(min(3, len(pdf.pages))): | |
| text += pdf.pages[i].extract_text() or "" | |
| return text[:5000] # Return first 5000 chars | |
| except ImportError: | |
| return "PDF document detected. Install pdfplumber to extract content." | |
| # HTML content | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| # Remove script and style elements | |
| for script in soup(["script", "style"]): | |
| script.decompose() | |
| # Get text and clean it | |
| text = soup.get_text(separator="\n", strip=True) | |
| # Clean up text - remove excessive newlines | |
| text = re.sub(r'\n\s*\n', '\n\n', text) | |
| logger.info(f"Successfully fetched tender details ({len(text)} chars)") | |
| return text[:5000] # Limit to 5000 chars to prevent token issues | |
| except Exception as e: | |
| logger.error(f"Error fetching tender details: {str(e)}") | |
| return f"Could not fetch details from this link. Error: {str(e)}" | |
| def get_ppra_resources(): | |
| """Return PPRA resources dictionary.""" | |
| return { | |
| "Home": "https://www.ppra.org.pk/", | |
| "Active Tenders": "https://www.ppra.org.pk/dad_tenders.asp", | |
| "Procurement Guidelines (PDF)": "https://www.ppra.org.pk/pguidelines.pdf", | |
| "PPRA Ordinance": "https://www.ppra.org.pk/ordinance.asp", | |
| "Rules": "https://www.ppra.org.pk/Rules.asp", | |
| "Regulations Page": "https://www.ppra.org.pk/regulation.asp", | |
| "Regulations, 2024 - Disposal of Public Assets": "https://www.ppra.org.pk/SRO615I2025.pdf", | |
| "Specimens for Advertisement (Amended 2024)": "https://www.ppra.org.pk/SRO461I2024.pdf", | |
| "Blacklisting & Debarment Regulations 2024": "https://www.ppra.org.pk/SRO460I2024.pdf", | |
| "Review Petition Rule 19(3), 2021": "https://www.ppra.org.pk/SRO19I2021.pdf", | |
| "Regulation 2009": "https://www.ppra.org.pk/reg2009.pdf", | |
| "Regulation 2010 - Consultancy Services": "https://www.ppra.org.pk/reg2010.pdf", | |
| "Regulation 2011": "https://www.ppra.org.pk/reg2011.pdf", | |
| "Eligible Bidders Tax Compliance 2015": "https://www.ppra.org.pk/reg2015.pdf", | |
| "Transaction of Business Board Meeting Regulations (2021)": "https://www.ppra.org.pk/SRO15I2021.pdf", | |
| "Review Petition and Grievances (SRO90I2022)": "https://www.ppra.org.pk/SRO90I2022.pdf", | |
| "National Standard Procurement Docs (SRO370I2022)": "https://www.ppra.org.pk/SRO370I2022.pdf", | |
| "Manner of Advertisement (SRO591I2022)": "https://www.ppra.org.pk/SRO591I2022.pdf", | |
| "Declaration of Beneficial Owners (SRO592I2022)": "https://www.ppra.org.pk/SRO592I2022.pdf", | |
| "E-Pak Procurement Regulation (SRO296I2023)": "https://www.ppra.org.pk/SRO296I2023.pdf", | |
| "Board Info": "https://www.ppra.org.pk/board.asp" | |
| } |