Spaces:
Sleeping
Sleeping
File size: 7,250 Bytes
538dec9 b1e29d6 8f7bc43 b1e29d6 8f7bc43 538dec9 8f7bc43 b1e29d6 8f7bc43 b1e29d6 6da4021 8f7bc43 6da4021 8f7bc43 6da4021 8f7bc43 6da4021 8f7bc43 b1e29d6 6da4021 8f7bc43 6da4021 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 | import requests
from bs4 import BeautifulSoup
import re
from io import BytesIO
import logging
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def chunk_text(text, max_tokens=3000, chars_per_token=4):
"""Split text into smaller chunks that won't exceed token limits."""
max_chars = max_tokens * chars_per_token
chunks = []
# Simple chunking by paragraphs
paragraphs = text.split('\n\n')
current_chunk = ""
for para in paragraphs:
if len(current_chunk) + len(para) < max_chars:
current_chunk += para + "\n\n"
else:
chunks.append(current_chunk)
current_chunk = para + "\n\n"
if current_chunk:
chunks.append(current_chunk)
return chunks
def fetch_active_tenders():
"""Fetch active tenders from PPRA website with error handling."""
try:
logger.info("Fetching active tenders from PPRA website")
url = "https://www.ppra.org.pk/dad_tenders.asp"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=15)
if response.status_code != 200:
logger.error(f"Failed to fetch tenders. Status code: {response.status_code}")
return []
soup = BeautifulSoup(response.text, "html.parser")
tables = soup.find_all("table")
# Find the right table with tender data
target_table = None
for table in tables:
if table.find("tr") and table.find("tr").find("th") and "Tender Description" in table.find("tr").text:
target_table = table
break
if not target_table:
# Try a different approach if the table header isn't found
target_table = soup.find("table", {"class": "data"})
if not target_table:
# Last resort: get the largest table
tables = sorted(tables, key=lambda t: len(t.find_all("tr")), reverse=True)
target_table = tables[0] if tables else None
if not target_table:
logger.error("Could not find tender table on the page")
return []
rows = target_table.find_all("tr")[1:] # Skip header
tenders = []
for row in rows:
cols = row.find_all("td")
if len(cols) >= 5:
link_element = cols[4].find("a")
link = ""
if link_element and link_element.has_attr("href"):
link = "https://www.ppra.org.pk/" + link_element["href"].strip()
tender = {
"title": cols[0].text.strip(),
"department": cols[1].text.strip(),
"closing_date": cols[3].text.strip(),
"link": link
}
tenders.append(tender)
logger.info(f"Successfully fetched {len(tenders)} tenders")
return tenders
except Exception as e:
logger.error(f"Error fetching tenders: {str(e)}")
return []
def fetch_tender_details(url):
"""Fetch and parse tender details from the provided URL."""
try:
logger.info(f"Fetching tender details from {url}")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=15)
if response.status_code != 200:
logger.error(f"Failed to fetch tender details. Status code: {response.status_code}")
return "Could not fetch details from this link."
# Check if it's a PDF
if url.lower().endswith('.pdf') or 'application/pdf' in response.headers.get('Content-Type', ''):
logger.info("PDF detected, extracting text preview")
try:
import pdfplumber
pdf = pdfplumber.open(BytesIO(response.content))
text = ""
# Extract first few pages
for i in range(min(3, len(pdf.pages))):
text += pdf.pages[i].extract_text() or ""
return text[:5000] # Return first 5000 chars
except ImportError:
return "PDF document detected. Install pdfplumber to extract content."
# HTML content
soup = BeautifulSoup(response.text, "html.parser")
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
# Get text and clean it
text = soup.get_text(separator="\n", strip=True)
# Clean up text - remove excessive newlines
text = re.sub(r'\n\s*\n', '\n\n', text)
logger.info(f"Successfully fetched tender details ({len(text)} chars)")
return text[:5000] # Limit to 5000 chars to prevent token issues
except Exception as e:
logger.error(f"Error fetching tender details: {str(e)}")
return f"Could not fetch details from this link. Error: {str(e)}"
def get_ppra_resources():
"""Return PPRA resources dictionary."""
return {
"Home": "https://www.ppra.org.pk/",
"Active Tenders": "https://www.ppra.org.pk/dad_tenders.asp",
"Procurement Guidelines (PDF)": "https://www.ppra.org.pk/pguidelines.pdf",
"PPRA Ordinance": "https://www.ppra.org.pk/ordinance.asp",
"Rules": "https://www.ppra.org.pk/Rules.asp",
"Regulations Page": "https://www.ppra.org.pk/regulation.asp",
"Regulations, 2024 - Disposal of Public Assets": "https://www.ppra.org.pk/SRO615I2025.pdf",
"Specimens for Advertisement (Amended 2024)": "https://www.ppra.org.pk/SRO461I2024.pdf",
"Blacklisting & Debarment Regulations 2024": "https://www.ppra.org.pk/SRO460I2024.pdf",
"Review Petition Rule 19(3), 2021": "https://www.ppra.org.pk/SRO19I2021.pdf",
"Regulation 2009": "https://www.ppra.org.pk/reg2009.pdf",
"Regulation 2010 - Consultancy Services": "https://www.ppra.org.pk/reg2010.pdf",
"Regulation 2011": "https://www.ppra.org.pk/reg2011.pdf",
"Eligible Bidders Tax Compliance 2015": "https://www.ppra.org.pk/reg2015.pdf",
"Transaction of Business Board Meeting Regulations (2021)": "https://www.ppra.org.pk/SRO15I2021.pdf",
"Review Petition and Grievances (SRO90I2022)": "https://www.ppra.org.pk/SRO90I2022.pdf",
"National Standard Procurement Docs (SRO370I2022)": "https://www.ppra.org.pk/SRO370I2022.pdf",
"Manner of Advertisement (SRO591I2022)": "https://www.ppra.org.pk/SRO591I2022.pdf",
"Declaration of Beneficial Owners (SRO592I2022)": "https://www.ppra.org.pk/SRO592I2022.pdf",
"E-Pak Procurement Regulation (SRO296I2023)": "https://www.ppra.org.pk/SRO296I2023.pdf",
"Board Info": "https://www.ppra.org.pk/board.asp"
} |