PPRA-Copilot / utils.py
Shami96's picture
Update utils.py
538dec9 verified
import requests
from bs4 import BeautifulSoup
import re
from io import BytesIO
import logging
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def chunk_text(text, max_tokens=3000, chars_per_token=4):
"""Split text into smaller chunks that won't exceed token limits."""
max_chars = max_tokens * chars_per_token
chunks = []
# Simple chunking by paragraphs
paragraphs = text.split('\n\n')
current_chunk = ""
for para in paragraphs:
if len(current_chunk) + len(para) < max_chars:
current_chunk += para + "\n\n"
else:
chunks.append(current_chunk)
current_chunk = para + "\n\n"
if current_chunk:
chunks.append(current_chunk)
return chunks
def fetch_active_tenders():
"""Fetch active tenders from PPRA website with error handling."""
try:
logger.info("Fetching active tenders from PPRA website")
url = "https://www.ppra.org.pk/dad_tenders.asp"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=15)
if response.status_code != 200:
logger.error(f"Failed to fetch tenders. Status code: {response.status_code}")
return []
soup = BeautifulSoup(response.text, "html.parser")
tables = soup.find_all("table")
# Find the right table with tender data
target_table = None
for table in tables:
if table.find("tr") and table.find("tr").find("th") and "Tender Description" in table.find("tr").text:
target_table = table
break
if not target_table:
# Try a different approach if the table header isn't found
target_table = soup.find("table", {"class": "data"})
if not target_table:
# Last resort: get the largest table
tables = sorted(tables, key=lambda t: len(t.find_all("tr")), reverse=True)
target_table = tables[0] if tables else None
if not target_table:
logger.error("Could not find tender table on the page")
return []
rows = target_table.find_all("tr")[1:] # Skip header
tenders = []
for row in rows:
cols = row.find_all("td")
if len(cols) >= 5:
link_element = cols[4].find("a")
link = ""
if link_element and link_element.has_attr("href"):
link = "https://www.ppra.org.pk/" + link_element["href"].strip()
tender = {
"title": cols[0].text.strip(),
"department": cols[1].text.strip(),
"closing_date": cols[3].text.strip(),
"link": link
}
tenders.append(tender)
logger.info(f"Successfully fetched {len(tenders)} tenders")
return tenders
except Exception as e:
logger.error(f"Error fetching tenders: {str(e)}")
return []
def fetch_tender_details(url):
"""Fetch and parse tender details from the provided URL."""
try:
logger.info(f"Fetching tender details from {url}")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=15)
if response.status_code != 200:
logger.error(f"Failed to fetch tender details. Status code: {response.status_code}")
return "Could not fetch details from this link."
# Check if it's a PDF
if url.lower().endswith('.pdf') or 'application/pdf' in response.headers.get('Content-Type', ''):
logger.info("PDF detected, extracting text preview")
try:
import pdfplumber
pdf = pdfplumber.open(BytesIO(response.content))
text = ""
# Extract first few pages
for i in range(min(3, len(pdf.pages))):
text += pdf.pages[i].extract_text() or ""
return text[:5000] # Return first 5000 chars
except ImportError:
return "PDF document detected. Install pdfplumber to extract content."
# HTML content
soup = BeautifulSoup(response.text, "html.parser")
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
# Get text and clean it
text = soup.get_text(separator="\n", strip=True)
# Clean up text - remove excessive newlines
text = re.sub(r'\n\s*\n', '\n\n', text)
logger.info(f"Successfully fetched tender details ({len(text)} chars)")
return text[:5000] # Limit to 5000 chars to prevent token issues
except Exception as e:
logger.error(f"Error fetching tender details: {str(e)}")
return f"Could not fetch details from this link. Error: {str(e)}"
def get_ppra_resources():
"""Return PPRA resources dictionary."""
return {
"Home": "https://www.ppra.org.pk/",
"Active Tenders": "https://www.ppra.org.pk/dad_tenders.asp",
"Procurement Guidelines (PDF)": "https://www.ppra.org.pk/pguidelines.pdf",
"PPRA Ordinance": "https://www.ppra.org.pk/ordinance.asp",
"Rules": "https://www.ppra.org.pk/Rules.asp",
"Regulations Page": "https://www.ppra.org.pk/regulation.asp",
"Regulations, 2024 - Disposal of Public Assets": "https://www.ppra.org.pk/SRO615I2025.pdf",
"Specimens for Advertisement (Amended 2024)": "https://www.ppra.org.pk/SRO461I2024.pdf",
"Blacklisting & Debarment Regulations 2024": "https://www.ppra.org.pk/SRO460I2024.pdf",
"Review Petition Rule 19(3), 2021": "https://www.ppra.org.pk/SRO19I2021.pdf",
"Regulation 2009": "https://www.ppra.org.pk/reg2009.pdf",
"Regulation 2010 - Consultancy Services": "https://www.ppra.org.pk/reg2010.pdf",
"Regulation 2011": "https://www.ppra.org.pk/reg2011.pdf",
"Eligible Bidders Tax Compliance 2015": "https://www.ppra.org.pk/reg2015.pdf",
"Transaction of Business Board Meeting Regulations (2021)": "https://www.ppra.org.pk/SRO15I2021.pdf",
"Review Petition and Grievances (SRO90I2022)": "https://www.ppra.org.pk/SRO90I2022.pdf",
"National Standard Procurement Docs (SRO370I2022)": "https://www.ppra.org.pk/SRO370I2022.pdf",
"Manner of Advertisement (SRO591I2022)": "https://www.ppra.org.pk/SRO591I2022.pdf",
"Declaration of Beneficial Owners (SRO592I2022)": "https://www.ppra.org.pk/SRO592I2022.pdf",
"E-Pak Procurement Regulation (SRO296I2023)": "https://www.ppra.org.pk/SRO296I2023.pdf",
"Board Info": "https://www.ppra.org.pk/board.asp"
}