Spaces:

Shami96
/

PPRA-Copilot

Sleeping

App Files Files Community

PPRA-Copilot / utils.py

Shami96

Update utils.py

538dec9 verified 10 months ago

raw

history blame contribute delete

7.25 kB

	import requests
	from bs4 import BeautifulSoup
	import re
	from io import BytesIO
	import logging

	# Setup logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	def chunk_text(text, max_tokens=3000, chars_per_token=4):
	"""Split text into smaller chunks that won't exceed token limits."""
	max_chars = max_tokens * chars_per_token
	chunks = []

	# Simple chunking by paragraphs
	paragraphs = text.split('\n\n')
	current_chunk = ""

	for para in paragraphs:
	if len(current_chunk) + len(para) < max_chars:
	current_chunk += para + "\n\n"
	else:
	chunks.append(current_chunk)
	current_chunk = para + "\n\n"

	if current_chunk:
	chunks.append(current_chunk)

	return chunks

	def fetch_active_tenders():
	"""Fetch active tenders from PPRA website with error handling."""
	try:
	logger.info("Fetching active tenders from PPRA website")
	url = "https://www.ppra.org.pk/dad_tenders.asp"
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	response = requests.get(url, headers=headers, timeout=15)
	if response.status_code != 200:
	logger.error(f"Failed to fetch tenders. Status code: {response.status_code}")
	return []

	soup = BeautifulSoup(response.text, "html.parser")
	tables = soup.find_all("table")

	# Find the right table with tender data
	target_table = None
	for table in tables:
	if table.find("tr") and table.find("tr").find("th") and "Tender Description" in table.find("tr").text:
	target_table = table
	break

	if not target_table:
	# Try a different approach if the table header isn't found
	target_table = soup.find("table", {"class": "data"})
	if not target_table:
	# Last resort: get the largest table
	tables = sorted(tables, key=lambda t: len(t.find_all("tr")), reverse=True)
	target_table = tables[0] if tables else None

	if not target_table:
	logger.error("Could not find tender table on the page")
	return []

	rows = target_table.find_all("tr")[1:] # Skip header
	tenders = []

	for row in rows:
	cols = row.find_all("td")
	if len(cols) >= 5:
	link_element = cols[4].find("a")
	link = ""
	if link_element and link_element.has_attr("href"):
	link = "https://www.ppra.org.pk/" + link_element["href"].strip()

	tender = {
	"title": cols[0].text.strip(),
	"department": cols[1].text.strip(),
	"closing_date": cols[3].text.strip(),
	"link": link
	}
	tenders.append(tender)

	logger.info(f"Successfully fetched {len(tenders)} tenders")
	return tenders
	except Exception as e:
	logger.error(f"Error fetching tenders: {str(e)}")
	return []

	def fetch_tender_details(url):
	"""Fetch and parse tender details from the provided URL."""
	try:
	logger.info(f"Fetching tender details from {url}")
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	response = requests.get(url, headers=headers, timeout=15)
	if response.status_code != 200:
	logger.error(f"Failed to fetch tender details. Status code: {response.status_code}")
	return "Could not fetch details from this link."

	# Check if it's a PDF
	if url.lower().endswith('.pdf') or 'application/pdf' in response.headers.get('Content-Type', ''):
	logger.info("PDF detected, extracting text preview")
	try:
	import pdfplumber
	pdf = pdfplumber.open(BytesIO(response.content))
	text = ""
	# Extract first few pages
	for i in range(min(3, len(pdf.pages))):
	text += pdf.pages[i].extract_text() or ""
	return text[:5000] # Return first 5000 chars
	except ImportError:
	return "PDF document detected. Install pdfplumber to extract content."

	# HTML content
	soup = BeautifulSoup(response.text, "html.parser")

	# Remove script and style elements
	for script in soup(["script", "style"]):
	script.decompose()

	# Get text and clean it
	text = soup.get_text(separator="\n", strip=True)

	# Clean up text - remove excessive newlines
	text = re.sub(r'\n\s*\n', '\n\n', text)

	logger.info(f"Successfully fetched tender details ({len(text)} chars)")
	return text[:5000] # Limit to 5000 chars to prevent token issues
	except Exception as e:
	logger.error(f"Error fetching tender details: {str(e)}")
	return f"Could not fetch details from this link. Error: {str(e)}"

	def get_ppra_resources():
	"""Return PPRA resources dictionary."""
	return {
	"Home": "https://www.ppra.org.pk/",
	"Active Tenders": "https://www.ppra.org.pk/dad_tenders.asp",
	"Procurement Guidelines (PDF)": "https://www.ppra.org.pk/pguidelines.pdf",
	"PPRA Ordinance": "https://www.ppra.org.pk/ordinance.asp",
	"Rules": "https://www.ppra.org.pk/Rules.asp",
	"Regulations Page": "https://www.ppra.org.pk/regulation.asp",
	"Regulations, 2024 - Disposal of Public Assets": "https://www.ppra.org.pk/SRO615I2025.pdf",
	"Specimens for Advertisement (Amended 2024)": "https://www.ppra.org.pk/SRO461I2024.pdf",
	"Blacklisting & Debarment Regulations 2024": "https://www.ppra.org.pk/SRO460I2024.pdf",
	"Review Petition Rule 19(3), 2021": "https://www.ppra.org.pk/SRO19I2021.pdf",
	"Regulation 2009": "https://www.ppra.org.pk/reg2009.pdf",
	"Regulation 2010 - Consultancy Services": "https://www.ppra.org.pk/reg2010.pdf",
	"Regulation 2011": "https://www.ppra.org.pk/reg2011.pdf",
	"Eligible Bidders Tax Compliance 2015": "https://www.ppra.org.pk/reg2015.pdf",
	"Transaction of Business Board Meeting Regulations (2021)": "https://www.ppra.org.pk/SRO15I2021.pdf",
	"Review Petition and Grievances (SRO90I2022)": "https://www.ppra.org.pk/SRO90I2022.pdf",
	"National Standard Procurement Docs (SRO370I2022)": "https://www.ppra.org.pk/SRO370I2022.pdf",
	"Manner of Advertisement (SRO591I2022)": "https://www.ppra.org.pk/SRO591I2022.pdf",
	"Declaration of Beneficial Owners (SRO592I2022)": "https://www.ppra.org.pk/SRO592I2022.pdf",
	"E-Pak Procurement Regulation (SRO296I2023)": "https://www.ppra.org.pk/SRO296I2023.pdf",
	"Board Info": "https://www.ppra.org.pk/board.asp"
	}