Spaces:

Shami96
/

PPRA-Copilot

Sleeping

File size: 7,250 Bytes

import requests
from bs4 import BeautifulSoup
import re
from io import BytesIO
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def chunk_text(text, max_tokens=3000, chars_per_token=4):
    """Split text into smaller chunks that won't exceed token limits."""
    max_chars = max_tokens * chars_per_token
    chunks = []
    
    # Simple chunking by paragraphs
    paragraphs = text.split('\n\n')
    current_chunk = ""
    
    for para in paragraphs:
        if len(current_chunk) + len(para) < max_chars:
            current_chunk += para + "\n\n"
        else:
            chunks.append(current_chunk)
            current_chunk = para + "\n\n"
    
    if current_chunk:
        chunks.append(current_chunk)
        
    return chunks

def fetch_active_tenders():
    """Fetch active tenders from PPRA website with error handling."""
    try:
        logger.info("Fetching active tenders from PPRA website")
        url = "https://www.ppra.org.pk/dad_tenders.asp"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=15)
        if response.status_code != 200:
            logger.error(f"Failed to fetch tenders. Status code: {response.status_code}")
            return []
            
        soup = BeautifulSoup(response.text, "html.parser")
        tables = soup.find_all("table")
        
        # Find the right table with tender data
        target_table = None
        for table in tables:
            if table.find("tr") and table.find("tr").find("th") and "Tender Description" in table.find("tr").text:
                target_table = table
                break
                
        if not target_table:
            # Try a different approach if the table header isn't found
            target_table = soup.find("table", {"class": "data"})
            if not target_table:
                # Last resort: get the largest table
                tables = sorted(tables, key=lambda t: len(t.find_all("tr")), reverse=True)
                target_table = tables[0] if tables else None
        
        if not target_table:
            logger.error("Could not find tender table on the page")
            return []
            
        rows = target_table.find_all("tr")[1:]  # Skip header
        tenders = []
        
        for row in rows:
            cols = row.find_all("td")
            if len(cols) >= 5:
                link_element = cols[4].find("a")
                link = ""
                if link_element and link_element.has_attr("href"):
                    link = "https://www.ppra.org.pk/" + link_element["href"].strip()
                
                tender = {
                    "title": cols[0].text.strip(),
                    "department": cols[1].text.strip(),
                    "closing_date": cols[3].text.strip(),
                    "link": link
                }
                tenders.append(tender)
        
        logger.info(f"Successfully fetched {len(tenders)} tenders")
        return tenders
    except Exception as e:
        logger.error(f"Error fetching tenders: {str(e)}")
        return []

def fetch_tender_details(url):
    """Fetch and parse tender details from the provided URL."""
    try:
        logger.info(f"Fetching tender details from {url}")
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=15)
        if response.status_code != 200:
            logger.error(f"Failed to fetch tender details. Status code: {response.status_code}")
            return "Could not fetch details from this link."
            
        # Check if it's a PDF
        if url.lower().endswith('.pdf') or 'application/pdf' in response.headers.get('Content-Type', ''):
            logger.info("PDF detected, extracting text preview")
            try:
                import pdfplumber
                pdf = pdfplumber.open(BytesIO(response.content))
                text = ""
                # Extract first few pages
                for i in range(min(3, len(pdf.pages))):
                    text += pdf.pages[i].extract_text() or ""
                return text[:5000]  # Return first 5000 chars
            except ImportError:
                return "PDF document detected. Install pdfplumber to extract content."
        
        # HTML content
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()
            
        # Get text and clean it
        text = soup.get_text(separator="\n", strip=True)
        
        # Clean up text - remove excessive newlines
        text = re.sub(r'\n\s*\n', '\n\n', text)
        
        logger.info(f"Successfully fetched tender details ({len(text)} chars)")
        return text[:5000]  # Limit to 5000 chars to prevent token issues
    except Exception as e:
        logger.error(f"Error fetching tender details: {str(e)}")
        return f"Could not fetch details from this link. Error: {str(e)}"

def get_ppra_resources():
    """Return PPRA resources dictionary."""
    return {
        "Home": "https://www.ppra.org.pk/",
        "Active Tenders": "https://www.ppra.org.pk/dad_tenders.asp",
        "Procurement Guidelines (PDF)": "https://www.ppra.org.pk/pguidelines.pdf",
        "PPRA Ordinance": "https://www.ppra.org.pk/ordinance.asp",
        "Rules": "https://www.ppra.org.pk/Rules.asp",
        "Regulations Page": "https://www.ppra.org.pk/regulation.asp",
        "Regulations, 2024 - Disposal of Public Assets": "https://www.ppra.org.pk/SRO615I2025.pdf",
        "Specimens for Advertisement (Amended 2024)": "https://www.ppra.org.pk/SRO461I2024.pdf",
        "Blacklisting & Debarment Regulations 2024": "https://www.ppra.org.pk/SRO460I2024.pdf",
        "Review Petition Rule 19(3), 2021": "https://www.ppra.org.pk/SRO19I2021.pdf",
        "Regulation 2009": "https://www.ppra.org.pk/reg2009.pdf",
        "Regulation 2010 - Consultancy Services": "https://www.ppra.org.pk/reg2010.pdf",
        "Regulation 2011": "https://www.ppra.org.pk/reg2011.pdf",
        "Eligible Bidders Tax Compliance 2015": "https://www.ppra.org.pk/reg2015.pdf",
        "Transaction of Business Board Meeting Regulations (2021)": "https://www.ppra.org.pk/SRO15I2021.pdf",
        "Review Petition and Grievances (SRO90I2022)": "https://www.ppra.org.pk/SRO90I2022.pdf",
        "National Standard Procurement Docs (SRO370I2022)": "https://www.ppra.org.pk/SRO370I2022.pdf",
        "Manner of Advertisement (SRO591I2022)": "https://www.ppra.org.pk/SRO591I2022.pdf",
        "Declaration of Beneficial Owners (SRO592I2022)": "https://www.ppra.org.pk/SRO592I2022.pdf",
        "E-Pak Procurement Regulation (SRO296I2023)": "https://www.ppra.org.pk/SRO296I2023.pdf",
        "Board Info": "https://www.ppra.org.pk/board.asp"
    }