Spaces:

NSamson1
/

Tender_Matcher

Running

File size: 8,074 Bytes

dffabb7

#!/usr/bin/env python3
"""
src/parser.py — Tender Document Parser
Handles .txt, .html, .pdf files and extracts structured fields.
"""

import os
import re
import json
from pathlib import Path
from datetime import datetime


def detect_language(text: str) -> str:
    """Simple rule-based language detection (FR vs EN). CPU-only, no deps."""
    fr_words = ["pour", "dans", "nous", "les", "des", "une", "est", "avec",
                "financ", "candid", "subvention", "appel", "projet", "éligib"]
    en_words = ["for", "the", "and", "with", "grant", "funding", "applicants",
                "eligible", "organization", "support", "submit", "proposal"]
    text_lower = text.lower()
    fr_count = sum(1 for w in fr_words if w in text_lower)
    en_count = sum(1 for w in en_words if w in text_lower)
    return "fr" if fr_count > en_count else "en"


def extract_budget(text: str) -> int:
    """Extract the largest budget figure from text."""
    patterns = [
        r'USD\s*([\d,]+)',
        r'\$([\d,]+)',
        r'([\d,]+)\s*USD',
        r'([\d,.]+)\s*million',
    ]
    amounts = []
    for pattern in patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        for m in matches:
            try:
                val = m.replace(",", "").replace(".", "")
                amounts.append(int(val))
            except ValueError:
                pass
    # Handle 'million'
    mil_matches = re.findall(r'([\d.]+)\s*million', text, re.IGNORECASE)
    for m in mil_matches:
        try:
            amounts.append(int(float(m) * 1_000_000))
        except ValueError:
            pass
    return max(amounts) if amounts else 0


def extract_deadline(text: str) -> str:
    """Extract application deadline date."""
    patterns = [
        r'[Dd]eadline[:\s]+([0-9]{1,2}\s+\w+\s+202[0-9])',
        r'[Dd]ate limite[:\s]+([0-9]{1,2}\s+\w+\s+202[0-9])',
        r'[Ss]ubmission[:\s]+([0-9]{1,2}\s+\w+\s+202[0-9])',
        r'[Ss]oumission[:\s]+([0-9]{1,2}\s+\w+\s+202[0-9])',
    ]
    for pattern in patterns:
        m = re.search(pattern, text)
        if m:
            return m.group(1).strip()
    return "Unknown"


def extract_sector(text: str, filename: str = "") -> str:
    """Extract sector from content or filename."""
    sectors = ["agritech", "healthtech", "cleantech", "edtech", "fintech", "wastetech"]
    # Try filename first
    for s in sectors:
        if s in filename.lower():
            return s
    # Try content
    text_lower = text.lower()
    sector_keywords = {
        "agritech": ["agri", "farming", "agriculture", "crop", "smallholder"],
        "healthtech": ["health", "santé", "medical", "téléméde", "clinic"],
        "cleantech": ["clean", "solar", "energy", "renewable", "énergie"],
        "edtech": ["educat", "learn", "school", "digital literacy", "tablet"],
        "fintech": ["finance", "microloan", "mobile money", "credit", "saving"],
        "wastetech": ["waste", "biogas", "compost", "circular", "déchets"],
    }
    scores = {s: 0 for s in sectors}
    for sector, keywords in sector_keywords.items():
        for kw in keywords:
            if kw in text_lower:
                scores[sector] += 1
    best = max(scores, key=scores.get)
    return best if scores[best] > 0 else "general"


def extract_region(text: str) -> str:
    """Extract target region from text."""
    regions = {
        "East Africa": ["east africa", "rwanda", "kenya", "uganda", "ethiopia", "tanzania"],
        "West Africa": ["west africa", "senegal", "ghana", "nigeria", "mali", "côte d'ivoire"],
        "Central Africa": ["central africa", "drc", "cameroon", "congo", "kinshasa"],
        "Southern Africa": ["southern africa", "zambia", "zimbabwe", "mozambique", "malawi"],
    }
    text_lower = text.lower()
    for region, keywords in regions.items():
        if any(kw in text_lower for kw in keywords):
            return region
    return "Africa"


def extract_title(text: str, filename: str = "") -> str:
    """Extract tender title from first meaningful line."""
    lines = [l.strip() for l in text.split("\n") if l.strip()]
    for line in lines[:5]:
        # Skip boilerplate headers
        if len(line) > 10 and not line.startswith("#") and ":" not in line[:3]:
            # Clean common prefixes
            for prefix in ["GRANT OPPORTUNITY:", "FUNDING CALL:", "APPEL À CANDIDATURES :", "APPEL À PROJETS :"]:
                if line.startswith(prefix):
                    return line[len(prefix):].strip()
            return line
    # Fallback: derive from filename
    return Path(filename).stem.replace("_", " ").title() if filename else "Unknown Tender"


def parse_txt(filepath: str) -> dict:
    """Parse a .txt tender file."""
    with open(filepath, "r", encoding="utf-8") as f:
        text = f.read()
    return text


def parse_html(filepath: str) -> dict:
    """Parse an .html tender file (strip tags)."""
    with open(filepath, "r", encoding="utf-8") as f:
        html = f.read()
    # Simple tag stripper
    text = re.sub(r"<[^>]+>", " ", html)
    text = re.sub(r"&nbsp;", " ", text)
    text = re.sub(r"&amp;", "&", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def parse_file(filepath: str) -> dict:
    """
    Parse any supported file format and return a structured tender record.
    
    Returns:
        dict with keys: id, title, sector, budget, deadline, region, language, raw_text, filepath
    """
    path = Path(filepath)
    ext = path.suffix.lower()

    if ext == ".txt":
        text = parse_txt(filepath)
    elif ext in [".html", ".htm"]:
        text = parse_html(filepath)
    elif ext == ".pdf":
        try:
            from pypdf import PdfReader
            reader = PdfReader(filepath)
            pages = [page.extract_text() or "" for page in reader.pages]
            text = "\n".join(pages).strip()
        except ImportError:
            # Fallback: try pdftotext CLI if pypdf not installed
            try:
                import subprocess
                result = subprocess.run(["pdftotext", filepath, "-"], capture_output=True, text=True)
                text = result.stdout if result.returncode == 0 else ""
            except Exception:
                text = ""
        except Exception as e:
            text = ""
        if not text.strip():
            text = f"[PDF: {path.name} — text extraction failed, file may be scanned/image-based]"
    else:
        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()

    tender_id = path.stem.split("_")[0] if "_" in path.stem else path.stem

    return {
        "id": tender_id,
        "title": extract_title(text, path.name),
        "sector": extract_sector(text, path.name),
        "budget": extract_budget(text),
        "deadline": extract_deadline(text),
        "region": extract_region(text),
        "language": detect_language(text),
        "raw_text": text,
        "filepath": str(filepath)
    }


def load_tenders(tenders_dir: str = "data/tenders") -> list:
    """Load and parse all tender documents from a directory."""
    tenders = []
    supported = {".txt", ".html", ".htm", ".pdf"}
    for fpath in sorted(Path(tenders_dir).iterdir()):
        if fpath.suffix.lower() in supported:
            try:
                tender = parse_file(str(fpath))
                tenders.append(tender)
            except Exception as e:
                print(f"  [WARN] Could not parse {fpath.name}: {e}")
    print(f"  Loaded {len(tenders)} tenders from {tenders_dir}")
    return tenders


def load_profiles(profiles_path: str = "data/profiles.json") -> list:
    """Load business profiles."""
    with open(profiles_path, "r") as f:
        profiles = json.load(f)
    print(f"  Loaded {len(profiles)} profiles from {profiles_path}")
    return profiles


if __name__ == "__main__":
    tenders = load_tenders()
    for t in tenders[:3]:
        print(f"\n  {t['id']} | {t['sector']} | {t['language']} | budget={t['budget']} | deadline={t['deadline']}")
        print(f"  Title: {t['title'][:60]}")