#!/usr/bin/env python3 """ src/parser.py — Tender Document Parser Handles .txt, .html, .pdf files and extracts structured fields. """ import os import re import json from pathlib import Path from datetime import datetime def detect_language(text: str) -> str: """Simple rule-based language detection (FR vs EN). CPU-only, no deps.""" fr_words = ["pour", "dans", "nous", "les", "des", "une", "est", "avec", "financ", "candid", "subvention", "appel", "projet", "éligib"] en_words = ["for", "the", "and", "with", "grant", "funding", "applicants", "eligible", "organization", "support", "submit", "proposal"] text_lower = text.lower() fr_count = sum(1 for w in fr_words if w in text_lower) en_count = sum(1 for w in en_words if w in text_lower) return "fr" if fr_count > en_count else "en" def extract_budget(text: str) -> int: """Extract the largest budget figure from text.""" patterns = [ r'USD\s*([\d,]+)', r'\$([\d,]+)', r'([\d,]+)\s*USD', r'([\d,.]+)\s*million', ] amounts = [] for pattern in patterns: matches = re.findall(pattern, text, re.IGNORECASE) for m in matches: try: val = m.replace(",", "").replace(".", "") amounts.append(int(val)) except ValueError: pass # Handle 'million' mil_matches = re.findall(r'([\d.]+)\s*million', text, re.IGNORECASE) for m in mil_matches: try: amounts.append(int(float(m) * 1_000_000)) except ValueError: pass return max(amounts) if amounts else 0 def extract_deadline(text: str) -> str: """Extract application deadline date.""" patterns = [ r'[Dd]eadline[:\s]+([0-9]{1,2}\s+\w+\s+202[0-9])', r'[Dd]ate limite[:\s]+([0-9]{1,2}\s+\w+\s+202[0-9])', r'[Ss]ubmission[:\s]+([0-9]{1,2}\s+\w+\s+202[0-9])', r'[Ss]oumission[:\s]+([0-9]{1,2}\s+\w+\s+202[0-9])', ] for pattern in patterns: m = re.search(pattern, text) if m: return m.group(1).strip() return "Unknown" def extract_sector(text: str, filename: str = "") -> str: """Extract sector from content or filename.""" sectors = ["agritech", "healthtech", "cleantech", "edtech", "fintech", "wastetech"] # Try filename first for s in sectors: if s in filename.lower(): return s # Try content text_lower = text.lower() sector_keywords = { "agritech": ["agri", "farming", "agriculture", "crop", "smallholder"], "healthtech": ["health", "santé", "medical", "téléméde", "clinic"], "cleantech": ["clean", "solar", "energy", "renewable", "énergie"], "edtech": ["educat", "learn", "school", "digital literacy", "tablet"], "fintech": ["finance", "microloan", "mobile money", "credit", "saving"], "wastetech": ["waste", "biogas", "compost", "circular", "déchets"], } scores = {s: 0 for s in sectors} for sector, keywords in sector_keywords.items(): for kw in keywords: if kw in text_lower: scores[sector] += 1 best = max(scores, key=scores.get) return best if scores[best] > 0 else "general" def extract_region(text: str) -> str: """Extract target region from text.""" regions = { "East Africa": ["east africa", "rwanda", "kenya", "uganda", "ethiopia", "tanzania"], "West Africa": ["west africa", "senegal", "ghana", "nigeria", "mali", "côte d'ivoire"], "Central Africa": ["central africa", "drc", "cameroon", "congo", "kinshasa"], "Southern Africa": ["southern africa", "zambia", "zimbabwe", "mozambique", "malawi"], } text_lower = text.lower() for region, keywords in regions.items(): if any(kw in text_lower for kw in keywords): return region return "Africa" def extract_title(text: str, filename: str = "") -> str: """Extract tender title from first meaningful line.""" lines = [l.strip() for l in text.split("\n") if l.strip()] for line in lines[:5]: # Skip boilerplate headers if len(line) > 10 and not line.startswith("#") and ":" not in line[:3]: # Clean common prefixes for prefix in ["GRANT OPPORTUNITY:", "FUNDING CALL:", "APPEL À CANDIDATURES :", "APPEL À PROJETS :"]: if line.startswith(prefix): return line[len(prefix):].strip() return line # Fallback: derive from filename return Path(filename).stem.replace("_", " ").title() if filename else "Unknown Tender" def parse_txt(filepath: str) -> dict: """Parse a .txt tender file.""" with open(filepath, "r", encoding="utf-8") as f: text = f.read() return text def parse_html(filepath: str) -> dict: """Parse an .html tender file (strip tags).""" with open(filepath, "r", encoding="utf-8") as f: html = f.read() # Simple tag stripper text = re.sub(r"<[^>]+>", " ", html) text = re.sub(r" ", " ", text) text = re.sub(r"&", "&", text) text = re.sub(r"\s+", " ", text).strip() return text def parse_file(filepath: str) -> dict: """ Parse any supported file format and return a structured tender record. Returns: dict with keys: id, title, sector, budget, deadline, region, language, raw_text, filepath """ path = Path(filepath) ext = path.suffix.lower() if ext == ".txt": text = parse_txt(filepath) elif ext in [".html", ".htm"]: text = parse_html(filepath) elif ext == ".pdf": try: from pypdf import PdfReader reader = PdfReader(filepath) pages = [page.extract_text() or "" for page in reader.pages] text = "\n".join(pages).strip() except ImportError: # Fallback: try pdftotext CLI if pypdf not installed try: import subprocess result = subprocess.run(["pdftotext", filepath, "-"], capture_output=True, text=True) text = result.stdout if result.returncode == 0 else "" except Exception: text = "" except Exception as e: text = "" if not text.strip(): text = f"[PDF: {path.name} — text extraction failed, file may be scanned/image-based]" else: with open(filepath, "r", encoding="utf-8", errors="ignore") as f: text = f.read() tender_id = path.stem.split("_")[0] if "_" in path.stem else path.stem return { "id": tender_id, "title": extract_title(text, path.name), "sector": extract_sector(text, path.name), "budget": extract_budget(text), "deadline": extract_deadline(text), "region": extract_region(text), "language": detect_language(text), "raw_text": text, "filepath": str(filepath) } def load_tenders(tenders_dir: str = "data/tenders") -> list: """Load and parse all tender documents from a directory.""" tenders = [] supported = {".txt", ".html", ".htm", ".pdf"} for fpath in sorted(Path(tenders_dir).iterdir()): if fpath.suffix.lower() in supported: try: tender = parse_file(str(fpath)) tenders.append(tender) except Exception as e: print(f" [WARN] Could not parse {fpath.name}: {e}") print(f" Loaded {len(tenders)} tenders from {tenders_dir}") return tenders def load_profiles(profiles_path: str = "data/profiles.json") -> list: """Load business profiles.""" with open(profiles_path, "r") as f: profiles = json.load(f) print(f" Loaded {len(profiles)} profiles from {profiles_path}") return profiles if __name__ == "__main__": tenders = load_tenders() for t in tenders[:3]: print(f"\n {t['id']} | {t['sector']} | {t['language']} | budget={t['budget']} | deadline={t['deadline']}") print(f" Title: {t['title'][:60]}")