Spaces:
Running
Running
File size: 8,074 Bytes
dffabb7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 | #!/usr/bin/env python3
"""
src/parser.py — Tender Document Parser
Handles .txt, .html, .pdf files and extracts structured fields.
"""
import os
import re
import json
from pathlib import Path
from datetime import datetime
def detect_language(text: str) -> str:
"""Simple rule-based language detection (FR vs EN). CPU-only, no deps."""
fr_words = ["pour", "dans", "nous", "les", "des", "une", "est", "avec",
"financ", "candid", "subvention", "appel", "projet", "éligib"]
en_words = ["for", "the", "and", "with", "grant", "funding", "applicants",
"eligible", "organization", "support", "submit", "proposal"]
text_lower = text.lower()
fr_count = sum(1 for w in fr_words if w in text_lower)
en_count = sum(1 for w in en_words if w in text_lower)
return "fr" if fr_count > en_count else "en"
def extract_budget(text: str) -> int:
"""Extract the largest budget figure from text."""
patterns = [
r'USD\s*([\d,]+)',
r'\$([\d,]+)',
r'([\d,]+)\s*USD',
r'([\d,.]+)\s*million',
]
amounts = []
for pattern in patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
for m in matches:
try:
val = m.replace(",", "").replace(".", "")
amounts.append(int(val))
except ValueError:
pass
# Handle 'million'
mil_matches = re.findall(r'([\d.]+)\s*million', text, re.IGNORECASE)
for m in mil_matches:
try:
amounts.append(int(float(m) * 1_000_000))
except ValueError:
pass
return max(amounts) if amounts else 0
def extract_deadline(text: str) -> str:
"""Extract application deadline date."""
patterns = [
r'[Dd]eadline[:\s]+([0-9]{1,2}\s+\w+\s+202[0-9])',
r'[Dd]ate limite[:\s]+([0-9]{1,2}\s+\w+\s+202[0-9])',
r'[Ss]ubmission[:\s]+([0-9]{1,2}\s+\w+\s+202[0-9])',
r'[Ss]oumission[:\s]+([0-9]{1,2}\s+\w+\s+202[0-9])',
]
for pattern in patterns:
m = re.search(pattern, text)
if m:
return m.group(1).strip()
return "Unknown"
def extract_sector(text: str, filename: str = "") -> str:
"""Extract sector from content or filename."""
sectors = ["agritech", "healthtech", "cleantech", "edtech", "fintech", "wastetech"]
# Try filename first
for s in sectors:
if s in filename.lower():
return s
# Try content
text_lower = text.lower()
sector_keywords = {
"agritech": ["agri", "farming", "agriculture", "crop", "smallholder"],
"healthtech": ["health", "santé", "medical", "téléméde", "clinic"],
"cleantech": ["clean", "solar", "energy", "renewable", "énergie"],
"edtech": ["educat", "learn", "school", "digital literacy", "tablet"],
"fintech": ["finance", "microloan", "mobile money", "credit", "saving"],
"wastetech": ["waste", "biogas", "compost", "circular", "déchets"],
}
scores = {s: 0 for s in sectors}
for sector, keywords in sector_keywords.items():
for kw in keywords:
if kw in text_lower:
scores[sector] += 1
best = max(scores, key=scores.get)
return best if scores[best] > 0 else "general"
def extract_region(text: str) -> str:
"""Extract target region from text."""
regions = {
"East Africa": ["east africa", "rwanda", "kenya", "uganda", "ethiopia", "tanzania"],
"West Africa": ["west africa", "senegal", "ghana", "nigeria", "mali", "côte d'ivoire"],
"Central Africa": ["central africa", "drc", "cameroon", "congo", "kinshasa"],
"Southern Africa": ["southern africa", "zambia", "zimbabwe", "mozambique", "malawi"],
}
text_lower = text.lower()
for region, keywords in regions.items():
if any(kw in text_lower for kw in keywords):
return region
return "Africa"
def extract_title(text: str, filename: str = "") -> str:
"""Extract tender title from first meaningful line."""
lines = [l.strip() for l in text.split("\n") if l.strip()]
for line in lines[:5]:
# Skip boilerplate headers
if len(line) > 10 and not line.startswith("#") and ":" not in line[:3]:
# Clean common prefixes
for prefix in ["GRANT OPPORTUNITY:", "FUNDING CALL:", "APPEL À CANDIDATURES :", "APPEL À PROJETS :"]:
if line.startswith(prefix):
return line[len(prefix):].strip()
return line
# Fallback: derive from filename
return Path(filename).stem.replace("_", " ").title() if filename else "Unknown Tender"
def parse_txt(filepath: str) -> dict:
"""Parse a .txt tender file."""
with open(filepath, "r", encoding="utf-8") as f:
text = f.read()
return text
def parse_html(filepath: str) -> dict:
"""Parse an .html tender file (strip tags)."""
with open(filepath, "r", encoding="utf-8") as f:
html = f.read()
# Simple tag stripper
text = re.sub(r"<[^>]+>", " ", html)
text = re.sub(r" ", " ", text)
text = re.sub(r"&", "&", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def parse_file(filepath: str) -> dict:
"""
Parse any supported file format and return a structured tender record.
Returns:
dict with keys: id, title, sector, budget, deadline, region, language, raw_text, filepath
"""
path = Path(filepath)
ext = path.suffix.lower()
if ext == ".txt":
text = parse_txt(filepath)
elif ext in [".html", ".htm"]:
text = parse_html(filepath)
elif ext == ".pdf":
try:
from pypdf import PdfReader
reader = PdfReader(filepath)
pages = [page.extract_text() or "" for page in reader.pages]
text = "\n".join(pages).strip()
except ImportError:
# Fallback: try pdftotext CLI if pypdf not installed
try:
import subprocess
result = subprocess.run(["pdftotext", filepath, "-"], capture_output=True, text=True)
text = result.stdout if result.returncode == 0 else ""
except Exception:
text = ""
except Exception as e:
text = ""
if not text.strip():
text = f"[PDF: {path.name} — text extraction failed, file may be scanned/image-based]"
else:
with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
text = f.read()
tender_id = path.stem.split("_")[0] if "_" in path.stem else path.stem
return {
"id": tender_id,
"title": extract_title(text, path.name),
"sector": extract_sector(text, path.name),
"budget": extract_budget(text),
"deadline": extract_deadline(text),
"region": extract_region(text),
"language": detect_language(text),
"raw_text": text,
"filepath": str(filepath)
}
def load_tenders(tenders_dir: str = "data/tenders") -> list:
"""Load and parse all tender documents from a directory."""
tenders = []
supported = {".txt", ".html", ".htm", ".pdf"}
for fpath in sorted(Path(tenders_dir).iterdir()):
if fpath.suffix.lower() in supported:
try:
tender = parse_file(str(fpath))
tenders.append(tender)
except Exception as e:
print(f" [WARN] Could not parse {fpath.name}: {e}")
print(f" Loaded {len(tenders)} tenders from {tenders_dir}")
return tenders
def load_profiles(profiles_path: str = "data/profiles.json") -> list:
"""Load business profiles."""
with open(profiles_path, "r") as f:
profiles = json.load(f)
print(f" Loaded {len(profiles)} profiles from {profiles_path}")
return profiles
if __name__ == "__main__":
tenders = load_tenders()
for t in tenders[:3]:
print(f"\n {t['id']} | {t['sector']} | {t['language']} | budget={t['budget']} | deadline={t['deadline']}")
print(f" Title: {t['title'][:60]}")
|