Spaces:

NSamson1
/

Tender_Matcher

Running

Samson NIYIZURUGERO

code migration

dffabb7 about 1 month ago

8.07 kB

	#!/usr/bin/env python3
	"""
	src/parser.py — Tender Document Parser
	Handles .txt, .html, .pdf files and extracts structured fields.
	"""

	import os
	import re
	import json
	from pathlib import Path
	from datetime import datetime


	def detect_language(text: str) -> str:
	"""Simple rule-based language detection (FR vs EN). CPU-only, no deps."""
	fr_words = ["pour", "dans", "nous", "les", "des", "une", "est", "avec",
	"financ", "candid", "subvention", "appel", "projet", "éligib"]
	en_words = ["for", "the", "and", "with", "grant", "funding", "applicants",
	"eligible", "organization", "support", "submit", "proposal"]
	text_lower = text.lower()
	fr_count = sum(1 for w in fr_words if w in text_lower)
	en_count = sum(1 for w in en_words if w in text_lower)
	return "fr" if fr_count > en_count else "en"


	def extract_budget(text: str) -> int:
	"""Extract the largest budget figure from text."""
	patterns = [
	r'USD\s*([\d,]+)',
	r'\$([\d,]+)',
	r'([\d,]+)\s*USD',
	r'([\d,.]+)\s*million',
	]
	amounts = []
	for pattern in patterns:
	matches = re.findall(pattern, text, re.IGNORECASE)
	for m in matches:
	try:
	val = m.replace(",", "").replace(".", "")
	amounts.append(int(val))
	except ValueError:
	pass
	# Handle 'million'
	mil_matches = re.findall(r'([\d.]+)\s*million', text, re.IGNORECASE)
	for m in mil_matches:
	try:
	amounts.append(int(float(m) * 1_000_000))
	except ValueError:
	pass
	return max(amounts) if amounts else 0


	def extract_deadline(text: str) -> str:
	"""Extract application deadline date."""
	patterns = [
	r'[Dd]eadline[:\s]+([0-9]{1,2}\s+\w+\s+202[0-9])',
	r'[Dd]ate limite[:\s]+([0-9]{1,2}\s+\w+\s+202[0-9])',
	r'[Ss]ubmission[:\s]+([0-9]{1,2}\s+\w+\s+202[0-9])',
	r'[Ss]oumission[:\s]+([0-9]{1,2}\s+\w+\s+202[0-9])',
	]
	for pattern in patterns:
	m = re.search(pattern, text)
	if m:
	return m.group(1).strip()
	return "Unknown"


	def extract_sector(text: str, filename: str = "") -> str:
	"""Extract sector from content or filename."""
	sectors = ["agritech", "healthtech", "cleantech", "edtech", "fintech", "wastetech"]
	# Try filename first
	for s in sectors:
	if s in filename.lower():
	return s
	# Try content
	text_lower = text.lower()
	sector_keywords = {
	"agritech": ["agri", "farming", "agriculture", "crop", "smallholder"],
	"healthtech": ["health", "santé", "medical", "téléméde", "clinic"],
	"cleantech": ["clean", "solar", "energy", "renewable", "énergie"],
	"edtech": ["educat", "learn", "school", "digital literacy", "tablet"],
	"fintech": ["finance", "microloan", "mobile money", "credit", "saving"],
	"wastetech": ["waste", "biogas", "compost", "circular", "déchets"],
	}
	scores = {s: 0 for s in sectors}
	for sector, keywords in sector_keywords.items():
	for kw in keywords:
	if kw in text_lower:
	scores[sector] += 1
	best = max(scores, key=scores.get)
	return best if scores[best] > 0 else "general"


	def extract_region(text: str) -> str:
	"""Extract target region from text."""
	regions = {
	"East Africa": ["east africa", "rwanda", "kenya", "uganda", "ethiopia", "tanzania"],
	"West Africa": ["west africa", "senegal", "ghana", "nigeria", "mali", "côte d'ivoire"],
	"Central Africa": ["central africa", "drc", "cameroon", "congo", "kinshasa"],
	"Southern Africa": ["southern africa", "zambia", "zimbabwe", "mozambique", "malawi"],
	}
	text_lower = text.lower()
	for region, keywords in regions.items():
	if any(kw in text_lower for kw in keywords):
	return region
	return "Africa"


	def extract_title(text: str, filename: str = "") -> str:
	"""Extract tender title from first meaningful line."""
	lines = [l.strip() for l in text.split("\n") if l.strip()]
	for line in lines[:5]:
	# Skip boilerplate headers
	if len(line) > 10 and not line.startswith("#") and ":" not in line[:3]:
	# Clean common prefixes
	for prefix in ["GRANT OPPORTUNITY:", "FUNDING CALL:", "APPEL À CANDIDATURES :", "APPEL À PROJETS :"]:
	if line.startswith(prefix):
	return line[len(prefix):].strip()
	return line
	# Fallback: derive from filename
	return Path(filename).stem.replace("_", " ").title() if filename else "Unknown Tender"


	def parse_txt(filepath: str) -> dict:
	"""Parse a .txt tender file."""
	with open(filepath, "r", encoding="utf-8") as f:
	text = f.read()
	return text


	def parse_html(filepath: str) -> dict:
	"""Parse an .html tender file (strip tags)."""
	with open(filepath, "r", encoding="utf-8") as f:
	html = f.read()
	# Simple tag stripper
	text = re.sub(r"<[^>]+>", " ", html)
	text = re.sub(r" ", " ", text)
	text = re.sub(r"&", "&", text)
	text = re.sub(r"\s+", " ", text).strip()
	return text


	def parse_file(filepath: str) -> dict:
	"""
	Parse any supported file format and return a structured tender record.

	Returns:
	dict with keys: id, title, sector, budget, deadline, region, language, raw_text, filepath
	"""
	path = Path(filepath)
	ext = path.suffix.lower()

	if ext == ".txt":
	text = parse_txt(filepath)
	elif ext in [".html", ".htm"]:
	text = parse_html(filepath)
	elif ext == ".pdf":
	try:
	from pypdf import PdfReader
	reader = PdfReader(filepath)
	pages = [page.extract_text() or "" for page in reader.pages]
	text = "\n".join(pages).strip()
	except ImportError:
	# Fallback: try pdftotext CLI if pypdf not installed
	try:
	import subprocess
	result = subprocess.run(["pdftotext", filepath, "-"], capture_output=True, text=True)
	text = result.stdout if result.returncode == 0 else ""
	except Exception:
	text = ""
	except Exception as e:
	text = ""
	if not text.strip():
	text = f"[PDF: {path.name} — text extraction failed, file may be scanned/image-based]"
	else:
	with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
	text = f.read()

	tender_id = path.stem.split("_")[0] if "_" in path.stem else path.stem

	return {
	"id": tender_id,
	"title": extract_title(text, path.name),
	"sector": extract_sector(text, path.name),
	"budget": extract_budget(text),
	"deadline": extract_deadline(text),
	"region": extract_region(text),
	"language": detect_language(text),
	"raw_text": text,
	"filepath": str(filepath)
	}


	def load_tenders(tenders_dir: str = "data/tenders") -> list:
	"""Load and parse all tender documents from a directory."""
	tenders = []
	supported = {".txt", ".html", ".htm", ".pdf"}
	for fpath in sorted(Path(tenders_dir).iterdir()):
	if fpath.suffix.lower() in supported:
	try:
	tender = parse_file(str(fpath))
	tenders.append(tender)
	except Exception as e:
	print(f" [WARN] Could not parse {fpath.name}: {e}")
	print(f" Loaded {len(tenders)} tenders from {tenders_dir}")
	return tenders


	def load_profiles(profiles_path: str = "data/profiles.json") -> list:
	"""Load business profiles."""
	with open(profiles_path, "r") as f:
	profiles = json.load(f)
	print(f" Loaded {len(profiles)} profiles from {profiles_path}")
	return profiles


	if __name__ == "__main__":
	tenders = load_tenders()
	for t in tenders[:3]:
	print(f"\n {t['id']} \| {t['sector']} \| {t['language']} \| budget={t['budget']} \| deadline={t['deadline']}")
	print(f" Title: {t['title'][:60]}")