Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| src/utils.py β Shared Utilities | |
| """ | |
| import os | |
| import json | |
| from pathlib import Path | |
| def ensure_dir(path: str): | |
| """Create directory if it doesn't exist.""" | |
| Path(path).mkdir(parents=True, exist_ok=True) | |
| def save_json(data, path: str): | |
| """Save data as JSON.""" | |
| ensure_dir(str(Path(path).parent)) | |
| with open(path, "w", encoding="utf-8") as f: | |
| json.dump(data, f, indent=2, ensure_ascii=False) | |
| def load_json(path: str): | |
| """Load JSON file.""" | |
| with open(path, "r", encoding="utf-8") as f: | |
| return json.load(f) | |
| def get_profile_language(profile: dict) -> str: | |
| """Get primary language for a profile.""" | |
| langs = profile.get("languages", ["en"]) | |
| return langs[0] if langs else "en" | |
| def format_budget(amount: int) -> str: | |
| """Format budget as readable string.""" | |
| if amount >= 1_000_000: | |
| return f"USD {amount/1_000_000:.1f}M" | |
| elif amount >= 1_000: | |
| return f"USD {amount:,}" | |
| elif amount == 0: | |
| return "Budget TBD" | |
| else: | |
| return f"USD {amount}" | |
| def print_banner(text: str, width: int = 60): | |
| """Print a styled banner.""" | |
| print("\n" + "=" * width) | |
| print(f" {text}") | |
| print("=" * width) | |
| def compute_mrr(gold_matches: dict, predictions: dict, k: int = 5) -> float: | |
| """ | |
| Compute Mean Reciprocal Rank @ k. | |
| Args: | |
| gold_matches: {profile_id: [tender_id, ...]} (ordered by relevance) | |
| predictions: {profile_id: [tender_id, ...]} (ordered by model rank) | |
| k: cutoff | |
| Returns: | |
| MRR@k score (0β1) | |
| """ | |
| rr_sum = 0.0 | |
| count = 0 | |
| for profile_id, gold_tids in gold_matches.items(): | |
| gold_set = set(gold_tids) | |
| pred_list = predictions.get(profile_id, [])[:k] | |
| for rank_idx, tid in enumerate(pred_list, 1): | |
| if tid in gold_set: | |
| rr_sum += 1.0 / rank_idx | |
| break | |
| count += 1 | |
| return rr_sum / count if count > 0 else 0.0 | |
| def compute_recall(gold_matches: dict, predictions: dict, k: int = 5) -> float: | |
| """ | |
| Compute Recall @ k. | |
| Args: | |
| gold_matches: {profile_id: [tender_id, ...]} | |
| predictions: {profile_id: [tender_id, ...]} | |
| k: cutoff | |
| Returns: | |
| Recall@k score (0β1) | |
| """ | |
| recall_sum = 0.0 | |
| count = 0 | |
| for profile_id, gold_tids in gold_matches.items(): | |
| gold_set = set(gold_tids) | |
| pred_list = predictions.get(profile_id, [])[:k] | |
| hits = len(set(pred_list) & gold_set) | |
| recall_sum += hits / len(gold_set) if gold_set else 0.0 | |
| count += 1 | |
| return recall_sum / count if count > 0 else 0.0 | |
| def load_gold_matches(gold_path: str = "data/gold_matches.csv") -> dict: | |
| """Load gold matches from CSV. Returns {profile_id: [tender_ids]}.""" | |
| gold = {} | |
| with open(gold_path, "r") as f: | |
| lines = f.read().strip().split("\n") | |
| for line in lines[1:]: # skip header | |
| parts = line.split(",") | |
| if len(parts) >= 2: | |
| pid, tid = parts[0].strip(), parts[1].strip() | |
| gold.setdefault(pid, []).append(tid) | |
| return gold | |