Spaces:
Running
Running
File size: 3,120 Bytes
dffabb7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 | #!/usr/bin/env python3
"""
src/utils.py — Shared Utilities
"""
import os
import json
from pathlib import Path
def ensure_dir(path: str):
"""Create directory if it doesn't exist."""
Path(path).mkdir(parents=True, exist_ok=True)
def save_json(data, path: str):
"""Save data as JSON."""
ensure_dir(str(Path(path).parent))
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
def load_json(path: str):
"""Load JSON file."""
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def get_profile_language(profile: dict) -> str:
"""Get primary language for a profile."""
langs = profile.get("languages", ["en"])
return langs[0] if langs else "en"
def format_budget(amount: int) -> str:
"""Format budget as readable string."""
if amount >= 1_000_000:
return f"USD {amount/1_000_000:.1f}M"
elif amount >= 1_000:
return f"USD {amount:,}"
elif amount == 0:
return "Budget TBD"
else:
return f"USD {amount}"
def print_banner(text: str, width: int = 60):
"""Print a styled banner."""
print("\n" + "=" * width)
print(f" {text}")
print("=" * width)
def compute_mrr(gold_matches: dict, predictions: dict, k: int = 5) -> float:
"""
Compute Mean Reciprocal Rank @ k.
Args:
gold_matches: {profile_id: [tender_id, ...]} (ordered by relevance)
predictions: {profile_id: [tender_id, ...]} (ordered by model rank)
k: cutoff
Returns:
MRR@k score (0–1)
"""
rr_sum = 0.0
count = 0
for profile_id, gold_tids in gold_matches.items():
gold_set = set(gold_tids)
pred_list = predictions.get(profile_id, [])[:k]
for rank_idx, tid in enumerate(pred_list, 1):
if tid in gold_set:
rr_sum += 1.0 / rank_idx
break
count += 1
return rr_sum / count if count > 0 else 0.0
def compute_recall(gold_matches: dict, predictions: dict, k: int = 5) -> float:
"""
Compute Recall @ k.
Args:
gold_matches: {profile_id: [tender_id, ...]}
predictions: {profile_id: [tender_id, ...]}
k: cutoff
Returns:
Recall@k score (0–1)
"""
recall_sum = 0.0
count = 0
for profile_id, gold_tids in gold_matches.items():
gold_set = set(gold_tids)
pred_list = predictions.get(profile_id, [])[:k]
hits = len(set(pred_list) & gold_set)
recall_sum += hits / len(gold_set) if gold_set else 0.0
count += 1
return recall_sum / count if count > 0 else 0.0
def load_gold_matches(gold_path: str = "data/gold_matches.csv") -> dict:
"""Load gold matches from CSV. Returns {profile_id: [tender_ids]}."""
gold = {}
with open(gold_path, "r") as f:
lines = f.read().strip().split("\n")
for line in lines[1:]: # skip header
parts = line.split(",")
if len(parts) >= 2:
pid, tid = parts[0].strip(), parts[1].strip()
gold.setdefault(pid, []).append(tid)
return gold
|