import re import json from datetime import datetime from rapidfuzz import fuzz def log_event(LOG_FILE, event_type: str, data: dict): LOG_FILE.parent.mkdir(exist_ok=True) entry = { "timestamp": datetime.utcnow().isoformat(), "event_type": event_type, "data": data } with open(LOG_FILE, "a") as f: f.write(json.dumps(entry, ensure_ascii=False) + "\n") def display_to_llm_format(physician_points: list[dict]) -> list[dict]: return [ { "quotes": [o["quote"] for o in point["offsets"]], "category": point["category"], "label": point["label"], "reasoning": point["reasoning"] } for point in physician_points ] def normalize(text: str) -> str: return re.sub(r'\s+', ' ', text).strip() def find_quote_offset(document: str, quote: str, threshold: int = 80) -> tuple[int, int] | None: norm_doc = re.sub(r'\s+', ' ', document) norm_quote = re.sub(r'\s+', ' ', quote).strip() norm_to_orig = [] orig_i = 0 in_ws = False for ch in document: if re.match(r'\s', ch): if not in_ws: norm_to_orig.append(orig_i) in_ws = True else: norm_to_orig.append(orig_i) in_ws = False orig_i += 1 q_len = len(norm_quote) best_score = 0 best_start = -1 for i in range(len(norm_doc) - q_len + 1): window = norm_doc[i:i + q_len] score = fuzz.ratio(norm_quote, window) if score > best_score: best_score = score best_start = i if best_score < threshold or best_start == -1: return None orig_start = norm_to_orig[best_start] orig_end = norm_to_orig[min(best_start + q_len - 1, len(norm_to_orig) - 1)] + 1 return (orig_start, orig_end) def find_all_offsets(document: str, quotes: list[str]) -> list[dict]: """ Find offsets for a list of quotes. """ results = [] for quote in quotes: offset = find_quote_offset(document, quote) results.append({ "quote": quote, "start": offset[0] if offset else None, "end": offset[1] if offset else None, "found": offset is not None }) return results