Spaces:
Sleeping
Sleeping
| import re | |
| import json | |
| from datetime import datetime | |
| from rapidfuzz import fuzz | |
| def log_event(LOG_FILE, event_type: str, data: dict): | |
| LOG_FILE.parent.mkdir(exist_ok=True) | |
| entry = { | |
| "timestamp": datetime.utcnow().isoformat(), | |
| "event_type": event_type, | |
| "data": data | |
| } | |
| with open(LOG_FILE, "a") as f: | |
| f.write(json.dumps(entry, ensure_ascii=False) + "\n") | |
| def display_to_llm_format(physician_points: list[dict]) -> list[dict]: | |
| return [ | |
| { | |
| "quotes": [o["quote"] for o in point["offsets"]], | |
| "category": point["category"], | |
| "label": point["label"], | |
| "reasoning": point["reasoning"] | |
| } | |
| for point in physician_points | |
| ] | |
| def normalize(text: str) -> str: | |
| return re.sub(r'\s+', ' ', text).strip() | |
| def find_quote_offset(document: str, quote: str, threshold: int = 80) -> tuple[int, int] | None: | |
| norm_doc = re.sub(r'\s+', ' ', document) | |
| norm_quote = re.sub(r'\s+', ' ', quote).strip() | |
| norm_to_orig = [] | |
| orig_i = 0 | |
| in_ws = False | |
| for ch in document: | |
| if re.match(r'\s', ch): | |
| if not in_ws: | |
| norm_to_orig.append(orig_i) | |
| in_ws = True | |
| else: | |
| norm_to_orig.append(orig_i) | |
| in_ws = False | |
| orig_i += 1 | |
| q_len = len(norm_quote) | |
| best_score = 0 | |
| best_start = -1 | |
| for i in range(len(norm_doc) - q_len + 1): | |
| window = norm_doc[i:i + q_len] | |
| score = fuzz.ratio(norm_quote, window) | |
| if score > best_score: | |
| best_score = score | |
| best_start = i | |
| if best_score < threshold or best_start == -1: | |
| return None | |
| orig_start = norm_to_orig[best_start] | |
| orig_end = norm_to_orig[min(best_start + q_len - 1, len(norm_to_orig) - 1)] + 1 | |
| return (orig_start, orig_end) | |
| def find_all_offsets(document: str, quotes: list[str]) -> list[dict]: | |
| """ | |
| Find offsets for a list of quotes. | |
| """ | |
| results = [] | |
| for quote in quotes: | |
| offset = find_quote_offset(document, quote) | |
| results.append({ | |
| "quote": quote, | |
| "start": offset[0] if offset else None, | |
| "end": offset[1] if offset else None, | |
| "found": offset is not None | |
| }) | |
| return results |