ClinDoc_CDSS / backend /utils.py
iyadh-bencheikh's picture
Update
764531e
import re
import json
from datetime import datetime
from rapidfuzz import fuzz
def log_event(LOG_FILE, event_type: str, data: dict):
LOG_FILE.parent.mkdir(exist_ok=True)
entry = {
"timestamp": datetime.utcnow().isoformat(),
"event_type": event_type,
"data": data
}
with open(LOG_FILE, "a") as f:
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
def display_to_llm_format(physician_points: list[dict]) -> list[dict]:
return [
{
"quotes": [o["quote"] for o in point["offsets"]],
"category": point["category"],
"label": point["label"],
"reasoning": point["reasoning"]
}
for point in physician_points
]
def normalize(text: str) -> str:
return re.sub(r'\s+', ' ', text).strip()
def find_quote_offset(document: str, quote: str, threshold: int = 80) -> tuple[int, int] | None:
norm_doc = re.sub(r'\s+', ' ', document)
norm_quote = re.sub(r'\s+', ' ', quote).strip()
norm_to_orig = []
orig_i = 0
in_ws = False
for ch in document:
if re.match(r'\s', ch):
if not in_ws:
norm_to_orig.append(orig_i)
in_ws = True
else:
norm_to_orig.append(orig_i)
in_ws = False
orig_i += 1
q_len = len(norm_quote)
best_score = 0
best_start = -1
for i in range(len(norm_doc) - q_len + 1):
window = norm_doc[i:i + q_len]
score = fuzz.ratio(norm_quote, window)
if score > best_score:
best_score = score
best_start = i
if best_score < threshold or best_start == -1:
return None
orig_start = norm_to_orig[best_start]
orig_end = norm_to_orig[min(best_start + q_len - 1, len(norm_to_orig) - 1)] + 1
return (orig_start, orig_end)
def find_all_offsets(document: str, quotes: list[str]) -> list[dict]:
"""
Find offsets for a list of quotes.
"""
results = []
for quote in quotes:
offset = find_quote_offset(document, quote)
results.append({
"quote": quote,
"start": offset[0] if offset else None,
"end": offset[1] if offset else None,
"found": offset is not None
})
return results