Spaces:

iyadh01
/

ClinDoc_CDSS

Sleeping

App Files Files Community

ClinDoc_CDSS / backend /utils.py

iyadh-bencheikh

Update

764531e about 1 month ago

raw

history blame contribute delete

2.3 kB


	import re
	import json
	from datetime import datetime
	from rapidfuzz import fuzz


	def log_event(LOG_FILE, event_type: str, data: dict):
	LOG_FILE.parent.mkdir(exist_ok=True)
	entry = {
	"timestamp": datetime.utcnow().isoformat(),
	"event_type": event_type,
	"data": data
	}
	with open(LOG_FILE, "a") as f:
	f.write(json.dumps(entry, ensure_ascii=False) + "\n")


	def display_to_llm_format(physician_points: list[dict]) -> list[dict]:
	return [
	{
	"quotes": [o["quote"] for o in point["offsets"]],
	"category": point["category"],
	"label": point["label"],
	"reasoning": point["reasoning"]
	}
	for point in physician_points
	]



	def normalize(text: str) -> str:
	return re.sub(r'\s+', ' ', text).strip()

	def find_quote_offset(document: str, quote: str, threshold: int = 80) -> tuple[int, int] \| None:
	norm_doc = re.sub(r'\s+', ' ', document)
	norm_quote = re.sub(r'\s+', ' ', quote).strip()

	norm_to_orig = []
	orig_i = 0
	in_ws = False
	for ch in document:
	if re.match(r'\s', ch):
	if not in_ws:
	norm_to_orig.append(orig_i)
	in_ws = True
	else:
	norm_to_orig.append(orig_i)
	in_ws = False
	orig_i += 1

	q_len = len(norm_quote)
	best_score = 0
	best_start = -1

	for i in range(len(norm_doc) - q_len + 1):
	window = norm_doc[i:i + q_len]
	score = fuzz.ratio(norm_quote, window)
	if score > best_score:
	best_score = score
	best_start = i

	if best_score < threshold or best_start == -1:
	return None

	orig_start = norm_to_orig[best_start]
	orig_end = norm_to_orig[min(best_start + q_len - 1, len(norm_to_orig) - 1)] + 1
	return (orig_start, orig_end)



	def find_all_offsets(document: str, quotes: list[str]) -> list[dict]:
	"""
	Find offsets for a list of quotes.
	"""
	results = []
	for quote in quotes:
	offset = find_quote_offset(document, quote)
	results.append({
	"quote": quote,
	"start": offset[0] if offset else None,
	"end": offset[1] if offset else None,
	"found": offset is not None
	})
	return results