Spaces:

amougou-mbida
/

regulens

Running

App Files Files Community

regulens / scripts /utility_functions.py

amougou-mbida

Update scripts/utility_functions.py

37df49d verified 6 days ago

raw

history blame contribute delete

17.5 kB

	import difflib
	import html
	import os
	import json
	import re
	import time
	import random
	import asyncio
	import httpx
	from dotenv import load_dotenv
	import pymupdf
	import pymupdf4llm
	from rapidfuzz import fuzz
	from agentic_doc.parse import parse
	import requests
	from scripts.models import RegulatoryChange
	from scripts.regulatory_change_foundation import (
	CLASSIFICATION_INFO,
	FEW_SHOT_EXAMPLES,
	BASE_PROMPT_TEMPLATE,
	)

	load_dotenv()
	# Define hex colors as RGB tuples (0–1 range)
	color_mapping_old = {
	"addition": (0, 0.4, 0), # green
	"deletion": (1, 0, 0), # red
	"modification": (0, 0.6, 1), # blue
	}

	color_mapping = {
	"addition": (0.0, 0.45, 0.7), # blue
	"deletion": (0.9, 0.6, 0.0), # orange
	"modification": (0.5, 0.5, 0.5), # gray
	}


	def to_rgb(color_tuple):
	return f"rgb({int(color_tuple[0] * 255)}, {int(color_tuple[1] * 255)}, {int(color_tuple[2] * 255)})"


	css_styles = f"""
	<style>
	.custom-link {{
	display: inline-block;
	padding: 8px 16px;
	color: white !important;
	text-decoration: none;
	border-radius: 8px;
	transition: background-color 0.3s ease;
	}}
	.custom-link:hover {{
	background-color: #45a049;
	}}
	.tooltip {{
	font-weight: bold;
	cursor: help;
	background-color: white;
	}}
	.addition-tooltip {{
	color: {to_rgb(color_mapping["addition"])};
	}}
	.modification-tooltip {{
	color: {to_rgb(color_mapping["modification"])};
	}}
	.deletion-tooltip {{
	color: {to_rgb(color_mapping["deletion"])};
	}}
	.default-tooltip {{
	color: yellow;
	}}
	</style>
	"""


	def get_color_mapping_hex():
	return {key: tuple(int(c * 255) for c in rgb) for key, rgb in color_mapping.items()}


	def get_tooltip_text(change):
	return (
	change.type
	if hasattr(change, "type")
	else "Type unspecified"
	+ " - "
	+ (change.category if hasattr(change, "category") else "Category unspecified")
	+ "\n"
	+ (change.context if hasattr(change, "context") else "")
	)


	def highlight_nth(text, change, skip_failed=False):
	n = change.occurrence_index if hasattr(change, "occurrence_index") else 0
	target = re.sub(r"\\\s+", r".*?", change.text)

	# OPTIMIZATION: Compile regex once and find only up to n+1 matches (early exit)
	pattern = re.compile(target, flags=re.IGNORECASE \| re.DOTALL)
	matches = []
	for match in pattern.finditer(text):
	matches.append(match)
	if len(matches) > n: # Early exit - we have enough matches
	break

	if len(matches) > n:
	match = matches[n]
	start, end = match.start(), match.end()
	tooltip_raw = get_tooltip_text(change)
	tooltip_escaped = html.escape(tooltip_raw, quote=True)
	highlighted_span = f"""<span id='marked_section' class='tooltip {change.type if hasattr(change, "type") else "default"}-tooltip' title='{tooltip_escaped}'>
	{text[start:end]}
	</span>"""
	return text[:start] + highlighted_span + text[end:]
	else:
	return highlight_fuzzy_match(text, change, n, skip_failed=skip_failed)


	# TODO:check treshhold->51 would get always a result
	# if we make it lower we get guaranteed matches but they might be different from the original target, but if threshold is too high we might not find any match eg when a word is missing
	def highlight_fuzzy_match(text, change, n=0, threshold=80, skip_failed=False):
	target = change.text
	window_size = len(target)
	step = 1

	candidates = []
	for i in range(0, len(text) - window_size, step):
	window = text[i : i + window_size]
	score = fuzz.partial_ratio(window.lower(), target.lower())
	if score >= threshold:
	candidates.append((score, i, i + window_size))

	if not candidates and not skip_failed:
	return (
	f"""
	<span class='hover-tooltip' title='No match found'>
	<strong style='color: red;'>No match found for: "{target}"</strong>
	<br>
	</span>
	<span style="color: red;">Please verify if it is part of the original text or if it was extracted incorrectly.</span><br>
	"""
	+ text
	)
	if not candidates and skip_failed:
	return text
	# Pick top-N match
	candidates.sort(reverse=True)
	_, start_norm, end_norm = candidates[min(n, len(candidates) - 1)]

	tooltip_raw = get_tooltip_text(change)
	tooltip_escaped = html.escape(tooltip_raw, quote=True)
	highlighted_span = f"""<span id='marked_section' class='tooltip {change.type if hasattr(change, "type") else "default"}-tooltip' title='{tooltip_escaped}'>{text[start_norm:end_norm]}</span>"""
	return text[:start_norm] + highlighted_span + text[end_norm:]


	# TODO:check treshhold->51 would get always a result
	# if we make it lower we get guaranteed matches but they might be different from the original target, but if threshold is too high we might not find any match eg when a word is missing
	def get_best_fuzzy_match(text, change: RegulatoryChange, threshold=65):
	"""Find the best fuzzy match for a change in the text and return the matched section
	Caller needs to account for potentially None return value"""
	n = change.occurrence_index if hasattr(change, "occurrence_index") else 0
	target = change.text
	window_size = len(target)
	step = 1

	candidates = []
	for i in range(0, len(text) - window_size, step):
	window = text[i : i + window_size]
	score = fuzz.partial_ratio(window.lower(), target.lower())
	if score >= threshold:
	candidates.append((score, i, i + window_size))

	if not candidates:
	return None
	# Pick top-N match
	candidates.sort(reverse=True)
	_, start_norm, end_norm = candidates[min(n, len(candidates) - 1)]

	return text[start_norm:end_norm]


	def render_prompt(text, include_nlp=False, preprocessed_data=None):
	classification_json = json.dumps(CLASSIFICATION_INFO, indent=2)
	few_shot_json = json.dumps(FEW_SHOT_EXAMPLES, indent=2)

	if include_nlp and preprocessed_data:
	chunk_entities = [
	ent for ent in preprocessed_data["entities"] if ent["text"] in text
	]
	chunk_nouns = [
	nc for nc in preprocessed_data["noun_chunks"] if nc["text"] in text
	]
	nlp_insights_json = json.dumps(
	{"entities": chunk_entities, "key_noun_phrases": chunk_nouns}, indent=2
	)

	nlp_section = ", and NLP insights"
	nlp_insights = f"\n\nNLP Insights:\n{nlp_insights_json}"
	evidence_block = ',\n "evidence": {\n "entities_involved": ["relevant named entities"],\n "key_phrases": ["relevant noun phrases or key terms"]\n }'
	else:
	nlp_section = ""
	nlp_insights = ""
	evidence_block = ""

	return BASE_PROMPT_TEMPLATE.format(
	classification_info=classification_json,
	few_shot_examples=few_shot_json,
	nlp_section=nlp_section,
	nlp_insights=nlp_insights,
	text=text,
	evidence_block=evidence_block,
	)


	def save_json_to_file(data, output_dir, output_file):
	"""Save the JSON data to a file and print the file path."""

	# Create output directory if it doesn't exist
	if not os.path.exists(output_dir):
	os.makedirs(output_dir)

	# Save JSON data to the specified file
	file_path = os.path.join(output_dir, output_file)
	with open(file_path, "w", encoding="utf-8") as f:
	json.dump(data, f, indent=4, ensure_ascii=False)

	# Print the location of the saved file
	print(f"JSON data saved successfully at: {file_path}")


	MICROSERVICE_KEY = os.getenv("MICROSERVICE_KEY")
	nlp_semaphore = asyncio.Semaphore(100) # Limit to 100 concurrent requests
	timeout = httpx.Timeout(
	connect=20.0, # time to establish connection
	read=60.0, # time to read the response
	write=30.0, # time to send the request
	pool=80.0, # time to acquire a connection from the pool
	)


	async def call_nlp_service(payload, method, max_retries=5, base_delay=1.0):
	url = f"https://amougou-mbida-nlp-preprocessor.hf.space/{method}"
	headers = {"Authorization": f"Bearer {MICROSERVICE_KEY}"}

	async with nlp_semaphore:
	for attempt in range(max_retries):
	try:
	async with httpx.AsyncClient(timeout=timeout) as client:
	response = await client.post(url, data=payload, headers=headers)

	# Success
	if response.status_code == 200:
	return response.json()

	# Rate limited
	if response.status_code == 429:
	if attempt == max_retries - 1:
	break
	retry_after = response.headers.get("Retry-After")
	delay = (
	float(retry_after)
	if retry_after
	else (base_delay * (2**attempt) + random.uniform(0, 0.5))
	)
	await asyncio.sleep(delay)
	continue

	# Other HTTP errors
	raise Exception(
	f"NLP service error: {response.status_code} - {response.text}"
	)

	except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.NetworkError) as e:
	# Retry on network issues
	if attempt == max_retries - 1:
	raise Exception(
	f"NLP service network error after {max_retries} attempts: {e}"
	)
	delay = base_delay * (2**attempt) + random.uniform(0, 0.5)
	await asyncio.sleep(delay)
	continue

	raise Exception(f"NLP service error: failed after {max_retries} retries")


	def lerp_color(value, start_color=(255, 0, 0), end_color=(0, 255, 0)):
	"""
	Linearly interpolate between start_color and end_color by value.

	Parameters:
	- value: float between 0 and 1
	- start_color: tuple (r, g, b), default red
	- end_color: tuple (r, g, b), default green

	Returns:
	- CSS rgb color string, e.g. 'rgb(255, 0, 0)'
	"""
	r = int(start_color[0] + (end_color[0] - start_color[0]) * value)
	g = int(start_color[1] + (end_color[1] - start_color[1]) * value)
	b = int(start_color[2] + (end_color[2] - start_color[2]) * value)
	return f"rgb({r}, {g}, {b})"


	def extract_markdown(file_bytes: bytes) -> str:
	"""Extract markdown text from PDF bytes using pymupdf4llm."""
	return pymupdf4llm.to_markdown(
	pymupdf.open(
	stream=file_bytes,
	filetype="pdf",
	)
	)


	def remove_html_comments(text: str) -> str:
	clean_text = re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL)
	return clean_text


	def normalize_markdown_indentation(content):
	"""Normalize excessive indentation to prevent code block interpretation."""
	lines = content.split("\n")
	normalized_lines = []

	for line in lines:
	# Check if line is a list item with excessive indentation
	stripped = line.lstrip()
	if stripped.startswith(("-", "*", "+")):
	# Count leading spaces
	leading_spaces = len(line) - len(stripped)
	# Normalize to max 4 spaces for nested lists
	if leading_spaces > 4:
	# Convert to proper nested list (2 spaces per level)
	nest_level = min(leading_spaces // 6, 2) # Max 2 levels deep
	normalized_line = " " * nest_level + stripped
	normalized_lines.append(normalized_line)
	else:
	normalized_lines.append(line)
	else:
	normalized_lines.append(line)

	return "\n".join(normalized_lines)


	def highlight_differences_words(text1: str, text2: str):
	"""
	Return two HTML strings: highlighted version of text1 and text2.
	Highlights:
	- deletion-tooltip for words deleted from text1 => appear in highlighted_text1 only
	- addition-tooltip for words inserted into text2 => appear in highlighted_text2 only
	- modification-tooltip for words replaced (both sides)
	Preserves newlines.
	"""

	# Split into words and newlines, preserving newlines as tokens
	words1 = re.split(r"(\s+)", text1)
	words2 = re.split(r"(\s+)", text2)

	sm = difflib.SequenceMatcher(a=words1, b=words2, isjunk=lambda x: x in " \t")
	out1 = []
	out2 = []

	def esc(w):
	return html.escape(w)

	for tag, i1, i2, j1, j2 in sm.get_opcodes():
	if tag == "equal":
	out1.extend([esc(w) for w in words1[i1:i2]])
	out2.extend([esc(w) for w in words2[j1:j2]])
	elif tag == "replace":
	out1.extend(
	[
	f'<span class="tooltip modification-tooltip" title="Changed">{esc(w)}</span>'
	for w in words1[i1:i2]
	]
	)
	out2.extend(
	[
	f'<span class="tooltip modification-tooltip" title="Changed">{esc(w)}</span>'
	for w in words2[j1:j2]
	]
	)
	elif tag == "delete":
	out1.extend(
	[
	f'<span class="tooltip deletion-tooltip" title="Removed">{esc(w)}</span>'
	for w in words1[i1:i2]
	]
	)
	# deleted words are not added to out2
	elif tag == "insert":
	out2.extend(
	[
	f'<span class="tooltip addition-tooltip" title="Added">{esc(w)}</span>'
	for w in words2[j1:j2]
	]
	)
	# inserted words are not added to out1

	highlighted_text1 = "".join(out1)
	highlighted_text2 = "".join(out2)
	return highlighted_text1, highlighted_text2


	def map_categorical_impact_assessment(
	changes: list[RegulatoryChange],
	) -> list[RegulatoryChange]:
	"""Map categorical impact assessment actions based on changetype"""
	import copy

	action_map = {
	"Textual and Editorial Changes": {
	"actions": [
	{"label": "Update documentation", "completed": False},
	{"label": "Adjust UI wording", "completed": False},
	{"label": "Inform stakeholders", "completed": False},
	],
	},
	"Data and Field Changes": {
	"actions": [
	{"label": "Add/modify fields", "completed": False},
	{"label": "Create migration scripts", "completed": False},
	{"label": "Update forms/APIs/test cases", "completed": False},
	],
	},
	"Procedural Changes": {
	"actions": [
	{"label": "Update process automation", "completed": False},
	{"label": "Adjust workflow steps", "completed": False},
	{"label": "Reassign roles or access", "completed": False},
	],
	},
	"Compliance and Enforcement Changes": {
	"actions": [
	{"label": "Implement logging or alerts", "completed": False},
	{"label": "Update compliance documentation", "completed": False},
	{"label": "Conduct internal review", "completed": False},
	],
	},
	"Policy Changes": {
	"actions": [
	{"label": "Adjust rule sets", "completed": False},
	{"label": "Revalidate configurations", "completed": False},
	{"label": "Run simulations or validations", "completed": False},
	],
	},
	}
	# Compare if the action labels match; if not, replace with correct ones while preserving completion status
	for change in changes:
	if change.category in action_map:
	mapped_actions = action_map[change.category]["actions"]
	current_labels = [action.get("label") for action in change.actions]
	expected_labels = [action["label"] for action in mapped_actions]

	# Only update if the labels don't match
	# Create deep copies to prevent shared references across changes
	if current_labels != expected_labels:
	change.actions = copy.deepcopy(mapped_actions)
	# If labels match but user has different completion status, preserve their progress
	return changes

	def landing_ai_available() -> bool:
	"""Check if we have Landing AI credits available."""
	try:
	result = json.loads(parse("c".encode("utf-8"))[0].model_dump_json())
	errors = result.get("errors", [])
	if errors:
	for error in errors:
	if "402" in error.get("error", ""):
	print("Landing AI credits exhausted.")
	return False
	return True

	except Exception as e:
	print(f"Error checking Landing AI credits: {e}")
	return False


	landing_ai_available_flag = landing_ai_available()
	extraction_methods = (
	{
	"agentic": "Agentic (Landing AI)",
	"llm": "LLM (gpt-4o-mini)",
	"pymupdf": "PyMuPDF (PDF Parsing Library)",
	}
	if landing_ai_available_flag
	else {
	"pymupdf": "PyMuPDF (PDF Parsing Library)",
	"llm": "LLM (gpt-4o-mini)",
	}
	)