import html
import os
import json
import re
from rapidfuzz import fuzz
import requests
from scripts.regulatory_change_foundation import (
CLASSIFICATION_INFO,
FEW_SHOT_EXAMPLES,
BASE_PROMPT_TEMPLATE,
)
# Define hex colors as RGB tuples (0–1 range)
color_mapping = {
"addition": (0, 0.4, 0), # green
"deletion": (1, 0, 0), # red
"modification": (0, 0.6, 1), # blue
}
def to_rgb(color_tuple):
return f"rgb({int(color_tuple[0] * 255)}, {int(color_tuple[1] * 255)}, {int(color_tuple[2] * 255)})"
css_styles = f"""
"""
def get_color_mapping_hex():
return {key: tuple(int(c * 255) for c in rgb) for key, rgb in color_mapping.items()}
def get_tooltip_text(change):
return (
change.get("type", "Type unspecified")
+ " - "
+ change.get("category", "Category unspecified")
+ "\n"
+ change.get("context", "")
)
def highlight_nth(text, change, skip_failed=False):
n = change.get("occurrence_index", 0)
target = re.sub(r"\\\s+", r".*?", change["text"])
matches = list(re.finditer(target, text, flags=re.IGNORECASE | re.DOTALL))
if len(matches) > n:
match = matches[n]
start, end = match.start(), match.end()
tooltip_raw = get_tooltip_text(change)
tooltip_escaped = html.escape(tooltip_raw, quote=True)
highlighted_span = f"""
{text[start:end]}
"""
return text[:start] + highlighted_span + text[end:]
else:
return highlight_fuzzy_match(text, change, n, skip_failed=skip_failed)
# TODO:check treshhold->51 would get always a result
# if we make it lower we get guaranteed matches but they might be different from the original target, but if threshold is too high we might not find any match eg when a word is missing
def highlight_fuzzy_match(text, change, n=0, threshold=80, skip_failed=False):
target = change["text"]
window_size = len(target)
step = 1
candidates = []
for i in range(0, len(text) - window_size, step):
window = text[i : i + window_size]
score = fuzz.partial_ratio(window.lower(), target.lower())
if score >= threshold:
candidates.append((score, i, i + window_size))
if not candidates and not skip_failed:
return (
f"""
No match found for: "{target}"
Please verify if it is part of the original text or if it was extracted incorrectly.
"""
+ text
)
if not candidates and skip_failed:
return text
# Pick top-N match
candidates.sort(reverse=True)
_, start_norm, end_norm = candidates[min(n, len(candidates) - 1)]
tooltip_raw = get_tooltip_text(change)
tooltip_escaped = html.escape(tooltip_raw, quote=True)
highlighted_span = f"""{text[start_norm:end_norm]}"""
return text[:start_norm] + highlighted_span + text[end_norm:]
# TODO:check treshhold->51 would get always a result
# if we make it lower we get guaranteed matches but they might be different from the original target, but if threshold is too high we might not find any match eg when a word is missing
def get_best_fuzzy_match(text, change, threshold=65):
"""Find the best fuzzy match for a change in the text and return the matched section
Caller needs to account for potentially None return value"""
n = change.get("occurrence_index", 0)
target = change["text"]
window_size = len(target)
step = 1
candidates = []
for i in range(0, len(text) - window_size, step):
window = text[i : i + window_size]
score = fuzz.partial_ratio(window.lower(), target.lower())
if score >= threshold:
candidates.append((score, i, i + window_size))
if not candidates:
return None
# Pick top-N match
candidates.sort(reverse=True)
_, start_norm, end_norm = candidates[min(n, len(candidates) - 1)]
return text[start_norm:end_norm]
def render_prompt(text, include_nlp=False, preprocessed_data=None):
classification_json = json.dumps(CLASSIFICATION_INFO, indent=2)
few_shot_json = json.dumps(FEW_SHOT_EXAMPLES, indent=2)
if include_nlp and preprocessed_data:
chunk_entities = [
ent for ent in preprocessed_data["entities"] if ent["text"] in text
]
chunk_nouns = [
nc for nc in preprocessed_data["noun_chunks"] if nc["text"] in text
]
nlp_insights_json = json.dumps(
{"entities": chunk_entities, "key_noun_phrases": chunk_nouns}, indent=2
)
nlp_section = ", and NLP insights"
nlp_insights = f"\n\nNLP Insights:\n{nlp_insights_json}"
evidence_block = ',\n "evidence": {\n "entities_involved": ["relevant named entities"],\n "key_phrases": ["relevant noun phrases or key terms"]\n }'
else:
nlp_section = ""
nlp_insights = ""
evidence_block = ""
return BASE_PROMPT_TEMPLATE.format(
classification_info=classification_json,
few_shot_examples=few_shot_json,
nlp_section=nlp_section,
nlp_insights=nlp_insights,
text=text,
evidence_block=evidence_block,
)
def save_json_to_file(data, output_dir, output_file):
"""Save the JSON data to a file and print the file path."""
# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Save JSON data to the specified file
file_path = os.path.join(output_dir, output_file)
with open(file_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=4, ensure_ascii=False)
# Print the location of the saved file
print(f"JSON data saved successfully at: {file_path}")
def call_nlp_service(payload, method):
url = f"https://amougou-fortiss-nlp-preprocessor.hf.space/{method}"
# Make the request
response = requests.post(url, data=payload)
if response.status_code == 200:
return response.json()
else:
raise Exception(f"NLP service error: {response.status_code} - {response.text}")
def lerp_color(value, start_color=(255, 0, 0), end_color=(0, 255, 0)):
"""
Linearly interpolate between start_color and end_color by value.
Parameters:
- value: float between 0 and 1
- start_color: tuple (r, g, b), default red
- end_color: tuple (r, g, b), default green
Returns:
- CSS rgb color string, e.g. 'rgb(255, 0, 0)'
"""
r = int(start_color[0] + (end_color[0] - start_color[0]) * value)
g = int(start_color[1] + (end_color[1] - start_color[1]) * value)
b = int(start_color[2] + (end_color[2] - start_color[2]) * value)
return f"rgb({r}, {g}, {b})"