import html import os import json import re from rapidfuzz import fuzz import requests from scripts.regulatory_change_foundation import ( CLASSIFICATION_INFO, FEW_SHOT_EXAMPLES, BASE_PROMPT_TEMPLATE, ) # Define hex colors as RGB tuples (0–1 range) color_mapping = { "addition": (0, 0.4, 0), # green "deletion": (1, 0, 0), # red "modification": (0, 0.6, 1), # blue } def to_rgb(color_tuple): return f"rgb({int(color_tuple[0] * 255)}, {int(color_tuple[1] * 255)}, {int(color_tuple[2] * 255)})" css_styles = f""" """ def get_color_mapping_hex(): return {key: tuple(int(c * 255) for c in rgb) for key, rgb in color_mapping.items()} def get_tooltip_text(change): return ( change.get("type", "Type unspecified") + " - " + change.get("category", "Category unspecified") + "\n" + change.get("context", "") ) def highlight_nth(text, change, skip_failed=False): n = change.get("occurrence_index", 0) target = re.sub(r"\\\s+", r".*?", change["text"]) matches = list(re.finditer(target, text, flags=re.IGNORECASE | re.DOTALL)) if len(matches) > n: match = matches[n] start, end = match.start(), match.end() tooltip_raw = get_tooltip_text(change) tooltip_escaped = html.escape(tooltip_raw, quote=True) highlighted_span = f"""""" return text[:start] + highlighted_span + text[end:] else: return highlight_fuzzy_match(text, change, n, skip_failed=skip_failed) # TODO:check treshhold->51 would get always a result # if we make it lower we get guaranteed matches but they might be different from the original target, but if threshold is too high we might not find any match eg when a word is missing def highlight_fuzzy_match(text, change, n=0, threshold=80, skip_failed=False): target = change["text"] window_size = len(target) step = 1 candidates = [] for i in range(0, len(text) - window_size, step): window = text[i : i + window_size] score = fuzz.partial_ratio(window.lower(), target.lower()) if score >= threshold: candidates.append((score, i, i + window_size)) if not candidates and not skip_failed: return ( f""" Please verify if it is part of the original text or if it was extracted incorrectly.
""" + text ) if not candidates and skip_failed: return text # Pick top-N match candidates.sort(reverse=True) _, start_norm, end_norm = candidates[min(n, len(candidates) - 1)] tooltip_raw = get_tooltip_text(change) tooltip_escaped = html.escape(tooltip_raw, quote=True) highlighted_span = f"""""" return text[:start_norm] + highlighted_span + text[end_norm:] # TODO:check treshhold->51 would get always a result # if we make it lower we get guaranteed matches but they might be different from the original target, but if threshold is too high we might not find any match eg when a word is missing def get_best_fuzzy_match(text, change, threshold=65): """Find the best fuzzy match for a change in the text and return the matched section Caller needs to account for potentially None return value""" n = change.get("occurrence_index", 0) target = change["text"] window_size = len(target) step = 1 candidates = [] for i in range(0, len(text) - window_size, step): window = text[i : i + window_size] score = fuzz.partial_ratio(window.lower(), target.lower()) if score >= threshold: candidates.append((score, i, i + window_size)) if not candidates: return None # Pick top-N match candidates.sort(reverse=True) _, start_norm, end_norm = candidates[min(n, len(candidates) - 1)] return text[start_norm:end_norm] def render_prompt(text, include_nlp=False, preprocessed_data=None): classification_json = json.dumps(CLASSIFICATION_INFO, indent=2) few_shot_json = json.dumps(FEW_SHOT_EXAMPLES, indent=2) if include_nlp and preprocessed_data: chunk_entities = [ ent for ent in preprocessed_data["entities"] if ent["text"] in text ] chunk_nouns = [ nc for nc in preprocessed_data["noun_chunks"] if nc["text"] in text ] nlp_insights_json = json.dumps( {"entities": chunk_entities, "key_noun_phrases": chunk_nouns}, indent=2 ) nlp_section = ", and NLP insights" nlp_insights = f"\n\nNLP Insights:\n{nlp_insights_json}" evidence_block = ',\n "evidence": {\n "entities_involved": ["relevant named entities"],\n "key_phrases": ["relevant noun phrases or key terms"]\n }' else: nlp_section = "" nlp_insights = "" evidence_block = "" return BASE_PROMPT_TEMPLATE.format( classification_info=classification_json, few_shot_examples=few_shot_json, nlp_section=nlp_section, nlp_insights=nlp_insights, text=text, evidence_block=evidence_block, ) def save_json_to_file(data, output_dir, output_file): """Save the JSON data to a file and print the file path.""" # Create output directory if it doesn't exist if not os.path.exists(output_dir): os.makedirs(output_dir) # Save JSON data to the specified file file_path = os.path.join(output_dir, output_file) with open(file_path, "w", encoding="utf-8") as f: json.dump(data, f, indent=4, ensure_ascii=False) # Print the location of the saved file print(f"JSON data saved successfully at: {file_path}") def call_nlp_service(payload, method): url = f"https://amougou-fortiss-nlp-preprocessor.hf.space/{method}" # Make the request response = requests.post(url, data=payload) if response.status_code == 200: return response.json() else: raise Exception(f"NLP service error: {response.status_code} - {response.text}") def lerp_color(value, start_color=(255, 0, 0), end_color=(0, 255, 0)): """ Linearly interpolate between start_color and end_color by value. Parameters: - value: float between 0 and 1 - start_color: tuple (r, g, b), default red - end_color: tuple (r, g, b), default green Returns: - CSS rgb color string, e.g. 'rgb(255, 0, 0)' """ r = int(start_color[0] + (end_color[0] - start_color[0]) * value) g = int(start_color[1] + (end_color[1] - start_color[1]) * value) b = int(start_color[2] + (end_color[2] - start_color[2]) * value) return f"rgb({r}, {g}, {b})"