Spaces:

amougou-mbida
/

regulens

Running

App Files Files Community

Maximilian Amougou commited on Jan 12

Commit

09a324c

verified ·

1 Parent(s): a60b2bb

Upload 7 files

Browse files

Files changed (4) hide show

scripts/agentic_pdfeditor.py +14 -6
scripts/llm_nlp_preprocessing.py +3 -3
scripts/pymupdf_nlp_preprocessing.py +4 -4
scripts/utility_functions.py +67 -9

scripts/agentic_pdfeditor.py CHANGED Viewed

@@ -100,10 +100,18 @@ def agentic_pdf_annotator(changes: list[RegulatoryChange], file_bytes, extractio
     # Sort by length of relevant_text in descending order to avoid overlapping highlights
     changes = sorted(changes, key=lambda c: -len(c.text))
     annotated_areas = {}
     full_text = ""
     for page_num in range(len(doc)):
         page = doc[page_num]
-        full_text += page.get_text()
     for change in changes:
         page_num = int(change.grounding[0].page)
         text = change.text
@@ -114,7 +122,7 @@ def agentic_pdf_annotator(changes: list[RegulatoryChange], file_bytes, extractio
             results = []
             for pnr in range(len(doc)):  # search all pages
                 annotated_areas.setdefault(f"{pnr}", [])
-                page = doc.load_page(pnr)
                 text_instances = page.search_for(text)
                 for inst in text_instances:
                     page_num = pnr# remove?
@@ -133,7 +141,7 @@ def agentic_pdf_annotator(changes: list[RegulatoryChange], file_bytes, extractio
                 if best_match and len(best_match) > 0:
                     print("found best fuzzy match: ", best_match)
                     for page_num in range(len(doc)):  # search all pages
-                        page = doc.load_page(page_num)
                         text_instances = page.search_for(best_match)
                         for inst in text_instances:
                             results.append({"page": page_num, "bbox": inst})
@@ -149,10 +157,10 @@ def agentic_pdf_annotator(changes: list[RegulatoryChange], file_bytes, extractio
                     )
             if results:  # "flattenning" the results
                 page_num = results[0]["page"]
-                doc_page = doc.load_page(page_num)
                 results = [r["bbox"] for r in results if r["page"] == page_num]
         else:
-            doc_page = doc.load_page(page_num)
             annotated_areas.setdefault(f"{page_num}", [])
             # Search for the relevant text on the page
             results = doc_page.search_for(text)
@@ -168,7 +176,7 @@ def agentic_pdf_annotator(changes: list[RegulatoryChange], file_bytes, extractio
             )
             if not results:
                 best_match = get_best_fuzzy_match(
-                    doc_page.get_text(option="text"), change
                 )
                 if best_match and len(best_match) > 0:
                     results = doc_page.search_for(best_match)

     # Sort by length of relevant_text in descending order to avoid overlapping highlights
     changes = sorted(changes, key=lambda c: -len(c.text))
     annotated_areas = {}
+    # OPTIMIZATION: Pre-cache all pages and their text content
+    page_cache = {}
+    page_text_cache = {}
     full_text = ""
     for page_num in range(len(doc)):
         page = doc[page_num]
+        page_cache[page_num] = page
+        page_text = page.get_text()
+        page_text_cache[page_num] = page_text
+        full_text += page_text
     for change in changes:
         page_num = int(change.grounding[0].page)
         text = change.text
             results = []
             for pnr in range(len(doc)):  # search all pages
                 annotated_areas.setdefault(f"{pnr}", [])
+                page = page_cache[pnr]  # Use cached page
                 text_instances = page.search_for(text)
                 for inst in text_instances:
                     page_num = pnr# remove?
                 if best_match and len(best_match) > 0:
                     print("found best fuzzy match: ", best_match)
                     for page_num in range(len(doc)):  # search all pages
+                        page = page_cache[page_num]  # Use cached page
                         text_instances = page.search_for(best_match)
                         for inst in text_instances:
                             results.append({"page": page_num, "bbox": inst})
                     )
             if results:  # "flattenning" the results
                 page_num = results[0]["page"]
+                doc_page = page_cache[page_num]  # Use cached page
                 results = [r["bbox"] for r in results if r["page"] == page_num]
         else:
+            doc_page = page_cache[page_num]  # Use cached page
             annotated_areas.setdefault(f"{page_num}", [])
             # Search for the relevant text on the page
             results = doc_page.search_for(text)
             )
             if not results:
                 best_match = get_best_fuzzy_match(
+                    page_text_cache[page_num], change  # Use cached text
                 )
                 if best_match and len(best_match) > 0:
                     results = doc_page.search_for(best_match)

scripts/llm_nlp_preprocessing.py CHANGED Viewed

@@ -14,8 +14,8 @@ api_key = os.getenv("OPENAI_API_KEY")
 openai_client = AsyncOpenAI(api_key=api_key, timeout=60)
-def preprocess_text_with_nlp(text, max_chunk_size=512, overlap=50):
-    result = call_nlp_service({"text": text}, "preprocess_text_with_nlp_llm")
     return result["chunks"], result["preprocessed_data"]
@@ -64,7 +64,7 @@ async def detect_regulatory_changes(text_content, subtitle):
     """
     # Preprocess text with enhanced NLP
-    chunks, preprocessed_data = preprocess_text_with_nlp(text_content)
     # Classify changes using NLP insights
     results = await search_for_regulatory_changes(chunks, preprocessed_data, subtitle)

 openai_client = AsyncOpenAI(api_key=api_key, timeout=60)
+async def preprocess_text_with_nlp(text, max_chunk_size=512, overlap=50):
+    result = await call_nlp_service({"text": text}, "preprocess_text_with_nlp_llm")
     return result["chunks"], result["preprocessed_data"]
     """
     # Preprocess text with enhanced NLP
+    chunks, preprocessed_data = await preprocess_text_with_nlp(text_content)
     # Classify changes using NLP insights
     results = await search_for_regulatory_changes(chunks, preprocessed_data, subtitle)

scripts/pymupdf_nlp_preprocessing.py CHANGED Viewed

@@ -14,9 +14,9 @@ api_key = os.getenv("OPENAI_API_KEY")
 openai_client = AsyncOpenAI(api_key=api_key, timeout=60)
-def preprocess_text_with_nlp(text, max_chunk_size=512, overlap=50):
     """Enhanced NLP preprocessing identical to your first experiment using PyMuPDF text extraction"""
-    return call_nlp_service({"text": text}, "preprocess_text_with_nlp_pymupdf")
 def create_prompt_with_nlp(chunk, preprocessed_data):
@@ -26,10 +26,10 @@ def create_prompt_with_nlp(chunk, preprocessed_data):
 async def classify_changes_with_nlp(text_content, location_info):
     """Classify changes with NLP preprocessing."""
     # Apply NLP preprocessing
-    preprocessed_data = preprocess_text_with_nlp(text_content)
     # Split into chunks (using the same method as your first experiment)
-    result = call_nlp_service({"text": text_content}, "recursive_character_text_splitter")
     chunks = result["chunks"]
     async def process_chunk(chunk):

 openai_client = AsyncOpenAI(api_key=api_key, timeout=60)
+async def preprocess_text_with_nlp(text, max_chunk_size=512, overlap=50):
     """Enhanced NLP preprocessing identical to your first experiment using PyMuPDF text extraction"""
+    return await call_nlp_service({"text": text}, "preprocess_text_with_nlp_pymupdf")
 def create_prompt_with_nlp(chunk, preprocessed_data):
 async def classify_changes_with_nlp(text_content, location_info):
     """Classify changes with NLP preprocessing."""
     # Apply NLP preprocessing
+    preprocessed_data = await preprocess_text_with_nlp(text_content)
     # Split into chunks (using the same method as your first experiment)
+    result = await call_nlp_service({"text": text_content}, "recursive_character_text_splitter")
     chunks = result["chunks"]
     async def process_chunk(chunk):

scripts/utility_functions.py CHANGED Viewed

@@ -3,6 +3,11 @@ import html
 import os
 import json
 import re
 import pymupdf
 import pymupdf4llm
 from rapidfuzz import fuzz
@@ -15,6 +20,7 @@ from scripts.regulatory_change_foundation import (
     BASE_PROMPT_TEMPLATE,
 )
 # Define hex colors as RGB tuples (0–1 range)
 color_mapping_old = {
     "addition": (0, 0.4, 0),  # green
@@ -118,7 +124,15 @@ def get_tooltip_text(change):
 def highlight_nth(text, change, skip_failed=False):
     n = change.occurrence_index if hasattr(change, "occurrence_index") else 0
     target = re.sub(r"\\\s+", r".*?", change.text)
-    matches = list(re.finditer(target, text, flags=re.IGNORECASE | re.DOTALL))
     if len(matches) > n:
         match = matches[n]
         start, end = match.start(), match.end()
@@ -244,15 +258,59 @@ def save_json_to_file(data, output_dir, output_file):
     print(f"JSON data saved successfully at: {file_path}")
-def call_nlp_service(payload, method):
-    url = f"https://amougou-fortiss-nlp-preprocessor.hf.space/{method}"
-    # Make the request
-    response = requests.post(url, data=payload)
-    if response.status_code == 200:
-        return response.json()
-    else:
-        raise Exception(f"NLP service error: {response.status_code} - {response.text}")
 def lerp_color(value, start_color=(255, 0, 0), end_color=(0, 255, 0)):

 import os
 import json
 import re
+import time
+import random
+import asyncio
+import httpx
+from dotenv import load_dotenv
 import pymupdf
 import pymupdf4llm
 from rapidfuzz import fuzz
     BASE_PROMPT_TEMPLATE,
 )
+load_dotenv()
 # Define hex colors as RGB tuples (0–1 range)
 color_mapping_old = {
     "addition": (0, 0.4, 0),  # green
 def highlight_nth(text, change, skip_failed=False):
     n = change.occurrence_index if hasattr(change, "occurrence_index") else 0
     target = re.sub(r"\\\s+", r".*?", change.text)
+    # OPTIMIZATION: Compile regex once and find only up to n+1 matches (early exit)
+    pattern = re.compile(target, flags=re.IGNORECASE | re.DOTALL)
+    matches = []
+    for match in pattern.finditer(text):
+        matches.append(match)
+        if len(matches) > n:  # Early exit - we have enough matches
+            break
     if len(matches) > n:
         match = matches[n]
         start, end = match.start(), match.end()
     print(f"JSON data saved successfully at: {file_path}")
+MICROSERVICE_KEY = os.getenv("MICROSERVICE_KEY")
+nlp_semaphore = asyncio.Semaphore(100)  # Limit to 100 concurrent requests
+timeout = httpx.Timeout(
+    connect=20.0,  # time to establish connection
+    read=60.0,  # time to read the response
+    write=30.0,  # time to send the request
+    pool=80.0,  # time to acquire a connection from the pool
+)
+async def call_nlp_service(payload, method, max_retries=5, base_delay=1.0):
+    url = f"https://amougou-fortiss-nlp-preprocessor.hf.space/{method}"
+    headers = {"Authorization": f"Bearer {MICROSERVICE_KEY}"}
+    async with nlp_semaphore:
+        for attempt in range(max_retries):
+            try:
+                async with httpx.AsyncClient(timeout=timeout) as client:
+                    response = await client.post(url, data=payload, headers=headers)
+                # Success
+                if response.status_code == 200:
+                    return response.json()
+                # Rate limited
+                if response.status_code == 429:
+                    if attempt == max_retries - 1:
+                        break
+                    retry_after = response.headers.get("Retry-After")
+                    delay = (
+                        float(retry_after)
+                        if retry_after
+                        else (base_delay * (2**attempt) + random.uniform(0, 0.5))
+                    )
+                    await asyncio.sleep(delay)
+                    continue
+                # Other HTTP errors
+                raise Exception(
+                    f"NLP service error: {response.status_code} - {response.text}"
+                )
+            except (httpx.ConnectTimeout, httpx.ReadTimeout, httpx.NetworkError) as e:
+                # Retry on network issues
+                if attempt == max_retries - 1:
+                    raise Exception(
+                        f"NLP service network error after {max_retries} attempts: {e}"
+                    )
+                delay = base_delay * (2**attempt) + random.uniform(0, 0.5)
+                await asyncio.sleep(delay)
+                continue
+    raise Exception(f"NLP service error: failed after {max_retries} retries")
 def lerp_color(value, start_color=(255, 0, 0), end_color=(0, 255, 0)):