Spaces:

amougou-mbida
/

regulens

Running

App Files Files Community

Maximilian Amougou commited on Jan 12

Commit

d6b760c

verified ·

1 Parent(s): 8c1316d

Upload 6 files

Browse files

Files changed (6) hide show

scripts/llm_nlp_preprocessing.py +49 -41
scripts/llm_no_nlp_preprocessing.py +51 -38
scripts/pymupdf_nlp_preprocessing.py +55 -47
scripts/pymupdf_no_nlp_preprocessing.py +52 -52
scripts/text_extraction_landing_ai.py +47 -36
scripts/utility_functions.py +29 -1

scripts/llm_nlp_preprocessing.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import json
 import os
 from dotenv import load_dotenv
-from openai import OpenAI
 from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
 from scripts.utility_functions import call_nlp_service, render_prompt
@@ -10,7 +11,7 @@ from scripts.utility_functions import call_nlp_service, render_prompt
 load_dotenv()
 api_key = os.getenv("OPENAI_API_KEY")
-openai_client = OpenAI(api_key=api_key)
 def preprocess_text_with_nlp(text, max_chunk_size=512, overlap=50):
@@ -22,36 +23,35 @@ def create_prompt(chunk, preprocessed_data):
     return render_prompt(chunk, include_nlp=True, preprocessed_data=preprocessed_data)
-def search_for_regulatory_changes(chunks, preprocessed_data, subtitle):
-    results = []
-    for chunk in chunks:
-        response = openai_client.chat.completions.create(
-            model="gpt-4o-mini",
-            messages=[
-                {
-                    "role": "system",
-                    "content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.",
-                },
-                {"role": "user", "content": create_prompt(chunk, preprocessed_data)},
-            ],
-            temperature=0.7,
-            max_tokens=1024,
-        )
         try:
             result = json.loads(response.choices[0].message.content)
             if result.get("changes_detected", False):
-                result["location"] = {"subtitle": subtitle}  # Use subtitle as location
                 result["source_text"] = chunk
-                results.append(result)
-        except json.JSONDecodeError:
-            continue
-    return results
-def detect_regulatory_changes(text_content, subtitle):
     """
     Main function to detect regulatory changes from text content.
@@ -67,7 +67,7 @@ def detect_regulatory_changes(text_content, subtitle):
     chunks, preprocessed_data = preprocess_text_with_nlp(text_content)
     # Classify changes using NLP insights
-    results = search_for_regulatory_changes(chunks, preprocessed_data, subtitle)
     return results
@@ -80,21 +80,29 @@ def llm_regulatory_change_detector(hierarchical_structure, progress_callback=Non
         }
         subtitles = {}
-        # Iterate over sections and analyze content
-        total_sections = len(hierarchical_structure["sections"])
-        for idx, section in enumerate(hierarchical_structure["sections"]):
-            # Update progress if callback provided
-            if progress_callback:
-                progress_callback((idx + 1) / total_sections)
             if status_callback:
-                status_callback(f"Analyzing section {idx + 1}/{total_sections}: {section.get('subtitle', 'Untitled')}")
-            subtitle = section["subtitle"]
-            content = section["content"]
-            if isinstance(content, list):
-                content = "\n".join(content)
-            # Detect changes for this subtitle
-            changes = detect_regulatory_changes(content, subtitle)
             # Update analysis summary
             for change in changes:

 import json
 import os
+import asyncio
 from dotenv import load_dotenv
+from openai import AsyncOpenAI
 from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
 from scripts.utility_functions import call_nlp_service, render_prompt
 load_dotenv()
 api_key = os.getenv("OPENAI_API_KEY")
+openai_client = AsyncOpenAI(api_key=api_key, timeout=60)
 def preprocess_text_with_nlp(text, max_chunk_size=512, overlap=50):
     return render_prompt(chunk, include_nlp=True, preprocessed_data=preprocessed_data)
+async def search_for_regulatory_changes(chunks, preprocessed_data, subtitle):
+    async def process_chunk(chunk):
         try:
+            response = await openai_client.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.",
+                    },
+                    {"role": "user", "content": create_prompt(chunk, preprocessed_data)},
+                ],
+                temperature=0.7,
+                max_tokens=1024,
+            )
             result = json.loads(response.choices[0].message.content)
             if result.get("changes_detected", False):
+                result["location"] = {"subtitle": subtitle}
                 result["source_text"] = chunk
+                return result
+        except (json.JSONDecodeError, Exception):
+            return None
+    tasks = [process_chunk(chunk) for chunk in chunks]
+    results = await asyncio.gather(*tasks)
+    return [r for r in results if r is not None]
+async def detect_regulatory_changes(text_content, subtitle):
     """
     Main function to detect regulatory changes from text content.
     chunks, preprocessed_data = preprocess_text_with_nlp(text_content)
     # Classify changes using NLP insights
+    results = await search_for_regulatory_changes(chunks, preprocessed_data, subtitle)
     return results
         }
         subtitles = {}
+        async def process_all_sections():
+            async def process_section(section):
+                subtitle = section["subtitle"]
+                content = section["content"]
+                if isinstance(content, list):
+                    content = "\n".join(content)
+                # Detect changes for this subtitle
+                changes = await detect_regulatory_changes(content, subtitle)
+                return subtitle, changes
             if status_callback:
+                status_callback(f"Processing all {len(hierarchical_structure['sections'])} sections concurrently...")
+            tasks = [process_section(section) for section in hierarchical_structure["sections"]]
+            results = await asyncio.gather(*tasks)
+            return results
+        # Run async processing
+        section_results = asyncio.run(process_all_sections())
+        # Process results
+        for subtitle, changes in section_results:
             # Update analysis summary
             for change in changes:

scripts/llm_no_nlp_preprocessing.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import json
 import os
 from dotenv import load_dotenv
-from openai import OpenAI
 from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
 from scripts.utility_functions import render_prompt
@@ -10,43 +11,47 @@ from scripts.utility_functions import render_prompt
 load_dotenv()
 api_key = os.getenv("OPENAI_API_KEY")
-openai_client = OpenAI(api_key=api_key)
 def create_prompt_without_nlp_insights(text):
     return render_prompt(text, include_nlp=False)
-def classify_changes_without_nlp_insights(text_content, subtitle):
     """Classify changes in text chunks using OpenAI."""
     chunks = text_content.split("\n\n")
-    results = []
-    for chunk in chunks:
-        response = openai_client.chat.completions.create(
-            model="gpt-4o-mini",
-            messages=[
-                {
-                    "role": "system",
-                    "content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.",
-                },
-                {"role": "user", "content": create_prompt_without_nlp_insights(chunk)},
-            ],
-            temperature=0.7,
-            max_tokens=1024,
-        )
         try:
             result = json.loads(response.choices[0].message.content)
             if result.get("changes_detected", False):
-                result["location"] = {"subtitle": subtitle}  # Use subtitle as location
                 result["source_text"] = chunk
-                results.append(result)
-        except json.JSONDecodeError:
-            continue
-    return results
 def llm_regulatory_change_detector_without_nlp_insights(hierarchical_structure, progress_callback=None, status_callback=None):
@@ -57,21 +62,29 @@ def llm_regulatory_change_detector_without_nlp_insights(hierarchical_structure,
         }
         subtitles = {}
-        # Iterate over sections and analyze content
-        total_sections = len(hierarchical_structure["sections"])
-        for idx, section in enumerate(hierarchical_structure["sections"]):
-            # Update progress if callback provided
-            if progress_callback:
-                progress_callback((idx + 1) / total_sections)
             if status_callback:
-                status_callback(f"Analyzing section {idx + 1}/{total_sections}: {section.get('subtitle', 'Untitled')}")
-            subtitle = section["subtitle"]
-            content = section["content"]
-            if isinstance(content, list):
-                content = "\n".join(content)
-            # Detect changes for this subtitle
-            changes = classify_changes_without_nlp_insights(content, subtitle)
             # Update analysis summary
             for change in changes:

 import json
 import os
+import asyncio
 from dotenv import load_dotenv
+from openai import AsyncOpenAI
 from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
 from scripts.utility_functions import render_prompt
 load_dotenv()
 api_key = os.getenv("OPENAI_API_KEY")
+openai_client = AsyncOpenAI(api_key=api_key, timeout=60)
 def create_prompt_without_nlp_insights(text):
     return render_prompt(text, include_nlp=False)
+async def classify_changes_without_nlp_insights(text_content, subtitle):
     """Classify changes in text chunks using OpenAI."""
     chunks = text_content.split("\n\n")
+    async def process_chunk(chunk):
         try:
+            response = await openai_client.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.",
+                    },
+                    {"role": "user", "content": create_prompt_without_nlp_insights(chunk)},
+                ],
+                temperature=0.7,
+                max_tokens=1024,
+            )
             result = json.loads(response.choices[0].message.content)
             if result.get("changes_detected", False):
+                result["location"] = {"subtitle": subtitle}
                 result["source_text"] = chunk
+                return result
+        except (json.JSONDecodeError, Exception):
+            return None
+    tasks = [process_chunk(chunk) for chunk in chunks]
+    results = await asyncio.gather(*tasks)
+    return [r for r in results if r is not None]
+# Async wrapper for backward compatibility
+async def classify_changes_without_nlp_insights_async(text_content, subtitle):
+    return await classify_changes_without_nlp_insights(text_content, subtitle)
 def llm_regulatory_change_detector_without_nlp_insights(hierarchical_structure, progress_callback=None, status_callback=None):
         }
         subtitles = {}
+        async def process_all_sections():
+            async def process_section(section):
+                subtitle = section["subtitle"]
+                content = section["content"]
+                if isinstance(content, list):
+                    content = "\n".join(content)
+                # Detect changes for this subtitle
+                changes = await classify_changes_without_nlp_insights(content, subtitle)
+                return subtitle, changes
             if status_callback:
+                status_callback(f"Processing all {len(hierarchical_structure['sections'])} sections concurrently...")
+            tasks = [process_section(section) for section in hierarchical_structure["sections"]]
+            results = await asyncio.gather(*tasks)
+            return results
+        # Run async processing
+        section_results = asyncio.run(process_all_sections())
+        # Process results
+        for subtitle, changes in section_results:
             # Update analysis summary
             for change in changes:

scripts/pymupdf_nlp_preprocessing.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import json
 import os
 from dotenv import load_dotenv
-from openai import OpenAI
 from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
 from scripts.utility_functions import call_nlp_service, render_prompt
@@ -10,7 +11,7 @@ from scripts.utility_functions import call_nlp_service, render_prompt
 load_dotenv()
 api_key = os.getenv("OPENAI_API_KEY")
-openai_client = OpenAI(api_key=api_key)
 def preprocess_text_with_nlp(text, max_chunk_size=512, overlap=50):
@@ -22,7 +23,7 @@ def create_prompt_with_nlp(chunk, preprocessed_data):
     return render_prompt(chunk, include_nlp=True, preprocessed_data=preprocessed_data)
-def classify_changes_with_nlp(text_content, location_info):
     """Classify changes with NLP preprocessing."""
     # Apply NLP preprocessing
     preprocessed_data = preprocess_text_with_nlp(text_content)
@@ -31,34 +32,39 @@ def classify_changes_with_nlp(text_content, location_info):
     result = call_nlp_service({"text": text_content}, "recursive_character_text_splitter")
     chunks = result["chunks"]
-    results = []
-    for chunk in chunks:
-        response = openai_client.chat.completions.create(
-            model="gpt-4o-mini",
-            messages=[
-                {
-                    "role": "system",
-                    "content": "You are a legal expert analyzing German regulatory changes. Return only JSON.",
-                },
-                {
-                    "role": "user",
-                    "content": create_prompt_with_nlp(chunk, preprocessed_data),
-                },
-            ],
-            temperature=0.7,
-            max_tokens=1024,
-        )
         try:
             result = json.loads(response.choices[0].message.content)
             if result.get("changes_detected", False):
                 result["location"] = location_info
                 result["source_text"] = chunk
-                results.append(result)
-        except json.JSONDecodeError:
-            continue
-    return results if results else None
 def extract_hierarchical_text(block):
@@ -85,38 +91,39 @@ def extract_hierarchical_text(block):
     return "\n\n".join(text_parts)
-def traverse_blocks_with_nlp(blocks, parent=None, results=None, is_top_level=True, progress_callback=None, status_callback=None):
-    """Traverse hierarchy with NLP-enhanced analysis."""
-    if results is None:
-        results = []
-    total_blocks = len(blocks) if is_top_level else 0
-    for idx, block in enumerate(blocks):
-        if is_top_level and progress_callback:
-            progress_callback((idx + 1) / total_blocks)
-        if is_top_level and status_callback:
-            status_callback(f"Processing text block {idx + 1}/{total_blocks} with NLP")
         block["parent"] = parent
         if "children" in block and not block["children"]:  # Leaf node
             text_content = extract_hierarchical_text(block)
             location_info = {
                 "page_number": block["page_number"],
                 "block_text": block["text"],
             }
-            changes = classify_changes_with_nlp(text_content, location_info)
             if changes:
                 for change in changes:
                     change["full_text"] = text_content
-                    results.append(change)
         else:
-            traverse_blocks_with_nlp(
-                block["children"], block, results, is_top_level=False, progress_callback=progress_callback, status_callback=status_callback
-            )
-    return results
 def pymupdf_regulatory_change_detector_with_nlp_insights(hierarchical_structure, progress_callback=None, status_callback=None):
@@ -131,9 +138,10 @@ def pymupdf_regulatory_change_detector_with_nlp_insights(hierarchical_structure,
     changes_by_page = {}
     if status_callback:
-        status_callback("Analyzing document structure with NLP...")
-    results = traverse_blocks_with_nlp(hierarchical_structure["blocks"], progress_callback=progress_callback, status_callback=status_callback)
     for change in results:
         analysis_summary["total_changes_detected"] += len(change["classifications"])

 import json
 import os
+import asyncio
 from dotenv import load_dotenv
+from openai import AsyncOpenAI
 from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
 from scripts.utility_functions import call_nlp_service, render_prompt
 load_dotenv()
 api_key = os.getenv("OPENAI_API_KEY")
+openai_client = AsyncOpenAI(api_key=api_key, timeout=60)
 def preprocess_text_with_nlp(text, max_chunk_size=512, overlap=50):
     return render_prompt(chunk, include_nlp=True, preprocessed_data=preprocessed_data)
+async def classify_changes_with_nlp(text_content, location_info):
     """Classify changes with NLP preprocessing."""
     # Apply NLP preprocessing
     preprocessed_data = preprocess_text_with_nlp(text_content)
     result = call_nlp_service({"text": text_content}, "recursive_character_text_splitter")
     chunks = result["chunks"]
+    async def process_chunk(chunk):
         try:
+            response = await openai_client.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You are a legal expert analyzing German regulatory changes. Return only JSON.",
+                    },
+                    {
+                        "role": "user",
+                        "content": create_prompt_with_nlp(chunk, preprocessed_data),
+                    },
+                ],
+                temperature=0.7,
+                max_tokens=1024,
+            )
             result = json.loads(response.choices[0].message.content)
             if result.get("changes_detected", False):
                 result["location"] = location_info
                 result["source_text"] = chunk
+                return result
+        except (json.JSONDecodeError, Exception):
+            return None
+    tasks = [process_chunk(chunk) for chunk in chunks]
+    results = await asyncio.gather(*tasks)
+    filtered_results = [r for r in results if r is not None]
+    return filtered_results if filtered_results else None
+# Async wrapper for backward compatibility
+async def classify_changes_with_nlp_async(text_content, location_info):
+    return await classify_changes_with_nlp(text_content, location_info)
 def extract_hierarchical_text(block):
     return "\n\n".join(text_parts)
+async def traverse_blocks_with_nlp(blocks, parent=None):
+    """Traverse hierarchy with NLP-enhanced analysis using asyncio.gather()."""
+    async def process_block(block, parent):
         block["parent"] = parent
         if "children" in block and not block["children"]:  # Leaf node
             text_content = extract_hierarchical_text(block)
             location_info = {
                 "page_number": block["page_number"],
                 "block_text": block["text"],
             }
+            changes = await classify_changes_with_nlp(text_content, location_info)
             if changes:
                 for change in changes:
                     change["full_text"] = text_content
+                return changes
         else:
+            # Process children recursively
+            return await traverse_blocks_with_nlp(block["children"], block)
+        return []
+    # Process all blocks concurrently
+    tasks = [process_block(block, parent) for block in blocks]
+    results = await asyncio.gather(*tasks)
+    # Flatten results
+    flattened = []
+    for result in results:
+        if isinstance(result, list):
+            flattened.extend(result)
+    return flattened
 def pymupdf_regulatory_change_detector_with_nlp_insights(hierarchical_structure, progress_callback=None, status_callback=None):
     changes_by_page = {}
     if status_callback:
+        status_callback("Analyzing all document blocks concurrently with NLP...")
+    # Run async processing
+    results = asyncio.run(traverse_blocks_with_nlp(hierarchical_structure["blocks"]))
     for change in results:
         analysis_summary["total_changes_detected"] += len(change["classifications"])

scripts/pymupdf_no_nlp_preprocessing.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import json
 import os
 from dotenv import load_dotenv
-from openai import OpenAI
 from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
 from scripts.utility_functions import render_prompt
 from scripts.pymupdf_nlp_preprocessing import extract_hierarchical_text
@@ -12,84 +13,83 @@ load_dotenv()
 #nlp = spacy.load("de_core_news_sm")
 api_key = os.getenv("OPENAI_API_KEY")
-openai_client = OpenAI(api_key=api_key)
 def create_prompt_without_nlp_insights(text):
     return render_prompt(text, include_nlp=False)
-def classify_changes_without_nlp_insights(text_content, location_info):
     """Classify changes in text chunks using OpenAI."""
-    response = openai_client.chat.completions.create(
-        model="gpt-4o-mini",
-        messages=[
-            {
-                "role": "system",
-                "content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.",
-            },
-            {
-                "role": "user",
-                "content": create_prompt_without_nlp_insights(text_content),
-            },
-        ],
-        temperature=0.7,
-        max_tokens=1024,
-    )
     try:
         result = json.loads(response.choices[0].message.content)
         if result.get("changes_detected", False):
             result["location"] = location_info
             result["source_text"] = text_content
             return result
-        return None
-    except json.JSONDecodeError:
-        return None
-def traverse_blocks(
-    blocks, parent=None, grandparent=None, results=None, is_top_level=True, progress_callback=None, status_callback=None
-):
-    """Traverse the hierarchical structure in a depth-first manner and analyze leaf nodes."""
-    if results is None:
-        results = []
-    total_blocks = len(blocks) if is_top_level else 0
-    for idx, block in enumerate(blocks):
-        if is_top_level and progress_callback:
-            progress_callback((idx + 1) / total_blocks)
-        if is_top_level and status_callback:
-            status_callback(f"Processing text block {idx + 1}/{total_blocks}")
-        # Add parent and grandparent references to the block for context tracking
         block["parent"] = parent
-        if "children" in block and (
-            not block["children"] or len(block["children"]) == 0
-        ):  # This is a leaf node
             # Extract hierarchical text
             text_content = extract_hierarchical_text(block)
             # Define location info
             location_info = {
                 "page_number": block["page_number"],
                 "block_text": block["text"],
             }
             # Analyze the text for changes
-            changes = classify_changes_without_nlp_insights(text_content, location_info)
             if changes:
                 # Add the full hierarchical text to the result
                 changes["text"] = text_content
-                results.append(changes)
         else:
-            traverse_blocks(
-                block["children"], block, parent, results, is_top_level=False, progress_callback=progress_callback, status_callback=status_callback
-            )
-    return results
 def pymupdf_regulatory_change_detector_without_nlp_insights(hierarchical_structure, progress_callback=None, status_callback=None):
@@ -104,10 +104,10 @@ def pymupdf_regulatory_change_detector_without_nlp_insights(hierarchical_structu
     changes_by_page = {}
     if status_callback:
-        status_callback("Analyzing document structure...")
-    # Traverse the blocks and analyze leaf nodes
-    results = traverse_blocks(hierarchical_structure["blocks"], progress_callback=progress_callback, status_callback=status_callback)
     # Update analysis summary
     for change in results:

 import json
 import os
+import asyncio
 from dotenv import load_dotenv
+from openai import AsyncOpenAI
 from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
 from scripts.utility_functions import render_prompt
 from scripts.pymupdf_nlp_preprocessing import extract_hierarchical_text
 #nlp = spacy.load("de_core_news_sm")
 api_key = os.getenv("OPENAI_API_KEY")
+openai_client = AsyncOpenAI(api_key=api_key, timeout=60)
 def create_prompt_without_nlp_insights(text):
     return render_prompt(text, include_nlp=False)
+async def classify_changes_without_nlp_insights(text_content, location_info):
     """Classify changes in text chunks using OpenAI."""
     try:
+        response = await openai_client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.",
+                },
+                {
+                    "role": "user",
+                    "content": create_prompt_without_nlp_insights(text_content),
+                },
+            ],
+            temperature=0.7,
+            max_tokens=1024,
+        )
         result = json.loads(response.choices[0].message.content)
         if result.get("changes_detected", False):
             result["location"] = location_info
             result["source_text"] = text_content
             return result
+    except (json.JSONDecodeError, Exception):
+        pass
+    return None
+# Async wrapper for backward compatibility
+async def classify_changes_without_nlp_insights_async(text_content, location_info):
+    return await classify_changes_without_nlp_insights(text_content, location_info)
+async def traverse_blocks(blocks, parent=None):
+    """Traverse the hierarchical structure and analyze leaf nodes using asyncio.gather()."""
+    async def process_block(block, parent):
         block["parent"] = parent
+        if "children" in block and (not block["children"] or len(block["children"]) == 0):  # Leaf node
             # Extract hierarchical text
             text_content = extract_hierarchical_text(block)
             # Define location info
             location_info = {
                 "page_number": block["page_number"],
                 "block_text": block["text"],
             }
             # Analyze the text for changes
+            changes = await classify_changes_without_nlp_insights(text_content, location_info)
             if changes:
                 # Add the full hierarchical text to the result
                 changes["text"] = text_content
+                return [changes]
         else:
+            # Process children recursively
+            return await traverse_blocks(block["children"], block)
+        return []
+    # Process all blocks concurrently
+    tasks = [process_block(block, parent) for block in blocks]
+    results = await asyncio.gather(*tasks)
+    # Flatten results
+    flattened = []
+    for result in results:
+        if isinstance(result, list):
+            flattened.extend(result)
+    return flattened
 def pymupdf_regulatory_change_detector_without_nlp_insights(hierarchical_structure, progress_callback=None, status_callback=None):
     changes_by_page = {}
     if status_callback:
+        status_callback("Analyzing all document blocks concurrently...")
+    # Run async processing
+    results = asyncio.run(traverse_blocks(hierarchical_structure["blocks"]))
     # Update analysis summary
     for change in results:

scripts/text_extraction_landing_ai.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import os
 import json
 import glob
 from agentic_doc.parse import parse
 from scripts.models import RegulatoryChange
-from scripts.pymupdf_nlp_preprocessing import classify_changes_with_nlp
-from scripts.pymupdf_no_nlp_preprocessing import classify_changes_without_nlp_insights
 def extract_document_agentic(
@@ -43,41 +44,51 @@ def extract_document_agentic(
             return result
         if result:
             if "chunks" in result and isinstance(result["chunks"], list):
-                for chunk in result["chunks"]:
-                    if do_nlp_preprocessing:
-                        classification_result = classify_changes_with_nlp(chunk["text"], "")
-                        # flatten into a single json element so it matches non-nlp part
-                        if classification_result and len(classification_result) > 0:
-                            flattened_classifications = {"changes_detected": classification_result[0].get("changes_detected", False), "classifications": []}
-                            for class_res in classification_result:
-                                if class_res.get("changes_detected", False):
-                                    flattened_classifications["classifications"].extend(class_res.get("classifications", []))
-                            classification_result = flattened_classifications
-                    else:
-                        classification_result = classify_changes_without_nlp_insights(
-                            chunk["text"], ""
-                        )
-                    if classification_result and classification_result.get(
-                        "changes_detected", False
-                    ):
-                        subchunks = []
-                        for subchunk in classification_result.get(
-                            "classifications", []
-                        ):
-                            subchunks.append(
-                                {
-                                    "text": subchunk.get("relevant_text", ""),
-                                    "validated": False,
-                                    "confirmed": False,
-                                    "reviewed": False,
-                                    "category": subchunk.get("change", ""),
-                                    "type": subchunk.get("change_type", ""),
-                                    "context": subchunk.get("explanation", ""),
-                                }
                             )
-                        chunk["subchunks"] = subchunks
-                    else:
-                        result["chunks"].remove(chunk)
                 # Create flattened list of subchunks for UI compatibility
                 flattened_changes = []
                 for chunk in result["chunks"]:

 import os
 import json
 import glob
+import asyncio
 from agentic_doc.parse import parse
 from scripts.models import RegulatoryChange
+from scripts.pymupdf_nlp_preprocessing import classify_changes_with_nlp_async
+from scripts.pymupdf_no_nlp_preprocessing import classify_changes_without_nlp_insights_async
 def extract_document_agentic(
             return result
         if result:
             if "chunks" in result and isinstance(result["chunks"], list):
+                # Process all chunks concurrently with asyncio.gather()
+                async def process_all_chunks():
+                    async def process_chunk(chunk):
+                        if do_nlp_preprocessing:
+                            classification_result = await classify_changes_with_nlp_async(chunk["text"], "")
+                            # flatten into a single json element so it matches non-nlp part
+                            if classification_result and len(classification_result) > 0:
+                                flattened_classifications = {"changes_detected": classification_result[0].get("changes_detected", False), "classifications": []}
+                                for class_res in classification_result:
+                                    if class_res.get("changes_detected", False):
+                                        flattened_classifications["classifications"].extend(class_res.get("classifications", []))
+                                classification_result = flattened_classifications
+                        else:
+                            classification_result = await classify_changes_without_nlp_insights_async(
+                                chunk["text"], ""
                             )
+                        if classification_result and classification_result.get("changes_detected", False):
+                            subchunks = []
+                            for subchunk in classification_result.get("classifications", []):
+                                subchunks.append(
+                                    {
+                                        "text": subchunk.get("relevant_text", ""),
+                                        "validated": False,
+                                        "confirmed": False,
+                                        "reviewed": False,
+                                        "category": subchunk.get("change", ""),
+                                        "type": subchunk.get("change_type", ""),
+                                        "context": subchunk.get("explanation", ""),
+                                    }
+                                )
+                            chunk["subchunks"] = subchunks
+                            return chunk, True
+                        return chunk, False
+                    # Process all chunks concurrently
+                    tasks = [process_chunk(chunk) for chunk in result["chunks"]]
+                    results = await asyncio.gather(*tasks)
+                    return results
+                # Run async processing
+                processed_results = asyncio.run(process_all_chunks())
+                # Remove chunks without changes
+                result["chunks"] = [chunk for chunk, has_changes in processed_results if has_changes]
                 # Create flattened list of subchunks for UI compatibility
                 flattened_changes = []
                 for chunk in result["chunks"]:

scripts/utility_functions.py CHANGED Viewed

@@ -288,6 +288,31 @@ def remove_html_comments(text: str) -> str:
     return clean_text
 def highlight_differences_words(text1: str, text2: str):
     """
     Return two HTML strings: highlighted version of text1 and text2.
@@ -352,6 +377,8 @@ def map_categorical_impact_assessment(
     changes: list[RegulatoryChange],
 ) -> list[RegulatoryChange]:
     """Map categorical impact assessment actions based on changetype"""
     action_map = {
         "Textual and Editorial Changes": {
             "actions": [
@@ -397,7 +424,8 @@ def map_categorical_impact_assessment(
             expected_labels = [action["label"] for action in mapped_actions]
             # Only update if the labels don't match
             if current_labels != expected_labels:
-                change.actions = mapped_actions
             # If labels match but user has different completion status, preserve their progress
     return changes

     return clean_text
+def normalize_markdown_indentation(content):
+    """Normalize excessive indentation to prevent code block interpretation."""
+    lines = content.split("\n")
+    normalized_lines = []
+    for line in lines:
+        # Check if line is a list item with excessive indentation
+        stripped = line.lstrip()
+        if stripped.startswith(("-", "*", "+")):
+            # Count leading spaces
+            leading_spaces = len(line) - len(stripped)
+            # Normalize to max 4 spaces for nested lists
+            if leading_spaces > 4:
+                # Convert to proper nested list (2 spaces per level)
+                nest_level = min(leading_spaces // 6, 2)  # Max 2 levels deep
+                normalized_line = "  " * nest_level + stripped
+                normalized_lines.append(normalized_line)
+            else:
+                normalized_lines.append(line)
+        else:
+            normalized_lines.append(line)
+    return "\n".join(normalized_lines)
 def highlight_differences_words(text1: str, text2: str):
     """
     Return two HTML strings: highlighted version of text1 and text2.
     changes: list[RegulatoryChange],
 ) -> list[RegulatoryChange]:
     """Map categorical impact assessment actions based on changetype"""
+    import copy
     action_map = {
         "Textual and Editorial Changes": {
             "actions": [
             expected_labels = [action["label"] for action in mapped_actions]
             # Only update if the labels don't match
+            # Create deep copies to prevent shared references across changes
             if current_labels != expected_labels:
+                change.actions = copy.deepcopy(mapped_actions)
             # If labels match but user has different completion status, preserve their progress
     return changes