File size: 6,259 Bytes
ce77033
 
d6b760c
ce77033
d6b760c
ce77033
 
 
 
 
 
 
 
d6b760c
ce77033
 
09a324c
ce77033
09a324c
ce77033
 
 
 
 
 
d6b760c
ce77033
 
09a324c
ce77033
 
09a324c
ce77033
 
d6b760c
ce77033
d6b760c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce77033
 
 
 
d6b760c
 
 
 
 
 
 
 
ce77033
d6b760c
 
 
ce77033
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d6b760c
 
bdc7d9a
d6b760c
ce77033
d6b760c
ce77033
 
 
 
 
 
d6b760c
 
ce77033
 
 
d6b760c
ce77033
d6b760c
 
 
 
 
 
 
 
 
 
 
 
 
 
ce77033
 
bdc7d9a
ce77033
 
 
 
 
 
 
 
 
 
bdc7d9a
d6b760c
bdc7d9a
d6b760c
 
ce77033
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import json
import os
import asyncio
from dotenv import load_dotenv
from openai import AsyncOpenAI
from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
from scripts.utility_functions import call_nlp_service, render_prompt


# Load environment variables from .env file
load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")
openai_client = AsyncOpenAI(api_key=api_key, timeout=60)


async def preprocess_text_with_nlp(text, max_chunk_size=512, overlap=50):
    """Enhanced NLP preprocessing identical to your first experiment using PyMuPDF text extraction"""
    return await call_nlp_service({"text": text}, "preprocess_text_with_nlp_pymupdf")


def create_prompt_with_nlp(chunk, preprocessed_data):
    return render_prompt(chunk, include_nlp=True, preprocessed_data=preprocessed_data)


async def classify_changes_with_nlp(text_content, location_info):
    """Classify changes with NLP preprocessing."""
    # Apply NLP preprocessing
    preprocessed_data = await preprocess_text_with_nlp(text_content)

    # Split into chunks (using the same method as your first experiment)
    result = await call_nlp_service({"text": text_content}, "recursive_character_text_splitter")
    chunks = result["chunks"]

    async def process_chunk(chunk):
        try:
            response = await openai_client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {
                        "role": "system",
                        "content": "You are a legal expert analyzing German regulatory changes. Return only JSON.",
                    },
                    {
                        "role": "user",
                        "content": create_prompt_with_nlp(chunk, preprocessed_data),
                    },
                ],
                temperature=0.7,
                max_tokens=1024,
            )
            result = json.loads(response.choices[0].message.content)
            if result.get("changes_detected", False):
                result["location"] = location_info
                result["source_text"] = chunk
                return result
        except (json.JSONDecodeError, Exception):
            return None
    
    tasks = [process_chunk(chunk) for chunk in chunks]
    results = await asyncio.gather(*tasks)
    filtered_results = [r for r in results if r is not None]
    return filtered_results if filtered_results else None

# Async wrapper for backward compatibility
async def classify_changes_with_nlp_async(text_content, location_info):
    return await classify_changes_with_nlp(text_content, location_info)


def extract_hierarchical_text(block):
    """Extract text from a block including its parent and grandparent contexts."""
    text_parts = []

    # Check if block has a grandparent
    if (
        "parent" in block
        and block["parent"] is not None
        and "parent" in block["parent"]
        and block["parent"]["parent"] is not None
    ):
        text_parts.append(block["parent"]["parent"]["text"])

    # Check if block has a parent
    if "parent" in block and block["parent"] is not None:
        text_parts.append(block["parent"]["text"])

    # Add the current block's text
    text_parts.append(block["text"])

    # Join all text parts with newlines between them
    return "\n\n".join(text_parts)


async def traverse_blocks_with_nlp(blocks, parent=None):
    """Traverse hierarchy with NLP-enhanced analysis using asyncio.gather()."""
    
    async def process_block(block, parent):
        block["parent"] = parent
        
        if "children" in block and not block["children"]:  # Leaf node
            text_content = extract_hierarchical_text(block)
            location_info = {
                "page_number": block["page_number"],
                "block_text": block["text"],
            }
            
            changes = await classify_changes_with_nlp(text_content, location_info)
            if changes:
                for change in changes:
                    change["full_text"] = text_content
                return changes
        else:
            # Process children recursively
            return await traverse_blocks_with_nlp(block["children"], block)
        return []
    
    # Process all blocks concurrently
    tasks = [process_block(block, parent) for block in blocks]
    results = await asyncio.gather(*tasks)
    
    # Flatten results
    flattened = []
    for result in results:
        if isinstance(result, list):
            flattened.extend(result)
    return flattened


def pymupdf_regulatory_change_detector_with_nlp_insights(hierarchical_structure, progress_callback=None, status_callback=None):
    """Main function with NLP integration."""
    if not hierarchical_structure:
        return {"error": "No structure provided"}, []

    analysis_summary = {
        "total_changes_detected": 0,
        "changes_by_type": {"addition": 0, "deletion": 0, "modification": 0},
    }
    changes_by_page = {}

    if status_callback:
        status_callback("Analyzing all document blocks concurrently with NLP...")
    
    # Run async processing
    results = asyncio.run(traverse_blocks_with_nlp(hierarchical_structure["blocks"]))

    for change in results:
        analysis_summary["total_changes_detected"] += len(change["classifications"])
        for classification in change["classifications"]:
            analysis_summary["changes_by_type"][classification["change_type"]] += 1

            change_subtype = (
                "context" if classification["change"] in CONTEXT_CATEGORIES else "scope"
            )
            page_num = change["location"]["page_number"]
            changes_by_page.setdefault(page_num, []).append(
                {
                    "change": classification["change"],
                    "change_type": classification["change_type"],
                    "change_subtype": change_subtype,
                    "relevant_text": classification["relevant_text"],
                    "explanation": classification["explanation"],
                    "nlp_evidence": classification["evidence"],
                }
            )

    return {
        "analysis_summary": analysis_summary,
        "changes_by_page": changes_by_page,
    }, results