File size: 4,637 Bytes
ce77033 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import json
import os
from dotenv import load_dotenv
from openai import OpenAI
from tqdm import tqdm
from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
from scripts.utility_functions import call_nlp_service, render_prompt
# Load environment variables from .env file
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
openai_client = OpenAI(api_key=api_key)
def preprocess_text_with_nlp(text, max_chunk_size=512, overlap=50):
result = call_nlp_service({"text": text}, "preprocess_text_with_nlp_llm")
return result["chunks"], result["preprocessed_data"]
def create_prompt(chunk, preprocessed_data):
return render_prompt(chunk, include_nlp=True, preprocessed_data=preprocessed_data)
def search_for_regulatory_changes(chunks, preprocessed_data, subtitle):
results = []
for chunk in chunks:
response = openai_client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.",
},
{"role": "user", "content": create_prompt(chunk, preprocessed_data)},
],
temperature=0.7,
max_tokens=1024,
)
try:
result = json.loads(response.choices[0].message.content)
if result.get("changes_detected", False):
result["location"] = {"subtitle": subtitle} # Use subtitle as location
result["source_text"] = chunk
results.append(result)
except json.JSONDecodeError:
continue
return results
def detect_regulatory_changes(text_content, subtitle):
"""
Main function to detect regulatory changes from text content.
Args:
text_content (str): The raw text content to analyze
subtitle (str): The subtitle associated with the content
Returns:
dict: Structured output containing detected changes and analysis summary
"""
# Preprocess text with enhanced NLP
chunks, preprocessed_data = preprocess_text_with_nlp(text_content)
# Classify changes using NLP insights
results = search_for_regulatory_changes(chunks, preprocessed_data, subtitle)
return results
def llm_regulatory_change_detector(hierarchical_structure):
if hierarchical_structure:
analysis_summary = {
"total_changes_detected": 0,
"changes_by_type": {"additions": 0, "deletions": 0, "modifications": 0},
}
subtitles = {}
# Iterate over sections and analyze content
for section in tqdm(
hierarchical_structure["sections"], desc="Analyzing Sections"
):
subtitle = section["subtitle"]
content = section["content"]
if isinstance(content, list):
content = "\n".join(content)
# Detect changes for this subtitle
changes = detect_regulatory_changes(content, subtitle)
# Update analysis summary
for change in changes:
analysis_summary["total_changes_detected"] += len(
change["classifications"]
)
for classification in change["classifications"]:
change_type = classification["change_type"]
analysis_summary["changes_by_type"][f"{change_type}s"] += 1
# Group changes by subtitle
subtitles[subtitle] = []
for change in changes:
for classification in change["classifications"]:
change_subtype = (
"context"
if classification["change"] in CONTEXT_CATEGORIES
else "scope"
)
subtitles[subtitle].append(
{
"change": classification["change"],
"change_type": classification["change_type"],
"change_subtype": change_subtype,
"relevant_text": classification["relevant_text"],
"explanation": classification["explanation"],
"nlp_evidence": classification["evidence"],
}
)
# Combine analysis summary and grouped changes
final_output = {"analysis_summary": analysis_summary, "results": subtitles}
return final_output
|