regulens

Sleeping

App Files Files Community

regulens / scripts /llm_nlp_preprocessing.py

amougou-fortiss

Upload 9 files

ce77033 verified 6 months ago

raw

history blame contribute delete

4.64 kB

	import json
	import os
	from dotenv import load_dotenv
	from openai import OpenAI
	from tqdm import tqdm
	from scripts.regulatory_change_foundation import CONTEXT_CATEGORIES
	from scripts.utility_functions import call_nlp_service, render_prompt


	# Load environment variables from .env file
	load_dotenv()

	api_key = os.getenv("OPENAI_API_KEY")
	openai_client = OpenAI(api_key=api_key)


	def preprocess_text_with_nlp(text, max_chunk_size=512, overlap=50):
	result = call_nlp_service({"text": text}, "preprocess_text_with_nlp_llm")
	return result["chunks"], result["preprocessed_data"]


	def create_prompt(chunk, preprocessed_data):
	return render_prompt(chunk, include_nlp=True, preprocessed_data=preprocessed_data)


	def search_for_regulatory_changes(chunks, preprocessed_data, subtitle):
	results = []

	for chunk in chunks:
	response = openai_client.chat.completions.create(
	model="gpt-4o-mini",
	messages=[
	{
	"role": "system",
	"content": "You are a legal expert specializing in analyzing German regulatory documents with a focus on identifying regulatory changes. Only return JSON output.",
	},
	{"role": "user", "content": create_prompt(chunk, preprocessed_data)},
	],
	temperature=0.7,
	max_tokens=1024,
	)

	try:
	result = json.loads(response.choices[0].message.content)
	if result.get("changes_detected", False):
	result["location"] = {"subtitle": subtitle} # Use subtitle as location
	result["source_text"] = chunk
	results.append(result)
	except json.JSONDecodeError:
	continue

	return results


	def detect_regulatory_changes(text_content, subtitle):
	"""
	Main function to detect regulatory changes from text content.

	Args:
	text_content (str): The raw text content to analyze
	subtitle (str): The subtitle associated with the content

	Returns:
	dict: Structured output containing detected changes and analysis summary
	"""

	# Preprocess text with enhanced NLP
	chunks, preprocessed_data = preprocess_text_with_nlp(text_content)

	# Classify changes using NLP insights
	results = search_for_regulatory_changes(chunks, preprocessed_data, subtitle)

	return results


	def llm_regulatory_change_detector(hierarchical_structure):
	if hierarchical_structure:
	analysis_summary = {
	"total_changes_detected": 0,
	"changes_by_type": {"additions": 0, "deletions": 0, "modifications": 0},
	}
	subtitles = {}

	# Iterate over sections and analyze content
	for section in tqdm(
	hierarchical_structure["sections"], desc="Analyzing Sections"
	):
	subtitle = section["subtitle"]
	content = section["content"]
	if isinstance(content, list):
	content = "\n".join(content)

	# Detect changes for this subtitle
	changes = detect_regulatory_changes(content, subtitle)

	# Update analysis summary
	for change in changes:
	analysis_summary["total_changes_detected"] += len(
	change["classifications"]
	)
	for classification in change["classifications"]:
	change_type = classification["change_type"]
	analysis_summary["changes_by_type"][f"{change_type}s"] += 1

	# Group changes by subtitle
	subtitles[subtitle] = []
	for change in changes:
	for classification in change["classifications"]:
	change_subtype = (
	"context"
	if classification["change"] in CONTEXT_CATEGORIES
	else "scope"
	)
	subtitles[subtitle].append(
	{
	"change": classification["change"],
	"change_type": classification["change_type"],
	"change_subtype": change_subtype,
	"relevant_text": classification["relevant_text"],
	"explanation": classification["explanation"],
	"nlp_evidence": classification["evidence"],
	}
	)

	# Combine analysis summary and grouped changes
	final_output = {"analysis_summary": analysis_summary, "results": subtitles}

	return final_output