OpenDeepResearch

Runtime error

OpenDeepResearch / scripts /legal_document_tool.py

Leonardo

Create scripts/legal_document_tool.py

9a5117b verified 9 months ago

17.7 kB

	"""
	Legal Document Processing Tool for Smolagents

	This tool processes legal documents with specialized models for legal text,
	optimizing for citation retention, multilingual support, and performance on
	legal-specific retrieval tasks.

	Author: Dr. Zhou Wang
	"""

	from typing import Dict, List, Any, Optional, Union
	import os
	import re
	import time
	import tempfile
	import numpy as np
	from tqdm import tqdm

	# Import Smolagents Tool class
	from smolagents import Tool

	# Import NLP components
	try:
	from sklearn.metrics.pairwise import cosine_similarity
	from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Document
	from llama_index.core.node_parser import MarkdownNodeParser
	from llama_index.embeddings.huggingface import HuggingFaceEmbedding
	from llama_index.core.ingestion import IngestionPipeline
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	except ImportError:
	raise ImportError(
	"Required dependencies not found. Please install with: "
	"pip install llama-index langchain scikit-learn tqdm"
	)


	# Model configurations based on research findings
	LEGAL_MODELS = {
	"legal-bert": {
	"name": "nlp-jurisprudence/legal-bert-base-uncased",
	"description": "Trained on ECtHR legal documents, specialized in human rights law",
	"max_length": 512,
	"requires_gpu": True,
	},
	"multi-qa-mpnet": {
	"name": "sentence-transformers/multi-qa-mpnet-base-dot-v1",
	"description": "Optimized for legal Q&A retrieval with cross-lingual support",
	"max_length": 512,
	"requires_gpu": False,
	},
	"legal-xlm-roberta": {
	"name": "joelito/legal-xlm-roberta-base",
	"description": "Multilingual legal model with EU legislation and RFC/ISO pattern awareness",
	"max_length": 512,
	"requires_gpu": True,
	},
	"multilingual-e5": {
	"name": "intfloat/multilingual-e5-base",
	"description": "Dense retrieval optimized with citation context preservation",
	"max_length": 512,
	"requires_gpu": True,
	},
	"all-mpnet": {
	"name": "sentence-transformers/all-mpnet-base-v2",
	"description": "General purpose embedding model, good baseline for legal text",
	"max_length": 512,
	"requires_gpu": False,
	},
	}


	class LegalDocumentProcessor:
	"""
	Processor for legal documents with specialized models,
	citation preservation, and benchmarking capabilities.
	"""

	def __init__(
	self,
	model_key: str = "legal-xlm-roberta",
	use_gpu: bool = False,
	chunk_size: int = 512,
	chunk_overlap: int = 100,
	):
	"""
	Initialize the legal document processor.

	Args:
	model_key: Key for the model to use from LEGAL_MODELS dictionary
	use_gpu: Whether to use GPU for embeddings (if available)
	chunk_size: Size of text chunks for processing
	chunk_overlap: Overlap between chunks to preserve context
	"""
	# Validate and set up model
	if model_key not in LEGAL_MODELS:
	print(
	f"Warning: Model '{model_key}' not found. Using legal-xlm-roberta as default."
	)
	model_key = "legal-xlm-roberta"

	model_config = LEGAL_MODELS[model_key]
	device = "cuda" if use_gpu and model_config["requires_gpu"] else "cpu"

	# Initialize embedding model
	self.embed_model = HuggingFaceEmbedding(
	model_name=model_config["name"],
	device=device,
	tokenizer_kwargs={
	"trust_remote_code": True,
	"max_length": model_config["max_length"],
	"truncation": True,
	},
	)

	# Store model information for reference
	self.model_info = model_config
	self.model_key = model_key

	# Legal document-optimized text splitter with improved chunk size
	self.splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	separators=[
	"\n## ",
	"\n### ",
	"\n#### ", # Headers
	"\n\n",
	"\n", # Paragraphs
	". ",
	"! ",
	"? ", # Sentences
	";",
	":", # Clause boundaries
	" ", # Last resort
	],
	)

	# Pattern for removing footers from legal documents
	# Separated into individual patterns for better maintainability
	self.footer_patterns = [
	r"^Page\s\d+(\s+of\s+\d+)?$", # Page numbers
	r"^©.\b(Company\|Inc\|Ltd)\b.$", # Copyright lines
	r"^All rights reserved.*?$", # Legal boilerplate
	r"^-+$", # Separator lines
	r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}(:\d{2})?$", # Timestamps
	r"(?i)^(confidential\|proprietary\|internal use only)", # Security tags
	]

	# Join all patterns with the OR operator
	combined_pattern = "\|".join(f"({pattern})" for pattern in self.footer_patterns)

	# Compile the combined pattern
	self.footer_pattern = re.compile(
	combined_pattern, flags=re.MULTILINE \| re.IGNORECASE
	)

	def remove_footers(self, text: str) -> str:
	"""
	Remove common document footer patterns from text.

	Args:
	text: The input text to process

	Returns:
	Text with footer patterns removed
	"""
	return self.footer_pattern.sub("", text)

	def clean_text(self, text: str) -> str:
	"""
	Preserve legal citations while cleaning artifacts.

	Args:
	text: The input text to clean

	Returns:
	Cleaned text with citations preserved
	"""
	# First remove footers
	text = self.remove_footers(text)

	# Preserve citation patterns
	# Pattern 1: Footnote numbers (e.g., 98, 99, 100)
	cleaned = re.sub(r"(?<=\D)(\d{2,3})(?=\D)", r"[\1]", text)

	# Pattern 2: Case citations [2019] UKSC 20
	# Already well-structured, so no changes needed

	# Pattern 3: Standardize quotation marks
	cleaned = cleaned.replace("''", '"').replace("``", '"')

	# Pattern 4: Handle section references (§3.1, §123)
	cleaned = re.sub(r"§(\d+(\.\d+)?)", r"Section \1", cleaned)

	# Pattern 5: Handle legal abbreviations (e.g., Art. -> Article)
	cleaned = re.sub(r"\bArt\.\s+(\d+)", r"Article \1", cleaned)

	# Pattern 6: Standardize case names with v. and vs.
	cleaned = re.sub(r"\bv\s+", r"v. ", cleaned)
	cleaned = re.sub(r"\bvs\s+", r"v. ", cleaned)

	# Pattern 7: RFC/ISO pattern standardization (RFC 1234, ISO 9001)
	cleaned = re.sub(r"\b(RFC\|ISO)\s[:#]?\s(\d+)", r"\1 \2", cleaned)

	return cleaned

	def create_pipeline(self) -> IngestionPipeline:
	"""
	Create a document processing pipeline.

	Returns:
	Configured IngestionPipeline object
	"""
	return IngestionPipeline(
	transformations=[
	self.clean_text,
	MarkdownNodeParser(),
	self.splitter,
	self.embed_model,
	]
	)

	def validate_citation_retention(
	self, documents: List[Document]
	) -> Dict[str, float]:
	"""
	Measure semantic similarity of citations before/after text cleaning.

	Args:
	documents: List of Document objects to validate

	Returns:
	Dictionary with validation metrics
	"""
	if not documents:
	return {"citation_retention": 0.0, "processing_time": 0.0}

	start_time = time.time()

	# Extract original texts
	original_texts = [doc.text for doc in documents[:5]] # Sample for performance

	# Apply cleaning
	processed_texts = [self.clean_text(text) for text in original_texts]

	# Calculate embeddings
	try:
	# Direct access to the underlying HuggingFace model
	orig_embeds = self.embed_model._model.encode(original_texts)
	proc_embeds = self.embed_model._model.encode(processed_texts)

	# Calculate similarity
	similarities = cosine_similarity(orig_embeds, proc_embeds).diagonal()
	avg_similarity = float(np.mean(similarities))

	processing_time = time.time() - start_time

	return {
	"citation_retention": avg_similarity * 100, # As percentage
	"processing_time": processing_time,
	"sample_size": len(original_texts),
	}
	except Exception as e:
	return {"citation_retention": 0.0, "processing_time": 0.0, "error": str(e)}

	def process_documents(self, documents: List[Document]) -> Dict[str, Any]:
	"""
	Process a list of legal documents.

	Args:
	documents: List of Document objects to process

	Returns:
	Dictionary with processing results and stats
	"""
	if not documents:
	return {"status": "error", "message": "No documents provided"}

	try:
	# Create pipeline and process documents
	pipeline = self.create_pipeline()
	nodes = pipeline.run(documents=documents)

	# Create vector index
	index = VectorStoreIndex(nodes)
	query_engine = index.as_query_engine()

	# Return success with stats
	return {
	"status": "success",
	"nodes_count": len(nodes),
	"documents_count": len(documents),
	"model_used": self.model_key,
	"query_engine": query_engine, # This will be used for querying
	}
	except Exception as e:
	return {"status": "error", "message": str(e)}


	class LegalDocumentTool(Tool):
	"""
	Tool for processing legal documents with specialized models and querying capabilities.
	"""

	name = "legal_document_processor"
	description = (
	"Processes legal documents with specialized models for legal text, optimizing for "
	"citation retention, multilingual support, and performance on legal-specific retrieval tasks. "
	"Can process text or file inputs and provide enhanced query capabilities."
	)
	inputs = {
	"text": {
	"type": "string",
	"description": "Legal document text to process. Provide either text or file_paths.",
	"optional": True,
	},
	"file_paths": {
	"type": "string",
	"description": "Comma-separated list of file paths or a directory path containing legal documents. Provide either text or file_paths.",
	"optional": True,
	},
	"model_key": {
	"type": "string",
	"description": "Legal embedding model to use. Options: legal-bert, multi-qa-mpnet, legal-xlm-roberta, multilingual-e5, all-mpnet",
	"default": "legal-xlm-roberta",
	},
	"query": {
	"type": "string",
	"description": "Optional query to run against the processed documents.",
	"optional": True,
	},
	"validate_citations": {
	"type": "boolean",
	"description": "Whether to validate citation retention in the processed documents.",
	"default": False,
	},
	"use_gpu": {
	"type": "boolean",
	"description": "Whether to use GPU for embedding calculations if available.",
	"default": False,
	},
	}
	output_type = "string"

	def _load_documents(self, input_path: str) -> List[Document]:
	"""
	Load documents from a file path or directory.

	Args:
	input_path: Path to a file or directory

	Returns:
	List of Document objects
	"""
	if os.path.isfile(input_path):
	# Create a SimpleDirectoryReader for the file's directory
	# and filter to only include this file
	directory = os.path.dirname(input_path)
	filename = os.path.basename(input_path)

	return SimpleDirectoryReader(
	input_dir=directory,
	required_exts=[
	os.path.splitext(filename)[1][1:]
	], # Extension without dot
	filename_as_id=True,
	).load_data()

	elif os.path.isdir(input_path):
	return SimpleDirectoryReader(
	input_dir=input_path,
	filename_as_id=True,
	).load_data()

	else:
	raise ValueError(f"Path not found: {input_path}")

	def _create_document_from_text(self, text: str) -> List[Document]:
	"""
	Create a Document object from text.

	Args:
	text: Text content

	Returns:
	List containing a single Document object
	"""
	# Create a temporary file to store the text
	with tempfile.NamedTemporaryFile(
	mode="w", suffix=".md", delete=False
	) as temp_file:
	temp_file.write(text)
	temp_path = temp_file.name

	try:
	# Load the document from the temporary file
	documents = self._load_documents(temp_path)
	return documents
	finally:
	# Clean up the temporary file
	os.remove(temp_path)

	def forward(
	self,
	text: Optional[str] = None,
	file_paths: Optional[str] = None,
	model_key: str = "legal-xlm-roberta",
	query: Optional[str] = None,
	validate_citations: bool = False,
	use_gpu: bool = False,
	) -> str:
	"""
	Process legal documents and optionally run a query.

	Args:
	text: Legal document text to process
	file_paths: Comma-separated list of file paths or a directory path
	model_key: Legal embedding model to use
	query: Optional query to run against the processed documents
	validate_citations: Whether to validate citation retention
	use_gpu: Whether to use GPU for embeddings

	Returns:
	Processing results or query response as a string
	"""
	# Validate inputs
	if not text and not file_paths:
	return "Error: Either text or file_paths must be provided."

	try:
	# Initialize processor
	processor = LegalDocumentProcessor(
	model_key=model_key,
	use_gpu=use_gpu,
	)

	# Load documents
	documents = []

	if text:
	documents.extend(self._create_document_from_text(text))

	if file_paths:
	# Handle comma-separated paths
	paths = [path.strip() for path in file_paths.split(",")]

	for path in paths:
	try:
	docs = self._load_documents(path)
	documents.extend(docs)
	except Exception as e:
	return f"Error loading documents from {path}: {str(e)}"

	# Check if we have documents to process
	if not documents:
	return "Error: No valid documents found."

	# Validate citations if requested
	validation_results = {}
	if validate_citations:
	validation_results = processor.validate_citation_retention(documents)

	# Process documents
	result = processor.process_documents(documents)

	if result["status"] != "success":
	return f"Processing error: {result['message']}"

	# Run query if provided
	if query and "query_engine" in result:
	query_engine = result["query_engine"]
	response = query_engine.query(query)

	# Format the response
	output = f"Query: {query}\n\nResponse: {response}\n\n"
	output += f"Documents processed: {result['documents_count']}\n"
	output += f"Text chunks: {result['nodes_count']}\n"
	output += f"Model used: {result['model_used']}\n"

	# Add validation results if available
	if validation_results:
	output += "\n=== Citation Retention Validation ===\n"
	output += f"Citation retention: {validation_results.get('citation_retention', 0):.2f}%\n"
	output += f"Processing time: {validation_results.get('processing_time', 0):.2f} seconds\n"

	return output

	# If no query, return processing stats
	output = "Document processing complete.\n\n"
	output += f"Documents processed: {result['documents_count']}\n"
	output += f"Text chunks: {result['nodes_count']}\n"
	output += f"Model used: {result['model_used']}\n"

	# Add validation results if available
	if validation_results:
	output += "\n=== Citation Retention Validation ===\n"
	output += f"Citation retention: {validation_results.get('citation_retention', 0):.2f}%\n"
	output += f"Processing time: {validation_results.get('processing_time', 0):.2f} seconds\n"

	output += "\nThe documents are now ready for querying. Use the 'query' parameter to run a query."

	return output

	except Exception as e:
	return f"Error: {str(e)}"