OpenDeepResearch

Runtime error

App Files Files Community

Leonardo commited on Mar 30, 2025

Commit

fdb59f7

verified ·

1 Parent(s): c1ce83b

Update scripts/document_tool.py

Browse files

Files changed (1) hide show

scripts/document_tool.py +278 -131

scripts/document_tool.py CHANGED Viewed

@@ -1,33 +1,35 @@
 """
-Legal Document Processing Tool for Smolagents
-This tool processes legal documents with specialized models for legal text,
-optimizing for citation retention, multilingual support, and performance on
-legal-specific retrieval tasks.
-Author: Dr. Zhou Wang
 """
-from typing import Dict, List, Any, Optional, Union
 import os
 import re
-import time
 import tempfile
-import spaces
 import numpy as np
-from tqdm import tqdm
 # Import Smolagents Tool class
 from smolagents import Tool
 # Import NLP components
 try:
-    from sklearn.metrics.pairwise import cosine_similarity
-    from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Document
     from llama_index.core.node_parser import MarkdownNodeParser
     from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-    from llama_index.core.ingestion import IngestionPipeline
-    from langchain.text_splitter import RecursiveCharacterTextSplitter
 except ImportError:
     raise ImportError(
         "Required dependencies not found. Please install with: "
@@ -35,89 +37,105 @@ except ImportError:
     )
-# Model configurations based on research findings
-LEGAL_MODELS = {
-    "legal-bert": {
-        "name": "nlp-jurisprudence/legal-bert-base-uncased",
-        "description": "Trained on ECtHR legal documents, specialized in human rights law",
         "max_length": 512,
         "requires_gpu": True,
     },
-    "multi-qa-mpnet": {
-        "name": "sentence-transformers/multi-qa-mpnet-base-dot-v1",
-        "description": "Optimized for legal Q&A retrieval with cross-lingual support",
         "max_length": 512,
         "requires_gpu": False,
     },
-    "legal-xlm-roberta": {
-        "name": "joelito/legal-xlm-roberta-base",
-        "description": "Multilingual legal model with EU legislation and RFC/ISO pattern awareness",
         "max_length": 512,
         "requires_gpu": True,
     },
-    "multilingual-e5": {
-        "name": "intfloat/multilingual-e5-base",
-        "description": "Dense retrieval optimized with citation context preservation",
         "max_length": 512,
         "requires_gpu": True,
     },
-    "all-mpnet": {
         "name": "sentence-transformers/all-mpnet-base-v2",
-        "description": "General purpose embedding model, good baseline for legal text",
         "max_length": 512,
         "requires_gpu": False,
     },
 }
-class LegalDocumentProcessor:
     """
-    Processor for legal documents with specialized models,
-    citation preservation, and benchmarking capabilities.
     """
     def __init__(
         self,
-        model_key: str = "legal-xlm-roberta",
         use_gpu: bool = False,
         chunk_size: int = 512,
         chunk_overlap: int = 100,
     ):
         """
-        Initialize the legal document processor.
         Args:
-            model_key: Key for the model to use from LEGAL_MODELS dictionary
             use_gpu: Whether to use GPU for embeddings (if available)
             chunk_size: Size of text chunks for processing
             chunk_overlap: Overlap between chunks to preserve context
         """
-        # Validate and set up model
-        if model_key not in LEGAL_MODELS:
-            print(
-                f"Warning: Model '{model_key}' not found. Using legal-xlm-roberta as default."
-            )
-            model_key = "legal-xlm-roberta"
-        model_config = LEGAL_MODELS[model_key]
-        device = "cuda" if use_gpu and model_config["requires_gpu"] else "cpu"
         # Initialize embedding model
-        self.embed_model = HuggingFaceEmbedding(
-            model_name=model_config["name"],
-            device=device,
-            tokenizer_kwargs={
-                "trust_remote_code": True,
-                "max_length": model_config["max_length"],
-                "truncation": True,
-            },
-        )
-        # Store model information for reference
-        self.model_info = model_config
-        self.model_key = model_key
-        # Legal document-optimized text splitter with improved chunk size
         self.splitter = RecursiveCharacterTextSplitter(
             chunk_size=chunk_size,
             chunk_overlap=chunk_overlap,
@@ -136,9 +154,8 @@ class LegalDocumentProcessor:
             ],
         )
-        # Pattern for removing footers from legal documents
-        # Separated into individual patterns for better maintainability
-        self.footer_patterns = [
             r"^Page\s\d+(\s+of\s+\d+)?$",  # Page numbers
             r"^©.*\b(Company|Inc|Ltd)\b.*$",  # Copyright lines
             r"^All rights reserved.*?$",  # Legal boilerplate
@@ -147,61 +164,183 @@ class LegalDocumentProcessor:
             r"(?i)^(confidential|proprietary|internal use only)",  # Security tags
         ]
         # Join all patterns with the OR operator
-        combined_pattern = "|".join(f"({pattern})" for pattern in self.footer_patterns)
         # Compile the combined pattern
-        self.footer_pattern = re.compile(
             combined_pattern, flags=re.MULTILINE | re.IGNORECASE
         )
-    def remove_footers(self, text: str) -> str:
         """
-        Remove common document footer patterns from text.
         Args:
             text: The input text to process
         Returns:
-            Text with footer patterns removed
         """
-        return self.footer_pattern.sub("", text)
     def clean_text(self, text: str) -> str:
         """
-        Preserve legal citations while cleaning artifacts.
         Args:
             text: The input text to clean
         Returns:
-            Cleaned text with citations preserved
         """
-        # First remove footers
-        text = self.remove_footers(text)
-        # Preserve citation patterns
-        # Pattern 1: Footnote numbers (e.g., 98, 99, 100)
-        cleaned = re.sub(r"(?<=\D)(\d{2,3})(?=\D)", r"[\1]", text)
-        # Pattern 2: Case citations [2019] UKSC 20
-        # Already well-structured, so no changes needed
-        # Pattern 3: Standardize quotation marks
-        cleaned = cleaned.replace("''", '"').replace("``", '"')
-        # Pattern 4: Handle section references (§3.1, §123)
-        cleaned = re.sub(r"§(\d+(\.\d+)?)", r"Section \1", cleaned)
-        # Pattern 5: Handle legal abbreviations (e.g., Art. -> Article)
-        cleaned = re.sub(r"\bArt\.\s+(\d+)", r"Article \1", cleaned)
-        # Pattern 6: Standardize case names with v. and vs.
-        cleaned = re.sub(r"\bv\s+", r"v. ", cleaned)
-        cleaned = re.sub(r"\bvs\s+", r"v. ", cleaned)
-        # Pattern 7: RFC/ISO pattern standardization (RFC 1234, ISO 9001)
-        cleaned = re.sub(r"\b(RFC|ISO)\s*[:#]?\s*(\d+)", r"\1 \2", cleaned)
         return cleaned
@@ -221,11 +360,9 @@ class LegalDocumentProcessor:
             ]
         )
-    def validate_citation_retention(
-        self, documents: List[Document]
-    ) -> Dict[str, float]:
         """
-        Measure semantic similarity of citations before/after text cleaning.
         Args:
             documents: List of Document objects to validate
@@ -234,7 +371,7 @@ class LegalDocumentProcessor:
             Dictionary with validation metrics
         """
         if not documents:
-            return {"citation_retention": 0.0, "processing_time": 0.0}
         start_time = time.time()
@@ -257,16 +394,16 @@ class LegalDocumentProcessor:
             processing_time = time.time() - start_time
             return {
-                "citation_retention": avg_similarity * 100,  # As percentage
                 "processing_time": processing_time,
                 "sample_size": len(original_texts),
             }
         except Exception as e:
-            return {"citation_retention": 0.0, "processing_time": 0.0, "error": str(e)}
     def process_documents(self, documents: List[Document]) -> Dict[str, Any]:
         """
-        Process a list of legal documents.
         Args:
             documents: List of Document objects to process
@@ -291,48 +428,54 @@ class LegalDocumentProcessor:
                 "status": "success",
                 "nodes_count": len(nodes),
                 "documents_count": len(documents),
-                "model_used": self.model_key,
                 "query_engine": query_engine,  # This will be used for querying
             }
         except Exception as e:
             return {"status": "error", "message": str(e)}
-class LegalDocumentTool(Tool):
     """
-    Tool for processing legal documents with specialized models and querying capabilities.
     """
-    name = "legal_document_processor"
     description = (
-        "Processes legal documents with specialized models for legal text, optimizing for "
-        "citation retention, multilingual support, and performance on legal-specific retrieval tasks. "
-        "Can process text or file inputs and provide enhanced query capabilities."
     )
     inputs = {
         "text": {
             "type": "string",
-            "description": "Legal document text to process. Provide either text or file_paths.",
             "optional": True,
         },
         "file_paths": {
             "type": "string",
-            "description": "Comma-separated list of file paths or a directory path containing legal documents. Provide either text or file_paths.",
             "optional": True,
         },
-        "model_key": {
             "type": "string",
-            "description": "Legal embedding model to use. Options: legal-bert, multi-qa-mpnet, legal-xlm-roberta, multilingual-e5, all-mpnet",
-            "default": "legal-xlm-roberta",
         },
         "query": {
             "type": "string",
             "description": "Optional query to run against the processed documents.",
             "optional": True,
         },
-        "validate_citations": {
             "type": "boolean",
-            "description": "Whether to validate citation retention in the processed documents.",
             "default": False,
         },
         "use_gpu": {
@@ -401,25 +544,26 @@ class LegalDocumentTool(Tool):
             # Clean up the temporary file
             os.remove(temp_path)
-    @spaces.GPU
     def forward(
         self,
         text: Optional[str] = None,
         file_paths: Optional[str] = None,
-        model_key: str = "legal-xlm-roberta",
         query: Optional[str] = None,
-        validate_citations: bool = False,
         use_gpu: bool = False,
     ) -> str:
         """
-        Process legal documents and optionally run a query.
         Args:
-            text: Legal document text to process
             file_paths: Comma-separated list of file paths or a directory path
-            model_key: Legal embedding model to use
             query: Optional query to run against the processed documents
-            validate_citations: Whether to validate citation retention
             use_gpu: Whether to use GPU for embeddings
         Returns:
@@ -431,8 +575,9 @@ class LegalDocumentTool(Tool):
         try:
             # Initialize processor
-            processor = LegalDocumentProcessor(
-                model_key=model_key,
                 use_gpu=use_gpu,
             )
@@ -457,10 +602,10 @@ class LegalDocumentTool(Tool):
             if not documents:
                 return "Error: No valid documents found."
-            # Validate citations if requested
             validation_results = {}
-            if validate_citations:
-                validation_results = processor.validate_citation_retention(documents)
             # Process documents
             result = processor.process_documents(documents)
@@ -477,12 +622,13 @@ class LegalDocumentTool(Tool):
                 output = f"Query: {query}\n\nResponse: {response}\n\n"
                 output += f"Documents processed: {result['documents_count']}\n"
                 output += f"Text chunks: {result['nodes_count']}\n"
-                output += f"Model used: {result['model_used']}\n"
                 # Add validation results if available
                 if validation_results:
-                    output += "\n=== Citation Retention Validation ===\n"
-                    output += f"Citation retention: {validation_results.get('citation_retention', 0):.2f}%\n"
                     output += f"Processing time: {validation_results.get('processing_time', 0):.2f} seconds\n"
                 return output
@@ -491,12 +637,13 @@ class LegalDocumentTool(Tool):
             output = "Document processing complete.\n\n"
             output += f"Documents processed: {result['documents_count']}\n"
             output += f"Text chunks: {result['nodes_count']}\n"
-            output += f"Model used: {result['model_used']}\n"
             # Add validation results if available
             if validation_results:
-                output += "\n=== Citation Retention Validation ===\n"
-                output += f"Citation retention: {validation_results.get('citation_retention', 0):.2f}%\n"
                 output += f"Processing time: {validation_results.get('processing_time', 0):.2f} seconds\n"
             output += "\nThe documents are now ready for querying. Use the 'query' parameter to run a query."

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2025 The Footscray Coding Collective. All rights reserved.
 """
+General Document Processing Tool for Smolagents
+This tool processes various types of documents with domain-specific models,
+optimizing for intelligent document parsing, entity extraction, and
+customized retrieval tasks.
+Author: Zhou Wang
 """
 import os
 import re
 import tempfile
+import time
+from typing import Any, Dict, List, Optional, Union
 import numpy as np
 # Import Smolagents Tool class
 from smolagents import Tool
 # Import NLP components
 try:
+    from langchain.text_splitter import RecursiveCharacterTextSplitter
+    from llama_index.core import Document, SimpleDirectoryReader, VectorStoreIndex
+    from llama_index.core.ingestion import IngestionPipeline
     from llama_index.core.node_parser import MarkdownNodeParser
     from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+    from sklearn.metrics.pairwise import cosine_similarity
 except ImportError:
     raise ImportError(
         "Required dependencies not found. Please install with: "
     )
+# Model configurations based on domain specialization
+DOMAIN_MODELS = {
+    "legal": {
+        "name": "joelito/legal-xlm-roberta-base",
+        "description": "Specialized for legal documents with citation preservation",
         "max_length": 512,
         "requires_gpu": True,
     },
+    "financial": {
+        "name": "thenlper/finetuned-finbert-slot-filling",
+        "description": "Financial document analysis with entity extraction",
         "max_length": 512,
         "requires_gpu": False,
     },
+    "medical": {
+        "name": "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
+        "description": "Medical text processing optimized for clinical terms",
         "max_length": 512,
         "requires_gpu": True,
     },
+    "technical": {
+        "name": "allenai/scibert_scivocab_uncased",
+        "description": "Scientific and technical document processing",
         "max_length": 512,
         "requires_gpu": True,
     },
+    "general": {
         "name": "sentence-transformers/all-mpnet-base-v2",
+        "description": "General purpose embedding model for all document types",
         "max_length": 512,
         "requires_gpu": False,
     },
 }
+class DocumentProcessor:
     """
+    Processor for documents with domain-specific models,
+    entity preservation, and customizable processing capabilities.
     """
     def __init__(
         self,
+        domain: str = "general",
+        model_key: Optional[str] = None,
         use_gpu: bool = False,
         chunk_size: int = 512,
         chunk_overlap: int = 100,
+        custom_patterns: Optional[List[str]] = None,
     ):
         """
+        Initialize the document processor.
         Args:
+            domain: Domain specialization ('legal', 'financial', 'medical', 'technical', 'general')
+            model_key: Specific model to use (overrides domain selection)
             use_gpu: Whether to use GPU for embeddings (if available)
             chunk_size: Size of text chunks for processing
             chunk_overlap: Overlap between chunks to preserve context
+            custom_patterns: Additional regex patterns for text cleaning
         """
+        # Store domain
+        self.domain = domain
+        # If model_key provided, use it directly
+        if model_key:
+            model_name = model_key
+            device = "cuda" if use_gpu else "cpu"
+        else:
+            # Otherwise select model based on domain
+            if domain not in DOMAIN_MODELS:
+                print(
+                    f"Warning: Domain '{domain}' not found. Using 'general' as default."
+                )
+                domain = "general"
+            model_config = DOMAIN_MODELS[domain]
+            model_name = model_config["name"]
+            device = "cuda" if use_gpu and model_config["requires_gpu"] else "cpu"
         # Initialize embedding model
+        try:
+            self.embed_model = HuggingFaceEmbedding(
+                model_name=model_name,
+                device=device,
+                tokenizer_kwargs={
+                    "trust_remote_code": True,
+                    "max_length": 512,
+                    "truncation": True,
+                },
+            )
+            # Store model information for reference
+            self.model_name = model_name
+            self.device = device
+        except Exception as e:
+            raise RuntimeError(f"Failed to initialize embedding model: {str(e)}")
+        # Domain-optimized text splitter
         self.splitter = RecursiveCharacterTextSplitter(
             chunk_size=chunk_size,
             chunk_overlap=chunk_overlap,
             ],
         )
+        # Base cleaning patterns
+        self.cleaning_patterns = [
             r"^Page\s\d+(\s+of\s+\d+)?$",  # Page numbers
             r"^©.*\b(Company|Inc|Ltd)\b.*$",  # Copyright lines
             r"^All rights reserved.*?$",  # Legal boilerplate
             r"(?i)^(confidential|proprietary|internal use only)",  # Security tags
         ]
+        # Add custom patterns if provided
+        if custom_patterns:
+            self.cleaning_patterns.extend(custom_patterns)
         # Join all patterns with the OR operator
+        combined_pattern = "|".join(
+            f"({pattern})" for pattern in self.cleaning_patterns
+        )
         # Compile the combined pattern
+        self.cleaning_pattern = re.compile(
             combined_pattern, flags=re.MULTILINE | re.IGNORECASE
         )
+        # Initialize domain-specific processors
+        self._init_domain_processors()
+    def _init_domain_processors(self):
+        """Initialize domain-specific processors based on selected domain."""
+        # Domain-specific entity patterns
+        self.entity_patterns = {}
+        # Set up domain-specific patterns and processors
+        if self.domain == "legal":
+            self.entity_patterns = {
+                "case_citation": r"\[\d{4}\]\s+[A-Z]+\s+\d+",  # [2019] UKSC 20
+                "statute": r"\b(?:Art\.|Section)\s+\d+(\.\d+)?",  # Art. 5, Section 3.1
+                "legal_ref": r"\b[A-Za-z]+\s+v\.?\s+[A-Za-z]+",  # Smith v. Jones
+            }
+            self.process_entities = self._process_legal_entities
+        if self.domain == "financial":
+            self.entity_patterns = {
+                "monetary": r"\$\s*\d+(?:\.\d+)?(?:\s*(?:million|billion|trillion))?",  # $5.2 million
+                "percentage": r"\d+(?:\.\d+)?\s*%",  # 10.5%
+                "date_range": r"(?:Q[1-4]|FY)\s+\d{4}",  # Q2 2023, FY 2022
+            }
+            self.process_entities = self._process_financial_entities
+        if self.domain == "medical":
+            self.entity_patterns = {
+                "dosage": r"\d+(?:\.\d+)?\s*(?:mg|mcg|g|ml|oz)",  # 10mg, 5.5ml
+                "medical_code": r"[A-Z]\d{2}(?:\.\d+)?",  # ICD codes like E11.9
+                "vital_sign": r"\d+(?:\.\d+)?\s*(?:bpm|mmHg|°[CF])",  # 120 bpm, 98.6°F
+            }
+            self.process_entities = self._process_medical_entities
+        if self.domain == "technical":
+            self.entity_patterns = {
+                "version": r"v\d+(?:\.\d+){1,3}",  # v1.2.3
+                "code_ref": r"(?:\w+\.)+\w+\(\)",  # function calls like math.sqrt()
+                "tech_standard": r"(?:RFC|ISO|IEEE)\s*\d+",  # RFC 1918, ISO 9001
+            }
+            self.process_entities = self._process_technical_entities
+        else:  # General domain or fallback
+            self.entity_patterns = {
+                "url": r"https?://\S+",  # URLs
+                "email": r"\S+@\S+\.\S+",  # Email addresses
+                "date": r"\d{1,2}[/-]\d{1,2}[/-]\d{2,4}",  # Dates
+            }
+            self.process_entities = self._process_general_entities
+    def _process_legal_entities(self, text: str) -> str:
+        """Process legal document entities."""
+        # Preserve citation patterns
+        # Pattern 1: Case citations [2019] UKSC 20
+        # Already well-structured, so no changes needed
+        # Pattern 2: Standardize section references (§3.1, §123)
+        processed = re.sub(r"§(\d+(\.\d+)?)", r"Section \1", text)
+        # Pattern 3: Handle legal abbreviations (e.g., Art. -> Article)
+        processed = re.sub(r"\bArt\.\s+(\d+)", r"Article \1", processed)
+        # Pattern 4: Standardize case names with v. and vs.
+        processed = re.sub(r"\bv\s+", r"v. ", processed)
+        processed = re.sub(r"\bvs\s+", r"v. ", processed)
+        return processed
+    def _process_financial_entities(self, text: str) -> str:
+        """Process financial document entities."""
+        # Pattern 1: Standardize monetary values
+        processed = re.sub(
+            r"\$\s*(\d+)(?:,\d{3})*(?:\.\d+)?",
+            lambda m: f"${float(m.group(1).replace(',', ''))}",
+            text,
+        )
+        # Pattern 2: Standardize percentage representations
+        processed = re.sub(r"(\d+(?:\.\d+)?)\s*(?:percent|pct)", r"\1%", processed)
+        # Pattern 3: Standardize fiscal periods
+        processed = re.sub(r"(?:fiscal year|FY)\s+(\d{4})", r"FY \1", processed)
+        # Pattern 4: Standardize quarterly references
+        processed = re.sub(r"(?:quarter|Q)(\d)\s+(\d{4})", r"Q\1 \2", processed)
+        return processed
+    def _process_medical_entities(self, text: str) -> str:
+        """Process medical document entities."""
+        # Pattern 1: Standardize dosage format
+        processed = re.sub(
+            r"(\d+(?:\.\d+)?)\s*(milligrams?|mcgs?|grams?|milliliters?)",
+            lambda m: f"{m.group(1)} {m.group(2)[0:2]}",
+            text,
+        )
+        # Pattern 2: Standardize temperature format
+        processed = re.sub(r"(\d+(?:\.\d+)?)\s*degrees?\s*([CF])", r"\1°\2", processed)
+        # Pattern 3: Standardize vital signs
+        processed = re.sub(
+            r"(\d+(?:\.\d+)?)\s*(?:beats per minute|BPM)", r"\1 bpm", processed
+        )
+        return processed
+    def _process_technical_entities(self, text: str) -> str:
+        """Process technical document entities."""
+        # Pattern 1: Standardize version numbers
+        processed = re.sub(r"version\s+(\d+(?:\.\d+){1,3})", r"v\1", text)
+        # Pattern 2: RFC/ISO pattern standardization
+        processed = re.sub(r"\b(RFC|ISO|IEEE)\s*[:#]?\s*(\d+)", r"\1 \2", processed)
+        # Pattern 3: Standardize code references
+        # This is a simplified example
+        processed = re.sub(r"function\s+(\w+)\s*\(", r"\1(", processed)
+        return processed
+    def _process_general_entities(self, text: str) -> str:
+        """Process general document entities."""
+        # General cleaning and standardization
+        processed = text
+        # URLs preserved as-is
+        # Simple date standardization
+        processed = re.sub(
+            r"(\d{1,2})/(\d{1,2})/(\d{2})(?!\d)",
+            r"\1/\2/20\3",  # Assume 2-digit years are 2000s
+            processed,
+        )
+        return processed
+    def remove_boilerplate(self, text: str) -> str:
         """
+        Remove common document boilerplate patterns from text.
         Args:
             text: The input text to process
         Returns:
+            Text with boilerplate patterns removed
         """
+        return self.cleaning_pattern.sub("", text)
     def clean_text(self, text: str) -> str:
         """
+        Clean text while preserving domain-specific entities.
         Args:
             text: The input text to clean
         Returns:
+            Cleaned text with domain entities preserved
         """
+        # First remove boilerplate
+        cleaned = self.remove_boilerplate(text)
+        # Then process domain-specific entities
+        cleaned = self.process_entities(cleaned)
         return cleaned
             ]
         )
+    def validate_entity_retention(self, documents: List[Document]) -> Dict[str, float]:
         """
+        Measure semantic similarity of entities before/after text cleaning.
         Args:
             documents: List of Document objects to validate
             Dictionary with validation metrics
         """
         if not documents:
+            return {"entity_retention": 0.0, "processing_time": 0.0}
         start_time = time.time()
             processing_time = time.time() - start_time
             return {
+                "entity_retention": avg_similarity * 100,  # As percentage
                 "processing_time": processing_time,
                 "sample_size": len(original_texts),
             }
         except Exception as e:
+            return {"entity_retention": 0.0, "processing_time": 0.0, "error": str(e)}
     def process_documents(self, documents: List[Document]) -> Dict[str, Any]:
         """
+        Process a list of documents.
         Args:
             documents: List of Document objects to process
                 "status": "success",
                 "nodes_count": len(nodes),
                 "documents_count": len(documents),
+                "domain": self.domain,
+                "model_name": self.model_name,
                 "query_engine": query_engine,  # This will be used for querying
             }
         except Exception as e:
             return {"status": "error", "message": str(e)}
+class DocumentProcessorTool(Tool):
     """
+    General-purpose document processing tool with domain specialization.
     """
+    name = "document_processor"
     description = (
+        "Processes documents with domain-specific models optimized for "
+        "entity preservation and retrieval performance. Supports legal, "
+        "financial, medical, technical and general document types."
     )
     inputs = {
         "text": {
             "type": "string",
+            "description": "Document text to process. Provide either text or file_paths.",
             "optional": True,
         },
         "file_paths": {
             "type": "string",
+            "description": "Comma-separated list of file paths or a directory path containing documents. Provide either text or file_paths.",
             "optional": True,
         },
+        "domain": {
+            "type": "string",
+            "description": "Document domain for specialized processing: legal, financial, medical, technical, or general.",
+            "default": "general",
+        },
+        "model_name": {
             "type": "string",
+            "description": "Specific embedding model name to use (optional, overrides domain selection).",
+            "optional": True,
         },
         "query": {
             "type": "string",
             "description": "Optional query to run against the processed documents.",
             "optional": True,
         },
+        "validate_entities": {
             "type": "boolean",
+            "description": "Whether to validate entity retention in the processed documents.",
             "default": False,
         },
         "use_gpu": {
             # Clean up the temporary file
             os.remove(temp_path)
     def forward(
         self,
         text: Optional[str] = None,
         file_paths: Optional[str] = None,
+        domain: str = "general",
+        model_name: Optional[str] = None,
         query: Optional[str] = None,
+        validate_entities: bool = False,
         use_gpu: bool = False,
     ) -> str:
         """
+        Process documents and optionally run a query.
         Args:
+            text: Document text to process
             file_paths: Comma-separated list of file paths or a directory path
+            domain: Document domain specialization
+            model_name: Specific embedding model to use
             query: Optional query to run against the processed documents
+            validate_entities: Whether to validate entity retention
             use_gpu: Whether to use GPU for embeddings
         Returns:
         try:
             # Initialize processor
+            processor = DocumentProcessor(
+                domain=domain,
+                model_key=model_name,
                 use_gpu=use_gpu,
             )
             if not documents:
                 return "Error: No valid documents found."
+            # Validate entity retention if requested
             validation_results = {}
+            if validate_entities:
+                validation_results = processor.validate_entity_retention(documents)
             # Process documents
             result = processor.process_documents(documents)
                 output = f"Query: {query}\n\nResponse: {response}\n\n"
                 output += f"Documents processed: {result['documents_count']}\n"
                 output += f"Text chunks: {result['nodes_count']}\n"
+                output += f"Domain: {result['domain']}\n"
+                output += f"Model: {result['model_name']}\n"
                 # Add validation results if available
                 if validation_results:
+                    output += "\n=== Entity Retention Validation ===\n"
+                    output += f"Entity retention: {validation_results.get('entity_retention', 0):.2f}%\n"
                     output += f"Processing time: {validation_results.get('processing_time', 0):.2f} seconds\n"
                 return output
             output = "Document processing complete.\n\n"
             output += f"Documents processed: {result['documents_count']}\n"
             output += f"Text chunks: {result['nodes_count']}\n"
+            output += f"Domain: {result['domain']}\n"
+            output += f"Model: {result['model_name']}\n"
             # Add validation results if available
             if validation_results:
+                output += "\n=== Entity Retention Validation ===\n"
+                output += f"Entity retention: {validation_results.get('entity_retention', 0):.2f}%\n"
                 output += f"Processing time: {validation_results.get('processing_time', 0):.2f} seconds\n"
             output += "\nThe documents are now ready for querying. Use the 'query' parameter to run a query."