Legal_AI_Agent

Build error

App Files Files Community

cryogenic22 commited on Dec 10, 2024

Commit

dc3b7e9

verified ·

1 Parent(s): 55f1461

Update utils/document_processor.py

Browse files

Files changed (1) hide show

utils/document_processor.py +359 -136

utils/document_processor.py CHANGED Viewed

@@ -13,24 +13,98 @@ import spacy
 import nltk
 from nltk.tokenize import sent_tokenize
 from nltk.corpus import stopwords
 class DocumentProcessor:
-    def __init__(self, ontology_path: str = "data/legal_ontology.json"):
-        """Initialize Document Processor with enhanced NLP capabilities."""
-        self.ontology = self._load_ontology(ontology_path)
         # Initialize NLP components
         try:
             self.nlp = spacy.load("en_core_web_sm")
         except OSError:
             spacy.cli.download("en_core_web_sm")
             self.nlp = spacy.load("en_core_web_sm")
-        # Initialize NLTK components
         try:
             nltk.data.find('tokenizers/punkt')
         except LookupError:
             nltk.download('punkt')
             nltk.download('stopwords')
         self.stop_words = set(stopwords.words('english'))
@@ -38,168 +112,209 @@ class DocumentProcessor:
     def process_and_tag_document(self, file) -> Tuple[str, List[Dict], Dict]:
         """Process document with enhanced metadata extraction and chunking."""
         try:
             # Extract text and perform initial processing
-            text, chunks = self.process_document(file)
             # Extract and enrich metadata
             metadata = self._extract_metadata(text, file.name)
-            # Enhance chunks with NLP analysis
-            enhanced_chunks = self._enhance_chunks(chunks, metadata)
-            return text, enhanced_chunks, metadata
         except Exception as e:
             print(f"Error processing document: {e}")
             raise
-    def _enhance_chunks(self, chunks: List[Dict], metadata: Dict) -> List[Dict]:
-        """Enhance chunks with NLP analysis and metadata."""
-        enhanced_chunks = []
-        for chunk in chunks:
-            # Process chunk with spaCy
-            doc = self.nlp(chunk['text'])
-            # Extract key entities
-            entities = [(ent.text, ent.label_) for ent in doc.ents]
-            # Extract key phrases
-            noun_phrases = [chunk.text for chunk in doc.noun_chunks]
-            # Link to ontology concepts
-            ontology_links = self._link_to_ontology(chunk['text'])
-            # Add enhancements to chunk
-            enhanced_chunks.append({
-                **chunk,
-                'entities': entities,
-                'key_phrases': noun_phrases,
-                'ontology_links': ontology_links,
-                'metadata': metadata
-            })
-        return enhanced_chunks
-    def _chunk_text(self, text: str, chunk_size: int = 500) -> List[Dict]:
-        """Improved text chunking with sentence boundary preservation."""
         # Split into sentences
         sentences = sent_tokenize(text)
         chunks = []
         current_chunk = []
         current_length = 0
         for sentence in sentences:
             sentence_length = len(sentence)
             if current_length + sentence_length > chunk_size and current_chunk:
-                # Store current chunk
                 chunk_text = ' '.join(current_chunk)
-                chunks.append({
-                    'chunk_id': len(chunks),
-                    'text': chunk_text,
-                    'start_idx': text.index(current_chunk[0]),
-                    'end_idx': text.index(current_chunk[-1]) + len(current_chunk[-1])
-                })
                 current_chunk = []
                 current_length = 0
             current_chunk.append(sentence)
             current_length += sentence_length
-        # Add final chunk
         if current_chunk:
             chunk_text = ' '.join(current_chunk)
-            chunks.append({
-                'chunk_id': len(chunks),
-                'text': chunk_text,
-                'start_idx': text.index(current_chunk[0]),
-                'end_idx': text.index(current_chunk[-1]) + len(current_chunk[-1])
-            })
         return chunks
     def _extract_metadata(self, text: str, file_name: str) -> Dict:
-        """Enhanced metadata extraction with NLP support."""
-        # Process text with spaCy
         doc = self.nlp(text[:10000])  # Process first 10k chars for efficiency
         metadata = {
-            'title': file_name,
-            'type': self._infer_document_type(text, doc),
-            'jurisdiction': self._infer_jurisdiction(text, doc),
             'processed_at': datetime.now().isoformat(),
-            'key_entities': self._extract_key_entities(doc),
-            'dates': self._extract_dates(text),
             'citations': self._extract_citations(text),
-            'ontology_links': self._link_to_ontology(text),
-            'language_stats': self._get_language_stats(doc)
         }
         return metadata
-    def _infer_document_type(self, text: str, doc: spacy.tokens.Doc) -> str:
-        """Improved document type inference using NLP and patterns."""
-        # Define document type patterns with weights
         type_patterns = {
-            'judgment': {
-                'keywords': ['court', 'judge', 'judgment', 'verdict', 'ruling'],
-                'weight': 1.5
-            },
-            'contract': {
-                'keywords': ['agreement', 'contract', 'party', 'clause', 'terms'],
-                'weight': 1.2
-            },
-            'legislation': {
-                'keywords': ['act', 'statute', 'regulation', 'law', 'provision'],
-                'weight': 1.3
-            },
-            'memo': {
-                'keywords': ['memorandum', 'memo', 'note', 'circular'],
-                'weight': 1.0
-            }
         }
-        # Calculate scores for each type
-        scores = {}
         text_lower = text.lower()
-        for doc_type, pattern in type_patterns.items():
-            score = 0
-            for keyword in pattern['keywords']:
-                count = text_lower.count(keyword)
-                score += count * pattern['weight']
-            scores[doc_type] = score
-        # Get type with highest score
-        if scores:
-            max_score = max(scores.values())
-            if max_score > 0:
-                return max(scores.items(), key=lambda x: x[1])[0]
-        return 'unknown'
-    def _extract_key_entities(self, doc: spacy.tokens.Doc) -> Dict[str, List[str]]:
-        """Extract and categorize key entities from text."""
-        entities = {
-            'PERSON': set(),
-            'ORG': set(),
-            'GPE': set(),
-            'LAW': set(),
-            'DATE': set()
-        }
-        for ent in doc.ents:
-            if ent.label_ in entities:
-                entities[ent.label_].add(ent.text)
-        return {k: list(v) for k, v in entities.items()}
     def _extract_citations(self, text: str) -> List[Dict]:
-        """Extract legal citations using regex patterns."""
         citation_patterns = [
             r'\[\d{4}\]\s+\w+\s+\d+',  # [2021] EWHC 123
             r'\d+\s+U\.S\.\s+\d+',     # 123 U.S. 456
-            r'\(\d{4}\)\s+\d+\s+\w+\s+\d+',  # (2021) 12 ABC 345
         ]
         citations = []
@@ -214,8 +329,23 @@ class DocumentProcessor:
         return citations
-    def _get_language_stats(self, doc: spacy.tokens.Doc) -> Dict:
-        """Get language statistics for the document."""
         return {
             'sentence_count': len(list(doc.sents)),
             'word_count': len([token for token in doc if not token.is_space]),
@@ -223,16 +353,27 @@ class DocumentProcessor:
                                     for sent in doc.sents) / len(list(doc.sents)) if doc.sents else 0
         }
     def _link_to_ontology(self, text: str) -> List[Dict]:
-        """Enhanced ontology linking with context."""
         relevant_concepts = []
         text_lower = text.lower()
-        for concept in self.ontology['@graph']:
-            if 'rdfs:label' not in concept:
                 continue
-            label = concept['rdfs:label'].lower()
             if label in text_lower:
                 # Get surrounding context
                 start_idx = text_lower.index(label)
@@ -244,29 +385,111 @@ class DocumentProcessor:
                     'type': concept.get('@type', 'Unknown'),
                     'description': concept.get('rdfs:comment', ''),
                     'context': text[context_start:context_end].strip(),
-                    'confidence': self._calculate_concept_confidence(text, concept)
                 })
         return relevant_concepts
-    def _calculate_concept_confidence(self, text: str, concept: Dict) -> float:
-        """Calculate confidence score for ontology concept match."""
-        confidence = 0.0
-        # Check for exact label match
-        if concept.get('rdfs:label', '').lower() in text.lower():
-            confidence += 0.6
-        # Check for related terms
-        if 'related_terms' in concept:
-            related_matches = sum(1 for term in concept['related_terms']
-                                if term.lower() in text.lower())
-            confidence += 0.2 * (related_matches / len(concept['related_terms']))
-        # Check for context terms
-        if 'context_terms' in concept:
-            context_matches = sum(1 for term in concept['context_terms']
-                                if term.lower() in text.lower())
-            confidence += 0.2 * (context_matches / len(concept['context_terms']))
-        return min(1.0, confidence)

 import nltk
 from nltk.tokenize import sent_tokenize
 from nltk.corpus import stopwords
+from pathlib import Path
+import shutil
 class DocumentProcessor:
+    def __init__(self, base_path: str = None):
+        """Initialize Document Processor with proper data directory handling.
+        Args:
+            base_path: Optional base path override. If None, will use appropriate
+                      path based on environment (local vs HF Spaces)
+        """
+        # Set up base paths
+        self.base_path = self._setup_data_directories(base_path)
+        self.ontology_path = os.path.join(self.base_path, "legal_ontology.json")
+        # Ensure ontology exists
+        self._ensure_ontology_exists()
+        # Load ontology
+        self.ontology = self._load_ontology()
         # Initialize NLP components
+        self._setup_nlp()
+        # Create processing directories
+        self.processed_path = os.path.join(self.base_path, "processed")
+        self.temp_path = os.path.join(self.base_path, "temp")
+        os.makedirs(self.processed_path, exist_ok=True)
+        os.makedirs(self.temp_path, exist_ok=True)
+    def _setup_data_directories(self, base_path: Optional[str] = None) -> str:
+        """Set up data directories with HF Spaces compatibility."""
+        if base_path:
+            data_path = base_path
+        else:
+            # Check if running in Hugging Face Spaces
+            if os.environ.get('SPACE_ID'):
+                # Use the persistent storage in HF Spaces
+                data_path = "/data"
+            else:
+                # Local development path
+                data_path = os.path.join(os.getcwd(), "data")
+        # Create necessary subdirectories
+        subdirs = ["ontology", "processed", "temp", "indexes"]
+        for subdir in subdirs:
+            os.makedirs(os.path.join(data_path, subdir), exist_ok=True)
+        return data_path
+    def _ensure_ontology_exists(self):
+        """Ensure the legal ontology file exists, create if not."""
+        if not os.path.exists(self.ontology_path):
+            default_ontology = {
+                "@graph": [
+                    {
+                        "@id": "concept:Contract",
+                        "@type": "vocab:LegalConcept",
+                        "rdfs:label": "Contract",
+                        "rdfs:comment": "A legally binding agreement between parties",
+                        "vocab:relatedConcepts": ["Offer", "Acceptance", "Consideration"]
+                    },
+                    {
+                        "@id": "concept:Judgment",
+                        "@type": "vocab:LegalConcept",
+                        "rdfs:label": "Judgment",
+                        "rdfs:comment": "A court's final determination of the rights and obligations",
+                        "vocab:relatedConcepts": ["Court Order", "Decision", "Ruling"]
+                    }
+                ]
+            }
+            with open(self.ontology_path, 'w') as f:
+                json.dump(default_ontology, f, indent=2)
+    def _setup_nlp(self):
+        """Initialize NLP components with error handling."""
+        # Setup spaCy
         try:
             self.nlp = spacy.load("en_core_web_sm")
         except OSError:
             spacy.cli.download("en_core_web_sm")
             self.nlp = spacy.load("en_core_web_sm")
+        # Setup NLTK
         try:
             nltk.data.find('tokenizers/punkt')
         except LookupError:
             nltk.download('punkt')
+        try:
+            nltk.data.find('corpora/stopwords')
+        except LookupError:
             nltk.download('stopwords')
         self.stop_words = set(stopwords.words('english'))
     def process_and_tag_document(self, file) -> Tuple[str, List[Dict], Dict]:
         """Process document with enhanced metadata extraction and chunking."""
         try:
+            # Generate unique document ID
+            doc_id = datetime.now().strftime('%Y%m%d_%H%M%S')
+            # Create document directory
+            doc_dir = os.path.join(self.processed_path, doc_id)
+            os.makedirs(doc_dir, exist_ok=True)
+            # Save original file
+            original_path = os.path.join(doc_dir, "original" + Path(file.name).suffix)
+            with open(original_path, 'wb') as f:
+                f.write(file.getvalue())
             # Extract text and perform initial processing
+            text, chunks = self.process_document(original_path)
             # Extract and enrich metadata
             metadata = self._extract_metadata(text, file.name)
+            metadata['doc_id'] = doc_id
+            metadata['original_path'] = original_path
+            # Save processed text
+            text_path = os.path.join(doc_dir, "processed.txt")
+            with open(text_path, 'w', encoding='utf-8') as f:
+                f.write(text)
+            # Save chunks
+            chunks_path = os.path.join(doc_dir, "chunks.json")
+            with open(chunks_path, 'w') as f:
+                json.dump(chunks, f, indent=2)
+            # Save metadata
+            metadata_path = os.path.join(doc_dir, "metadata.json")
+            with open(metadata_path, 'w') as f:
+                json.dump(metadata, f, indent=2)
+            return text, chunks, metadata
         except Exception as e:
             print(f"Error processing document: {e}")
             raise
+    def process_document(self, file_path: str) -> Tuple[str, List[Dict]]:
+        """Process a document and return its text and chunks."""
+        file_type = Path(file_path).suffix.lower()
+        if file_type == '.pdf':
+            text = self._process_pdf(file_path)
+        elif file_type == '.docx':
+            text = self._process_docx(file_path)
+        elif file_type in ['.txt', '.csv']:
+            text = self._process_text(file_path)
+        else:
+            raise ValueError(f"Unsupported file type: {file_type}")
+        # Create chunks with enhanced metadata
+        chunks = self._create_enhanced_chunks(text)
+        return text, chunks
+    def _process_pdf(self, file_path: str) -> str:
+        """Extract text from PDF with OCR fallback."""
+        reader = pypdf.PdfReader(file_path)
+        text = ""
+        for page_num, page in enumerate(reader.pages, 1):
+            page_text = page.extract_text()
+            if page_text.strip():
+                text += f"\n--- Page {page_num} ---\n{page_text}"
+            else:
+                # Perform OCR if text extraction fails
+                images = convert_from_bytes(open(file_path, 'rb').read())
+                page_text = pytesseract.image_to_string(images[page_num - 1])
+                text += f"\n--- Page {page_num} (OCR) ---\n{page_text}"
+        return text
+    def _process_docx(self, file_path: str) -> str:
+        """Process DOCX files with metadata."""
+        doc = docx.Document(file_path)
+        text = ""
+        # Process document sections
+        for para in doc.paragraphs:
+            if para.text.strip():
+                text += para.text + "\n"
+        return text
+    def _process_text(self, file_path: str) -> str:
+        """Process text files with encoding detection."""
+        try:
+            with open(file_path, 'rb') as f:
+                raw_data = f.read()
+            # Detect encoding
+            result = chardet.detect(raw_data)
+            encoding = result['encoding'] if result['confidence'] > 0.7 else 'utf-8'
+            # Decode text
+            return raw_data.decode(encoding)
+        except Exception as e:
+            print(f"Error processing text file: {e}")
+            return ""
+    def _create_enhanced_chunks(self, text: str) -> List[Dict]:
+        """Create enhanced chunks with NLP analysis."""
         # Split into sentences
         sentences = sent_tokenize(text)
         chunks = []
         current_chunk = []
         current_length = 0
+        chunk_size = 500  # Approximate target chunk size
         for sentence in sentences:
             sentence_length = len(sentence)
             if current_length + sentence_length > chunk_size and current_chunk:
+                # Process current chunk
                 chunk_text = ' '.join(current_chunk)
+                chunks.append(self._process_chunk(chunk_text, len(chunks)))
                 current_chunk = []
                 current_length = 0
             current_chunk.append(sentence)
             current_length += sentence_length
+        # Process final chunk
         if current_chunk:
             chunk_text = ' '.join(current_chunk)
+            chunks.append(self._process_chunk(chunk_text, len(chunks)))
         return chunks
+    def _process_chunk(self, text: str, chunk_id: int) -> Dict:
+        """Process a single chunk with NLP analysis."""
+        doc = self.nlp(text)
+        return {
+            'chunk_id': chunk_id,
+            'text': text,
+            'entities': [(ent.text, ent.label_) for ent in doc.ents],
+            'noun_phrases': [chunk.text for chunk in doc.noun_chunks],
+            'word_count': len([token for token in doc if not token.is_space]),
+            'sentence_count': len(list(doc.sents)),
+            'ontology_links': self._link_to_ontology(text)
+        }
     def _extract_metadata(self, text: str, file_name: str) -> Dict:
+        """Extract enhanced metadata from document."""
         doc = self.nlp(text[:10000])  # Process first 10k chars for efficiency
         metadata = {
+            'filename': file_name,
+            'file_type': Path(file_name).suffix.lower(),
             'processed_at': datetime.now().isoformat(),
+            'word_count': len([token for token in doc if not token.is_space]),
+            'sentence_count': len(list(doc.sents)),
+            'entities': self._extract_entities(doc),
+            'document_type': self._infer_document_type(text),
+            'language_stats': self._get_language_stats(doc),
             'citations': self._extract_citations(text),
+            'dates': self._extract_dates(text),
+            'key_phrases': [chunk.text for chunk in doc.noun_chunks if len(chunk.text.split()) > 1][:10],
+            'ontology_concepts': self._link_to_ontology(text)
         }
         return metadata
+    def _extract_entities(self, doc) -> Dict[str, List[str]]:
+        """Extract named entities from text."""
+        entities = {}
+        for ent in doc.ents:
+            if ent.label_ not in entities:
+                entities[ent.label_] = []
+            if ent.text not in entities[ent.label_]:
+                entities[ent.label_].append(ent.text)
+        return entities
+    def _infer_document_type(self, text: str) -> str:
+        """Infer document type using rule-based classification."""
         type_patterns = {
+            'contract': ['agreement', 'parties', 'obligations', 'terms and conditions'],
+            'judgment': ['court', 'judge', 'ruling', 'ordered', 'judgment'],
+            'legislation': ['act', 'statute', 'regulation', 'amended', 'parliament'],
+            'memo': ['memorandum', 'memo', 'note', 'meeting minutes']
         }
         text_lower = text.lower()
+        scores = {doc_type: sum(1 for pattern in patterns if pattern in text_lower)
+                 for doc_type, patterns in type_patterns.items()}
+        if not scores or max(scores.values()) == 0:
+            return 'unknown'
+        return max(scores.items(), key=lambda x: x[1])[0]
     def _extract_citations(self, text: str) -> List[Dict]:
+        """Extract legal citations."""
         citation_patterns = [
             r'\[\d{4}\]\s+\w+\s+\d+',  # [2021] EWHC 123
             r'\d+\s+U\.S\.\s+\d+',     # 123 U.S. 456
+            r'\(\d{4}\)\s+\d+\s+\w+\s+\d+'  # (2021) 12 ABC 345
         ]
         citations = []
         return citations
+    def _extract_dates(self, text: str) -> List[str]:
+        """Extract dates from text."""
+        date_patterns = [
+            r'\d{1,2}/\d{1,2}/\d{2,4}',
+            r'\d{1,2}-\d{1,2}-\d{2,4}',
+            r'\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}'
+        ]
+        dates = []
+        for pattern in date_patterns:
+            matches = re.finditer(pattern, text, re.IGNORECASE)
+            dates.extend(match.group() for match in matches)
+        return dates
+    def _get_language_stats(self, doc) -> Dict:
+        """Get language statistics from document."""
         return {
             'sentence_count': len(list(doc.sents)),
             'word_count': len([token for token in doc if not token.is_space]),
                                     for sent in doc.sents) / len(list(doc.sents)) if doc.sents else 0
         }
+    def _load_ontology(self) -> Dict:
+        """Load legal ontology from file."""
+        try:
+            if os.path.exists(self.ontology_path):
+                with open(self.ontology_path, 'r') as f:
+                    return json.load(f)
+            return {"@graph": []}
+        except Exception as e:
+            print(f"Error loading ontology: {e}")
+            return {"@graph": []}
     def _link_to_ontology(self, text: str) -> List[Dict]:
+        """Link text to ontology concepts."""
         relevant_concepts = []
         text_lower = text.lower()
+        for concept in self.ontology.get("@graph", []):
+            if "rdfs:label" not in concept:
                 continue
+            label = concept["rdfs:label"].lower()
             if label in text_lower:
                 # Get surrounding context
                 start_idx = text_lower.index(label)
                     'type': concept.get('@type', 'Unknown'),
                     'description': concept.get('rdfs:comment', ''),
                     'context': text[context_start:context_end].strip(),
+                    'location': {'start': start_idx, 'end': start_idx + len(label)}
                 })
         return relevant_concepts
+    def cleanup(self):
+        """Clean up temporary files."""
+        try:
+            shutil.rmtree(self.temp_path)
+            os.makedirs(self.temp_path, exist_ok=True)
+        except Exception as e:
+            print(f"Error cleaning up temporary files: {e}")
+    def get_document_path(self, doc_id: str) -> Optional[str]:
+        """Get the path to a processed document."""
+        doc_dir = os.path.join(self.processed_path, doc_id)
+        if not os.path.exists(doc_dir):
+            return None
+        return doc_dir
+    def get_document_metadata(self, doc_id: str) -> Optional[Dict]:
+        """Get metadata for a processed document."""
+        doc_dir = self.get_document_path(doc_id)
+        if not doc_dir:
+            return None
+        metadata_path = os.path.join(doc_dir, "metadata.json")
+        try:
+            with open(metadata_path, 'r') as f:
+                return json.load(f)
+        except Exception as e:
+            print(f"Error loading metadata for document {doc_id}: {e}")
+            return None
+    def get_document_chunks(self, doc_id: str) -> Optional[List[Dict]]:
+        """Get chunks for a processed document."""
+        doc_dir = self.get_document_path(doc_id)
+        if not doc_dir:
+            return None
+        chunks_path = os.path.join(doc_dir, "chunks.json")
+        try:
+            with open(chunks_path, 'r') as f:
+                return json.load(f)
+        except Exception as e:
+            print(f"Error loading chunks for document {doc_id}: {e}")
+            return None
+    def reprocess_document(self, doc_id: str) -> Optional[Tuple[str, List[Dict], Dict]]:
+        """Reprocess an existing document."""
+        doc_dir = self.get_document_path(doc_id)
+        if not doc_dir:
+            return None
+        original_path = os.path.join(doc_dir, "original" + Path(doc_dir).suffix)
+        if not os.path.exists(original_path):
+            return None
+        try:
+            # Process the original file again
+            with open(original_path, 'rb') as f:
+                text, chunks = self.process_document(original_path)
+            # Update metadata
+            metadata = self._extract_metadata(text, os.path.basename(original_path))
+            metadata['doc_id'] = doc_id
+            metadata['original_path'] = original_path
+            metadata['reprocessed_at'] = datetime.now().isoformat()
+            # Save updated files
+            text_path = os.path.join(doc_dir, "processed.txt")
+            with open(text_path, 'w', encoding='utf-8') as f:
+                f.write(text)
+            chunks_path = os.path.join(doc_dir, "chunks.json")
+            with open(chunks_path, 'w') as f:
+                json.dump(chunks, f, indent=2)
+            metadata_path = os.path.join(doc_dir, "metadata.json")
+            with open(metadata_path, 'w') as f:
+                json.dump(metadata, f, indent=2)
+            return text, chunks, metadata
+        except Exception as e:
+            print(f"Error reprocessing document {doc_id}: {e}")
+            return None
+    def delete_document(self, doc_id: str) -> bool:
+        """Delete a processed document and its files."""
+        doc_dir = self.get_document_path(doc_id)
+        if not doc_dir:
+            return False
+        try:
+            shutil.rmtree(doc_dir)
+            return True
+        except Exception as e:
+            print(f"Error deleting document {doc_id}: {e}")
+            return False
+    def __enter__(self):
+        """Context manager entry."""
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit with cleanup."""
+        self.cleanup()