Legal_AI_Agent

Build error

App Files Files Community

cryogenic22 commited on Dec 10, 2024

Commit

1628132

verified ·

1 Parent(s): 8a2ab7f

Create utils/document_processor.py

Browse files

Files changed (1) hide show

utils/document_processor.py +74 -0

utils/document_processor.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# utils/document_processor.py
+import fitz
+import docx
+from typing import List, Dict, Tuple
+import re
+from sentence_transformers import SentenceTransformer
+class DocumentProcessor:
+    def __init__(self):
+        if 'embedder' not in st.session_state:
+            st.session_state.embedder = SentenceTransformer('all-MiniLM-L6-v2')
+        self.embedder = st.session_state.embedder
+    def process_document(self, file) -> Tuple[str, List[Dict]]:
+        """Process document and return text and chunks"""
+        # Extract text based on file type
+        file_type = file.name.split('.')[-1].lower()
+        if file_type == 'pdf':
+            text = self._process_pdf(file)
+        elif file_type == 'docx':
+            text = self._process_docx(file)
+        else:
+            text = file.getvalue().decode()
+        # Create chunks
+        chunks = self._create_chunks(text)
+        return text, chunks
+    def _process_pdf(self, file) -> str:
+        """Process PDF file"""
+        pdf_bytes = file.getvalue()
+        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+        text = ""
+        for page in doc:
+            text += page.get_text()
+        return text
+    def _process_docx(self, file) -> str:
+        """Process DOCX file"""
+        doc = docx.Document(file)
+        text = []
+        for para in doc.paragraphs:
+            text.append(para.text)
+        return "\n".join(text)
+    def _create_chunks(self, text: str, chunk_size: int = 1000) -> List[Dict]:
+        """Create chunks from text"""
+        # Split into paragraphs
+        paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
+        chunks = []
+        current_chunk = ""
+        for para in paragraphs:
+            if len(current_chunk) + len(para) > chunk_size and current_chunk:
+                chunks.append(self._create_chunk_dict(current_chunk))
+                current_chunk = para
+            else:
+                current_chunk += "\n" + para if current_chunk else para
+        if current_chunk:
+            chunks.append(self._create_chunk_dict(current_chunk))
+        return chunks
+    def _create_chunk_dict(self, text: str) -> Dict:
+        """Create a chunk dictionary with metadata"""
+        return {
+            "text": text,
+            "metadata": {
+                "length": len(text),
+                "embedding": self.embedder.encode(text).tolist()
+            }
+        }