Legal_AI_Agent

Build error

App Files Files Community

cryogenic22 commited on Dec 10, 2024

Commit

82781b0

verified ·

1 Parent(s): 9c6318f

Update utils/document_processor.py

Browse files

Files changed (1) hide show

utils/document_processor.py +51 -85

utils/document_processor.py CHANGED Viewed

@@ -1,107 +1,73 @@
-# utils/document_processor.py
-import streamlit as st
-import fitz
-import docx
-from typing import List, Dict, Tuple
-import re
-import io
 from PIL import Image
 class DocumentProcessor:
     def __init__(self):
-        pass  # Removed embedder dependency for simpler processing
     def process_document(self, file) -> Tuple[str, List[Dict]]:
-        """Process document and return text and chunks"""
-        try:
-            # Extract text based on file type
-            file_type = file.name.split('.')[-1].lower()
-            if file_type == 'pdf':
-                text = self._process_pdf(file)
-            elif file_type == 'docx':
-                text = self._process_docx(file)
-            else:
-                text = self._process_text(file)
-            # Create chunks
-            chunks = self._create_chunks(text)
-            return text, chunks
-        except Exception as e:
-            st.error(f"Error processing document: {str(e)}")
-            return "", []
     def _process_pdf(self, file) -> str:
-        """Process PDF file"""
         try:
-            pdf_bytes = file.getvalue()
-            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
             text = ""
-            for page in doc:
-                text += page.get_text()
             return text
         except Exception as e:
-            st.error(f"Error processing PDF: {str(e)}")
             return ""
-    def _process_docx(self, file) -> str:
-        """Process DOCX file"""
         try:
-            doc = docx.Document(io.BytesIO(file.getvalue()))
-            text = []
-            for para in doc.paragraphs:
-                text.append(para.text)
-            return "\n".join(text)
         except Exception as e:
-            st.error(f"Error processing DOCX: {str(e)}")
             return ""
-    def _process_text(self, file) -> str:
-        """Process text file"""
         try:
-            return file.getvalue().decode('utf-8')
         except Exception as e:
-            st.error(f"Error processing text file: {str(e)}")
             return ""
-    def _create_chunks(self, text: str, chunk_size: int = 1000) -> List[Dict]:
-        """Create chunks from text"""
-        if not text:
-            return []
-        # Split into paragraphs
-        paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
-        chunks = []
-        current_chunk = ""
-        for para in paragraphs:
-            if len(current_chunk) + len(para) > chunk_size and current_chunk:
-                chunks.append({
-                    "text": current_chunk,
-                    "metadata": {
-                        "length": len(current_chunk)
-                    }
-                })
-                current_chunk = para
-            else:
-                current_chunk += "\n" + para if current_chunk else para
-        if current_chunk:
-            chunks.append({
-                "text": current_chunk,
-                "metadata": {
-                    "length": len(current_chunk)
-                }
-            })
-        return chunks
-    def _create_chunk_dict(self, text: str) -> Dict:
-        """Create a chunk dictionary with metadata"""
-        return {
-            "text": text,
-            "metadata": {
-                "length": len(text),
-                "embedding": self.embedder.encode(text).tolist()
-            }
-        }

+import os
+import pytesseract
+from pytesseract import Output
 from PIL import Image
+import pypdf
+from pdf2image import convert_from_bytes
+import docx
+from typing import Tuple, List, Dict
+import streamlit as st
 class DocumentProcessor:
     def __init__(self):
+        pass
     def process_document(self, file) -> Tuple[str, List[Dict]]:
+        """Process a document and return its text and chunks."""
+        file_type = file.name.split(".")[-1].lower()
+        if file_type == "pdf":
+            text = self._process_pdf(file)
+        elif file_type == "docx":
+            text = self._process_docx(file)
+        elif file_type in ["txt", "csv"]:
+            text = file.read().decode("utf-8")
+        else:
+            raise ValueError(f"Unsupported file type: {file_type}")
+        chunks = self._chunk_text(text)
+        return text, chunks
     def _process_pdf(self, file) -> str:
+        """Extract text from a PDF, including OCR for scanned PDFs."""
         try:
+            reader = pypdf.PdfReader(file)
             text = ""
+            for page in reader.pages:
+                page_text = page.extract_text()
+                if not page_text.strip():  # Fallback to OCR if text is empty
+                    st.warning("Detected a scanned PDF. Performing OCR...")
+                    pdf_bytes = file.read()
+                    text += self._perform_ocr(pdf_bytes)
+                else:
+                    text += page_text
             return text
         except Exception as e:
+            st.error(f"Error processing PDF: {e}")
             return ""
+    def _perform_ocr(self, pdf_bytes: bytes) -> str:
+        """Perform OCR on scanned PDF pages."""
         try:
+            images = convert_from_bytes(pdf_bytes)
+            text = ""
+            for image in images:
+                text += pytesseract.image_to_string(image, config="--psm 6")
+            return text
         except Exception as e:
+            st.error(f"Error performing OCR: {e}")
             return ""
+    def _process_docx(self, file) -> str:
+        """Extract text from DOCX files."""
         try:
+            doc = docx.Document(file)
+            return "\n".join(para.text for para in doc.paragraphs)
         except Exception as e:
+            st.error(f"Error processing DOCX: {e}")
             return ""
+    def _chunk_text(self, text: str, chunk_size: int = 500) -> List[Dict]:
+        """Split text into smaller chunks for vectorization."""
+        return [{"chunk_id": idx, "text": text[i:i + chunk_size]}
+                for idx, i in enumerate(range(0, len(text), chunk_size))]