Legal_AI_Agent

Build error

App Files Files Community

cryogenic22 commited on Dec 10, 2024

Commit

65e8156

verified ·

1 Parent(s): d2a1c88

Update utils/document_processor.py

Browse files

Files changed (1) hide show

utils/document_processor.py +30 -39

utils/document_processor.py CHANGED Viewed

@@ -1,18 +1,11 @@
-import os
-import pytesseract
-from pytesseract import Output
-from PIL import Image
 import pypdf
-from pdf2image import convert_from_bytes
 import docx
-from typing import Tuple, List, Dict
-import streamlit as st
 class DocumentProcessor:
-    def __init__(self):
-        pass
     def process_document(self, file) -> Tuple[str, List[Dict]]:
         """Process a document and return its text and chunks."""
         file_type = file.name.split(".")[-1].lower()
@@ -21,7 +14,7 @@ class DocumentProcessor:
         elif file_type == "docx":
             text = self._process_docx(file)
         elif file_type in ["txt", "csv"]:
-            text = file.read().decode("utf-8")
         else:
             raise ValueError(f"Unsupported file type: {file_type}")
@@ -30,41 +23,39 @@ class DocumentProcessor:
     def _process_pdf(self, file) -> str:
         """Extract text from a PDF, including OCR for scanned PDFs."""
-        try:
-            reader = pypdf.PdfReader(file)
-            text = ""
-            for page in reader.pages:
-                page_text = page.extract_text()
-                if not page_text.strip():  # Fallback to OCR if text is empty
-                    st.warning("Detected a scanned PDF. Performing OCR...")
-                    pdf_bytes = file.read()
-                    text += self._perform_ocr(pdf_bytes)
-                else:
-                    text += page_text
-            return text
-        except Exception as e:
-            st.error(f"Error processing PDF: {e}")
-            return ""
     def _perform_ocr(self, pdf_bytes: bytes) -> str:
         """Perform OCR on scanned PDF pages."""
-        try:
-            images = convert_from_bytes(pdf_bytes)
-            text = ""
-            for image in images:
-                text += pytesseract.image_to_string(image, config="--psm 6")
-            return text
-        except Exception as e:
-            st.error(f"Error performing OCR: {e}")
-            return ""
     def _process_docx(self, file) -> str:
         """Extract text from DOCX files."""
         try:
-            doc = docx.Document(file)
-            return "\n".join(para.text for para in doc.paragraphs)
         except Exception as e:
-            st.error(f"Error processing DOCX: {e}")
             return ""
     def _chunk_text(self, text: str, chunk_size: int = 500) -> List[Dict]:

+import chardet
 import pypdf
 import docx
+from pdf2image import convert_from_bytes
+import pytesseract
+from PIL import Image
 class DocumentProcessor:
     def process_document(self, file) -> Tuple[str, List[Dict]]:
         """Process a document and return its text and chunks."""
         file_type = file.name.split(".")[-1].lower()
         elif file_type == "docx":
             text = self._process_docx(file)
         elif file_type in ["txt", "csv"]:
+            text = self._process_text(file)
         else:
             raise ValueError(f"Unsupported file type: {file_type}")
     def _process_pdf(self, file) -> str:
         """Extract text from a PDF, including OCR for scanned PDFs."""
+        reader = pypdf.PdfReader(file)
+        text = ""
+        for page in reader.pages:
+            page_text = page.extract_text()
+            if page_text:
+                text += page_text
+            else:
+                st.warning("Detected a scanned PDF. Performing OCR...")
+                pdf_bytes = file.read()
+                text += self._perform_ocr(pdf_bytes)
+        return text
     def _perform_ocr(self, pdf_bytes: bytes) -> str:
         """Perform OCR on scanned PDF pages."""
+        images = convert_from_bytes(pdf_bytes)
+        text = ""
+        for image in images:
+            text += pytesseract.image_to_string(image, config="--psm 6")
+        return text
     def _process_docx(self, file) -> str:
         """Extract text from DOCX files."""
+        doc = docx.Document(file)
+        return "\n".join(para.text for para in doc.paragraphs)
+    def _process_text(self, file) -> str:
+        """Process plain text files with unknown encoding."""
         try:
+            raw_data = file.read()
+            detected_encoding = chardet.detect(raw_data)["encoding"]
+            return raw_data.decode(detected_encoding)
         except Exception as e:
+            st.error(f"Error processing text file: {e}")
             return ""
     def _chunk_text(self, text: str, chunk_size: int = 500) -> List[Dict]: