Legal_AI_Agent

Build error

App Files Files Community

cryogenic22 commited on Dec 16, 2024

Commit

ca49412

verified ·

1 Parent(s): 694fa5c

Update utils/document_processor.py

Browse files

Files changed (1) hide show

utils/document_processor.py +61 -37

utils/document_processor.py CHANGED Viewed

@@ -153,47 +153,71 @@ class DocumentProcessor:
             raise ValueError(f"Unsupported file type: {file_type}")
         chunks = self._create_chunks(text)
         return text, chunks
-    def _process_pdf(self, file_path: str) -> str:
         try:
-            # Try some common Poppler installation paths
-            poppler_paths = [
-                "/usr/bin",
-                "/usr/local/bin",
-                "/opt/poppler/bin",
-                "/Library/Frameworks/Poppler.framework/Versions/Current/bin",  # for macOS
-            ]
-            # Find the first valid Poppler path
-            for poppler_dir in poppler_paths:
-                if os.path.exists(os.path.join(poppler_dir, "pdftoppm")):
-                    break
-            else:
-                raise ValueError("Poppler not found in any of the common installation paths.")
-            # Update the PATH and LD_LIBRARY_PATH environment variables
-            os.environ["PATH"] = f"{poppler_dir}:{os.environ['PATH']}"
-            os.environ["LD_LIBRARY_PATH"] = f"{poppler_dir}:{os.environ.get('LD_LIBRARY_PATH', '')}"
-            # Test the Poppler installation
             try:
-                subprocess.check_output(["pdftoppm", "-v"])
-                st.info("Poppler is installed and in the PATH.")
             except (subprocess.CalledProcessError, FileNotFoundError):
-                st.error("Unable to find Poppler. Please check the installation.")
-                raise
-            # Process the PDF file using pdf2image and Tesseract OCR
-            images = convert_from_bytes(open(file_path, 'rb').read())
-            text = ""
-            for page_num, image in enumerate(images, 1):
-                st.info(f"Performing OCR on page {page_num}...")
-                page_text = pytesseract.image_to_string(image)
-                text += f"\n--- Page {page_num} ---\n{page_text}"
-            return text
-        except Exception as e:
-            st.error(f"Error processing PDF: {str(e)}")
-            raise

             raise ValueError(f"Unsupported file type: {file_type}")
         chunks = self._create_chunks(text)
         return text, chunks
+    def process_pdf(self, file_path: str) -> Optional[str]:
         try:
+            # First verify if poppler is installed via package manager
             try:
+                subprocess.check_output(['pdftoppm', '-v'], stderr=subprocess.STDOUT)
+                st.success("✓ Poppler found on system")
             except (subprocess.CalledProcessError, FileNotFoundError):
+                # If not in default path, check common installation directories
+                poppler_paths = [
+                    "/usr/bin",
+                    "/usr/local/bin",
+                    "/opt/poppler/bin",
+                    "/app/.apt/usr/bin",  # Common HF Spaces path
+                    os.path.expanduser("~/.local/bin")
+                ]
+                for poppler_dir in poppler_paths:
+                    if os.path.exists(os.path.join(poppler_dir, "pdftoppm")):
+                        # Update PATH
+                        os.environ["PATH"] = f"{poppler_dir}:{os.environ.get('PATH', '')}"
+                        st.success(f"✓ Found Poppler in {poppler_dir}")
+                        break
+                else:
+                    st.error("❌ Poppler not found. Please ensure 'poppler-utils' is in packages.txt")
+                    return None
+            # Attempt to read and convert the PDF
+            try:
+                with open(file_path, 'rb') as pdf_file:
+                    pdf_bytes = pdf_file.read()
+                # Convert PDF to images
+                images = convert_from_bytes(
+                    pdf_bytes,
+                    dpi=300,  # Increase DPI for better OCR quality
+                    fmt='png'
+                )
+                # Process each page
+                text = ""
+                total_pages = len(images)
+                for page_num, image in enumerate(images, 1):
+                    st.progress(page_num / total_pages)
+                    st.info(f"📄 Processing page {page_num}/{total_pages}")
+                    # Perform OCR with custom configuration
+                    page_text = pytesseract.image_to_string(
+                        image,
+                        config='--psm 3 --oem 3'  # Use default page segmentation and OCR Engine Mode
+                    )
+                    text += f"\n{'='*20} Page {page_num} {'='*20}\n{page_text}\n"
+                return text.strip()
+            except Exception as e:
+                st.error(f"Error processing PDF content: {str(e)}")
+                return None
+    except Exception as e:
+        st.error(f"Unexpected error: {str(e)}")
+        return None