Legal_AI_Agent

Build error

App Files Files Community

cryogenic22 commited on Dec 11, 2024

Commit

cd6801c

verified ·

1 Parent(s): 4a25702

Update utils/document_processor.py

Browse files

Files changed (1) hide show

utils/document_processor.py +31 -1

utils/document_processor.py CHANGED Viewed

@@ -154,8 +154,36 @@ class DocumentProcessor:
         chunks = self._create_chunks(text)
         return text, chunks
-    def _process_pdf(self, file_path: str) -> str:
         try:
             images = convert_from_bytes(open(file_path, 'rb').read())
             text = ""
             for page_num, image in enumerate(images, 1):
@@ -167,6 +195,8 @@ class DocumentProcessor:
             st.error(f"Error processing PDF: {str(e)}")
             raise
     def _process_docx(self, file_path: str) -> str:
         """Extract text from DOCX files."""
         try:

         chunks = self._create_chunks(text)
         return text, chunks
+     def __process_pdf(self, file_path: str) -> str:
         try:
+            # Try some common Poppler installation paths
+            poppler_paths = [
+                "/usr/bin",
+                "/usr/local/bin",
+                "/opt/poppler/bin",
+                "/Library/Frameworks/Poppler.framework/Versions/Current/bin",  # for macOS
+            ]
+            # Find the first valid Poppler path
+            for poppler_dir in poppler_paths:
+                if os.path.exists(os.path.join(poppler_dir, "pdftoppm")):
+                    break
+            else:
+                raise ValueError("Poppler not found in any of the common installation paths.")
+            # Update the PATH and LD_LIBRARY_PATH environment variables
+            os.environ["PATH"] = f"{poppler_dir}:{os.environ['PATH']}"
+            os.environ["LD_LIBRARY_PATH"] = f"{poppler_dir}:{os.environ.get('LD_LIBRARY_PATH', '')}"
+            # Test the Poppler installation
+            try:
+                subprocess.check_output(["pdftoppm", "-v"])
+                st.info("Poppler is installed and in the PATH.")
+            except (subprocess.CalledProcessError, FileNotFoundError):
+                st.error("Unable to find Poppler. Please check the installation.")
+                raise
+            # Process the PDF file using pdf2image and Tesseract OCR
             images = convert_from_bytes(open(file_path, 'rb').read())
             text = ""
             for page_num, image in enumerate(images, 1):
             st.error(f"Error processing PDF: {str(e)}")
             raise
     def _process_docx(self, file_path: str) -> str:
         """Extract text from DOCX files."""
         try: