Spaces:

omgy
/

vero_ps

Sleeping

App Files Files Community

omgy commited on Dec 7, 2025

Commit

19de0ae

verified ·

1 Parent(s): f6ed292

Update document_converter.py

Browse files

Files changed (1) hide show

document_converter.py +9 -51

document_converter.py CHANGED Viewed

@@ -96,64 +96,23 @@ class DocumentConverter:
     def _extract_from_pdf(self, file_content: bytes) -> str:
         """
-        Helper to pull text from PDF.
-        Strategy:
-        - First try PyPDF2 with strict=False (handles most normal PDFs).
-        - Skip pages that fail to decode.
-        - If PyPDF2 raises PdfReadError (e.g., EOF marker not found),
-          try a naive fallback that treats the bytes as text and filters
-          printable characters.
-        """
-        # --- First attempt: normal PyPDF2 read ---
         try:
-            # strict=False makes PyPDF2 more forgiving about slightly broken PDFs
             pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content), strict=False)
-        except PdfReadError as e:
             # Very likely a corrupted or badly exported PDF
-            print(f"PyPDF2 PdfReadError: {e}. Trying naive fallback text extraction.", file=sys.stderr)
-            # --- Fallback: naive "best effort" text extraction from raw bytes ---
-            try:
-                # Decode raw bytes to string using latin-1 (1:1 byte→char mapping),
-                # then keep only printable characters and whitespace.
-                raw = file_content.decode('latin-1', errors='ignore')
-                filtered_chars = []
-                for ch in raw:
-                    code = ord(ch)
-                    # Keep basic printable ASCII + common whitespace
-                    if ch in "\n\r\t":
-                        filtered_chars.append(ch)
-                    elif 32 <= code <= 126:
-                        filtered_chars.append(ch)
-                    else:
-                        # Replace non-printable with space
-                        filtered_chars.append(" ")
-                filtered = "".join(filtered_chars)
-                # Collapse excessive spaces
-                filtered = re.sub(r'[ \t]{2,}', ' ', filtered)
-                # Collapse excessive blank lines
-                filtered = re.sub(r'\n{3,}', '\n\n', filtered)
-                if filtered.strip():
-                    print("Using naive PDF text fallback due to PdfReadError.", file=sys.stderr)
-                    return filtered
-            except Exception as e2:
-                print(f"Naive PDF fallback also failed: {e2}", file=sys.stderr)
-            # If we get here, we genuinely couldn't salvage text
             raise ValueError(
-                "Failed to open PDF: the file appears to be corrupted or missing its EOF marker. "
-                "Please try downloading/exporting the PDF again, or re-save it with a PDF printer."
             )
         except Exception as e:
             raise ValueError(f"Failed to open PDF: {str(e)}")
-        # --- Normal per-page extraction path ---
         text_parts = []
         total_pages = len(pdf_reader.pages)
@@ -178,10 +137,9 @@ class DocumentConverter:
                     text_parts.append(safe_text)
         if not text_parts:
-            # If literally nothing could be extracted, then bubble a clean error
             raise ValueError(
                 "Could not extract readable text from this PDF. "
-                "The file likely uses a non-standard encoding, is image-only, or is heavily corrupted."
             )
         return "\n\n".join(text_parts)

     def _extract_from_pdf(self, file_content: bytes) -> str:
         """
+        Helper to pull text from PDF, skipping pages that fail to decode.
+        - Uses strict=False to handle slightly broken PDFs.
+        - If PdfReadError (e.g., EOF marker missing), treat as corrupted.
+        """
         try:
+            # strict=False makes PyPDF2 more forgiving about minor issues
             pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content), strict=False)
+        except PdfReadError:
             # Very likely a corrupted or badly exported PDF
             raise ValueError(
+                "This PDF appears to be corrupted or incomplete (EOF marker missing). "
+                "Please re-download or re-export the file and try again."
             )
         except Exception as e:
             raise ValueError(f"Failed to open PDF: {str(e)}")
         text_parts = []
         total_pages = len(pdf_reader.pages)
                     text_parts.append(safe_text)
         if not text_parts:
             raise ValueError(
                 "Could not extract readable text from this PDF. "
+                "It may be image-only, use a non-standard encoding, or be corrupted."
             )
         return "\n\n".join(text_parts)