Update document_converter.py
Browse files- document_converter.py +9 -51
document_converter.py
CHANGED
|
@@ -96,64 +96,23 @@ class DocumentConverter:
|
|
| 96 |
|
| 97 |
def _extract_from_pdf(self, file_content: bytes) -> str:
|
| 98 |
"""
|
| 99 |
-
Helper to pull text from PDF.
|
| 100 |
-
|
| 101 |
-
Strategy:
|
| 102 |
-
- First try PyPDF2 with strict=False (handles most normal PDFs).
|
| 103 |
-
- Skip pages that fail to decode.
|
| 104 |
-
- If PyPDF2 raises PdfReadError (e.g., EOF marker not found),
|
| 105 |
-
try a naive fallback that treats the bytes as text and filters
|
| 106 |
-
printable characters.
|
| 107 |
-
"""
|
| 108 |
|
| 109 |
-
|
|
|
|
|
|
|
| 110 |
try:
|
| 111 |
-
# strict=False makes PyPDF2 more forgiving about
|
| 112 |
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content), strict=False)
|
| 113 |
-
except PdfReadError
|
| 114 |
# Very likely a corrupted or badly exported PDF
|
| 115 |
-
print(f"PyPDF2 PdfReadError: {e}. Trying naive fallback text extraction.", file=sys.stderr)
|
| 116 |
-
|
| 117 |
-
# --- Fallback: naive "best effort" text extraction from raw bytes ---
|
| 118 |
-
try:
|
| 119 |
-
# Decode raw bytes to string using latin-1 (1:1 byte→char mapping),
|
| 120 |
-
# then keep only printable characters and whitespace.
|
| 121 |
-
raw = file_content.decode('latin-1', errors='ignore')
|
| 122 |
-
filtered_chars = []
|
| 123 |
-
for ch in raw:
|
| 124 |
-
code = ord(ch)
|
| 125 |
-
# Keep basic printable ASCII + common whitespace
|
| 126 |
-
if ch in "\n\r\t":
|
| 127 |
-
filtered_chars.append(ch)
|
| 128 |
-
elif 32 <= code <= 126:
|
| 129 |
-
filtered_chars.append(ch)
|
| 130 |
-
else:
|
| 131 |
-
# Replace non-printable with space
|
| 132 |
-
filtered_chars.append(" ")
|
| 133 |
-
|
| 134 |
-
filtered = "".join(filtered_chars)
|
| 135 |
-
# Collapse excessive spaces
|
| 136 |
-
filtered = re.sub(r'[ \t]{2,}', ' ', filtered)
|
| 137 |
-
# Collapse excessive blank lines
|
| 138 |
-
filtered = re.sub(r'\n{3,}', '\n\n', filtered)
|
| 139 |
-
|
| 140 |
-
if filtered.strip():
|
| 141 |
-
print("Using naive PDF text fallback due to PdfReadError.", file=sys.stderr)
|
| 142 |
-
return filtered
|
| 143 |
-
|
| 144 |
-
except Exception as e2:
|
| 145 |
-
print(f"Naive PDF fallback also failed: {e2}", file=sys.stderr)
|
| 146 |
-
|
| 147 |
-
# If we get here, we genuinely couldn't salvage text
|
| 148 |
raise ValueError(
|
| 149 |
-
"
|
| 150 |
-
"Please
|
| 151 |
)
|
| 152 |
-
|
| 153 |
except Exception as e:
|
| 154 |
raise ValueError(f"Failed to open PDF: {str(e)}")
|
| 155 |
|
| 156 |
-
# --- Normal per-page extraction path ---
|
| 157 |
text_parts = []
|
| 158 |
total_pages = len(pdf_reader.pages)
|
| 159 |
|
|
@@ -178,10 +137,9 @@ class DocumentConverter:
|
|
| 178 |
text_parts.append(safe_text)
|
| 179 |
|
| 180 |
if not text_parts:
|
| 181 |
-
# If literally nothing could be extracted, then bubble a clean error
|
| 182 |
raise ValueError(
|
| 183 |
"Could not extract readable text from this PDF. "
|
| 184 |
-
"
|
| 185 |
)
|
| 186 |
|
| 187 |
return "\n\n".join(text_parts)
|
|
|
|
| 96 |
|
| 97 |
def _extract_from_pdf(self, file_content: bytes) -> str:
|
| 98 |
"""
|
| 99 |
+
Helper to pull text from PDF, skipping pages that fail to decode.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
+
- Uses strict=False to handle slightly broken PDFs.
|
| 102 |
+
- If PdfReadError (e.g., EOF marker missing), treat as corrupted.
|
| 103 |
+
"""
|
| 104 |
try:
|
| 105 |
+
# strict=False makes PyPDF2 more forgiving about minor issues
|
| 106 |
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content), strict=False)
|
| 107 |
+
except PdfReadError:
|
| 108 |
# Very likely a corrupted or badly exported PDF
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
raise ValueError(
|
| 110 |
+
"This PDF appears to be corrupted or incomplete (EOF marker missing). "
|
| 111 |
+
"Please re-download or re-export the file and try again."
|
| 112 |
)
|
|
|
|
| 113 |
except Exception as e:
|
| 114 |
raise ValueError(f"Failed to open PDF: {str(e)}")
|
| 115 |
|
|
|
|
| 116 |
text_parts = []
|
| 117 |
total_pages = len(pdf_reader.pages)
|
| 118 |
|
|
|
|
| 137 |
text_parts.append(safe_text)
|
| 138 |
|
| 139 |
if not text_parts:
|
|
|
|
| 140 |
raise ValueError(
|
| 141 |
"Could not extract readable text from this PDF. "
|
| 142 |
+
"It may be image-only, use a non-standard encoding, or be corrupted."
|
| 143 |
)
|
| 144 |
|
| 145 |
return "\n\n".join(text_parts)
|