omgy commited on
Commit
19de0ae
·
verified ·
1 Parent(s): f6ed292

Update document_converter.py

Browse files
Files changed (1) hide show
  1. document_converter.py +9 -51
document_converter.py CHANGED
@@ -96,64 +96,23 @@ class DocumentConverter:
96
 
97
  def _extract_from_pdf(self, file_content: bytes) -> str:
98
  """
99
- Helper to pull text from PDF.
100
-
101
- Strategy:
102
- - First try PyPDF2 with strict=False (handles most normal PDFs).
103
- - Skip pages that fail to decode.
104
- - If PyPDF2 raises PdfReadError (e.g., EOF marker not found),
105
- try a naive fallback that treats the bytes as text and filters
106
- printable characters.
107
- """
108
 
109
- # --- First attempt: normal PyPDF2 read ---
 
 
110
  try:
111
- # strict=False makes PyPDF2 more forgiving about slightly broken PDFs
112
  pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content), strict=False)
113
- except PdfReadError as e:
114
  # Very likely a corrupted or badly exported PDF
115
- print(f"PyPDF2 PdfReadError: {e}. Trying naive fallback text extraction.", file=sys.stderr)
116
-
117
- # --- Fallback: naive "best effort" text extraction from raw bytes ---
118
- try:
119
- # Decode raw bytes to string using latin-1 (1:1 byte→char mapping),
120
- # then keep only printable characters and whitespace.
121
- raw = file_content.decode('latin-1', errors='ignore')
122
- filtered_chars = []
123
- for ch in raw:
124
- code = ord(ch)
125
- # Keep basic printable ASCII + common whitespace
126
- if ch in "\n\r\t":
127
- filtered_chars.append(ch)
128
- elif 32 <= code <= 126:
129
- filtered_chars.append(ch)
130
- else:
131
- # Replace non-printable with space
132
- filtered_chars.append(" ")
133
-
134
- filtered = "".join(filtered_chars)
135
- # Collapse excessive spaces
136
- filtered = re.sub(r'[ \t]{2,}', ' ', filtered)
137
- # Collapse excessive blank lines
138
- filtered = re.sub(r'\n{3,}', '\n\n', filtered)
139
-
140
- if filtered.strip():
141
- print("Using naive PDF text fallback due to PdfReadError.", file=sys.stderr)
142
- return filtered
143
-
144
- except Exception as e2:
145
- print(f"Naive PDF fallback also failed: {e2}", file=sys.stderr)
146
-
147
- # If we get here, we genuinely couldn't salvage text
148
  raise ValueError(
149
- "Failed to open PDF: the file appears to be corrupted or missing its EOF marker. "
150
- "Please try downloading/exporting the PDF again, or re-save it with a PDF printer."
151
  )
152
-
153
  except Exception as e:
154
  raise ValueError(f"Failed to open PDF: {str(e)}")
155
 
156
- # --- Normal per-page extraction path ---
157
  text_parts = []
158
  total_pages = len(pdf_reader.pages)
159
 
@@ -178,10 +137,9 @@ class DocumentConverter:
178
  text_parts.append(safe_text)
179
 
180
  if not text_parts:
181
- # If literally nothing could be extracted, then bubble a clean error
182
  raise ValueError(
183
  "Could not extract readable text from this PDF. "
184
- "The file likely uses a non-standard encoding, is image-only, or is heavily corrupted."
185
  )
186
 
187
  return "\n\n".join(text_parts)
 
96
 
97
  def _extract_from_pdf(self, file_content: bytes) -> str:
98
  """
99
+ Helper to pull text from PDF, skipping pages that fail to decode.
 
 
 
 
 
 
 
 
100
 
101
+ - Uses strict=False to handle slightly broken PDFs.
102
+ - If PdfReadError (e.g., EOF marker missing), treat as corrupted.
103
+ """
104
  try:
105
+ # strict=False makes PyPDF2 more forgiving about minor issues
106
  pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content), strict=False)
107
+ except PdfReadError:
108
  # Very likely a corrupted or badly exported PDF
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  raise ValueError(
110
+ "This PDF appears to be corrupted or incomplete (EOF marker missing). "
111
+ "Please re-download or re-export the file and try again."
112
  )
 
113
  except Exception as e:
114
  raise ValueError(f"Failed to open PDF: {str(e)}")
115
 
 
116
  text_parts = []
117
  total_pages = len(pdf_reader.pages)
118
 
 
137
  text_parts.append(safe_text)
138
 
139
  if not text_parts:
 
140
  raise ValueError(
141
  "Could not extract readable text from this PDF. "
142
+ "It may be image-only, use a non-standard encoding, or be corrupted."
143
  )
144
 
145
  return "\n\n".join(text_parts)