cryogenic22 commited on
Commit
ca49412
·
verified ·
1 Parent(s): 694fa5c

Update utils/document_processor.py

Browse files
Files changed (1) hide show
  1. utils/document_processor.py +61 -37
utils/document_processor.py CHANGED
@@ -153,47 +153,71 @@ class DocumentProcessor:
153
  raise ValueError(f"Unsupported file type: {file_type}")
154
  chunks = self._create_chunks(text)
155
  return text, chunks
156
-
157
- def _process_pdf(self, file_path: str) -> str:
 
158
  try:
159
- # Try some common Poppler installation paths
160
- poppler_paths = [
161
- "/usr/bin",
162
- "/usr/local/bin",
163
- "/opt/poppler/bin",
164
- "/Library/Frameworks/Poppler.framework/Versions/Current/bin", # for macOS
165
- ]
166
-
167
- # Find the first valid Poppler path
168
- for poppler_dir in poppler_paths:
169
- if os.path.exists(os.path.join(poppler_dir, "pdftoppm")):
170
- break
171
- else:
172
- raise ValueError("Poppler not found in any of the common installation paths.")
173
-
174
- # Update the PATH and LD_LIBRARY_PATH environment variables
175
- os.environ["PATH"] = f"{poppler_dir}:{os.environ['PATH']}"
176
- os.environ["LD_LIBRARY_PATH"] = f"{poppler_dir}:{os.environ.get('LD_LIBRARY_PATH', '')}"
177
-
178
- # Test the Poppler installation
179
  try:
180
- subprocess.check_output(["pdftoppm", "-v"])
181
- st.info("Poppler is installed and in the PATH.")
182
  except (subprocess.CalledProcessError, FileNotFoundError):
183
- st.error("Unable to find Poppler. Please check the installation.")
184
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
- # Process the PDF file using pdf2image and Tesseract OCR
187
- images = convert_from_bytes(open(file_path, 'rb').read())
188
- text = ""
189
- for page_num, image in enumerate(images, 1):
190
- st.info(f"Performing OCR on page {page_num}...")
191
- page_text = pytesseract.image_to_string(image)
192
- text += f"\n--- Page {page_num} ---\n{page_text}"
193
- return text
194
- except Exception as e:
195
- st.error(f"Error processing PDF: {str(e)}")
196
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
 
199
 
 
153
  raise ValueError(f"Unsupported file type: {file_type}")
154
  chunks = self._create_chunks(text)
155
  return text, chunks
156
+
157
+
158
+ def process_pdf(self, file_path: str) -> Optional[str]:
159
  try:
160
+ # First verify if poppler is installed via package manager
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  try:
162
+ subprocess.check_output(['pdftoppm', '-v'], stderr=subprocess.STDOUT)
163
+ st.success("Poppler found on system")
164
  except (subprocess.CalledProcessError, FileNotFoundError):
165
+ # If not in default path, check common installation directories
166
+ poppler_paths = [
167
+ "/usr/bin",
168
+ "/usr/local/bin",
169
+ "/opt/poppler/bin",
170
+ "/app/.apt/usr/bin", # Common HF Spaces path
171
+ os.path.expanduser("~/.local/bin")
172
+ ]
173
+
174
+ for poppler_dir in poppler_paths:
175
+ if os.path.exists(os.path.join(poppler_dir, "pdftoppm")):
176
+ # Update PATH
177
+ os.environ["PATH"] = f"{poppler_dir}:{os.environ.get('PATH', '')}"
178
+ st.success(f"✓ Found Poppler in {poppler_dir}")
179
+ break
180
+ else:
181
+ st.error("❌ Poppler not found. Please ensure 'poppler-utils' is in packages.txt")
182
+ return None
183
 
184
+ # Attempt to read and convert the PDF
185
+ try:
186
+ with open(file_path, 'rb') as pdf_file:
187
+ pdf_bytes = pdf_file.read()
188
+
189
+ # Convert PDF to images
190
+ images = convert_from_bytes(
191
+ pdf_bytes,
192
+ dpi=300, # Increase DPI for better OCR quality
193
+ fmt='png'
194
+ )
195
+
196
+ # Process each page
197
+ text = ""
198
+ total_pages = len(images)
199
+
200
+ for page_num, image in enumerate(images, 1):
201
+ st.progress(page_num / total_pages)
202
+ st.info(f"📄 Processing page {page_num}/{total_pages}")
203
+
204
+ # Perform OCR with custom configuration
205
+ page_text = pytesseract.image_to_string(
206
+ image,
207
+ config='--psm 3 --oem 3' # Use default page segmentation and OCR Engine Mode
208
+ )
209
+
210
+ text += f"\n{'='*20} Page {page_num} {'='*20}\n{page_text}\n"
211
+
212
+ return text.strip()
213
+
214
+ except Exception as e:
215
+ st.error(f"Error processing PDF content: {str(e)}")
216
+ return None
217
+
218
+ except Exception as e:
219
+ st.error(f"Unexpected error: {str(e)}")
220
+ return None
221
 
222
 
223