cryogenic22 commited on
Commit
cd6801c
·
verified ·
1 Parent(s): 4a25702

Update utils/document_processor.py

Browse files
Files changed (1) hide show
  1. utils/document_processor.py +31 -1
utils/document_processor.py CHANGED
@@ -154,8 +154,36 @@ class DocumentProcessor:
154
  chunks = self._create_chunks(text)
155
  return text, chunks
156
 
157
- def _process_pdf(self, file_path: str) -> str:
158
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  images = convert_from_bytes(open(file_path, 'rb').read())
160
  text = ""
161
  for page_num, image in enumerate(images, 1):
@@ -167,6 +195,8 @@ class DocumentProcessor:
167
  st.error(f"Error processing PDF: {str(e)}")
168
  raise
169
 
 
 
170
  def _process_docx(self, file_path: str) -> str:
171
  """Extract text from DOCX files."""
172
  try:
 
154
  chunks = self._create_chunks(text)
155
  return text, chunks
156
 
157
+ def __process_pdf(self, file_path: str) -> str:
158
  try:
159
+ # Try some common Poppler installation paths
160
+ poppler_paths = [
161
+ "/usr/bin",
162
+ "/usr/local/bin",
163
+ "/opt/poppler/bin",
164
+ "/Library/Frameworks/Poppler.framework/Versions/Current/bin", # for macOS
165
+ ]
166
+
167
+ # Find the first valid Poppler path
168
+ for poppler_dir in poppler_paths:
169
+ if os.path.exists(os.path.join(poppler_dir, "pdftoppm")):
170
+ break
171
+ else:
172
+ raise ValueError("Poppler not found in any of the common installation paths.")
173
+
174
+ # Update the PATH and LD_LIBRARY_PATH environment variables
175
+ os.environ["PATH"] = f"{poppler_dir}:{os.environ['PATH']}"
176
+ os.environ["LD_LIBRARY_PATH"] = f"{poppler_dir}:{os.environ.get('LD_LIBRARY_PATH', '')}"
177
+
178
+ # Test the Poppler installation
179
+ try:
180
+ subprocess.check_output(["pdftoppm", "-v"])
181
+ st.info("Poppler is installed and in the PATH.")
182
+ except (subprocess.CalledProcessError, FileNotFoundError):
183
+ st.error("Unable to find Poppler. Please check the installation.")
184
+ raise
185
+
186
+ # Process the PDF file using pdf2image and Tesseract OCR
187
  images = convert_from_bytes(open(file_path, 'rb').read())
188
  text = ""
189
  for page_num, image in enumerate(images, 1):
 
195
  st.error(f"Error processing PDF: {str(e)}")
196
  raise
197
 
198
+
199
+
200
  def _process_docx(self, file_path: str) -> str:
201
  """Extract text from DOCX files."""
202
  try: