Spaces:
Build error
Build error
Update utils/document_processor.py
Browse files- utils/document_processor.py +61 -37
utils/document_processor.py
CHANGED
|
@@ -153,47 +153,71 @@ class DocumentProcessor:
|
|
| 153 |
raise ValueError(f"Unsupported file type: {file_type}")
|
| 154 |
chunks = self._create_chunks(text)
|
| 155 |
return text, chunks
|
| 156 |
-
|
| 157 |
-
|
|
|
|
| 158 |
try:
|
| 159 |
-
#
|
| 160 |
-
poppler_paths = [
|
| 161 |
-
"/usr/bin",
|
| 162 |
-
"/usr/local/bin",
|
| 163 |
-
"/opt/poppler/bin",
|
| 164 |
-
"/Library/Frameworks/Poppler.framework/Versions/Current/bin", # for macOS
|
| 165 |
-
]
|
| 166 |
-
|
| 167 |
-
# Find the first valid Poppler path
|
| 168 |
-
for poppler_dir in poppler_paths:
|
| 169 |
-
if os.path.exists(os.path.join(poppler_dir, "pdftoppm")):
|
| 170 |
-
break
|
| 171 |
-
else:
|
| 172 |
-
raise ValueError("Poppler not found in any of the common installation paths.")
|
| 173 |
-
|
| 174 |
-
# Update the PATH and LD_LIBRARY_PATH environment variables
|
| 175 |
-
os.environ["PATH"] = f"{poppler_dir}:{os.environ['PATH']}"
|
| 176 |
-
os.environ["LD_LIBRARY_PATH"] = f"{poppler_dir}:{os.environ.get('LD_LIBRARY_PATH', '')}"
|
| 177 |
-
|
| 178 |
-
# Test the Poppler installation
|
| 179 |
try:
|
| 180 |
-
subprocess.check_output([
|
| 181 |
-
st.
|
| 182 |
except (subprocess.CalledProcessError, FileNotFoundError):
|
| 183 |
-
|
| 184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
-
#
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
|
| 199 |
|
|
|
|
| 153 |
raise ValueError(f"Unsupported file type: {file_type}")
|
| 154 |
chunks = self._create_chunks(text)
|
| 155 |
return text, chunks
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def process_pdf(self, file_path: str) -> Optional[str]:
|
| 159 |
try:
|
| 160 |
+
# First verify if poppler is installed via package manager
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
try:
|
| 162 |
+
subprocess.check_output(['pdftoppm', '-v'], stderr=subprocess.STDOUT)
|
| 163 |
+
st.success("✓ Poppler found on system")
|
| 164 |
except (subprocess.CalledProcessError, FileNotFoundError):
|
| 165 |
+
# If not in default path, check common installation directories
|
| 166 |
+
poppler_paths = [
|
| 167 |
+
"/usr/bin",
|
| 168 |
+
"/usr/local/bin",
|
| 169 |
+
"/opt/poppler/bin",
|
| 170 |
+
"/app/.apt/usr/bin", # Common HF Spaces path
|
| 171 |
+
os.path.expanduser("~/.local/bin")
|
| 172 |
+
]
|
| 173 |
+
|
| 174 |
+
for poppler_dir in poppler_paths:
|
| 175 |
+
if os.path.exists(os.path.join(poppler_dir, "pdftoppm")):
|
| 176 |
+
# Update PATH
|
| 177 |
+
os.environ["PATH"] = f"{poppler_dir}:{os.environ.get('PATH', '')}"
|
| 178 |
+
st.success(f"✓ Found Poppler in {poppler_dir}")
|
| 179 |
+
break
|
| 180 |
+
else:
|
| 181 |
+
st.error("❌ Poppler not found. Please ensure 'poppler-utils' is in packages.txt")
|
| 182 |
+
return None
|
| 183 |
|
| 184 |
+
# Attempt to read and convert the PDF
|
| 185 |
+
try:
|
| 186 |
+
with open(file_path, 'rb') as pdf_file:
|
| 187 |
+
pdf_bytes = pdf_file.read()
|
| 188 |
+
|
| 189 |
+
# Convert PDF to images
|
| 190 |
+
images = convert_from_bytes(
|
| 191 |
+
pdf_bytes,
|
| 192 |
+
dpi=300, # Increase DPI for better OCR quality
|
| 193 |
+
fmt='png'
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
# Process each page
|
| 197 |
+
text = ""
|
| 198 |
+
total_pages = len(images)
|
| 199 |
+
|
| 200 |
+
for page_num, image in enumerate(images, 1):
|
| 201 |
+
st.progress(page_num / total_pages)
|
| 202 |
+
st.info(f"📄 Processing page {page_num}/{total_pages}")
|
| 203 |
+
|
| 204 |
+
# Perform OCR with custom configuration
|
| 205 |
+
page_text = pytesseract.image_to_string(
|
| 206 |
+
image,
|
| 207 |
+
config='--psm 3 --oem 3' # Use default page segmentation and OCR Engine Mode
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
text += f"\n{'='*20} Page {page_num} {'='*20}\n{page_text}\n"
|
| 211 |
+
|
| 212 |
+
return text.strip()
|
| 213 |
+
|
| 214 |
+
except Exception as e:
|
| 215 |
+
st.error(f"Error processing PDF content: {str(e)}")
|
| 216 |
+
return None
|
| 217 |
+
|
| 218 |
+
except Exception as e:
|
| 219 |
+
st.error(f"Unexpected error: {str(e)}")
|
| 220 |
+
return None
|
| 221 |
|
| 222 |
|
| 223 |
|