Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -23,7 +23,6 @@ import asyncio
|
|
| 23 |
from functools import lru_cache
|
| 24 |
import hashlib
|
| 25 |
from concurrent.futures import ThreadPoolExecutor
|
| 26 |
-
import pdfplumber
|
| 27 |
|
| 28 |
# ========== CONFIGURATION ==========
|
| 29 |
PROFILES_DIR = "student_profiles"
|
|
@@ -197,20 +196,16 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
|
|
| 197 |
|
| 198 |
try:
|
| 199 |
if file_ext == '.pdf':
|
| 200 |
-
# First try
|
| 201 |
try:
|
| 202 |
-
with pdfplumber.open(file_path) as pdf:
|
| 203 |
-
text = "\n".join([page.extract_text() for page in pdf.pages])
|
| 204 |
-
if not text.strip():
|
| 205 |
-
raise ValueError("pdfplumber returned empty text - the PDF may be image-based")
|
| 206 |
-
except Exception as e:
|
| 207 |
-
logging.warning(f"pdfplumber failed: {str(e)}. Trying PyMuPDF fallback...")
|
| 208 |
doc = fitz.open(file_path)
|
| 209 |
for page in doc:
|
| 210 |
text += page.get_text("text") + '\n'
|
| 211 |
if not text.strip():
|
| 212 |
-
raise ValueError("PyMuPDF returned empty text -
|
| 213 |
-
|
|
|
|
|
|
|
| 214 |
|
| 215 |
elif file_ext in ['.png', '.jpg', '.jpeg']:
|
| 216 |
text = extract_text_with_ocr(file_path)
|
|
|
|
| 23 |
from functools import lru_cache
|
| 24 |
import hashlib
|
| 25 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
| 26 |
|
| 27 |
# ========== CONFIGURATION ==========
|
| 28 |
PROFILES_DIR = "student_profiles"
|
|
|
|
| 196 |
|
| 197 |
try:
|
| 198 |
if file_ext == '.pdf':
|
| 199 |
+
# First try PyMuPDF for text extraction
|
| 200 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
doc = fitz.open(file_path)
|
| 202 |
for page in doc:
|
| 203 |
text += page.get_text("text") + '\n'
|
| 204 |
if not text.strip():
|
| 205 |
+
raise ValueError("PyMuPDF returned empty text - the PDF may be image-based")
|
| 206 |
+
except Exception as e:
|
| 207 |
+
logging.warning(f"PyMuPDF failed: {str(e)}. Trying OCR fallback...")
|
| 208 |
+
text = extract_text_from_pdf_with_ocr(file_path)
|
| 209 |
|
| 210 |
elif file_ext in ['.png', '.jpg', '.jpeg']:
|
| 211 |
text = extract_text_with_ocr(file_path)
|