Spaces:

Dannyar608
/

Final_project

Runtime error

Dannyar608 commited on May 16, 2025

Commit

058e198

verified ·

1 Parent(s): 2084b35

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -23,7 +23,6 @@ import asyncio
 from functools import lru_cache
 import hashlib
 from concurrent.futures import ThreadPoolExecutor
-import pdfplumber
 # ========== CONFIGURATION ==========
 PROFILES_DIR = "student_profiles"
@@ -197,20 +196,16 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
     try:
         if file_ext == '.pdf':
-            # First try pdfplumber for better text extraction
             try:
-                with pdfplumber.open(file_path) as pdf:
-                    text = "\n".join([page.extract_text() for page in pdf.pages])
-                if not text.strip():
-                    raise ValueError("pdfplumber returned empty text - the PDF may be image-based")
-            except Exception as e:
-                logging.warning(f"pdfplumber failed: {str(e)}. Trying PyMuPDF fallback...")
                 doc = fitz.open(file_path)
                 for page in doc:
                     text += page.get_text("text") + '\n'
                 if not text.strip():
-                    raise ValueError("PyMuPDF returned empty text - trying OCR fallback...")
-                    text = extract_text_from_pdf_with_ocr(file_path)
         elif file_ext in ['.png', '.jpg', '.jpeg']:
             text = extract_text_with_ocr(file_path)

 from functools import lru_cache
 import hashlib
 from concurrent.futures import ThreadPoolExecutor
 # ========== CONFIGURATION ==========
 PROFILES_DIR = "student_profiles"
     try:
         if file_ext == '.pdf':
+            # First try PyMuPDF for text extraction
             try:
                 doc = fitz.open(file_path)
                 for page in doc:
                     text += page.get_text("text") + '\n'
                 if not text.strip():
+                    raise ValueError("PyMuPDF returned empty text - the PDF may be image-based")
+            except Exception as e:
+                logging.warning(f"PyMuPDF failed: {str(e)}. Trying OCR fallback...")
+                text = extract_text_from_pdf_with_ocr(file_path)
         elif file_ext in ['.png', '.jpg', '.jpeg']:
             text = extract_text_with_ocr(file_path)