Spaces:

mbuck17
/

paddleocr-processor

Sleeping

App Files Files Community

mbuckle commited on Jun 2, 2025

Commit

c8b00d0

1 Parent(s): ca91382

Count pages fix attempt #2

Browse files

Files changed (1) hide show

app.py +31 -14

app.py CHANGED Viewed

@@ -110,35 +110,50 @@ def process_document(file):
         filename = os.path.basename(file.name)
         print(f"Processing: {filename}")
-        # Count pages if PDF
         total_pages = 1
         if filename.lower().endswith('.pdf'):
             try:
                 doc = fitz.open(file.name)
-                # Handle different PyMuPDF versions
                 try:
-                    total_pages = doc.page_count  # Newer versions
                 except AttributeError:
-                    total_pages = len(doc)  # Older versions or alternative
                 doc.close()
             except Exception as e:
-                print(f"Could not count PDF pages: {e}")
-        # Run OCR
-        result = ocr.ocr(file.name, cls=True)
         # Extract text
         extracted_text = ""
         pages_processed = 0
-        for page_idx, page_result in enumerate(result):
-            if page_result:
-                pages_processed += 1
-                for line in page_result:
-                    if len(line) >= 2 and line[1][1] > 0.5:  # Confidence > 50%
-                        extracted_text += line[1][0] + "\n"
         processing_time = time.time() - start_time
         # Create summary
         summary = f"""
@@ -165,8 +180,10 @@ def process_document(file):
     except Exception as e:
         error_msg = f"Error processing file: {str(e)}"
         print(f"Processing error: {e}")
         return error_msg, "", json.dumps({"success": False, "error": str(e)})
 def process_api_request(api_data):
     """Process API-style requests (for integration with your Vercel app)"""
     try:

         filename = os.path.basename(file.name)
         print(f"Processing: {filename}")
+        # Count pages if PDF with better error handling
         total_pages = 1
         if filename.lower().endswith('.pdf'):
             try:
                 doc = fitz.open(file.name)
+                # Try multiple ways to get page count
                 try:
+                    total_pages = doc.page_count  # PyMuPDF >= 1.23.0
                 except AttributeError:
+                    try:
+                        total_pages = doc.pageCount  # PyMuPDF < 1.23.0
+                    except AttributeError:
+                        total_pages = len(doc)  # Fallback method
+                print(f"PDF has {total_pages} pages")
                 doc.close()
             except Exception as e:
+                print(f"Could not count PDF pages, assuming 1 page: {e}")
+                total_pages = 1
+        # Run OCR with better error handling
+        print("Starting OCR processing...")
+        try:
+            result = ocr.ocr(file.name, cls=True)
+        except Exception as ocr_error:
+            print(f"OCR processing failed: {ocr_error}")
+            return f"OCR Error: {str(ocr_error)}", "", json.dumps({"success": False, "error": str(ocr_error)})
         # Extract text
         extracted_text = ""
         pages_processed = 0
+        if result:
+            for page_idx, page_result in enumerate(result):
+                if page_result:
+                    pages_processed += 1
+                    print(f"Processing page {page_idx + 1}")
+                    for line in page_result:
+                        if len(line) >= 2 and line[1][1] > 0.5:  # Confidence > 50%
+                            extracted_text += line[1][0] + "\n"
+        else:
+            print("OCR returned no results")
         processing_time = time.time() - start_time
+        print(f"Processing completed in {processing_time:.2f} seconds")
         # Create summary
         summary = f"""
     except Exception as e:
         error_msg = f"Error processing file: {str(e)}"
         print(f"Processing error: {e}")
+        import traceback
+        traceback.print_exc()  # Print full stack trace for debugging
         return error_msg, "", json.dumps({"success": False, "error": str(e)})
 def process_api_request(api_data):
     """Process API-style requests (for integration with your Vercel app)"""
     try: