Spaces:

mbuck17
/

paddleocr-processor

Sleeping

App Files Files Community

mbuckle commited on Jun 2, 2025

Commit

7829b2b

1 Parent(s): c8b00d0

Count fix - debug version

Browse files

Files changed (1) hide show

app.py +55 -30

app.py CHANGED Viewed

@@ -100,42 +100,73 @@ except Exception as e:
     sys.exit(1)
 def process_document(file):
-    """Process uploaded document with PaddleOCR"""
     if file is None:
         return "No file uploaded", "", ""
     start_time = time.time()
     try:
-        filename = os.path.basename(file.name)
         print(f"Processing: {filename}")
-        # Count pages if PDF with better error handling
         total_pages = 1
         if filename.lower().endswith('.pdf'):
             try:
-                doc = fitz.open(file.name)
-                # Try multiple ways to get page count
-                try:
-                    total_pages = doc.page_count  # PyMuPDF >= 1.23.0
-                except AttributeError:
-                    try:
-                        total_pages = doc.pageCount  # PyMuPDF < 1.23.0
-                    except AttributeError:
-                        total_pages = len(doc)  # Fallback method
-                print(f"PDF has {total_pages} pages")
                 doc.close()
             except Exception as e:
-                print(f"Could not count PDF pages, assuming 1 page: {e}")
                 total_pages = 1
-        # Run OCR with better error handling
-        print("Starting OCR processing...")
-        try:
-            result = ocr.ocr(file.name, cls=True)
-        except Exception as ocr_error:
-            print(f"OCR processing failed: {ocr_error}")
-            return f"OCR Error: {str(ocr_error)}", "", json.dumps({"success": False, "error": str(ocr_error)})
         # Extract text
         extracted_text = ""
@@ -145,17 +176,12 @@ def process_document(file):
             for page_idx, page_result in enumerate(result):
                 if page_result:
                     pages_processed += 1
-                    print(f"Processing page {page_idx + 1}")
                     for line in page_result:
-                        if len(line) >= 2 and line[1][1] > 0.5:  # Confidence > 50%
                             extracted_text += line[1][0] + "\n"
-        else:
-            print("OCR returned no results")
         processing_time = time.time() - start_time
-        print(f"Processing completed in {processing_time:.2f} seconds")
-        # Create summary
         summary = f"""
 📄 **File**: {filename}
 📊 **Pages Processed**: {pages_processed}/{total_pages}
@@ -164,7 +190,6 @@ def process_document(file):
 🔧 **OCR Engine**: PaddleOCR
         """
-        # For API compatibility, also return JSON format
         api_response = json.dumps({
             "success": True,
             "text": extracted_text,
@@ -179,9 +204,9 @@ def process_document(file):
     except Exception as e:
         error_msg = f"Error processing file: {str(e)}"
-        print(f"Processing error: {e}")
         import traceback
-        traceback.print_exc()  # Print full stack trace for debugging
         return error_msg, "", json.dumps({"success": False, "error": str(e)})
 def process_api_request(api_data):

     sys.exit(1)
 def process_document(file):
+    """Process uploaded document with PaddleOCR - Debug Version"""
     if file is None:
         return "No file uploaded", "", ""
     start_time = time.time()
     try:
+        # Debug file object
+        print(f"File object type: {type(file)}")
+        print(f"File object attributes: {dir(file)}")
+        # Try different ways to get filename
+        try:
+            filename = os.path.basename(file.name)
+        except AttributeError:
+            try:
+                filename = file.orig_name if hasattr(file, 'orig_name') else 'unknown.pdf'
+            except:
+                filename = 'unknown.pdf'
         print(f"Processing: {filename}")
+        # Try different ways to access file path
+        file_path = None
+        if hasattr(file, 'name'):
+            file_path = file.name
+        elif hasattr(file, 'path'):
+            file_path = file.path
+        elif hasattr(file, 'file'):
+            file_path = file.file.name if hasattr(file.file, 'name') else None
+        if not file_path:
+            return "Error: Could not access file path", "", json.dumps({"success": False, "error": "File path not accessible"})
+        print(f"File path: {file_path}")
+        # Count pages if PDF
         total_pages = 1
         if filename.lower().endswith('.pdf'):
             try:
+                print(f"Opening PDF: {file_path}")
+                doc = fitz.open(file_path)
+                # Debug document object
+                print(f"Document object type: {type(doc)}")
+                print(f"Document attributes: {[attr for attr in dir(doc) if not attr.startswith('_')]}")
+                # Try all possible ways to get page count
+                if hasattr(doc, 'page_count'):
+                    total_pages = doc.page_count
+                    print(f"Used page_count: {total_pages}")
+                elif hasattr(doc, 'pageCount'):
+                    total_pages = doc.pageCount
+                    print(f"Used pageCount: {total_pages}")
+                else:
+                    total_pages = len(doc)
+                    print(f"Used len(): {total_pages}")
                 doc.close()
             except Exception as e:
+                print(f"PDF page counting error: {e}")
                 total_pages = 1
+        # Run OCR
+        print(f"Running OCR on: {file_path}")
+        result = ocr.ocr(file_path, cls=True)
+        print(f"OCR result type: {type(result)}")
         # Extract text
         extracted_text = ""
             for page_idx, page_result in enumerate(result):
                 if page_result:
                     pages_processed += 1
                     for line in page_result:
+                        if len(line) >= 2 and line[1][1] > 0.5:
                             extracted_text += line[1][0] + "\n"
         processing_time = time.time() - start_time
         summary = f"""
 📄 **File**: {filename}
 📊 **Pages Processed**: {pages_processed}/{total_pages}
 🔧 **OCR Engine**: PaddleOCR
         """
         api_response = json.dumps({
             "success": True,
             "text": extracted_text,
     except Exception as e:
         error_msg = f"Error processing file: {str(e)}"
+        print(f"Full error: {e}")
         import traceback
+        traceback.print_exc()
         return error_msg, "", json.dumps({"success": False, "error": str(e)})
 def process_api_request(api_data):