Spaces:
Sleeping
Sleeping
Count pages fix attempt #2
Browse files
app.py
CHANGED
|
@@ -110,35 +110,50 @@ def process_document(file):
|
|
| 110 |
filename = os.path.basename(file.name)
|
| 111 |
print(f"Processing: {filename}")
|
| 112 |
|
| 113 |
-
# Count pages if PDF
|
| 114 |
total_pages = 1
|
| 115 |
if filename.lower().endswith('.pdf'):
|
| 116 |
try:
|
| 117 |
doc = fitz.open(file.name)
|
| 118 |
-
#
|
| 119 |
try:
|
| 120 |
-
total_pages = doc.page_count #
|
| 121 |
except AttributeError:
|
| 122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
doc.close()
|
| 124 |
except Exception as e:
|
| 125 |
-
print(f"Could not count PDF pages: {e}")
|
|
|
|
| 126 |
|
| 127 |
-
# Run OCR
|
| 128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
# Extract text
|
| 131 |
extracted_text = ""
|
| 132 |
pages_processed = 0
|
| 133 |
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
processing_time = time.time() - start_time
|
|
|
|
| 142 |
|
| 143 |
# Create summary
|
| 144 |
summary = f"""
|
|
@@ -165,8 +180,10 @@ def process_document(file):
|
|
| 165 |
except Exception as e:
|
| 166 |
error_msg = f"Error processing file: {str(e)}"
|
| 167 |
print(f"Processing error: {e}")
|
|
|
|
|
|
|
| 168 |
return error_msg, "", json.dumps({"success": False, "error": str(e)})
|
| 169 |
-
|
| 170 |
def process_api_request(api_data):
|
| 171 |
"""Process API-style requests (for integration with your Vercel app)"""
|
| 172 |
try:
|
|
|
|
| 110 |
filename = os.path.basename(file.name)
|
| 111 |
print(f"Processing: {filename}")
|
| 112 |
|
| 113 |
+
# Count pages if PDF with better error handling
|
| 114 |
total_pages = 1
|
| 115 |
if filename.lower().endswith('.pdf'):
|
| 116 |
try:
|
| 117 |
doc = fitz.open(file.name)
|
| 118 |
+
# Try multiple ways to get page count
|
| 119 |
try:
|
| 120 |
+
total_pages = doc.page_count # PyMuPDF >= 1.23.0
|
| 121 |
except AttributeError:
|
| 122 |
+
try:
|
| 123 |
+
total_pages = doc.pageCount # PyMuPDF < 1.23.0
|
| 124 |
+
except AttributeError:
|
| 125 |
+
total_pages = len(doc) # Fallback method
|
| 126 |
+
print(f"PDF has {total_pages} pages")
|
| 127 |
doc.close()
|
| 128 |
except Exception as e:
|
| 129 |
+
print(f"Could not count PDF pages, assuming 1 page: {e}")
|
| 130 |
+
total_pages = 1
|
| 131 |
|
| 132 |
+
# Run OCR with better error handling
|
| 133 |
+
print("Starting OCR processing...")
|
| 134 |
+
try:
|
| 135 |
+
result = ocr.ocr(file.name, cls=True)
|
| 136 |
+
except Exception as ocr_error:
|
| 137 |
+
print(f"OCR processing failed: {ocr_error}")
|
| 138 |
+
return f"OCR Error: {str(ocr_error)}", "", json.dumps({"success": False, "error": str(ocr_error)})
|
| 139 |
|
| 140 |
# Extract text
|
| 141 |
extracted_text = ""
|
| 142 |
pages_processed = 0
|
| 143 |
|
| 144 |
+
if result:
|
| 145 |
+
for page_idx, page_result in enumerate(result):
|
| 146 |
+
if page_result:
|
| 147 |
+
pages_processed += 1
|
| 148 |
+
print(f"Processing page {page_idx + 1}")
|
| 149 |
+
for line in page_result:
|
| 150 |
+
if len(line) >= 2 and line[1][1] > 0.5: # Confidence > 50%
|
| 151 |
+
extracted_text += line[1][0] + "\n"
|
| 152 |
+
else:
|
| 153 |
+
print("OCR returned no results")
|
| 154 |
|
| 155 |
processing_time = time.time() - start_time
|
| 156 |
+
print(f"Processing completed in {processing_time:.2f} seconds")
|
| 157 |
|
| 158 |
# Create summary
|
| 159 |
summary = f"""
|
|
|
|
| 180 |
except Exception as e:
|
| 181 |
error_msg = f"Error processing file: {str(e)}"
|
| 182 |
print(f"Processing error: {e}")
|
| 183 |
+
import traceback
|
| 184 |
+
traceback.print_exc() # Print full stack trace for debugging
|
| 185 |
return error_msg, "", json.dumps({"success": False, "error": str(e)})
|
| 186 |
+
|
| 187 |
def process_api_request(api_data):
|
| 188 |
"""Process API-style requests (for integration with your Vercel app)"""
|
| 189 |
try:
|