mbuckle commited on
Commit
c8b00d0
·
1 Parent(s): ca91382

Count pages fix attempt #2

Browse files
Files changed (1) hide show
  1. app.py +31 -14
app.py CHANGED
@@ -110,35 +110,50 @@ def process_document(file):
110
  filename = os.path.basename(file.name)
111
  print(f"Processing: {filename}")
112
 
113
- # Count pages if PDF
114
  total_pages = 1
115
  if filename.lower().endswith('.pdf'):
116
  try:
117
  doc = fitz.open(file.name)
118
- # Handle different PyMuPDF versions
119
  try:
120
- total_pages = doc.page_count # Newer versions
121
  except AttributeError:
122
- total_pages = len(doc) # Older versions or alternative
 
 
 
 
123
  doc.close()
124
  except Exception as e:
125
- print(f"Could not count PDF pages: {e}")
 
126
 
127
- # Run OCR
128
- result = ocr.ocr(file.name, cls=True)
 
 
 
 
 
129
 
130
  # Extract text
131
  extracted_text = ""
132
  pages_processed = 0
133
 
134
- for page_idx, page_result in enumerate(result):
135
- if page_result:
136
- pages_processed += 1
137
- for line in page_result:
138
- if len(line) >= 2 and line[1][1] > 0.5: # Confidence > 50%
139
- extracted_text += line[1][0] + "\n"
 
 
 
 
140
 
141
  processing_time = time.time() - start_time
 
142
 
143
  # Create summary
144
  summary = f"""
@@ -165,8 +180,10 @@ def process_document(file):
165
  except Exception as e:
166
  error_msg = f"Error processing file: {str(e)}"
167
  print(f"Processing error: {e}")
 
 
168
  return error_msg, "", json.dumps({"success": False, "error": str(e)})
169
-
170
  def process_api_request(api_data):
171
  """Process API-style requests (for integration with your Vercel app)"""
172
  try:
 
110
  filename = os.path.basename(file.name)
111
  print(f"Processing: {filename}")
112
 
113
+ # Count pages if PDF with better error handling
114
  total_pages = 1
115
  if filename.lower().endswith('.pdf'):
116
  try:
117
  doc = fitz.open(file.name)
118
+ # Try multiple ways to get page count
119
  try:
120
+ total_pages = doc.page_count # PyMuPDF >= 1.23.0
121
  except AttributeError:
122
+ try:
123
+ total_pages = doc.pageCount # PyMuPDF < 1.23.0
124
+ except AttributeError:
125
+ total_pages = len(doc) # Fallback method
126
+ print(f"PDF has {total_pages} pages")
127
  doc.close()
128
  except Exception as e:
129
+ print(f"Could not count PDF pages, assuming 1 page: {e}")
130
+ total_pages = 1
131
 
132
+ # Run OCR with better error handling
133
+ print("Starting OCR processing...")
134
+ try:
135
+ result = ocr.ocr(file.name, cls=True)
136
+ except Exception as ocr_error:
137
+ print(f"OCR processing failed: {ocr_error}")
138
+ return f"OCR Error: {str(ocr_error)}", "", json.dumps({"success": False, "error": str(ocr_error)})
139
 
140
  # Extract text
141
  extracted_text = ""
142
  pages_processed = 0
143
 
144
+ if result:
145
+ for page_idx, page_result in enumerate(result):
146
+ if page_result:
147
+ pages_processed += 1
148
+ print(f"Processing page {page_idx + 1}")
149
+ for line in page_result:
150
+ if len(line) >= 2 and line[1][1] > 0.5: # Confidence > 50%
151
+ extracted_text += line[1][0] + "\n"
152
+ else:
153
+ print("OCR returned no results")
154
 
155
  processing_time = time.time() - start_time
156
+ print(f"Processing completed in {processing_time:.2f} seconds")
157
 
158
  # Create summary
159
  summary = f"""
 
180
  except Exception as e:
181
  error_msg = f"Error processing file: {str(e)}"
182
  print(f"Processing error: {e}")
183
+ import traceback
184
+ traceback.print_exc() # Print full stack trace for debugging
185
  return error_msg, "", json.dumps({"success": False, "error": str(e)})
186
+
187
  def process_api_request(api_data):
188
  """Process API-style requests (for integration with your Vercel app)"""
189
  try: