mbuckle commited on
Commit
7829b2b
Β·
1 Parent(s): c8b00d0

Count fix - debug version

Browse files
Files changed (1) hide show
  1. app.py +55 -30
app.py CHANGED
@@ -100,42 +100,73 @@ except Exception as e:
100
  sys.exit(1)
101
 
102
  def process_document(file):
103
- """Process uploaded document with PaddleOCR"""
104
  if file is None:
105
  return "No file uploaded", "", ""
106
 
107
  start_time = time.time()
108
 
109
  try:
110
- filename = os.path.basename(file.name)
 
 
 
 
 
 
 
 
 
 
 
 
111
  print(f"Processing: {filename}")
112
 
113
- # Count pages if PDF with better error handling
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  total_pages = 1
115
  if filename.lower().endswith('.pdf'):
116
  try:
117
- doc = fitz.open(file.name)
118
- # Try multiple ways to get page count
119
- try:
120
- total_pages = doc.page_count # PyMuPDF >= 1.23.0
121
- except AttributeError:
122
- try:
123
- total_pages = doc.pageCount # PyMuPDF < 1.23.0
124
- except AttributeError:
125
- total_pages = len(doc) # Fallback method
126
- print(f"PDF has {total_pages} pages")
 
 
 
 
 
 
 
 
127
  doc.close()
128
  except Exception as e:
129
- print(f"Could not count PDF pages, assuming 1 page: {e}")
130
  total_pages = 1
131
 
132
- # Run OCR with better error handling
133
- print("Starting OCR processing...")
134
- try:
135
- result = ocr.ocr(file.name, cls=True)
136
- except Exception as ocr_error:
137
- print(f"OCR processing failed: {ocr_error}")
138
- return f"OCR Error: {str(ocr_error)}", "", json.dumps({"success": False, "error": str(ocr_error)})
139
 
140
  # Extract text
141
  extracted_text = ""
@@ -145,17 +176,12 @@ def process_document(file):
145
  for page_idx, page_result in enumerate(result):
146
  if page_result:
147
  pages_processed += 1
148
- print(f"Processing page {page_idx + 1}")
149
  for line in page_result:
150
- if len(line) >= 2 and line[1][1] > 0.5: # Confidence > 50%
151
  extracted_text += line[1][0] + "\n"
152
- else:
153
- print("OCR returned no results")
154
 
155
  processing_time = time.time() - start_time
156
- print(f"Processing completed in {processing_time:.2f} seconds")
157
 
158
- # Create summary
159
  summary = f"""
160
  πŸ“„ **File**: {filename}
161
  πŸ“Š **Pages Processed**: {pages_processed}/{total_pages}
@@ -164,7 +190,6 @@ def process_document(file):
164
  πŸ”§ **OCR Engine**: PaddleOCR
165
  """
166
 
167
- # For API compatibility, also return JSON format
168
  api_response = json.dumps({
169
  "success": True,
170
  "text": extracted_text,
@@ -179,9 +204,9 @@ def process_document(file):
179
 
180
  except Exception as e:
181
  error_msg = f"Error processing file: {str(e)}"
182
- print(f"Processing error: {e}")
183
  import traceback
184
- traceback.print_exc() # Print full stack trace for debugging
185
  return error_msg, "", json.dumps({"success": False, "error": str(e)})
186
 
187
  def process_api_request(api_data):
 
100
  sys.exit(1)
101
 
102
  def process_document(file):
103
+ """Process uploaded document with PaddleOCR - Debug Version"""
104
  if file is None:
105
  return "No file uploaded", "", ""
106
 
107
  start_time = time.time()
108
 
109
  try:
110
+ # Debug file object
111
+ print(f"File object type: {type(file)}")
112
+ print(f"File object attributes: {dir(file)}")
113
+
114
+ # Try different ways to get filename
115
+ try:
116
+ filename = os.path.basename(file.name)
117
+ except AttributeError:
118
+ try:
119
+ filename = file.orig_name if hasattr(file, 'orig_name') else 'unknown.pdf'
120
+ except:
121
+ filename = 'unknown.pdf'
122
+
123
  print(f"Processing: {filename}")
124
 
125
+ # Try different ways to access file path
126
+ file_path = None
127
+ if hasattr(file, 'name'):
128
+ file_path = file.name
129
+ elif hasattr(file, 'path'):
130
+ file_path = file.path
131
+ elif hasattr(file, 'file'):
132
+ file_path = file.file.name if hasattr(file.file, 'name') else None
133
+
134
+ if not file_path:
135
+ return "Error: Could not access file path", "", json.dumps({"success": False, "error": "File path not accessible"})
136
+
137
+ print(f"File path: {file_path}")
138
+
139
+ # Count pages if PDF
140
  total_pages = 1
141
  if filename.lower().endswith('.pdf'):
142
  try:
143
+ print(f"Opening PDF: {file_path}")
144
+ doc = fitz.open(file_path)
145
+
146
+ # Debug document object
147
+ print(f"Document object type: {type(doc)}")
148
+ print(f"Document attributes: {[attr for attr in dir(doc) if not attr.startswith('_')]}")
149
+
150
+ # Try all possible ways to get page count
151
+ if hasattr(doc, 'page_count'):
152
+ total_pages = doc.page_count
153
+ print(f"Used page_count: {total_pages}")
154
+ elif hasattr(doc, 'pageCount'):
155
+ total_pages = doc.pageCount
156
+ print(f"Used pageCount: {total_pages}")
157
+ else:
158
+ total_pages = len(doc)
159
+ print(f"Used len(): {total_pages}")
160
+
161
  doc.close()
162
  except Exception as e:
163
+ print(f"PDF page counting error: {e}")
164
  total_pages = 1
165
 
166
+ # Run OCR
167
+ print(f"Running OCR on: {file_path}")
168
+ result = ocr.ocr(file_path, cls=True)
169
+ print(f"OCR result type: {type(result)}")
 
 
 
170
 
171
  # Extract text
172
  extracted_text = ""
 
176
  for page_idx, page_result in enumerate(result):
177
  if page_result:
178
  pages_processed += 1
 
179
  for line in page_result:
180
+ if len(line) >= 2 and line[1][1] > 0.5:
181
  extracted_text += line[1][0] + "\n"
 
 
182
 
183
  processing_time = time.time() - start_time
 
184
 
 
185
  summary = f"""
186
  πŸ“„ **File**: {filename}
187
  πŸ“Š **Pages Processed**: {pages_processed}/{total_pages}
 
190
  πŸ”§ **OCR Engine**: PaddleOCR
191
  """
192
 
 
193
  api_response = json.dumps({
194
  "success": True,
195
  "text": extracted_text,
 
204
 
205
  except Exception as e:
206
  error_msg = f"Error processing file: {str(e)}"
207
+ print(f"Full error: {e}")
208
  import traceback
209
+ traceback.print_exc()
210
  return error_msg, "", json.dumps({"success": False, "error": str(e)})
211
 
212
  def process_api_request(api_data):