Spaces:
Sleeping
Sleeping
Count fix - debug version
Browse files
app.py
CHANGED
|
@@ -100,42 +100,73 @@ except Exception as e:
|
|
| 100 |
sys.exit(1)
|
| 101 |
|
| 102 |
def process_document(file):
|
| 103 |
-
"""Process uploaded document with PaddleOCR"""
|
| 104 |
if file is None:
|
| 105 |
return "No file uploaded", "", ""
|
| 106 |
|
| 107 |
start_time = time.time()
|
| 108 |
|
| 109 |
try:
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
print(f"Processing: {filename}")
|
| 112 |
|
| 113 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
total_pages = 1
|
| 115 |
if filename.lower().endswith('.pdf'):
|
| 116 |
try:
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
doc.close()
|
| 128 |
except Exception as e:
|
| 129 |
-
print(f"
|
| 130 |
total_pages = 1
|
| 131 |
|
| 132 |
-
# Run OCR
|
| 133 |
-
print("
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
except Exception as ocr_error:
|
| 137 |
-
print(f"OCR processing failed: {ocr_error}")
|
| 138 |
-
return f"OCR Error: {str(ocr_error)}", "", json.dumps({"success": False, "error": str(ocr_error)})
|
| 139 |
|
| 140 |
# Extract text
|
| 141 |
extracted_text = ""
|
|
@@ -145,17 +176,12 @@ def process_document(file):
|
|
| 145 |
for page_idx, page_result in enumerate(result):
|
| 146 |
if page_result:
|
| 147 |
pages_processed += 1
|
| 148 |
-
print(f"Processing page {page_idx + 1}")
|
| 149 |
for line in page_result:
|
| 150 |
-
if len(line) >= 2 and line[1][1] > 0.5:
|
| 151 |
extracted_text += line[1][0] + "\n"
|
| 152 |
-
else:
|
| 153 |
-
print("OCR returned no results")
|
| 154 |
|
| 155 |
processing_time = time.time() - start_time
|
| 156 |
-
print(f"Processing completed in {processing_time:.2f} seconds")
|
| 157 |
|
| 158 |
-
# Create summary
|
| 159 |
summary = f"""
|
| 160 |
π **File**: {filename}
|
| 161 |
π **Pages Processed**: {pages_processed}/{total_pages}
|
|
@@ -164,7 +190,6 @@ def process_document(file):
|
|
| 164 |
π§ **OCR Engine**: PaddleOCR
|
| 165 |
"""
|
| 166 |
|
| 167 |
-
# For API compatibility, also return JSON format
|
| 168 |
api_response = json.dumps({
|
| 169 |
"success": True,
|
| 170 |
"text": extracted_text,
|
|
@@ -179,9 +204,9 @@ def process_document(file):
|
|
| 179 |
|
| 180 |
except Exception as e:
|
| 181 |
error_msg = f"Error processing file: {str(e)}"
|
| 182 |
-
print(f"
|
| 183 |
import traceback
|
| 184 |
-
traceback.print_exc()
|
| 185 |
return error_msg, "", json.dumps({"success": False, "error": str(e)})
|
| 186 |
|
| 187 |
def process_api_request(api_data):
|
|
|
|
| 100 |
sys.exit(1)
|
| 101 |
|
| 102 |
def process_document(file):
|
| 103 |
+
"""Process uploaded document with PaddleOCR - Debug Version"""
|
| 104 |
if file is None:
|
| 105 |
return "No file uploaded", "", ""
|
| 106 |
|
| 107 |
start_time = time.time()
|
| 108 |
|
| 109 |
try:
|
| 110 |
+
# Debug file object
|
| 111 |
+
print(f"File object type: {type(file)}")
|
| 112 |
+
print(f"File object attributes: {dir(file)}")
|
| 113 |
+
|
| 114 |
+
# Try different ways to get filename
|
| 115 |
+
try:
|
| 116 |
+
filename = os.path.basename(file.name)
|
| 117 |
+
except AttributeError:
|
| 118 |
+
try:
|
| 119 |
+
filename = file.orig_name if hasattr(file, 'orig_name') else 'unknown.pdf'
|
| 120 |
+
except:
|
| 121 |
+
filename = 'unknown.pdf'
|
| 122 |
+
|
| 123 |
print(f"Processing: {filename}")
|
| 124 |
|
| 125 |
+
# Try different ways to access file path
|
| 126 |
+
file_path = None
|
| 127 |
+
if hasattr(file, 'name'):
|
| 128 |
+
file_path = file.name
|
| 129 |
+
elif hasattr(file, 'path'):
|
| 130 |
+
file_path = file.path
|
| 131 |
+
elif hasattr(file, 'file'):
|
| 132 |
+
file_path = file.file.name if hasattr(file.file, 'name') else None
|
| 133 |
+
|
| 134 |
+
if not file_path:
|
| 135 |
+
return "Error: Could not access file path", "", json.dumps({"success": False, "error": "File path not accessible"})
|
| 136 |
+
|
| 137 |
+
print(f"File path: {file_path}")
|
| 138 |
+
|
| 139 |
+
# Count pages if PDF
|
| 140 |
total_pages = 1
|
| 141 |
if filename.lower().endswith('.pdf'):
|
| 142 |
try:
|
| 143 |
+
print(f"Opening PDF: {file_path}")
|
| 144 |
+
doc = fitz.open(file_path)
|
| 145 |
+
|
| 146 |
+
# Debug document object
|
| 147 |
+
print(f"Document object type: {type(doc)}")
|
| 148 |
+
print(f"Document attributes: {[attr for attr in dir(doc) if not attr.startswith('_')]}")
|
| 149 |
+
|
| 150 |
+
# Try all possible ways to get page count
|
| 151 |
+
if hasattr(doc, 'page_count'):
|
| 152 |
+
total_pages = doc.page_count
|
| 153 |
+
print(f"Used page_count: {total_pages}")
|
| 154 |
+
elif hasattr(doc, 'pageCount'):
|
| 155 |
+
total_pages = doc.pageCount
|
| 156 |
+
print(f"Used pageCount: {total_pages}")
|
| 157 |
+
else:
|
| 158 |
+
total_pages = len(doc)
|
| 159 |
+
print(f"Used len(): {total_pages}")
|
| 160 |
+
|
| 161 |
doc.close()
|
| 162 |
except Exception as e:
|
| 163 |
+
print(f"PDF page counting error: {e}")
|
| 164 |
total_pages = 1
|
| 165 |
|
| 166 |
+
# Run OCR
|
| 167 |
+
print(f"Running OCR on: {file_path}")
|
| 168 |
+
result = ocr.ocr(file_path, cls=True)
|
| 169 |
+
print(f"OCR result type: {type(result)}")
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
# Extract text
|
| 172 |
extracted_text = ""
|
|
|
|
| 176 |
for page_idx, page_result in enumerate(result):
|
| 177 |
if page_result:
|
| 178 |
pages_processed += 1
|
|
|
|
| 179 |
for line in page_result:
|
| 180 |
+
if len(line) >= 2 and line[1][1] > 0.5:
|
| 181 |
extracted_text += line[1][0] + "\n"
|
|
|
|
|
|
|
| 182 |
|
| 183 |
processing_time = time.time() - start_time
|
|
|
|
| 184 |
|
|
|
|
| 185 |
summary = f"""
|
| 186 |
π **File**: {filename}
|
| 187 |
π **Pages Processed**: {pages_processed}/{total_pages}
|
|
|
|
| 190 |
π§ **OCR Engine**: PaddleOCR
|
| 191 |
"""
|
| 192 |
|
|
|
|
| 193 |
api_response = json.dumps({
|
| 194 |
"success": True,
|
| 195 |
"text": extracted_text,
|
|
|
|
| 204 |
|
| 205 |
except Exception as e:
|
| 206 |
error_msg = f"Error processing file: {str(e)}"
|
| 207 |
+
print(f"Full error: {e}")
|
| 208 |
import traceback
|
| 209 |
+
traceback.print_exc()
|
| 210 |
return error_msg, "", json.dumps({"success": False, "error": str(e)})
|
| 211 |
|
| 212 |
def process_api_request(api_data):
|