Update app.py
Browse files
app.py
CHANGED
|
@@ -1,291 +1,290 @@
|
|
| 1 |
-
from fastapi import FastAPI, File, UploadFile, HTTPException
|
| 2 |
-
import pytesseract
|
| 3 |
-
import cv2
|
| 4 |
-
import os
|
| 5 |
-
from PIL import Image
|
| 6 |
-
import json
|
| 7 |
-
import unicodedata
|
| 8 |
-
from pdf2image import convert_from_bytes
|
| 9 |
-
from pypdf import PdfReader
|
| 10 |
-
import numpy as np
|
| 11 |
-
from typing import List
|
| 12 |
-
import io
|
| 13 |
-
import logging
|
| 14 |
-
import time
|
| 15 |
-
import asyncio
|
| 16 |
-
import psutil
|
| 17 |
-
import cachetools
|
| 18 |
-
import hashlib
|
| 19 |
-
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
|
| 20 |
-
import torch
|
| 21 |
-
|
| 22 |
-
app = FastAPI()
|
| 23 |
-
|
| 24 |
-
# Configure logging
|
| 25 |
-
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 26 |
-
logger = logging.getLogger(__name__)
|
| 27 |
-
|
| 28 |
-
# Set Tesseract path
|
| 29 |
-
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
|
| 30 |
-
|
| 31 |
-
# Load Qwen2-VL-
|
| 32 |
-
model_name = "Qwen/Qwen2
|
| 33 |
-
try:
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
)
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
mem_info
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
text_hash
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
"
|
| 124 |
-
"
|
| 125 |
-
"
|
| 126 |
-
"
|
| 127 |
-
"
|
| 128 |
-
"
|
| 129 |
-
"
|
| 130 |
-
"
|
| 131 |
-
"
|
| 132 |
-
"
|
| 133 |
-
"
|
| 134 |
-
"
|
| 135 |
-
"
|
| 136 |
-
"invoice
|
| 137 |
-
"
|
| 138 |
-
"
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
"""
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
"
|
| 166 |
-
"
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
file_ext
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
"
|
| 185 |
-
"
|
| 186 |
-
|
| 187 |
-
})
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
"
|
| 202 |
-
"
|
| 203 |
-
|
| 204 |
-
})
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
text
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
images
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
page_text
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
"
|
| 245 |
-
"
|
| 246 |
-
|
| 247 |
-
})
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
"
|
| 259 |
-
"
|
| 260 |
-
|
| 261 |
-
})
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
raw_text =
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
"
|
| 280 |
-
"
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
logger.info(f"Completed processing for {len(files)} files, {success_count} succeeded, {fail_count} failed, {log_memory_usage()}")
|
| 291 |
return output_json
|
|
|
|
| 1 |
+
from fastapi import FastAPI, File, UploadFile, HTTPException
|
| 2 |
+
import pytesseract
|
| 3 |
+
import cv2
|
| 4 |
+
import os
|
| 5 |
+
from PIL import Image
|
| 6 |
+
import json
|
| 7 |
+
import unicodedata
|
| 8 |
+
from pdf2image import convert_from_bytes
|
| 9 |
+
from pypdf import PdfReader
|
| 10 |
+
import numpy as np
|
| 11 |
+
from typing import List
|
| 12 |
+
import io
|
| 13 |
+
import logging
|
| 14 |
+
import time
|
| 15 |
+
import asyncio
|
| 16 |
+
import psutil
|
| 17 |
+
import cachetools
|
| 18 |
+
import hashlib
|
| 19 |
+
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
|
| 20 |
+
import torch
|
| 21 |
+
|
| 22 |
+
app = FastAPI()
|
| 23 |
+
|
| 24 |
+
# Configure logging
|
| 25 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 26 |
+
logger = logging.getLogger(__name__)
|
| 27 |
+
|
| 28 |
+
# Set Tesseract path
|
| 29 |
+
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
|
| 30 |
+
|
| 31 |
+
# Load Qwen2-VL-2B-Instruct model on CPU
|
| 32 |
+
model_name = "Qwen/Qwen2-VL-2B-Instruct"
|
| 33 |
+
try:
|
| 34 |
+
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
| 35 |
+
model_name,
|
| 36 |
+
torch_dtype=torch.float16,
|
| 37 |
+
device_map="auto",
|
| 38 |
+
low_cpu_mem_usage=True
|
| 39 |
+
)
|
| 40 |
+
processor = AutoProcessor.from_pretrained(model_name)
|
| 41 |
+
logger.info("Qwen2-VL-2B-Instruct model loaded successfully")
|
| 42 |
+
except Exception as e:
|
| 43 |
+
logger.error(f"Failed to load Qwen2-VL-2B-Instruct model: {str(e)}")
|
| 44 |
+
raise HTTPException(status_code=500, detail="Failed to load Qwen2-VL-2B-Instruct model")
|
| 45 |
+
|
| 46 |
+
# In-memory caches (1-hour TTL)
|
| 47 |
+
raw_text_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
|
| 48 |
+
structured_data_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
|
| 49 |
+
|
| 50 |
+
def log_memory_usage():
|
| 51 |
+
"""Log current memory usage."""
|
| 52 |
+
process = psutil.Process()
|
| 53 |
+
mem_info = process.memory_info()
|
| 54 |
+
return f"Memory usage: {mem_info.rss / 1024 / 1024:.2f} MB"
|
| 55 |
+
|
| 56 |
+
def get_file_hash(file_bytes):
|
| 57 |
+
"""Generate MD5 hash of file content."""
|
| 58 |
+
return hashlib.md5(file_bytes).hexdigest()
|
| 59 |
+
|
| 60 |
+
def get_text_hash(raw_text):
|
| 61 |
+
"""Generate MD5 hash of raw text."""
|
| 62 |
+
return hashlib.md5(raw_text.encode('utf-8')).hexdigest()
|
| 63 |
+
|
| 64 |
+
async def process_image(img_bytes, filename, idx):
|
| 65 |
+
"""Process a single image (JPG/JPEG/PNG) with OCR."""
|
| 66 |
+
start_time = time.time()
|
| 67 |
+
logger.info(f"Starting OCR for {filename} image {idx}, {log_memory_usage()}")
|
| 68 |
+
try:
|
| 69 |
+
img = Image.open(io.BytesIO(img_bytes))
|
| 70 |
+
img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
|
| 71 |
+
gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
|
| 72 |
+
img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
|
| 73 |
+
custom_config = r'--oem 1 --psm 6 -l eng+ara' # Reduced for performance
|
| 74 |
+
page_text = pytesseract.image_to_string(img_pil, config=custom_config)
|
| 75 |
+
logger.info(f"Completed OCR for {filename} image {idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
|
| 76 |
+
return page_text + "\n"
|
| 77 |
+
except Exception as e:
|
| 78 |
+
logger.error(f"OCR failed for {filename} image {idx}: {str(e)}, {log_memory_usage()}")
|
| 79 |
+
return ""
|
| 80 |
+
|
| 81 |
+
async def process_pdf_page(img, page_idx):
|
| 82 |
+
"""Process a single PDF page with OCR."""
|
| 83 |
+
start_time = time.time()
|
| 84 |
+
logger.info(f"Starting OCR for PDF page {page_idx}, {log_memory_usage()}")
|
| 85 |
+
try:
|
| 86 |
+
img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
|
| 87 |
+
gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
|
| 88 |
+
img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
|
| 89 |
+
custom_config = r'--oem 1 --psm 6 -l eng+ara' # Reduced for performance
|
| 90 |
+
page_text = pytesseract.image_to_string(img_pil, config=custom_config)
|
| 91 |
+
logger.info(f"Completed OCR for PDF page {page_idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
|
| 92 |
+
return page_text + "\n"
|
| 93 |
+
except Exception as e:
|
| 94 |
+
logger.error(f"OCR failed for PDF page {page_idx}: {str(e)}, {log_memory_usage()}")
|
| 95 |
+
return ""
|
| 96 |
+
|
| 97 |
+
async def process_with_qwen(filename: str, raw_text: str):
|
| 98 |
+
"""Process raw text with Qwen2-VL-2B-Instruct to extract structured data."""
|
| 99 |
+
start_time = time.time()
|
| 100 |
+
logger.info(f"Starting Qwen processing for {filename}, {log_memory_usage()}")
|
| 101 |
+
|
| 102 |
+
# Check structured data cache
|
| 103 |
+
text_hash = get_text_hash(raw_text)
|
| 104 |
+
if text_hash in structured_data_cache:
|
| 105 |
+
logger.info(f"Structured data cache hit for {filename}, {log_memory_usage()}")
|
| 106 |
+
return structured_data_cache[text_hash]
|
| 107 |
+
|
| 108 |
+
# Truncate text for Qwen
|
| 109 |
+
if len(raw_text) > 10000:
|
| 110 |
+
raw_text = raw_text[:10000]
|
| 111 |
+
logger.info(f"Truncated raw text for {filename} to 10000 characters, {log_memory_usage()}")
|
| 112 |
+
|
| 113 |
+
try:
|
| 114 |
+
prompt = f"""
|
| 115 |
+
You are an intelligent invoice data extractor. Given raw text from an invoice in any language, extract key business fields in the specified JSON format. Support English. Handle synonyms (e.g., 'total' = 'net', 'tax' = 'GST'/'TDS'). The 'Products' field is dynamic and may contain multiple items, each with 'qty', 'description', 'unit_price', and 'amount'. Detect the currency (e.g., USD, INR, EUR) from symbols ($, ₹, €) or text; default to USD if unclear. If a field is missing, include it with an empty string ("") or appropriate default (e.g., 0 for numbers). Output only the JSON object.
|
| 116 |
+
|
| 117 |
+
Raw text:
|
| 118 |
+
{raw_text}
|
| 119 |
+
|
| 120 |
+
Output JSON:
|
| 121 |
+
{{
|
| 122 |
+
"currency": "",
|
| 123 |
+
"Discount_Percentage": "",
|
| 124 |
+
"Due_Date": "",
|
| 125 |
+
"Email_Client": "",
|
| 126 |
+
"Name_Client": "",
|
| 127 |
+
"Products": [],
|
| 128 |
+
"Remise": "",
|
| 129 |
+
"Subtotal": "",
|
| 130 |
+
"Tax": "",
|
| 131 |
+
"Tax_Percentage": "",
|
| 132 |
+
"Tel_Client": "",
|
| 133 |
+
"billing address": "",
|
| 134 |
+
"header": "",
|
| 135 |
+
"invoice date": "",
|
| 136 |
+
"invoice number": "",
|
| 137 |
+
"shipping address": "",
|
| 138 |
+
"total": ""
|
| 139 |
+
}}
|
| 140 |
+
"""
|
| 141 |
+
messages = [{"role": "user", "content": prompt}]
|
| 142 |
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 143 |
+
inputs = processor(text=[text], return_tensors="pt")
|
| 144 |
+
output_ids = model.generate(**inputs, max_new_tokens=512)
|
| 145 |
+
llm_output = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
|
| 146 |
+
|
| 147 |
+
# Extract JSON from output
|
| 148 |
+
json_start = llm_output.find("{")
|
| 149 |
+
json_end = llm_output.rfind("}") + 1
|
| 150 |
+
if json_start == -1 or json_end == -1:
|
| 151 |
+
raise ValueError("No valid JSON found in Qwen output")
|
| 152 |
+
json_str = llm_output[json_start:json_end]
|
| 153 |
+
structured_data = json.loads(json_str)
|
| 154 |
+
structured_data_cache[text_hash] = structured_data
|
| 155 |
+
logger.info(f"Qwen processing for {filename}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
|
| 156 |
+
return structured_data
|
| 157 |
+
except Exception as e:
|
| 158 |
+
logger.error(f"Qwen processing failed for {filename}: {str(e)}, {log_memory_usage()}")
|
| 159 |
+
return {"error": f"Qwen processing failed: {str(e)}"}
|
| 160 |
+
|
| 161 |
+
@app.post("/ocr")
|
| 162 |
+
async def extract_and_structure(files: List[UploadFile] = File(...)):
|
| 163 |
+
output_json = {
|
| 164 |
+
"success": True,
|
| 165 |
+
"message": "",
|
| 166 |
+
"data": []
|
| 167 |
+
}
|
| 168 |
+
success_count = 0
|
| 169 |
+
fail_count = 0
|
| 170 |
+
|
| 171 |
+
logger.info(f"Starting processing for {len(files)} files, {log_memory_usage()}")
|
| 172 |
+
|
| 173 |
+
for file in files:
|
| 174 |
+
total_start_time = time.time()
|
| 175 |
+
logger.info(f"Processing file: {file.filename}, {log_memory_usage()}")
|
| 176 |
+
|
| 177 |
+
# Validate file format
|
| 178 |
+
valid_extensions = {'.pdf', '.jpg', '.jpeg', '.png'}
|
| 179 |
+
file_ext = os.path.splitext(file.filename.lower())[1]
|
| 180 |
+
if file_ext not in valid_extensions:
|
| 181 |
+
fail_count += 1
|
| 182 |
+
output_json["data"].append({
|
| 183 |
+
"filename": file.filename,
|
| 184 |
+
"structured_data": {"error": f"Unsupported file format: {file_ext}"},
|
| 185 |
+
"error": f"Unsupported file format: {file_ext}"
|
| 186 |
+
})
|
| 187 |
+
logger.error(f"Unsupported file format for {file.filename}: {file_ext}")
|
| 188 |
+
continue
|
| 189 |
+
|
| 190 |
+
# Read file into memory
|
| 191 |
+
try:
|
| 192 |
+
file_start_time = time.time()
|
| 193 |
+
file_bytes = await file.read()
|
| 194 |
+
file_stream = io.BytesIO(file_bytes)
|
| 195 |
+
file_hash = get_file_hash(file_bytes)
|
| 196 |
+
logger.info(f"Read file {file.filename}, took {time.time() - file_start_time:.2f} seconds, size: {len(file_bytes)/1024:.2f} KB, {log_memory_usage()}")
|
| 197 |
+
except Exception as e:
|
| 198 |
+
fail_count += 1
|
| 199 |
+
output_json["data"].append({
|
| 200 |
+
"filename": file.filename,
|
| 201 |
+
"structured_data": {"error": f"Failed to read file: {str(e)}"},
|
| 202 |
+
"error": f"Failed to read file: {str(e)}"
|
| 203 |
+
})
|
| 204 |
+
logger.error(f"Failed to read file {file.filename}: {str(e)}, {log_memory_usage()}")
|
| 205 |
+
continue
|
| 206 |
+
|
| 207 |
+
# Check raw text cache
|
| 208 |
+
raw_text = ""
|
| 209 |
+
if file_hash in raw_text_cache:
|
| 210 |
+
raw_text = raw_text_cache[file_hash]
|
| 211 |
+
logger.info(f"Raw text cache hit for {file.filename}, {log_memory_usage()}")
|
| 212 |
+
else:
|
| 213 |
+
if file_ext == '.pdf':
|
| 214 |
+
# Try extracting embedded text
|
| 215 |
+
try:
|
| 216 |
+
extract_start_time = time.time()
|
| 217 |
+
reader = PdfReader(file_stream)
|
| 218 |
+
for page in reader.pages:
|
| 219 |
+
text = page.extract_text()
|
| 220 |
+
if text:
|
| 221 |
+
raw_text += text + "\n"
|
| 222 |
+
logger.info(f"Embedded text extraction for {file.filename}, took {time.time() - extract_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
|
| 223 |
+
except Exception as e:
|
| 224 |
+
logger.warning(f"Embedded text extraction failed for {file.filename}: {str(e)}, {log_memory_usage()}")
|
| 225 |
+
|
| 226 |
+
# If no embedded text, perform OCR
|
| 227 |
+
if not raw_text.strip():
|
| 228 |
+
try:
|
| 229 |
+
convert_start_time = time.time()
|
| 230 |
+
images = convert_from_bytes(file_bytes, poppler_path="/usr/local/bin", dpi=100)
|
| 231 |
+
logger.info(f"PDF to images conversion for {file.filename}, {len(images)} pages, took {time.time() - convert_start_time:.2f} seconds, {log_memory_usage()}")
|
| 232 |
+
|
| 233 |
+
ocr_start_time = time.time()
|
| 234 |
+
page_texts = []
|
| 235 |
+
for i, img in enumerate(images):
|
| 236 |
+
page_text = await process_pdf_page(img, i)
|
| 237 |
+
page_texts.append(page_text)
|
| 238 |
+
raw_text = "".join(page_texts)
|
| 239 |
+
logger.info(f"Total OCR for {file.filename}, took {time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
|
| 240 |
+
except Exception as e:
|
| 241 |
+
fail_count += 1
|
| 242 |
+
output_json["data"].append({
|
| 243 |
+
"filename": file.filename,
|
| 244 |
+
"structured_data": {"error": f"OCR failed: {str(e)}"},
|
| 245 |
+
"error": f"OCR failed: {str(e)}"
|
| 246 |
+
})
|
| 247 |
+
logger.error(f"OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
|
| 248 |
+
continue
|
| 249 |
+
else: # JPG/JPEG/PNG
|
| 250 |
+
try:
|
| 251 |
+
ocr_start_time = time.time()
|
| 252 |
+
raw_text = await process_image(file_bytes, file.filename, 0)
|
| 253 |
+
logger.info(f"Image OCR for {file.filename}, took {time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
|
| 254 |
+
except Exception as e:
|
| 255 |
+
fail_count += 1
|
| 256 |
+
output_json["data"].append({
|
| 257 |
+
"filename": file.filename,
|
| 258 |
+
"structured_data": {"error": f"Image OCR failed: {str(e)}"},
|
| 259 |
+
"error": f"Image OCR failed: {str(e)}"
|
| 260 |
+
})
|
| 261 |
+
logger.error(f"Image OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
|
| 262 |
+
continue
|
| 263 |
+
|
| 264 |
+
# Normalize text
|
| 265 |
+
try:
|
| 266 |
+
normalize_start_time = time.time()
|
| 267 |
+
raw_text = unicodedata.normalize('NFKC', raw_text)
|
| 268 |
+
raw_text = raw_text.encode().decode('utf-8')
|
| 269 |
+
raw_text_cache[file_hash] = raw_text
|
| 270 |
+
logger.info(f"Text normalization for {file.filename}, took {time.time() - normalize_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
|
| 271 |
+
except Exception as e:
|
| 272 |
+
logger.warning(f"Text normalization failed for {file.filename}: {str(e)}, {log_memory_usage()}")
|
| 273 |
+
|
| 274 |
+
# Process with Qwen
|
| 275 |
+
structured_data = await process_with_qwen(file.filename, raw_text)
|
| 276 |
+
success_count += 1
|
| 277 |
+
output_json["data"].append({
|
| 278 |
+
"filename": file.filename,
|
| 279 |
+
"structured_data": structured_data,
|
| 280 |
+
"error": ""
|
| 281 |
+
})
|
| 282 |
+
|
| 283 |
+
logger.info(f"Total processing for {file.filename}, took {time.time() - total_start_time:.2f} seconds, {log_memory_usage()}")
|
| 284 |
+
|
| 285 |
+
output_json["message"] = f"Processed {len(files)} files. {success_count} succeeded, {fail_count} failed."
|
| 286 |
+
if fail_count > 0 and success_count == 0:
|
| 287 |
+
output_json["success"] = False
|
| 288 |
+
|
| 289 |
+
logger.info(f"Completed processing for {len(files)} files, {success_count} succeeded, {fail_count} failed, {log_memory_usage()}")
|
|
|
|
| 290 |
return output_json
|