RajanMalaviya commited on
Commit
3687ca5
·
verified ·
1 Parent(s): f6801bd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +289 -290
app.py CHANGED
@@ -1,291 +1,290 @@
1
- from fastapi import FastAPI, File, UploadFile, HTTPException
2
- import pytesseract
3
- import cv2
4
- import os
5
- from PIL import Image
6
- import json
7
- import unicodedata
8
- from pdf2image import convert_from_bytes
9
- from pypdf import PdfReader
10
- import numpy as np
11
- from typing import List
12
- import io
13
- import logging
14
- import time
15
- import asyncio
16
- import psutil
17
- import cachetools
18
- import hashlib
19
- from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
20
- import torch
21
-
22
- app = FastAPI()
23
-
24
- # Configure logging
25
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
26
- logger = logging.getLogger(__name__)
27
-
28
- # Set Tesseract path
29
- pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
30
-
31
- # Load Qwen2-VL-7B-Instruct model with 4-bit quantization
32
- model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
33
- try:
34
- quantization_config = BitsAndBytesConfig(load_in_4bit=True)
35
- model = Qwen2VLForConditionalGeneration.from_pretrained(
36
- model_name,
37
- quantization_config=quantization_config,
38
- device_map="auto",
39
- low_cpu_mem_usage=True
40
- )
41
- processor = AutoProcessor.from_pretrained(model_name)
42
- logger.info("Qwen2-VL-7B-Instruct model loaded successfully")
43
- except Exception as e:
44
- logger.error(f"Failed to load Qwen2-VL-7B-Instruct model: {str(e)}")
45
- raise HTTPException(status_code=500, detail="Failed to load Qwen2-VL-7B-Instruct model")
46
-
47
- # In-memory caches (1-hour TTL)
48
- raw_text_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
49
- structured_data_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
50
-
51
- def log_memory_usage():
52
- """Log current memory usage."""
53
- process = psutil.Process()
54
- mem_info = process.memory_info()
55
- return f"Memory usage: {mem_info.rss / 1024 / 1024:.2f} MB"
56
-
57
- def get_file_hash(file_bytes):
58
- """Generate MD5 hash of file content."""
59
- return hashlib.md5(file_bytes).hexdigest()
60
-
61
- def get_text_hash(raw_text):
62
- """Generate MD5 hash of raw text."""
63
- return hashlib.md5(raw_text.encode('utf-8')).hexdigest()
64
-
65
- async def process_image(img_bytes, filename, idx):
66
- """Process a single image (JPG/JPEG/PNG) with OCR."""
67
- start_time = time.time()
68
- logger.info(f"Starting OCR for {filename} image {idx}, {log_memory_usage()}")
69
- try:
70
- img = Image.open(io.BytesIO(img_bytes))
71
- img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
72
- gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
73
- img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
74
- custom_config = r'--oem 1 --psm 6 -l eng+ara' # Reduced for performance
75
- page_text = pytesseract.image_to_string(img_pil, config=custom_config)
76
- logger.info(f"Completed OCR for {filename} image {idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
77
- return page_text + "\n"
78
- except Exception as e:
79
- logger.error(f"OCR failed for {filename} image {idx}: {str(e)}, {log_memory_usage()}")
80
- return ""
81
-
82
- async def process_pdf_page(img, page_idx):
83
- """Process a single PDF page with OCR."""
84
- start_time = time.time()
85
- logger.info(f"Starting OCR for PDF page {page_idx}, {log_memory_usage()}")
86
- try:
87
- img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
88
- gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
89
- img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
90
- custom_config = r'--oem 1 --psm 6 -l eng+ara' # Reduced for performance
91
- page_text = pytesseract.image_to_string(img_pil, config=custom_config)
92
- logger.info(f"Completed OCR for PDF page {page_idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
93
- return page_text + "\n"
94
- except Exception as e:
95
- logger.error(f"OCR failed for PDF page {page_idx}: {str(e)}, {log_memory_usage()}")
96
- return ""
97
-
98
- async def process_with_qwen(filename: str, raw_text: str):
99
- """Process raw text with Qwen2-VL-7B-Instruct to extract structured data."""
100
- start_time = time.time()
101
- logger.info(f"Starting Qwen processing for {filename}, {log_memory_usage()}")
102
-
103
- # Check structured data cache
104
- text_hash = get_text_hash(raw_text)
105
- if text_hash in structured_data_cache:
106
- logger.info(f"Structured data cache hit for {filename}, {log_memory_usage()}")
107
- return structured_data_cache[text_hash]
108
-
109
- # Truncate text for Qwen
110
- if len(raw_text) > 10000:
111
- raw_text = raw_text[:10000]
112
- logger.info(f"Truncated raw text for {filename} to 10000 characters, {log_memory_usage()}")
113
-
114
- try:
115
- prompt = f"""
116
- You are an intelligent invoice data extractor. Given raw text from an invoice in any language, extract key business fields in the specified JSON format. Support English. Handle synonyms (e.g., 'total' = 'net', 'tax' = 'GST'/'TDS'). The 'Products' field is dynamic and may contain multiple items, each with 'qty', 'description', 'unit_price', and 'amount'. Detect the currency (e.g., USD, INR, EUR) from symbols ($, ₹, €) or text; default to USD if unclear. If a field is missing, include it with an empty string ("") or appropriate default (e.g., 0 for numbers). Output only the JSON object.
117
-
118
- Raw text:
119
- {raw_text}
120
-
121
- Output JSON:
122
- {{
123
- "currency": "",
124
- "Discount_Percentage": "",
125
- "Due_Date": "",
126
- "Email_Client": "",
127
- "Name_Client": "",
128
- "Products": [],
129
- "Remise": "",
130
- "Subtotal": "",
131
- "Tax": "",
132
- "Tax_Percentage": "",
133
- "Tel_Client": "",
134
- "billing address": "",
135
- "header": "",
136
- "invoice date": "",
137
- "invoice number": "",
138
- "shipping address": "",
139
- "total": ""
140
- }}
141
- """
142
- messages = [{"role": "user", "content": prompt}]
143
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
144
- inputs = processor(text=[text], return_tensors="pt")
145
- output_ids = model.generate(**inputs, max_new_tokens=512)
146
- llm_output = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
147
-
148
- # Extract JSON from output
149
- json_start = llm_output.find("{")
150
- json_end = llm_output.rfind("}") + 1
151
- if json_start == -1 or json_end == -1:
152
- raise ValueError("No valid JSON found in Qwen output")
153
- json_str = llm_output[json_start:json_end]
154
- structured_data = json.loads(json_str)
155
- structured_data_cache[text_hash] = structured_data
156
- logger.info(f"Qwen processing for {filename}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
157
- return structured_data
158
- except Exception as e:
159
- logger.error(f"Qwen processing failed for {filename}: {str(e)}, {log_memory_usage()}")
160
- return {"error": f"Qwen processing failed: {str(e)}"}
161
-
162
- @app.post("/ocr")
163
- async def extract_and_structure(files: List[UploadFile] = File(...)):
164
- output_json = {
165
- "success": True,
166
- "message": "",
167
- "data": []
168
- }
169
- success_count = 0
170
- fail_count = 0
171
-
172
- logger.info(f"Starting processing for {len(files)} files, {log_memory_usage()}")
173
-
174
- for file in files:
175
- total_start_time = time.time()
176
- logger.info(f"Processing file: {file.filename}, {log_memory_usage()}")
177
-
178
- # Validate file format
179
- valid_extensions = {'.pdf', '.jpg', '.jpeg', '.png'}
180
- file_ext = os.path.splitext(file.filename.lower())[1]
181
- if file_ext not in valid_extensions:
182
- fail_count += 1
183
- output_json["data"].append({
184
- "filename": file.filename,
185
- "structured_data": {"error": f"Unsupported file format: {file_ext}"},
186
- "error": f"Unsupported file format: {file_ext}"
187
- })
188
- logger.error(f"Unsupported file format for {file.filename}: {file_ext}")
189
- continue
190
-
191
- # Read file into memory
192
- try:
193
- file_start_time = time.time()
194
- file_bytes = await file.read()
195
- file_stream = io.BytesIO(file_bytes)
196
- file_hash = get_file_hash(file_bytes)
197
- logger.info(f"Read file {file.filename}, took {time.time() - file_start_time:.2f} seconds, size: {len(file_bytes)/1024:.2f} KB, {log_memory_usage()}")
198
- except Exception as e:
199
- fail_count += 1
200
- output_json["data"].append({
201
- "filename": file.filename,
202
- "structured_data": {"error": f"Failed to read file: {str(e)}"},
203
- "error": f"Failed to read file: {str(e)}"
204
- })
205
- logger.error(f"Failed to read file {file.filename}: {str(e)}, {log_memory_usage()}")
206
- continue
207
-
208
- # Check raw text cache
209
- raw_text = ""
210
- if file_hash in raw_text_cache:
211
- raw_text = raw_text_cache[file_hash]
212
- logger.info(f"Raw text cache hit for {file.filename}, {log_memory_usage()}")
213
- else:
214
- if file_ext == '.pdf':
215
- # Try extracting embedded text
216
- try:
217
- extract_start_time = time.time()
218
- reader = PdfReader(file_stream)
219
- for page in reader.pages:
220
- text = page.extract_text()
221
- if text:
222
- raw_text += text + "\n"
223
- logger.info(f"Embedded text extraction for {file.filename}, took {time.time() - extract_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
224
- except Exception as e:
225
- logger.warning(f"Embedded text extraction failed for {file.filename}: {str(e)}, {log_memory_usage()}")
226
-
227
- # If no embedded text, perform OCR
228
- if not raw_text.strip():
229
- try:
230
- convert_start_time = time.time()
231
- images = convert_from_bytes(file_bytes, poppler_path="/usr/local/bin", dpi=100)
232
- logger.info(f"PDF to images conversion for {file.filename}, {len(images)} pages, took {time.time() - convert_start_time:.2f} seconds, {log_memory_usage()}")
233
-
234
- ocr_start_time = time.time()
235
- page_texts = []
236
- for i, img in enumerate(images):
237
- page_text = await process_pdf_page(img, i)
238
- page_texts.append(page_text)
239
- raw_text = "".join(page_texts)
240
- logger.info(f"Total OCR for {file.filename}, took {time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
241
- except Exception as e:
242
- fail_count += 1
243
- output_json["data"].append({
244
- "filename": file.filename,
245
- "structured_data": {"error": f"OCR failed: {str(e)}"},
246
- "error": f"OCR failed: {str(e)}"
247
- })
248
- logger.error(f"OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
249
- continue
250
- else: # JPG/JPEG/PNG
251
- try:
252
- ocr_start_time = time.time()
253
- raw_text = await process_image(file_bytes, file.filename, 0)
254
- logger.info(f"Image OCR for {file.filename}, took {time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
255
- except Exception as e:
256
- fail_count += 1
257
- output_json["data"].append({
258
- "filename": file.filename,
259
- "structured_data": {"error": f"Image OCR failed: {str(e)}"},
260
- "error": f"Image OCR failed: {str(e)}"
261
- })
262
- logger.error(f"Image OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
263
- continue
264
-
265
- # Normalize text
266
- try:
267
- normalize_start_time = time.time()
268
- raw_text = unicodedata.normalize('NFKC', raw_text)
269
- raw_text = raw_text.encode().decode('utf-8')
270
- raw_text_cache[file_hash] = raw_text
271
- logger.info(f"Text normalization for {file.filename}, took {time.time() - normalize_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
272
- except Exception as e:
273
- logger.warning(f"Text normalization failed for {file.filename}: {str(e)}, {log_memory_usage()}")
274
-
275
- # Process with Qwen
276
- structured_data = await process_with_qwen(file.filename, raw_text)
277
- success_count += 1
278
- output_json["data"].append({
279
- "filename": file.filename,
280
- "structured_data": structured_data,
281
- "error": ""
282
- })
283
-
284
- logger.info(f"Total processing for {file.filename}, took {time.time() - total_start_time:.2f} seconds, {log_memory_usage()}")
285
-
286
- output_json["message"] = f"Processed {len(files)} files. {success_count} succeeded, {fail_count} failed."
287
- if fail_count > 0 and success_count == 0:
288
- output_json["success"] = False
289
-
290
- logger.info(f"Completed processing for {len(files)} files, {success_count} succeeded, {fail_count} failed, {log_memory_usage()}")
291
  return output_json
 
1
+ from fastapi import FastAPI, File, UploadFile, HTTPException
2
+ import pytesseract
3
+ import cv2
4
+ import os
5
+ from PIL import Image
6
+ import json
7
+ import unicodedata
8
+ from pdf2image import convert_from_bytes
9
+ from pypdf import PdfReader
10
+ import numpy as np
11
+ from typing import List
12
+ import io
13
+ import logging
14
+ import time
15
+ import asyncio
16
+ import psutil
17
+ import cachetools
18
+ import hashlib
19
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
20
+ import torch
21
+
22
+ app = FastAPI()
23
+
24
+ # Configure logging
25
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
26
+ logger = logging.getLogger(__name__)
27
+
28
+ # Set Tesseract path
29
+ pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
30
+
31
+ # Load Qwen2-VL-2B-Instruct model on CPU
32
+ model_name = "Qwen/Qwen2-VL-2B-Instruct"
33
+ try:
34
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
35
+ model_name,
36
+ torch_dtype=torch.float16,
37
+ device_map="auto",
38
+ low_cpu_mem_usage=True
39
+ )
40
+ processor = AutoProcessor.from_pretrained(model_name)
41
+ logger.info("Qwen2-VL-2B-Instruct model loaded successfully")
42
+ except Exception as e:
43
+ logger.error(f"Failed to load Qwen2-VL-2B-Instruct model: {str(e)}")
44
+ raise HTTPException(status_code=500, detail="Failed to load Qwen2-VL-2B-Instruct model")
45
+
46
+ # In-memory caches (1-hour TTL)
47
+ raw_text_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
48
+ structured_data_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
49
+
50
+ def log_memory_usage():
51
+ """Log current memory usage."""
52
+ process = psutil.Process()
53
+ mem_info = process.memory_info()
54
+ return f"Memory usage: {mem_info.rss / 1024 / 1024:.2f} MB"
55
+
56
+ def get_file_hash(file_bytes):
57
+ """Generate MD5 hash of file content."""
58
+ return hashlib.md5(file_bytes).hexdigest()
59
+
60
+ def get_text_hash(raw_text):
61
+ """Generate MD5 hash of raw text."""
62
+ return hashlib.md5(raw_text.encode('utf-8')).hexdigest()
63
+
64
+ async def process_image(img_bytes, filename, idx):
65
+ """Process a single image (JPG/JPEG/PNG) with OCR."""
66
+ start_time = time.time()
67
+ logger.info(f"Starting OCR for {filename} image {idx}, {log_memory_usage()}")
68
+ try:
69
+ img = Image.open(io.BytesIO(img_bytes))
70
+ img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
71
+ gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
72
+ img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
73
+ custom_config = r'--oem 1 --psm 6 -l eng+ara' # Reduced for performance
74
+ page_text = pytesseract.image_to_string(img_pil, config=custom_config)
75
+ logger.info(f"Completed OCR for {filename} image {idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
76
+ return page_text + "\n"
77
+ except Exception as e:
78
+ logger.error(f"OCR failed for {filename} image {idx}: {str(e)}, {log_memory_usage()}")
79
+ return ""
80
+
81
+ async def process_pdf_page(img, page_idx):
82
+ """Process a single PDF page with OCR."""
83
+ start_time = time.time()
84
+ logger.info(f"Starting OCR for PDF page {page_idx}, {log_memory_usage()}")
85
+ try:
86
+ img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
87
+ gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
88
+ img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
89
+ custom_config = r'--oem 1 --psm 6 -l eng+ara' # Reduced for performance
90
+ page_text = pytesseract.image_to_string(img_pil, config=custom_config)
91
+ logger.info(f"Completed OCR for PDF page {page_idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
92
+ return page_text + "\n"
93
+ except Exception as e:
94
+ logger.error(f"OCR failed for PDF page {page_idx}: {str(e)}, {log_memory_usage()}")
95
+ return ""
96
+
97
+ async def process_with_qwen(filename: str, raw_text: str):
98
+ """Process raw text with Qwen2-VL-2B-Instruct to extract structured data."""
99
+ start_time = time.time()
100
+ logger.info(f"Starting Qwen processing for {filename}, {log_memory_usage()}")
101
+
102
+ # Check structured data cache
103
+ text_hash = get_text_hash(raw_text)
104
+ if text_hash in structured_data_cache:
105
+ logger.info(f"Structured data cache hit for {filename}, {log_memory_usage()}")
106
+ return structured_data_cache[text_hash]
107
+
108
+ # Truncate text for Qwen
109
+ if len(raw_text) > 10000:
110
+ raw_text = raw_text[:10000]
111
+ logger.info(f"Truncated raw text for {filename} to 10000 characters, {log_memory_usage()}")
112
+
113
+ try:
114
+ prompt = f"""
115
+ You are an intelligent invoice data extractor. Given raw text from an invoice in any language, extract key business fields in the specified JSON format. Support English. Handle synonyms (e.g., 'total' = 'net', 'tax' = 'GST'/'TDS'). The 'Products' field is dynamic and may contain multiple items, each with 'qty', 'description', 'unit_price', and 'amount'. Detect the currency (e.g., USD, INR, EUR) from symbols ($, ₹, €) or text; default to USD if unclear. If a field is missing, include it with an empty string ("") or appropriate default (e.g., 0 for numbers). Output only the JSON object.
116
+
117
+ Raw text:
118
+ {raw_text}
119
+
120
+ Output JSON:
121
+ {{
122
+ "currency": "",
123
+ "Discount_Percentage": "",
124
+ "Due_Date": "",
125
+ "Email_Client": "",
126
+ "Name_Client": "",
127
+ "Products": [],
128
+ "Remise": "",
129
+ "Subtotal": "",
130
+ "Tax": "",
131
+ "Tax_Percentage": "",
132
+ "Tel_Client": "",
133
+ "billing address": "",
134
+ "header": "",
135
+ "invoice date": "",
136
+ "invoice number": "",
137
+ "shipping address": "",
138
+ "total": ""
139
+ }}
140
+ """
141
+ messages = [{"role": "user", "content": prompt}]
142
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
143
+ inputs = processor(text=[text], return_tensors="pt")
144
+ output_ids = model.generate(**inputs, max_new_tokens=512)
145
+ llm_output = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
146
+
147
+ # Extract JSON from output
148
+ json_start = llm_output.find("{")
149
+ json_end = llm_output.rfind("}") + 1
150
+ if json_start == -1 or json_end == -1:
151
+ raise ValueError("No valid JSON found in Qwen output")
152
+ json_str = llm_output[json_start:json_end]
153
+ structured_data = json.loads(json_str)
154
+ structured_data_cache[text_hash] = structured_data
155
+ logger.info(f"Qwen processing for {filename}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
156
+ return structured_data
157
+ except Exception as e:
158
+ logger.error(f"Qwen processing failed for {filename}: {str(e)}, {log_memory_usage()}")
159
+ return {"error": f"Qwen processing failed: {str(e)}"}
160
+
161
+ @app.post("/ocr")
162
+ async def extract_and_structure(files: List[UploadFile] = File(...)):
163
+ output_json = {
164
+ "success": True,
165
+ "message": "",
166
+ "data": []
167
+ }
168
+ success_count = 0
169
+ fail_count = 0
170
+
171
+ logger.info(f"Starting processing for {len(files)} files, {log_memory_usage()}")
172
+
173
+ for file in files:
174
+ total_start_time = time.time()
175
+ logger.info(f"Processing file: {file.filename}, {log_memory_usage()}")
176
+
177
+ # Validate file format
178
+ valid_extensions = {'.pdf', '.jpg', '.jpeg', '.png'}
179
+ file_ext = os.path.splitext(file.filename.lower())[1]
180
+ if file_ext not in valid_extensions:
181
+ fail_count += 1
182
+ output_json["data"].append({
183
+ "filename": file.filename,
184
+ "structured_data": {"error": f"Unsupported file format: {file_ext}"},
185
+ "error": f"Unsupported file format: {file_ext}"
186
+ })
187
+ logger.error(f"Unsupported file format for {file.filename}: {file_ext}")
188
+ continue
189
+
190
+ # Read file into memory
191
+ try:
192
+ file_start_time = time.time()
193
+ file_bytes = await file.read()
194
+ file_stream = io.BytesIO(file_bytes)
195
+ file_hash = get_file_hash(file_bytes)
196
+ logger.info(f"Read file {file.filename}, took {time.time() - file_start_time:.2f} seconds, size: {len(file_bytes)/1024:.2f} KB, {log_memory_usage()}")
197
+ except Exception as e:
198
+ fail_count += 1
199
+ output_json["data"].append({
200
+ "filename": file.filename,
201
+ "structured_data": {"error": f"Failed to read file: {str(e)}"},
202
+ "error": f"Failed to read file: {str(e)}"
203
+ })
204
+ logger.error(f"Failed to read file {file.filename}: {str(e)}, {log_memory_usage()}")
205
+ continue
206
+
207
+ # Check raw text cache
208
+ raw_text = ""
209
+ if file_hash in raw_text_cache:
210
+ raw_text = raw_text_cache[file_hash]
211
+ logger.info(f"Raw text cache hit for {file.filename}, {log_memory_usage()}")
212
+ else:
213
+ if file_ext == '.pdf':
214
+ # Try extracting embedded text
215
+ try:
216
+ extract_start_time = time.time()
217
+ reader = PdfReader(file_stream)
218
+ for page in reader.pages:
219
+ text = page.extract_text()
220
+ if text:
221
+ raw_text += text + "\n"
222
+ logger.info(f"Embedded text extraction for {file.filename}, took {time.time() - extract_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
223
+ except Exception as e:
224
+ logger.warning(f"Embedded text extraction failed for {file.filename}: {str(e)}, {log_memory_usage()}")
225
+
226
+ # If no embedded text, perform OCR
227
+ if not raw_text.strip():
228
+ try:
229
+ convert_start_time = time.time()
230
+ images = convert_from_bytes(file_bytes, poppler_path="/usr/local/bin", dpi=100)
231
+ logger.info(f"PDF to images conversion for {file.filename}, {len(images)} pages, took {time.time() - convert_start_time:.2f} seconds, {log_memory_usage()}")
232
+
233
+ ocr_start_time = time.time()
234
+ page_texts = []
235
+ for i, img in enumerate(images):
236
+ page_text = await process_pdf_page(img, i)
237
+ page_texts.append(page_text)
238
+ raw_text = "".join(page_texts)
239
+ logger.info(f"Total OCR for {file.filename}, took {time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
240
+ except Exception as e:
241
+ fail_count += 1
242
+ output_json["data"].append({
243
+ "filename": file.filename,
244
+ "structured_data": {"error": f"OCR failed: {str(e)}"},
245
+ "error": f"OCR failed: {str(e)}"
246
+ })
247
+ logger.error(f"OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
248
+ continue
249
+ else: # JPG/JPEG/PNG
250
+ try:
251
+ ocr_start_time = time.time()
252
+ raw_text = await process_image(file_bytes, file.filename, 0)
253
+ logger.info(f"Image OCR for {file.filename}, took {time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
254
+ except Exception as e:
255
+ fail_count += 1
256
+ output_json["data"].append({
257
+ "filename": file.filename,
258
+ "structured_data": {"error": f"Image OCR failed: {str(e)}"},
259
+ "error": f"Image OCR failed: {str(e)}"
260
+ })
261
+ logger.error(f"Image OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
262
+ continue
263
+
264
+ # Normalize text
265
+ try:
266
+ normalize_start_time = time.time()
267
+ raw_text = unicodedata.normalize('NFKC', raw_text)
268
+ raw_text = raw_text.encode().decode('utf-8')
269
+ raw_text_cache[file_hash] = raw_text
270
+ logger.info(f"Text normalization for {file.filename}, took {time.time() - normalize_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
271
+ except Exception as e:
272
+ logger.warning(f"Text normalization failed for {file.filename}: {str(e)}, {log_memory_usage()}")
273
+
274
+ # Process with Qwen
275
+ structured_data = await process_with_qwen(file.filename, raw_text)
276
+ success_count += 1
277
+ output_json["data"].append({
278
+ "filename": file.filename,
279
+ "structured_data": structured_data,
280
+ "error": ""
281
+ })
282
+
283
+ logger.info(f"Total processing for {file.filename}, took {time.time() - total_start_time:.2f} seconds, {log_memory_usage()}")
284
+
285
+ output_json["message"] = f"Processed {len(files)} files. {success_count} succeeded, {fail_count} failed."
286
+ if fail_count > 0 and success_count == 0:
287
+ output_json["success"] = False
288
+
289
+ logger.info(f"Completed processing for {len(files)} files, {success_count} succeeded, {fail_count} failed, {log_memory_usage()}")
 
290
  return output_json