Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -34,7 +34,7 @@ if not api_key:
|
|
| 34 |
logger.error("GOOGLE_API_KEY not set")
|
| 35 |
raise HTTPException(status_code=500, detail="GOOGLE_API_KEY not set")
|
| 36 |
genai.configure(api_key=api_key)
|
| 37 |
-
model = genai.GenerativeModel("gemini-
|
| 38 |
|
| 39 |
# Set Tesseract path
|
| 40 |
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
|
|
@@ -49,28 +49,45 @@ def log_memory_usage():
|
|
| 49 |
mem_info = process.memory_info()
|
| 50 |
return f"Memory usage: {mem_info.rss / 1024 / 1024:.2f} MB"
|
| 51 |
|
| 52 |
-
def get_file_hash(
|
| 53 |
-
"""Generate MD5 hash of
|
| 54 |
-
return hashlib.md5(
|
| 55 |
|
| 56 |
def get_text_hash(raw_text):
|
| 57 |
"""Generate MD5 hash of raw text."""
|
| 58 |
return hashlib.md5(raw_text.encode('utf-8')).hexdigest()
|
| 59 |
|
| 60 |
-
async def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
"""Process a single PDF page with OCR."""
|
| 62 |
start_time = time.time()
|
| 63 |
-
logger.info(f"Starting OCR for page {page_idx}, {log_memory_usage()}")
|
| 64 |
try:
|
| 65 |
img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
|
| 66 |
gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
|
| 67 |
img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
|
| 68 |
-
custom_config = r'--oem 1 --psm 6 -l eng+ara
|
| 69 |
page_text = pytesseract.image_to_string(img_pil, config=custom_config)
|
| 70 |
-
logger.info(f"Completed OCR for page {page_idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
|
| 71 |
return page_text + "\n"
|
| 72 |
except Exception as e:
|
| 73 |
-
logger.error(f"OCR failed for page {page_idx}: {str(e)}, {log_memory_usage()}")
|
| 74 |
return ""
|
| 75 |
|
| 76 |
async def process_with_gemini(filename: str, raw_text: str):
|
|
@@ -91,17 +108,14 @@ async def process_with_gemini(filename: str, raw_text: str):
|
|
| 91 |
|
| 92 |
try:
|
| 93 |
prompt = f"""
|
| 94 |
-
You are an intelligent invoice data extractor.
|
| 95 |
-
Given raw text from an invoice in any language,
|
| 96 |
-
extract key business fields in the specified JSON format.
|
| 97 |
-
Handle synonyms (e.g., 'total' = 'net', 'tax' = 'GST'/'TDS'/'VAT'). Extract currency from symbol or acronym as well.
|
| 98 |
-
The 'Products' field is dynamic and may contain multiple items, each with 'qty', 'description', 'unit_price', and 'amount'.
|
| 99 |
-
If a field is missing, include it with an empty string ("") or appropriate default (e.g., 0 for numbers).
|
| 100 |
|
| 101 |
-
|
|
|
|
| 102 |
|
| 103 |
Output JSON:
|
| 104 |
{{
|
|
|
|
| 105 |
"Discount_Percentage": "",
|
| 106 |
"Due_Date": "",
|
| 107 |
"Email_Client": "",
|
|
@@ -117,7 +131,6 @@ async def process_with_gemini(filename: str, raw_text: str):
|
|
| 117 |
"invoice date": "",
|
| 118 |
"invoice number": "",
|
| 119 |
"shipping address": "",
|
| 120 |
-
"currency": "",
|
| 121 |
"total": ""
|
| 122 |
}}
|
| 123 |
"""
|
|
@@ -150,33 +163,34 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
|
|
| 150 |
total_start_time = time.time()
|
| 151 |
logger.info(f"Processing file: {file.filename}, {log_memory_usage()}")
|
| 152 |
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
| 154 |
fail_count += 1
|
| 155 |
output_json["data"].append({
|
| 156 |
"filename": file.filename,
|
| 157 |
-
"
|
| 158 |
-
"
|
| 159 |
-
"error": "File is not a PDF"
|
| 160 |
})
|
| 161 |
-
logger.error(f"
|
| 162 |
continue
|
| 163 |
|
| 164 |
-
# Read
|
| 165 |
try:
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
file_hash = get_file_hash(
|
| 170 |
-
logger.info(f"Read
|
| 171 |
except Exception as e:
|
| 172 |
fail_count += 1
|
| 173 |
output_json["data"].append({
|
| 174 |
"filename": file.filename,
|
| 175 |
-
"
|
| 176 |
-
"
|
| 177 |
-
"error": f"Failed to read PDF: {str(e)}"
|
| 178 |
})
|
| 179 |
-
logger.error(f"Failed to read
|
| 180 |
continue
|
| 181 |
|
| 182 |
# Check raw text cache
|
|
@@ -185,44 +199,55 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
|
|
| 185 |
raw_text = raw_text_cache[file_hash]
|
| 186 |
logger.info(f"Raw text cache hit for {file.filename}, {log_memory_usage()}")
|
| 187 |
else:
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
extract_start_time = time.time()
|
| 191 |
-
reader = PdfReader(pdf_stream)
|
| 192 |
-
for page in reader.pages:
|
| 193 |
-
text = page.extract_text()
|
| 194 |
-
if text:
|
| 195 |
-
raw_text += text + "\n"
|
| 196 |
-
logger.info(f"Embedded text extraction for {file.filename}, took {time.time() - extract_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
|
| 197 |
-
except Exception as e:
|
| 198 |
-
logger.warning(f"Embedded text extraction failed for {file.filename}: {str(e)}, {log_memory_usage()}")
|
| 199 |
-
|
| 200 |
-
# If no embedded text, perform OCR
|
| 201 |
-
if not raw_text.strip():
|
| 202 |
try:
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
ocr_start_time = time.time()
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
for i, img in enumerate(images):
|
| 211 |
-
page_text = await process_page(img, i)
|
| 212 |
-
page_texts.append(page_text)
|
| 213 |
-
# tasks = [process_page(img, i) for i, img in enumerate(images)]
|
| 214 |
-
# page_texts = await asyncio.gather(*tasks)
|
| 215 |
-
raw_text = "".join(page_texts)
|
| 216 |
-
logger.info(f"Total OCR for {file.filename}, took {time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
|
| 217 |
except Exception as e:
|
| 218 |
fail_count += 1
|
| 219 |
output_json["data"].append({
|
| 220 |
"filename": file.filename,
|
| 221 |
-
"
|
| 222 |
-
"
|
| 223 |
-
"error": f"OCR failed: {str(e)}"
|
| 224 |
})
|
| 225 |
-
logger.error(f"OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
|
| 226 |
continue
|
| 227 |
|
| 228 |
# Normalize text
|
|
@@ -240,14 +265,13 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
|
|
| 240 |
success_count += 1
|
| 241 |
output_json["data"].append({
|
| 242 |
"filename": file.filename,
|
| 243 |
-
"raw_text": raw_text,
|
| 244 |
"structured_data": structured_data,
|
| 245 |
"error": ""
|
| 246 |
})
|
| 247 |
|
| 248 |
logger.info(f"Total processing for {file.filename}, took {time.time() - total_start_time:.2f} seconds, {log_memory_usage()}")
|
| 249 |
|
| 250 |
-
output_json["message"] = f"Processed {len(files)}
|
| 251 |
if fail_count > 0 and success_count == 0:
|
| 252 |
output_json["success"] = False
|
| 253 |
|
|
|
|
| 34 |
logger.error("GOOGLE_API_KEY not set")
|
| 35 |
raise HTTPException(status_code=500, detail="GOOGLE_API_KEY not set")
|
| 36 |
genai.configure(api_key=api_key)
|
| 37 |
+
model = genai.GenerativeModel("gemini-1.5-pro")
|
| 38 |
|
| 39 |
# Set Tesseract path
|
| 40 |
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
|
|
|
|
| 49 |
mem_info = process.memory_info()
|
| 50 |
return f"Memory usage: {mem_info.rss / 1024 / 1024:.2f} MB"
|
| 51 |
|
| 52 |
+
def get_file_hash(file_bytes):
|
| 53 |
+
"""Generate MD5 hash of file content."""
|
| 54 |
+
return hashlib.md5(file_bytes).hexdigest()
|
| 55 |
|
| 56 |
def get_text_hash(raw_text):
|
| 57 |
"""Generate MD5 hash of raw text."""
|
| 58 |
return hashlib.md5(raw_text.encode('utf-8')).hexdigest()
|
| 59 |
|
| 60 |
+
async def process_image(img_bytes, filename, idx):
|
| 61 |
+
"""Process a single image (JPG/JPEG/PNG) with OCR."""
|
| 62 |
+
start_time = time.time()
|
| 63 |
+
logger.info(f"Starting OCR for {filename} image {idx}, {log_memory_usage()}")
|
| 64 |
+
try:
|
| 65 |
+
img = Image.open(io.BytesIO(img_bytes))
|
| 66 |
+
img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
|
| 67 |
+
gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
|
| 68 |
+
img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
|
| 69 |
+
custom_config = r'--oem 1 --psm 6 -l eng+ara' # Reduced for performance
|
| 70 |
+
page_text = pytesseract.image_to_string(img_pil, config=custom_config)
|
| 71 |
+
logger.info(f"Completed OCR for {filename} image {idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
|
| 72 |
+
return page_text + "\n"
|
| 73 |
+
except Exception as e:
|
| 74 |
+
logger.error(f"OCR failed for {filename} image {idx}: {str(e)}, {log_memory_usage()}")
|
| 75 |
+
return ""
|
| 76 |
+
|
| 77 |
+
async def process_pdf_page(img, page_idx):
|
| 78 |
"""Process a single PDF page with OCR."""
|
| 79 |
start_time = time.time()
|
| 80 |
+
logger.info(f"Starting OCR for PDF page {page_idx}, {log_memory_usage()}")
|
| 81 |
try:
|
| 82 |
img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
|
| 83 |
gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
|
| 84 |
img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
|
| 85 |
+
custom_config = r'--oem 1 --psm 6 -l eng+ara' # Reduced for performance
|
| 86 |
page_text = pytesseract.image_to_string(img_pil, config=custom_config)
|
| 87 |
+
logger.info(f"Completed OCR for PDF page {page_idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
|
| 88 |
return page_text + "\n"
|
| 89 |
except Exception as e:
|
| 90 |
+
logger.error(f"OCR failed for PDF page {page_idx}: {str(e)}, {log_memory_usage()}")
|
| 91 |
return ""
|
| 92 |
|
| 93 |
async def process_with_gemini(filename: str, raw_text: str):
|
|
|
|
| 108 |
|
| 109 |
try:
|
| 110 |
prompt = f"""
|
| 111 |
+
You are an intelligent invoice data extractor. Given raw text from an invoice in any language and extract key business fields in the specified JSON format. Support English. Handle synonyms (e.g., 'total' = 'net', 'tax' = 'GST'/'TDS'). The 'Products' field is dynamic and may contain multiple items, each with 'qty', 'description', 'unit_price', and 'amount'. Detect the currency (e.g., USD, INR, EUR) from symbols ($, ₹, €) or text; default to USD if unclear. If a field is missing, include it with an empty string ("") or appropriate default (e.g., 0 for numbers).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
+
Raw text:
|
| 114 |
+
{raw_text}
|
| 115 |
|
| 116 |
Output JSON:
|
| 117 |
{{
|
| 118 |
+
"currency": "",
|
| 119 |
"Discount_Percentage": "",
|
| 120 |
"Due_Date": "",
|
| 121 |
"Email_Client": "",
|
|
|
|
| 131 |
"invoice date": "",
|
| 132 |
"invoice number": "",
|
| 133 |
"shipping address": "",
|
|
|
|
| 134 |
"total": ""
|
| 135 |
}}
|
| 136 |
"""
|
|
|
|
| 163 |
total_start_time = time.time()
|
| 164 |
logger.info(f"Processing file: {file.filename}, {log_memory_usage()}")
|
| 165 |
|
| 166 |
+
# Validate file format
|
| 167 |
+
valid_extensions = {'.pdf', '.jpg', '.jpeg', '.png'}
|
| 168 |
+
file_ext = os.path.splitext(file.filename.lower())[1]
|
| 169 |
+
if file_ext not in valid_extensions:
|
| 170 |
fail_count += 1
|
| 171 |
output_json["data"].append({
|
| 172 |
"filename": file.filename,
|
| 173 |
+
"structured_data": {"error": f"Unsupported file format: {file_ext}"},
|
| 174 |
+
"error": f"Unsupported file format: {file_ext}"
|
|
|
|
| 175 |
})
|
| 176 |
+
logger.error(f"Unsupported file format for {file.filename}: {file_ext}")
|
| 177 |
continue
|
| 178 |
|
| 179 |
+
# Read file into memory
|
| 180 |
try:
|
| 181 |
+
file_start_time = time.time()
|
| 182 |
+
file_bytes = await file.read()
|
| 183 |
+
file_stream = io.BytesIO(file_bytes)
|
| 184 |
+
file_hash = get_file_hash(file_bytes)
|
| 185 |
+
logger.info(f"Read file {file.filename}, took {time.time() - file_start_time:.2f} seconds, size: {len(file_bytes)/1024:.2f} KB, {log_memory_usage()}")
|
| 186 |
except Exception as e:
|
| 187 |
fail_count += 1
|
| 188 |
output_json["data"].append({
|
| 189 |
"filename": file.filename,
|
| 190 |
+
"structured_data": {"error": f"Failed to read file: {str(e)}"},
|
| 191 |
+
"error": f"Failed to read file: {str(e)}"
|
|
|
|
| 192 |
})
|
| 193 |
+
logger.error(f"Failed to read file {file.filename}: {str(e)}, {log_memory_usage()}")
|
| 194 |
continue
|
| 195 |
|
| 196 |
# Check raw text cache
|
|
|
|
| 199 |
raw_text = raw_text_cache[file_hash]
|
| 200 |
logger.info(f"Raw text cache hit for {file.filename}, {log_memory_usage()}")
|
| 201 |
else:
|
| 202 |
+
if file_ext == '.pdf':
|
| 203 |
+
# Try extracting embedded text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
try:
|
| 205 |
+
extract_start_time = time.time()
|
| 206 |
+
reader = PdfReader(file_stream)
|
| 207 |
+
for page in reader.pages:
|
| 208 |
+
text = page.extract_text()
|
| 209 |
+
if text:
|
| 210 |
+
raw_text += text + "\n"
|
| 211 |
+
logger.info(f"Embedded text extraction for {file.filename}, took {time.time() - extract_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
|
| 212 |
+
except Exception as e:
|
| 213 |
+
logger.warning(f"Embedded text extraction failed for {file.filename}: {str(e)}, {log_memory_usage()}")
|
| 214 |
|
| 215 |
+
# If no embedded text, perform OCR
|
| 216 |
+
if not raw_text.strip():
|
| 217 |
+
try:
|
| 218 |
+
convert_start_time = time.time()
|
| 219 |
+
images = convert_from_bytes(file_bytes, poppler_path="/usr/local/bin", dpi=100)
|
| 220 |
+
logger.info(f"PDF to images conversion for {file.filename}, {len(images)} pages, took {time.time() - convert_start_time:.2f} seconds, {log_memory_usage()}")
|
| 221 |
+
|
| 222 |
+
ocr_start_time = time.time()
|
| 223 |
+
page_texts = []
|
| 224 |
+
for i, img in enumerate(images):
|
| 225 |
+
page_text = await process_pdf_page(img, i)
|
| 226 |
+
page_texts.append(page_text)
|
| 227 |
+
raw_text = "".join(page_texts)
|
| 228 |
+
logger.info(f"Total OCR for {file.filename}, took {time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
|
| 229 |
+
except Exception as e:
|
| 230 |
+
fail_count += 1
|
| 231 |
+
output_json["data"].append({
|
| 232 |
+
"filename": file.filename,
|
| 233 |
+
"structured_data": {"error": f"OCR failed: {str(e)}"},
|
| 234 |
+
"error": f"OCR failed: {str(e)}"
|
| 235 |
+
})
|
| 236 |
+
logger.error(f"OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
|
| 237 |
+
continue
|
| 238 |
+
else: # JPG/JPEG/PNG
|
| 239 |
+
try:
|
| 240 |
ocr_start_time = time.time()
|
| 241 |
+
raw_text = await process_image(file_bytes, file.filename, 0)
|
| 242 |
+
logger.info(f"Image OCR for {file.filename}, took {time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
except Exception as e:
|
| 244 |
fail_count += 1
|
| 245 |
output_json["data"].append({
|
| 246 |
"filename": file.filename,
|
| 247 |
+
"structured_data": {"error": f"Image OCR failed: {str(e)}"},
|
| 248 |
+
"error": f"Image OCR failed: {str(e)}"
|
|
|
|
| 249 |
})
|
| 250 |
+
logger.error(f"Image OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
|
| 251 |
continue
|
| 252 |
|
| 253 |
# Normalize text
|
|
|
|
| 265 |
success_count += 1
|
| 266 |
output_json["data"].append({
|
| 267 |
"filename": file.filename,
|
|
|
|
| 268 |
"structured_data": structured_data,
|
| 269 |
"error": ""
|
| 270 |
})
|
| 271 |
|
| 272 |
logger.info(f"Total processing for {file.filename}, took {time.time() - total_start_time:.2f} seconds, {log_memory_usage()}")
|
| 273 |
|
| 274 |
+
output_json["message"] = f"Processed {len(files)} files. {success_count} succeeded, {fail_count} failed."
|
| 275 |
if fail_count > 0 and success_count == 0:
|
| 276 |
output_json["success"] = False
|
| 277 |
|