Sathvik-kota commited on
Commit
b5c6c29
·
verified ·
1 Parent(s): be53ce2

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. Dockerfile +30 -17
  2. app.py +561 -805
Dockerfile CHANGED
@@ -1,38 +1,51 @@
1
- # Use a standard Python 3.11 slim image
2
  FROM python:3.11-slim
3
 
 
4
  RUN apt-get update && \
5
- apt-get install -y tesseract-ocr poppler-utils libglib2.0-0 libsm6 libxext6 libxrender1 libgl1 && \
6
- apt-get clean && rm -rf /var/lib/apt/lists/*
7
-
8
- # Set up a new user named "user" with user ID 1000 (required by HF Spaces)
 
 
 
 
 
 
 
 
 
 
 
 
9
  RUN useradd -m -u 1000 user
10
 
11
- # Switch to the "user" user
12
  USER user
13
 
14
- # Set home to the user's home directory
15
  ENV HOME=/home/user \
16
- PATH=/home/user/.local/bin:$PATH
 
 
17
 
18
- # Set the working directory to the user's home directory
19
  WORKDIR $HOME/app
20
 
21
- # Copy the requirements file first to leverage Docker layer caching
22
  COPY --chown=user requirements.txt .
23
 
24
- # Install all your Python dependencies
25
- RUN pip install --no-cache-dir --upgrade pip && \
26
  pip install --no-cache-dir -r requirements.txt
27
 
28
- # Copy all your project files into the container with proper ownership
29
  COPY --chown=user . .
30
 
31
- # Make our startup script executable
32
- RUN chmod +x ./start.sh
33
 
34
- # Hugging Face Spaces will set PORT env; default to 7860 if not set
35
  EXPOSE 7860
36
 
37
- # Run the startup script when the container starts
 
 
 
38
  CMD ["./start.sh"]
 
 
1
  FROM python:3.11-slim
2
 
3
+ # Install system dependencies for OCR, PDF processing, and OpenCV
4
  RUN apt-get update && \
5
+ apt-get install -y --no-install-recommends \
6
+ tesseract-ocr \
7
+ libtesseract-dev \
8
+ poppler-utils \
9
+ libglib2.0-0 \
10
+ libsm6 \
11
+ libxext6 \
12
+ libxrender1 \
13
+ libgl1 \
14
+ libgomp1 \
15
+ && apt-get clean && rm -rf /var/lib/apt/lists/*
16
+
17
+ # Set Tesseract data directory
18
+ ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata
19
+
20
+ # Create non-root user (required by HF Spaces)
21
  RUN useradd -m -u 1000 user
22
 
 
23
  USER user
24
 
 
25
  ENV HOME=/home/user \
26
+ PATH=/home/user/.local/bin:$PATH \
27
+ PYTHONUNBUFFERED=1 \
28
+ PYTHONDONTWRITEBYTECODE=1
29
 
 
30
  WORKDIR $HOME/app
31
 
32
+ # Copy and install requirements
33
  COPY --chown=user requirements.txt .
34
 
35
+ RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \
 
36
  pip install --no-cache-dir -r requirements.txt
37
 
38
+ # Copy project files
39
  COPY --chown=user . .
40
 
41
+ # Make startup script executable
42
+ RUN chmod +x ./start.sh 2>/dev/null || true
43
 
44
+ # Expose port (HF Spaces will override via PORT env var)
45
  EXPOSE 7860
46
 
47
+ # Health check
48
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
49
+ CMD python -c "import requests; requests.get('http://localhost:${PORT:-7860}/', timeout=5)" || exit 1
50
+
51
  CMD ["./start.sh"]
app.py CHANGED
@@ -1,23 +1,12 @@
1
- # Enhanced Bill Extraction API (Improved Name Detection)
2
- # Focused on: Accurate item name extraction with intelligent cleaning
3
- #
4
- # Improvements:
5
- # 1. Advanced name normalization and cleaning
6
- # 2. OCR error correction for common names
7
- # 3. Smart multi-word item detection
8
- # 4. Context-aware name validation
9
- # 5. Medical/pharmacy/retail term recognition
10
- # 6. Remove junk characters and formatting
11
- # 7. Consolidate similar names (fuzzy matching)
12
-
13
  import os
14
  import re
15
  import json
16
  import logging
17
  from io import BytesIO
18
  from typing import List, Dict, Any, Optional, Tuple
19
- from dataclasses import dataclass, asdict, field
20
  from difflib import SequenceMatcher
 
21
 
22
  from fastapi import FastAPI
23
  from pydantic import BaseModel
@@ -29,980 +18,747 @@ import cv2
29
  import pytesseract
30
  from pytesseract import Output
31
 
32
- try:
33
- import boto3
34
- except Exception:
35
- boto3 = None
36
-
37
- try:
38
- from google.cloud import vision
39
- except Exception:
40
- vision = None
41
 
42
- # -------------------------------------------------------------------------
43
- # Configuration
44
- # -------------------------------------------------------------------------
45
  OCR_ENGINE = os.getenv("OCR_ENGINE", "tesseract").lower()
46
- AWS_REGION = os.getenv("AWS_REGION", "us-east-1")
47
- TESSERACT_PSM = os.getenv("TESSERACT_PSM", "6")
48
-
49
- logging.basicConfig(level=logging.INFO)
50
- logger = logging.getLogger("bill-extractor-improved")
51
-
52
- _textract_client = None
53
- _vision_client = None
54
 
55
- def textract_client():
56
- global _textract_client
57
- if _textract_client is None:
58
- if boto3 is None:
59
- raise RuntimeError("boto3 not installed")
60
- _textract_client = boto3.client("textract", region_name=AWS_REGION)
61
- return _textract_client
62
 
63
- def vision_client():
64
- global _vision_client
65
- if _vision_client is None:
66
- if vision is None:
67
- raise RuntimeError("google-cloud-vision not installed")
68
- _vision_client = vision.ImageAnnotatorClient()
69
- return _vision_client
70
-
71
- # -------------------------------------------------------------------------
72
- # Enhanced Name Correction Dictionary
73
- # -------------------------------------------------------------------------
74
- OCR_CORRECTIONS = {
75
- # Medical terms
76
- "consuitation": "Consultation",
77
- "consulation": "Consultation",
78
- "consultatior": "Consultation",
79
- "consultaion": "Consultation",
80
  "consultion": "Consultation",
81
- "consultaon": "Consultation",
82
- "consuftation": "Consultation",
83
-
84
- # Lab tests
85
  "cbc": "Complete Blood Count (CBC)",
86
  "lft": "Liver Function Test (LFT)",
87
  "rft": "Renal Function Test (RFT)",
88
- "thyroid": "Thyroid Profile",
89
- "lipid": "Lipid Profile",
90
- "sugar": "Blood Sugar Test",
91
- "glucose": "Blood Glucose",
92
- "haemoglobin": "Hemoglobin",
93
- "hemoglobin": "Hemoglobin",
94
-
95
- # Procedures
96
  "xray": "X-Ray",
97
  "x-ray": "X-Ray",
98
- "xra": "X-Ray",
99
- "ctscan": "CT Scan",
100
- "ct-scan": "CT Scan",
101
- "ultrasound": "Ultrasound",
102
  "mri": "MRI Scan",
103
- "ecg": "ECG",
104
- "ekg": "ECG",
105
-
106
- # Medicines
107
- "amoxicilin": "Amoxicillin",
108
- "amoxicilen": "Amoxicillin",
109
- "antibiotic": "Antibiotic",
110
- "paracetamol": "Paracetamol",
111
- "cough-syrup": "Cough Syrup",
112
- "coughsyrup": "Cough Syrup",
113
-
114
- # Pharmacy
115
- "strip": "Strip",
116
  "tablet": "Tablet",
117
- "capsuie": "Capsule",
118
  "capsule": "Capsule",
119
- "bottle": "Bottle",
120
- "ml": "ml",
121
-
122
- # Pharmacy/Retail
123
- "pack": "Pack",
124
- "box": "Box",
125
- "blister": "Blister",
126
- "nos": "Nos",
127
- "pcs": "Pcs",
128
  }
129
 
130
- # Medical/pharmacy keywords to recognize item types
131
- MEDICAL_KEYWORDS = {
132
- "consultation", "check-up", "checkup", "visit", "appointment",
133
- "diagnosis", "treatment", "examination", "exam",
134
- }
135
-
136
- LAB_TEST_KEYWORDS = {
137
- "test", "cbc", "lft", "rft", "blood", "urine", "stool", "sample",
138
- "profile", "thyroid", "lipid", "glucose", "hemoglobin", "sugar",
139
- "covid", "screening", "culture", "pathology",
140
- }
141
-
142
- PROCEDURE_KEYWORDS = {
143
- "xray", "x-ray", "scan", "ultrasound", "ct", "mri", "echo", "ecg",
144
- "procedure", "surgery", "operation", "imaging", "radiography",
145
- "endoscopy", "colonoscopy", "sonography",
146
- }
147
-
148
- MEDICINE_KEYWORDS = {
149
- "tablet", "capsule", "strip", "bottle", "syrup", "cream", "ointment",
150
- "injection", "medicine", "drug", "antibiotic", "paracetamol",
151
- "aspirin", "cough", "vitamin", "supplement",
152
- }
153
-
154
- # -------------------------------------------------------------------------
155
- # Data Models
156
- # -------------------------------------------------------------------------
157
  @dataclass
158
- class BillLineItem:
159
- """Represents a single line item in a bill"""
160
- item_name: str
161
  item_quantity: float = 1.0
162
- item_rate: float = 0.0
163
- item_amount: float = 0.0
164
- # Internal fields (not exported)
165
- confidence: float = field(default=1.0, repr=False)
166
- source_row: str = field(default="", repr=False)
167
- is_description_continuation: bool = field(default=False, repr=False)
168
- name_confidence: float = field(default=1.0, repr=False) # Name-specific confidence
169
-
170
- def to_dict(self) -> Dict[str, Any]:
171
- """Export only public fields"""
172
  return {
173
- "item_name": self.item_name,
174
  "item_quantity": self.item_quantity,
175
- "item_rate": self.item_rate,
176
- "item_amount": self.item_amount,
177
  }
178
 
179
  @dataclass
180
- class BillTotal:
181
- """Subtotal and total information"""
182
- subtotal_amount: Optional[float] = None
183
- tax_amount: Optional[float] = None
184
- discount_amount: Optional[float] = None
185
- final_total_amount: Optional[float] = None
186
-
187
- def to_dict(self) -> Dict[str, Any]:
188
- return {k: v for k, v in asdict(self).items() if v is not None}
 
 
 
 
189
 
190
  @dataclass
191
- class ExtractedPage:
192
- """Page-level extraction result"""
193
- page_no: int
194
- page_type: str
195
- line_items: List[BillLineItem]
196
- bill_totals: BillTotal
197
- page_confidence: float = field(default=1.0, repr=False)
198
-
199
- def to_dict(self) -> Dict[str, Any]:
200
- """Export clean output"""
201
  return {
202
- "page_no": self.page_no,
203
- "page_type": self.page_type,
204
- "line_items": [item.to_dict() for item in self.line_items],
205
- "bill_totals": self.bill_totals.to_dict(),
206
  }
207
 
208
- # -------------------------------------------------------------------------
209
- # Advanced Name Cleaning & Validation
210
- # -------------------------------------------------------------------------
211
- def correct_ocr_errors(text: str) -> str:
212
- """Correct common OCR errors in text"""
213
- text_lower = text.lower().strip()
214
-
215
- # Check dictionary
216
- if text_lower in OCR_CORRECTIONS:
217
- return OCR_CORRECTIONS[text_lower]
218
-
219
- # Try substring match for common errors
220
- for wrong, correct in OCR_CORRECTIONS.items():
221
- if wrong in text_lower:
222
- text = text.replace(wrong, correct)
223
- text = text.replace(wrong.upper(), correct.upper())
224
-
225
- return text
226
 
227
- def normalize_name(s: str) -> str:
228
- """Deep normalization of item names"""
229
- if not s:
230
- return "UNKNOWN"
231
-
232
- # 1. Strip and basic cleanup
233
- s = s.strip()
234
-
235
- # 2. Remove extra spaces
236
- s = re.sub(r'\s+', ' ', s)
237
-
238
- # 3. Fix common separators
239
- s = s.replace('|', ' ')
240
- s = s.replace('||', ' ')
241
- s = s.replace('/', ' / ')
242
- s = re.sub(r'\s+/\s+', ' / ', s)
243
-
244
- # 4. Remove leading/trailing junk
245
- s = s.strip(' -:,.=()[]{}|\\/')
246
-
247
- # 5. OCR error correction
248
- s = correct_ocr_errors(s)
249
-
250
- # 6. Capitalize properly
251
- s = capitalize_name(s)
252
-
253
- # 7. Remove duplicate words
254
- words = s.split()
255
- seen = set()
256
- unique_words = []
257
- for word in words:
258
- word_lower = word.lower()
259
- if word_lower not in seen or len(seen) < 3: # Allow some repetition
260
- unique_words.append(word)
261
- seen.add(word_lower)
262
- s = ' '.join(unique_words)
263
-
264
- # 8. Final trim
265
- s = s.strip()
266
-
267
- return s if s else "UNKNOWN"
268
-
269
- def capitalize_name(s: str) -> str:
270
- """Intelligent capitalization for names"""
271
- if not s:
272
- return s
273
-
274
- # Special cases (all caps)
275
- all_caps = ["CBC", "LFT", "RFT", "ECG", "EKG", "MRI", "CT", "COVID", "GST", "SGST", "CGST"]
276
- for term in all_caps:
277
- pattern = re.compile(r'\b' + term.lower() + r'\b', re.I)
278
- s = pattern.sub(term, s)
279
-
280
- # Title case for regular terms
281
- words = s.split()
282
- result = []
283
- for word in words:
284
- # Don't capitalize small words between
285
- if word.lower() in ["for", "the", "and", "or", "in", "of", "to", "a", "an", "ml", "mg", "mg/ml"]:
286
- if result: # Not first word
287
- result.append(word.lower())
288
- else:
289
- result.append(word.capitalize())
290
- else:
291
- result.append(word.capitalize())
292
-
293
- return ' '.join(result)
294
-
295
- def validate_name(name: str, context_amount: float = 0) -> Tuple[str, float]:
296
- """
297
- Validate and enhance name with context awareness.
298
- Returns: (validated_name, confidence_score)
299
- """
300
- if not name or name == "UNKNOWN":
301
- return "UNKNOWN", 0.0
302
-
303
- name_lower = name.lower()
304
- confidence = 0.85 # Default
305
-
306
- # Medical consultation context
307
- if any(kw in name_lower for kw in MEDICAL_KEYWORDS):
308
- confidence = 0.95
309
- if context_amount > 0 and context_amount < 2000:
310
- confidence = 0.98 # Typical consultation price range
311
-
312
- # Lab test context
313
- elif any(kw in name_lower for kw in LAB_TEST_KEYWORDS):
314
- confidence = 0.92
315
- if context_amount > 0 and context_amount < 5000:
316
- confidence = 0.96
317
-
318
- # Procedure context
319
- elif any(kw in name_lower for kw in PROCEDURE_KEYWORDS):
320
- confidence = 0.90
321
- if context_amount > 0 and context_amount < 10000:
322
- confidence = 0.94
323
-
324
- # Medicine context
325
- elif any(kw in name_lower for kw in MEDICINE_KEYWORDS):
326
- confidence = 0.88
327
- if context_amount > 0 and context_amount < 500:
328
- confidence = 0.92
329
-
330
- # Length penalty (too short = less confident)
331
- if len(name) < 3:
332
- confidence *= 0.7
333
- # Length bonus (reasonable length)
334
- elif 5 <= len(name) <= 50:
335
- confidence = min(1.0, confidence + 0.05)
336
-
337
- # Remove redundant text
338
- name = remove_redundant_text(name)
339
-
340
- return name, min(1.0, confidence)
341
 
342
- def remove_redundant_text(name: str) -> str:
343
- """Remove redundant or unnecessary words"""
344
- if not name:
345
- return name
346
-
347
- name_lower = name.lower()
348
-
349
- # Remove common redundant patterns
350
- patterns = [
351
- r'\b(item|name|description|service|product)\b',
352
- r'\b(ref|reference)\s*:?\s*',
353
- r'\b(qty|quantity)\b',
354
- r'\b(unit|units)\b',
355
- r'^-+\s*|-+$', # Leading/trailing dashes
356
- r'\s+x\s+$', # Trailing "x"
357
- r'\s+,\s*$', # Trailing comma
358
- ]
359
-
360
- for pattern in patterns:
361
- name = re.sub(pattern, '', name, flags=re.I)
362
-
363
- return name.strip()
364
-
365
- def merge_similar_names(items: List[BillLineItem], similarity_threshold: float = 0.85) -> List[BillLineItem]:
366
- """
367
- Merge items with very similar names.
368
- Example: "Consultation" and "Consultation for checkup" → "Consultation for checkup"
369
- """
370
- if len(items) <= 1:
371
- return items
372
-
373
- merged = []
374
- used_indices = set()
375
-
376
- for i, item1 in enumerate(items):
377
- if i in used_indices:
378
- continue
379
-
380
- # Find similar items
381
- similar_group = [item1]
382
- for j, item2 in enumerate(items[i+1:], start=i+1):
383
- if j in used_indices:
384
- continue
385
-
386
- # Calculate similarity
387
- sim = SequenceMatcher(None,
388
- item1.item_name.lower(),
389
- item2.item_name.lower()).ratio()
390
-
391
- if sim > similarity_threshold:
392
- # Keep the longer, more detailed name
393
- if len(item2.item_name) > len(item1.item_name):
394
- similar_group = [item2] + similar_group
395
- similar_group.append(item2)
396
- used_indices.add(j)
397
-
398
- # Use the best (longest/most detailed) name
399
- best_item = max(similar_group, key=lambda x: (len(x.item_name), x.name_confidence))
400
- merged.append(best_item)
401
- used_indices.add(i)
402
-
403
- return merged
404
-
405
- # -------------------------------------------------------------------------
406
- # Regular Expressions (Enhanced)
407
- # -------------------------------------------------------------------------
408
- NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")
409
-
410
- TOTAL_KEYWORDS = re.compile(
411
- r"\b(grand\s+total|net\s+payable|total\s+(?:amount|due)|amount\s+payable|bill\s+amount|"
412
- r"final\s+(?:amount|total)|balance\s+due|amount\s+due|total\s+payable|payable)\b",
413
- re.I
414
- )
415
- SUBTOTAL_KEYWORDS = re.compile(
416
- r"\b(sub\s*[\-\s]?total|subtotal|sub\s+total|items\s+total|line\s+items\s+total)\b",
417
- re.I
418
- )
419
- TAX_KEYWORDS = re.compile(
420
- r"\b(tax|gst|vat|sgst|cgst|igst|sales\s+tax|service\s+tax)\b",
421
- re.I
422
- )
423
- DISCOUNT_KEYWORDS = re.compile(
424
- r"\b(discount|rebate|deduction)\b",
425
- re.I
426
- )
427
- FOOTER_KEYWORDS = re.compile(
428
- r"(page|printed\s+on|printed|date|time|signature|authorized|terms|conditions)",
429
- re.I
430
- )
431
-
432
- # -------------------------------------------------------------------------
433
- # Text Cleaning & Normalization
434
- # -------------------------------------------------------------------------
435
- def sanitize_ocr_text(s: Optional[str]) -> str:
436
- """Clean OCR text"""
437
- if not s:
438
- return ""
439
- s = s.replace("\u2014", "-").replace("\u2013", "-")
440
- s = s.replace("\u00A0", " ")
441
- s = re.sub(r"[^\x09\x0A\x0D\x20-\x7E]", " ", s)
442
- s = s.replace("\r\n", "\n").replace("\r", "\n")
443
- s = re.sub(r"[ \t]+", " ", s)
444
- s = re.sub(r"\b(qiy|qty|oty|gty)\b", "qty", s, flags=re.I)
445
- s = re.sub(r"\b(deseription|descriptin|desription)\b", "description", s, flags=re.I)
446
- return s.strip()
447
-
448
- def normalize_num_str(s: Optional[str], allow_zero: bool = False) -> Optional[float]:
449
- """Robust number parsing"""
450
- if s is None:
451
  return None
452
- s = str(s).strip()
453
- if s == "":
 
454
  return None
455
 
456
- negative = False
457
- if s.startswith("(") and s.endswith(")"):
458
- negative = True
459
- s = s[1:-1]
460
 
461
- s = re.sub(r"[^\d\-\+\,\.\(\)]", "", s)
462
- s = s.replace(",", "")
463
 
464
- if s in ("", "-", "+"):
465
  return None
466
 
467
  try:
468
- val = float(s)
469
- val = -val if negative else val
470
- if val == 0 and not allow_zero:
 
471
  return None
472
- return val
 
473
  except Exception:
474
  return None
475
 
476
- def is_numeric_token(t: Optional[str]) -> bool:
477
- """Check if token is numeric"""
478
- return bool(t and NUM_RE.search(str(t)))
479
-
480
- # -------------------------------------------------------------------------
481
- # Item Fingerprinting
482
- # -------------------------------------------------------------------------
483
- def item_fingerprint(item: BillLineItem) -> Tuple[str, float]:
484
- """Create fingerprint for deduplication"""
485
- name_norm = re.sub(r"\s+", " ", item.item_name.lower()).strip()[:100]
486
- amount_rounded = round(float(item.item_amount), 2)
487
- return (name_norm, amount_rounded)
488
-
489
- def dedupe_items_advanced(items: List[BillLineItem]) -> List[BillLineItem]:
490
- """Remove duplicates with improved name handling"""
491
- if not items:
492
- return []
493
-
494
- seen: Dict[Tuple, BillLineItem] = {}
495
- for item in items:
496
- fp = item_fingerprint(item)
497
- if fp not in seen or item.confidence > seen[fp].confidence:
498
- seen[fp] = item
499
-
500
- final = list(seen.values())
501
 
502
- # Merge similar names
503
- final = merge_similar_names(final, similarity_threshold=0.85)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
504
 
505
- return final
506
-
507
- # -------------------------------------------------------------------------
508
- # Total Detection
509
- # -------------------------------------------------------------------------
510
- FINAL_TOTAL_KEYWORDS = re.compile(
511
- r"\b(grand\s+total|final\s+(?:total|amount)|total\s+(?:due|payable|amount)|"
512
- r"net\s+payable|amount\s+(?:due|payable)|balance\s+due|payable)\b",
513
- re.I
514
- )
515
 
516
- def detect_totals_in_rows(rows: List[List[Dict[str, Any]]]) -> Tuple[Optional[float], Optional[float], Optional[float], Optional[float]]:
517
- """Scan rows for subtotal, tax, discount, final total"""
518
- subtotal = None
519
- tax = None
520
- discount = None
521
- final_total = None
522
-
523
- for row in rows:
524
- row_text = " ".join([c["text"] for c in row])
525
- row_lower = row_text.lower()
526
- tokens = row_text.split()
527
-
528
- amounts = []
529
- for t in tokens:
530
- if is_numeric_token(t):
531
- v = normalize_num_str(t, allow_zero=True)
532
- if v is not None:
533
- amounts.append(v)
534
-
535
- if not amounts:
536
- continue
537
-
538
- amount = max(amounts)
539
-
540
- if FINAL_TOTAL_KEYWORDS.search(row_lower):
541
- final_total = amount
542
- elif SUBTOTAL_KEYWORDS.search(row_lower):
543
- subtotal = amount
544
- elif TAX_KEYWORDS.search(row_lower):
545
- tax = amount
546
- elif DISCOUNT_KEYWORDS.search(row_lower):
547
- discount = amount
548
-
549
- return subtotal, tax, discount, final_total
550
-
551
- # -------------------------------------------------------------------------
552
- # Image Preprocessing
553
- # -------------------------------------------------------------------------
554
- def pil_to_cv2(img: Image.Image) -> Any:
555
- arr = np.array(img)
556
- if arr.ndim == 2:
557
- return arr
558
- return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
559
 
560
- def preprocess_image_for_tesseract(pil_img: Image.Image, target_w: int = 1500) -> Any:
561
- """Enhanced preprocessing"""
562
- pil_img = pil_img.convert("RGB")
563
- w, h = pil_img.size
564
 
565
- if w < target_w:
566
- scale = target_w / float(w)
567
- pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
568
 
569
- cv_img = pil_to_cv2(pil_img)
 
 
 
 
 
570
 
571
- if cv_img.ndim == 3:
572
- gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
 
 
573
  else:
574
- gray = cv_img
575
 
576
  gray = cv2.fastNlMeansDenoising(gray, h=10)
577
 
578
  try:
579
- bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
580
- cv2.THRESH_BINARY, 41, 15)
581
  except Exception:
582
- _, bw = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
583
 
584
  kernel = np.ones((2, 2), np.uint8)
585
- bw = cv2.morphologyEx(bw, cv2.MORPH_CLOSE, kernel)
586
- bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel)
587
 
588
- return bw
 
589
 
590
- def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
591
- """Extract OCR cells from image"""
 
 
592
  try:
593
- o = pytesseract.image_to_data(cv_img, output_type=Output.DICT, config=f"--psm {TESSERACT_PSM}")
594
- except Exception:
595
- o = pytesseract.image_to_data(cv_img, output_type=Output.DICT)
 
 
 
 
 
596
 
597
  cells = []
598
- n = len(o.get("text", []))
599
- for i in range(n):
600
- raw = o["text"][i]
601
- if raw is None:
 
 
602
  continue
603
- txt = str(raw).strip()
604
- if not txt:
 
605
  continue
606
 
607
  try:
608
- conf_raw = o.get("conf", [])[i]
609
- conf = float(conf_raw) if conf_raw not in (None, "", "-1") else -1.0
610
  except Exception:
611
- conf = -1.0
612
 
613
- left = int(o.get("left", [0])[i])
614
- top = int(o.get("top", [0])[i])
615
- width = int(o.get("width", [0])[i])
616
- height = int(o.get("height", [0])[i])
617
- center_y = top + height / 2.0
618
  center_x = left + width / 2.0
 
 
 
619
 
620
  cells.append({
621
- "text": txt,
622
- "conf": max(0.0, conf) / 100.0,
623
  "left": left, "top": top, "width": width, "height": height,
624
  "center_x": center_x, "center_y": center_y
625
  })
626
 
 
 
 
 
 
 
627
  return cells
628
 
629
- def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) -> List[List[Dict[str, Any]]]:
630
- """Group cells by horizontal position (rows)"""
631
- if not cells:
 
 
632
  return []
633
 
634
- sorted_cells = sorted(cells, key=lambda c: (c["center_y"], c["center_x"]))
635
- rows = []
636
- current = [sorted_cells[0]]
637
- last_y = sorted_cells[0]["center_y"]
 
638
 
639
- for c in sorted_cells[1:]:
640
- if abs(c["center_y"] - last_y) <= y_tolerance:
641
- current.append(c)
642
- last_y = (last_y * (len(current) - 1) + c["center_y"]) / len(current)
643
  else:
644
- rows.append(sorted(current, key=lambda cc: cc["left"]))
645
- current = [c]
646
- last_y = c["center_y"]
 
 
 
647
 
648
- if current:
649
- rows.append(sorted(current, key=lambda cc: cc["left"]))
 
 
 
 
650
 
651
- return rows
652
 
653
- # -------------------------------------------------------------------------
654
- # Column Detection
655
- # -------------------------------------------------------------------------
656
- def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 6) -> List[float]:
657
- """Detect x-positions of numeric columns"""
658
- xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
659
- if not xs:
 
 
660
  return []
661
 
662
- xs = sorted(set(xs))
663
- if len(xs) == 1:
664
- return xs
665
-
666
- gaps = [xs[i+1] - xs[i] for i in range(len(xs)-1)]
667
- mean_gap = float(np.mean(gaps))
668
- std_gap = float(np.std(gaps)) if len(gaps) > 1 else 0.0
669
- gap_thresh = max(35.0, mean_gap + 0.7 * std_gap)
670
-
671
- clusters = []
672
- curr = [xs[0]]
673
- for i, g in enumerate(gaps):
674
- if g > gap_thresh and len(clusters) < (max_columns - 1):
675
- clusters.append(curr)
676
- curr = [xs[i+1]]
 
 
 
 
 
 
677
  else:
678
- curr.append(xs[i+1])
679
- clusters.append(curr)
680
 
681
- centers = [float(np.median(c)) for c in clusters]
682
- if len(centers) > max_columns:
683
- centers = centers[-max_columns:]
684
 
685
- return sorted(centers)
 
 
 
 
 
 
 
686
 
687
- def assign_token_to_column(token_x: float, column_centers: List[float]) -> Optional[int]:
688
- """Find closest column index for token"""
689
- if not column_centers:
690
- return None
691
- distances = [abs(token_x - cx) for cx in column_centers]
692
- return int(np.argmin(distances))
 
 
693
 
694
- # -------------------------------------------------------------------------
695
- # Row Parsing (Improved Name Handling)
696
- # -------------------------------------------------------------------------
697
- def parse_rows_with_columns(
698
- rows: List[List[Dict[str, Any]]],
699
- page_cells: List[Dict[str, Any]],
700
- page_text: str = ""
701
- ) -> List[BillLineItem]:
702
- """Parse rows into line items with improved name detection"""
703
- items = []
704
- column_centers = detect_numeric_columns(page_cells, max_columns=6)
705
-
706
- for row in rows:
707
- tokens = [c["text"] for c in row]
708
- row_text = " ".join(tokens)
709
- row_lower = row_text.lower()
710
 
711
- if FOOTER_KEYWORDS.search(row_lower) and not any(is_numeric_token(t) for t in tokens):
 
 
 
712
  continue
713
 
714
- if not any(is_numeric_token(t) for t in tokens):
 
 
715
  continue
716
 
717
- numeric_values = []
718
- for t in tokens:
719
- if is_numeric_token(t):
720
- v = normalize_num_str(t, allow_zero=False)
721
- if v is not None:
722
- numeric_values.append(float(v))
 
723
 
724
- if not numeric_values:
 
 
725
  continue
726
 
727
- numeric_values = sorted(list(set(numeric_values)), reverse=True)
728
 
729
- if column_centers:
730
- left_text_parts = []
731
- numeric_buckets = {i: [] for i in range(len(column_centers))}
 
 
 
 
 
 
 
732
 
733
- for c in row:
734
- t = c["text"]
735
- cx = c["center_x"]
736
- conf = c.get("conf", 1.0)
737
 
738
- if is_numeric_token(t):
739
- col_idx = assign_token_to_column(cx, column_centers)
740
- if col_idx is None:
741
- col_idx = len(column_centers) - 1
742
- numeric_buckets[col_idx].append((t, conf))
743
  else:
744
- left_text_parts.append(t)
745
 
746
- raw_name = " ".join(left_text_parts).strip()
747
 
748
- # IMPROVED NAME NORMALIZATION
749
- item_name = normalize_name(raw_name) if raw_name else "UNKNOWN"
750
- name_confidence_score = 0.85
 
751
 
752
- # Validate with context
753
- num_cols = len(column_centers)
754
- amount = None
755
- rate = None
756
- qty = None
757
 
758
- if num_cols >= 1:
759
- bucket = numeric_buckets.get(num_cols - 1, [])
760
  if bucket:
761
- amt_str = bucket[-1][0]
762
- amount = normalize_num_str(amt_str, allow_zero=False)
763
-
764
- if amount is None:
765
- for v in numeric_values:
766
- if v > 0:
767
- amount = v
768
- break
769
 
770
- if num_cols >= 2:
771
- bucket = numeric_buckets.get(num_cols - 2, [])
772
  if bucket:
773
- rate = normalize_num_str(bucket[-1][0], allow_zero=False)
774
 
775
- if num_cols >= 3:
776
- bucket = numeric_buckets.get(num_cols - 3, [])
777
  if bucket:
778
- qty = normalize_num_str(bucket[-1][0], allow_zero=False)
779
 
780
- if amount and not qty and not rate and numeric_values:
781
- for cand in numeric_values:
782
- if cand <= 0.1 or cand >= amount:
783
- continue
784
- ratio = amount / cand
785
- r = round(ratio)
786
- if 1 <= r <= 100 and abs(ratio - r) <= 0.15 * r:
787
- qty = float(r)
788
- rate = cand
789
  break
790
 
791
- if qty and rate is None and amount and amount != 0:
792
- rate = amount / qty
793
- elif rate and qty is None and amount and amount != 0:
794
- qty = amount / rate
795
- elif amount and qty and rate is None:
796
- rate = amount / qty if qty != 0 else 0.0
 
 
797
 
798
- if qty is None:
799
- qty = 1.0
800
- if rate is None:
801
- rate = 0.0
802
- if amount is None:
803
- amount = qty * rate if qty and rate else 0.0
804
 
805
- if amount > 0:
806
- confidence = np.mean([c.get("conf", 0.85) for c in row]) if row else 0.85
807
-
808
- # ★ VALIDATE NAME WITH CONTEXT
809
- validated_name, name_conf = validate_name(item_name, context_amount=amount)
810
-
811
- items.append(BillLineItem(
812
- item_name=validated_name,
813
- item_quantity=float(qty),
814
- item_rate=float(round(rate, 2)),
815
- item_amount=float(round(amount, 2)),
816
- confidence=min(1.0, max(0.0, confidence)),
817
- source_row=row_text,
818
- name_confidence=name_conf,
819
- ))
820
  else:
821
- numeric_idxs = [i for i, t in enumerate(tokens) if is_numeric_token(t)]
822
- if not numeric_idxs:
 
 
 
823
  continue
824
 
825
- last = numeric_idxs[-1]
826
- amount = normalize_num_str(tokens[last], allow_zero=False)
827
- if amount is None:
 
 
 
828
  continue
829
 
830
- raw_name = " ".join(tokens[:last]).strip()
 
 
 
 
 
 
831
 
832
- # IMPROVED NAME NORMALIZATION
833
- name = normalize_name(raw_name) if raw_name else "UNKNOWN"
834
- validated_name, name_conf = validate_name(name, context_amount=amount)
835
 
836
- confidence = np.mean([c.get("conf", 0.85) for c in row]) if row else 0.85
837
- items.append(BillLineItem(
838
- item_name=validated_name,
839
  item_quantity=1.0,
840
- item_rate=0.0,
841
- item_amount=float(round(amount, 2)),
842
- confidence=min(1.0, max(0.0, confidence)),
843
- source_row=row_text,
844
- name_confidence=name_conf,
845
  ))
 
 
 
 
846
 
847
- return items
848
 
849
- # -------------------------------------------------------------------------
850
- # Tesseract OCR Pipeline
851
- # -------------------------------------------------------------------------
852
- def ocr_with_tesseract(file_bytes: bytes) -> List[ExtractedPage]:
853
- """Tesseract pipeline"""
854
- pages_out = []
 
 
 
 
855
 
856
  try:
857
- images = convert_from_bytes(file_bytes)
858
- except Exception:
 
 
859
  try:
860
- im = Image.open(BytesIO(file_bytes))
861
- images = [im]
862
- except Exception as e:
863
- logger.exception("Tesseract: file open failed: %s", e)
 
864
  return []
865
 
866
- for idx, pil_img in enumerate(images, start=1):
 
 
 
 
867
  try:
868
- proc = preprocess_image_for_tesseract(pil_img)
869
- cells = image_to_tsv_cells(proc)
870
- rows = group_cells_into_rows(cells, y_tolerance=12)
 
 
 
 
871
 
872
- page_text = " ".join([" ".join([c["text"] for c in r]) for r in rows])
 
 
 
 
 
 
 
 
 
 
 
 
873
 
874
- subtotal, tax, discount, final_total = detect_totals_in_rows(rows)
 
 
875
 
876
- items = parse_rows_with_columns(rows, cells, page_text)
 
 
 
 
 
 
 
 
 
877
 
878
- items = dedupe_items_advanced(items)
 
 
879
 
880
- filtered_items = []
881
- for item in items:
882
- name_lower = item.item_name.lower()
883
-
884
- if TOTAL_KEYWORDS.search(name_lower) or SUBTOTAL_KEYWORDS.search(name_lower):
885
- continue
886
-
887
- if item.item_amount > 0:
888
- filtered_items.append(item)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
889
 
890
- bill_totals = BillTotal(
891
- subtotal_amount=subtotal,
892
- tax_amount=tax,
893
- discount_amount=discount,
894
- final_total_amount=final_total,
 
 
 
 
 
 
 
 
 
 
895
  )
896
 
897
- page_conf = np.mean([item.confidence for item in filtered_items]) if filtered_items else 0.8
 
898
 
899
- pages_out.append(ExtractedPage(
900
- page_no=idx,
901
- page_type="Bill Detail",
902
- line_items=filtered_items,
903
- bill_totals=bill_totals,
904
- page_confidence=page_conf,
905
- ))
906
-
907
- except Exception as e:
908
- logger.exception(f"Tesseract page {idx} failed: %s", e)
909
- pages_out.append(ExtractedPage(
910
- page_no=idx,
911
- page_type="Bill Detail",
912
- line_items=[],
913
- bill_totals=BillTotal(),
914
- page_confidence=0.0,
915
  ))
916
 
917
- return pages_out
 
 
 
 
 
 
918
 
919
- # -------------------------------------------------------------------------
920
- # FastAPI App
921
- # -------------------------------------------------------------------------
922
- app = FastAPI(title="Enhanced Bill Extractor (Improved Names)")
923
 
924
- class BillRequest(BaseModel):
925
  document: str
926
 
927
- class BillResponse(BaseModel):
928
  is_success: bool
929
  error: Optional[str] = None
930
  data: Dict[str, Any]
931
  token_usage: Dict[str, int]
 
932
 
933
- @app.post("/extract-bill-data", response_model=BillResponse)
934
- async def extract_bill_data(payload: BillRequest):
935
  """Main extraction endpoint"""
936
- doc_url = payload.document
937
- file_bytes = None
938
 
939
- if doc_url.startswith("file://"):
940
- local_path = doc_url.replace("file://", "")
941
  try:
942
- with open(local_path, "rb") as f:
943
- file_bytes = f.read()
 
944
  except Exception as e:
945
- return BillResponse(
 
946
  is_success=False,
947
- error=f"Local file read failed: {e}",
948
  data={"pagewise_line_items": [], "total_item_count": 0},
949
- token_usage={"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
950
  )
951
  else:
952
  try:
953
- headers = {"User-Agent": "Mozilla/5.0"}
954
- resp = requests.get(doc_url, headers=headers, timeout=30)
955
- if resp.status_code != 200:
956
- return BillResponse(
957
  is_success=False,
958
- error=f"Download failed (status={resp.status_code})",
959
  data={"pagewise_line_items": [], "total_item_count": 0},
960
- token_usage={"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
961
  )
962
- file_bytes = resp.content
963
  except Exception as e:
964
- return BillResponse(
965
  is_success=False,
966
  error=f"HTTP error: {e}",
967
  data={"pagewise_line_items": [], "total_item_count": 0},
968
- token_usage={"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
969
  )
970
 
971
- if not file_bytes:
972
- return BillResponse(
973
  is_success=False,
974
- error="No file bytes",
975
  data={"pagewise_line_items": [], "total_item_count": 0},
976
- token_usage={"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
977
  )
978
 
979
- logger.info(f"Processing with engine: {OCR_ENGINE}")
980
- try:
981
- if OCR_ENGINE == "tesseract":
982
- pages = ocr_with_tesseract(file_bytes)
983
- else:
984
- pages = ocr_with_tesseract(file_bytes)
985
- except Exception as e:
986
- logger.exception("OCR failed: %s", e)
987
- pages = []
988
 
989
- total_items = sum(len(p.line_items) for p in pages)
990
- pages_dict = [p.to_dict() for p in pages]
 
 
 
991
 
992
- return BillResponse(
993
  is_success=True,
994
  data={
995
- "pagewise_line_items": pages_dict,
996
  "total_item_count": total_items,
997
  },
998
- token_usage={"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
 
999
  )
1000
 
1001
  @app.get("/")
1002
- def health():
1003
  return {
1004
- "status": "ok",
1005
- "engine": OCR_ENGINE,
1006
- "message": "Enhanced Bill Extractor (Improved Name Detection)",
1007
- "hint": "POST /extract-bill-data with {'document': '<url or file://path>'}",
1008
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import re
3
  import json
4
  import logging
5
  from io import BytesIO
6
  from typing import List, Dict, Any, Optional, Tuple
7
+ from dataclasses import dataclass, field
8
  from difflib import SequenceMatcher
9
+ from collections import defaultdict
10
 
11
  from fastapi import FastAPI
12
  from pydantic import BaseModel
 
18
  import pytesseract
19
  from pytesseract import Output
20
 
21
+ # ============================================================================
22
+ # ENHANCED LOGGING CONFIGURATION
23
+ # ============================================================================
24
+ logging.basicConfig(
25
+ level=logging.DEBUG,
26
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
27
+ )
28
+ logger = logging.getLogger("bill-extractor-debug")
 
29
 
30
+ # ============================================================================
31
+ # CONFIGURATION
32
+ # ============================================================================
33
  OCR_ENGINE = os.getenv("OCR_ENGINE", "tesseract").lower()
34
+ TESSERACT_PAGE_SEGMENTATION_MODE = os.getenv("TESSERACT_PSM", "6")
 
 
 
 
 
 
 
35
 
36
+ logger.info(f"OCR Engine: {OCR_ENGINE}")
37
+ logger.info(f"Tesseract PSM: {TESSERACT_PAGE_SEGMENTATION_MODE}")
 
 
 
 
 
38
 
39
+ # ============================================================================
40
+ # MEDICAL TERMINOLOGY MAPPING
41
+ # ============================================================================
42
+ MEDICAL_TERMINOLOGY_MAPPING = {
43
+ "consultation": "Consultation",
 
 
 
 
 
 
 
 
 
 
 
 
44
  "consultion": "Consultation",
45
+ "consult": "Consultation",
46
+ "check": "Check-up",
47
+ "checkup": "Check-up",
48
+ "visit": "Patient Visit",
49
  "cbc": "Complete Blood Count (CBC)",
50
  "lft": "Liver Function Test (LFT)",
51
  "rft": "Renal Function Test (RFT)",
52
+ "kft": "Kidney Function Test (KFT)",
 
 
 
 
 
 
 
53
  "xray": "X-Ray",
54
  "x-ray": "X-Ray",
55
+ "ct": "CT Scan",
 
 
 
56
  "mri": "MRI Scan",
57
+ "ultrasound": "Ultrasound (USG)",
58
+ "usg": "Ultrasound (USG)",
 
 
 
 
 
 
 
 
 
 
 
59
  "tablet": "Tablet",
 
60
  "capsule": "Capsule",
61
+ "injection": "Injection",
62
+ "inj": "Injection",
 
 
 
 
 
 
 
63
  }
64
 
65
+ # ============================================================================
66
+ # DATA MODELS
67
+ # ============================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  @dataclass
69
+ class LineItemForBill:
70
+ item_description: str
 
71
  item_quantity: float = 1.0
72
+ unit_price_per_item: float = 0.0
73
+ total_item_amount: float = 0.0
74
+ ocr_confidence_score: float = field(default=1.0, repr=False)
75
+ raw_row_text: str = field(default="", repr=False)
76
+
77
+ def convert_to_output_dict(self) -> Dict[str, Any]:
 
 
 
 
78
  return {
79
+ "item_name": self.item_description,
80
  "item_quantity": self.item_quantity,
81
+ "item_rate": self.unit_price_per_item,
82
+ "item_amount": self.total_item_amount,
83
  }
84
 
85
  @dataclass
86
+ class BillSummaryTotals:
87
+ subtotal_sum: Optional[float] = None
88
+ tax_amount_gst: Optional[float] = None
89
+ discount_total: Optional[float] = None
90
+ final_bill_amount: Optional[float] = None
91
+
92
+ def convert_to_output_dict(self) -> Dict[str, Any]:
93
+ return {k: v for k, v in {
94
+ "subtotal_amount": self.subtotal_sum,
95
+ "tax_amount": self.tax_amount_gst,
96
+ "discount_amount": self.discount_total,
97
+ "final_total_amount": self.final_bill_amount,
98
+ }.items() if v is not None}
99
 
100
  @dataclass
101
+ class ExtractedBillPage:
102
+ page_number: int
103
+ page_classification: str
104
+ extracted_items: List[LineItemForBill]
105
+ bill_summary: BillSummaryTotals
106
+ page_extraction_confidence: float = field(default=0.85, repr=False)
107
+ debug_info: Dict[str, Any] = field(default_factory=dict, repr=False)
108
+
109
+ def convert_to_output_dict(self) -> Dict[str, Any]:
 
110
  return {
111
+ "page_no": self.page_number,
112
+ "page_type": self.page_classification,
113
+ "line_items": [item.convert_to_output_dict() for item in self.extracted_items],
114
+ "bill_totals": self.bill_summary.convert_to_output_dict(),
115
  }
116
 
117
+ # ============================================================================
118
+ # TEXT PROCESSING
119
+ # ============================================================================
120
+ NUMERIC_PATTERN = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
+ def is_token_numeric(token: Optional[str]) -> bool:
123
+ return bool(token and NUMERIC_PATTERN.search(str(token)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
+ def parse_numeric_string(text_input: Optional[str], allow_zero_values: bool = False) -> Optional[float]:
126
+ if text_input is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  return None
128
+
129
+ text_input = str(text_input).strip()
130
+ if not text_input:
131
  return None
132
 
133
+ is_negative = False
134
+ if text_input.startswith("(") and text_input.endswith(")"):
135
+ is_negative = True
136
+ text_input = text_input[1:-1]
137
 
138
+ text_input = re.sub(r"[^\d\-\+\,\.\(\)]", "", text_input)
139
+ text_input = text_input.replace(",", "")
140
 
141
+ if text_input in ("", "-", "+"):
142
  return None
143
 
144
  try:
145
+ value = float(text_input)
146
+ value = -value if is_negative else value
147
+
148
+ if value == 0 and not allow_zero_values:
149
  return None
150
+
151
+ return value
152
  except Exception:
153
  return None
154
 
155
+ def comprehensive_text_normalization(raw_text: str) -> str:
156
+ if not raw_text:
157
+ return "UNKNOWN"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
+ normalized_text = raw_text.strip()
160
+ normalized_text = re.sub(r'\s+', ' ', normalized_text)
161
+ normalized_text = normalized_text.replace('|', ' ').replace('||', ' ')
162
+ normalized_text = normalized_text.strip(' -:,.=()[]{}|\\/')
163
+
164
+ # Apply corrections
165
+ text_lower = normalized_text.lower()
166
+ for incorrect_term, correct_term in MEDICAL_TERMINOLOGY_MAPPING.items():
167
+ if incorrect_term in text_lower:
168
+ pattern = re.compile(r'\b' + re.escape(incorrect_term) + r'\b', re.IGNORECASE)
169
+ normalized_text = pattern.sub(correct_term, normalized_text)
170
+
171
+ # Capitalization
172
+ words = normalized_text.split()
173
+ result_words = []
174
+ for word in words:
175
+ if word.lower() in ["for", "the", "and", "or", "in", "of", "to", "a", "an", "ml", "mg"]:
176
+ if result_words:
177
+ result_words.append(word.lower())
178
+ else:
179
+ result_words.append(word.capitalize())
180
+ else:
181
+ result_words.append(word.capitalize())
182
 
183
+ return ' '.join(result_words).strip()
 
 
 
 
 
 
 
 
 
184
 
185
+ # ============================================================================
186
+ # IMAGE PREPROCESSING
187
+ # ============================================================================
188
+ def pil_to_cv2(pil_image: Image.Image) -> Any:
189
+ array = np.array(pil_image)
190
+ if array.ndim == 2:
191
+ return array
192
+ return cv2.cvtColor(array, cv2.COLOR_RGB2BGR)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
+ def preprocess_bill_image_for_ocr(pil_image: Image.Image, target_width: int = 1500) -> Any:
195
+ logger.debug(f"Preprocessing image: original size {pil_image.size}")
 
 
196
 
197
+ pil_image = pil_image.convert("RGB")
198
+ width, height = pil_image.size
 
199
 
200
+ if width < target_width:
201
+ scale_factor = target_width / float(width)
202
+ new_width = int(width * scale_factor)
203
+ new_height = int(height * scale_factor)
204
+ pil_image = pil_image.resize((new_width, new_height), Image.LANCZOS)
205
+ logger.debug(f"Image scaled to {new_width}x{new_height}")
206
 
207
+ cv_image = pil_to_cv2(pil_image)
208
+
209
+ if cv_image.ndim == 3:
210
+ gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
211
  else:
212
+ gray = cv_image
213
 
214
  gray = cv2.fastNlMeansDenoising(gray, h=10)
215
 
216
  try:
217
+ binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
218
+ cv2.THRESH_BINARY, 41, 15)
219
  except Exception:
220
+ _, binary = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
221
 
222
  kernel = np.ones((2, 2), np.uint8)
223
+ binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
224
+ binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
225
 
226
+ logger.debug("Image preprocessing complete")
227
+ return binary
228
 
229
+ def extract_ocr_cells_from_image(cv_image: Any) -> List[Dict[str, Any]]:
230
+ """Extract OCR data with detailed logging"""
231
+ logger.info(f"Starting OCR extraction with PSM {TESSERACT_PAGE_SEGMENTATION_MODE}")
232
+
233
  try:
234
+ ocr_data = pytesseract.image_to_data(
235
+ cv_image,
236
+ output_type=Output.DICT,
237
+ config=f"--psm {TESSERACT_PAGE_SEGMENTATION_MODE}"
238
+ )
239
+ except Exception as e:
240
+ logger.error(f"OCR with PSM failed: {e}, trying default")
241
+ ocr_data = pytesseract.image_to_data(cv_image, output_type=Output.DICT)
242
 
243
  cells = []
244
+ text_count = len(ocr_data.get("text", []))
245
+ logger.info(f"OCR extracted {text_count} cells")
246
+
247
+ for i in range(text_count):
248
+ raw_text = ocr_data["text"][i]
249
+ if raw_text is None:
250
  continue
251
+
252
+ text_string = str(raw_text).strip()
253
+ if not text_string:
254
  continue
255
 
256
  try:
257
+ confidence_raw = ocr_data.get("conf", [])[i]
258
+ confidence = float(confidence_raw) if confidence_raw not in (None, "", "-1") else 0.6
259
  except Exception:
260
+ confidence = 0.6
261
 
262
+ left = int(ocr_data.get("left", [0])[i])
263
+ top = int(ocr_data.get("top", [0])[i])
264
+ width = int(ocr_data.get("width", [0])[i])
265
+ height = int(ocr_data.get("height", [0])[i])
 
266
  center_x = left + width / 2.0
267
+ center_y = top + height / 2.0
268
+
269
+ confidence_normalized = max(0.0, min(1.0, confidence / 100.0)) if confidence > 1 else max(0.0, min(1.0, confidence))
270
 
271
  cells.append({
272
+ "text": text_string,
273
+ "conf": confidence_normalized,
274
  "left": left, "top": top, "width": width, "height": height,
275
  "center_x": center_x, "center_y": center_y
276
  })
277
 
278
+ logger.info(f"Extracted {len(cells)} valid cells")
279
+
280
+ # Debug: Log first 10 cells
281
+ for i, cell in enumerate(cells[:10]):
282
+ logger.debug(f"Cell {i}: '{cell['text']}' at ({cell['center_x']:.0f}, {cell['center_y']:.0f}) conf={cell['conf']:.2f}")
283
+
284
  return cells
285
 
286
+ def group_ocr_cells_into_rows(cells_list: List[Dict[str, Any]], vertical_tolerance_pixels: int = 12) -> List[List[Dict[str, Any]]]:
287
+ logger.info(f"Grouping {len(cells_list)} cells into rows with tolerance {vertical_tolerance_pixels}")
288
+
289
+ if not cells_list:
290
+ logger.warning("No cells to group!")
291
  return []
292
 
293
+ sorted_cells = sorted(cells_list, key=lambda c: (c["center_y"], c["center_x"]))
294
+
295
+ row_groups = []
296
+ current_row = [sorted_cells[0]]
297
+ last_vertical_center = sorted_cells[0]["center_y"]
298
 
299
+ for cell in sorted_cells[1:]:
300
+ if abs(cell["center_y"] - last_vertical_center) <= vertical_tolerance_pixels:
301
+ current_row.append(cell)
302
+ last_vertical_center = (last_vertical_center * (len(current_row) - 1) + cell["center_y"]) / len(current_row)
303
  else:
304
+ row_groups.append(sorted(current_row, key=lambda c: c["center_x"]))
305
+ current_row = [cell]
306
+ last_vertical_center = cell["center_y"]
307
+
308
+ if current_row:
309
+ row_groups.append(sorted(current_row, key=lambda c: c["center_x"]))
310
 
311
+ logger.info(f"Created {len(row_groups)} rows")
312
+
313
+ # Debug: Log row statistics
314
+ for i, row in enumerate(row_groups[:5]):
315
+ row_text = " ".join([c["text"] for c in row])
316
+ logger.debug(f"Row {i}: {len(row)} cells | Y={row[0]['center_y']:.0f} | '{row_text[:60]}...'")
317
 
318
+ return row_groups
319
 
320
+ def detect_numeric_column_positions(cells_list: List[Dict[str, Any]], maximum_expected_columns: int = 6) -> List[float]:
321
+ logger.info("Detecting numeric column positions...")
322
+
323
+ numeric_x_positions = [c["center_x"] for c in cells_list if is_token_numeric(c["text"])]
324
+
325
+ logger.info(f"Found {len(numeric_x_positions)} numeric tokens")
326
+
327
+ if not numeric_x_positions:
328
+ logger.warning("No numeric columns detected!")
329
  return []
330
 
331
+ numeric_x_positions = sorted(set(numeric_x_positions))
332
+
333
+ if len(numeric_x_positions) <= 1:
334
+ logger.info(f"Only {len(numeric_x_positions)} numeric column(s)")
335
+ return numeric_x_positions
336
+
337
+ column_gaps = [numeric_x_positions[i+1] - numeric_x_positions[i] for i in range(len(numeric_x_positions) - 1)]
338
+
339
+ mean_gap = float(np.mean(column_gaps))
340
+ std_dev_gap = float(np.std(column_gaps)) if len(column_gaps) > 1 else 0.0
341
+ gap_threshold = max(35.0, mean_gap + 0.7 * std_dev_gap)
342
+
343
+ logger.debug(f"Gap stats: mean={mean_gap:.1f}, std={std_dev_gap:.1f}, threshold={gap_threshold:.1f}")
344
+
345
+ column_clusters = []
346
+ current_cluster = [numeric_x_positions[0]]
347
+
348
+ for i, gap in enumerate(column_gaps):
349
+ if gap > gap_threshold and len(column_clusters) < (maximum_expected_columns - 1):
350
+ column_clusters.append(current_cluster)
351
+ current_cluster = [numeric_x_positions[i + 1]]
352
  else:
353
+ current_cluster.append(numeric_x_positions[i + 1])
 
354
 
355
+ column_clusters.append(current_cluster)
356
+ column_centers = [float(np.median(cluster)) for cluster in column_clusters]
 
357
 
358
+ if len(column_centers) > maximum_expected_columns:
359
+ column_centers = column_centers[-maximum_expected_columns:]
360
+
361
+ column_centers = sorted(column_centers)
362
+
363
+ logger.info(f"Detected {len(column_centers)} columns at positions: {[f'{c:.0f}' for c in column_centers]}")
364
+
365
+ return column_centers
366
 
367
+ # ============================================================================
368
+ # ROW PARSING
369
+ # ============================================================================
370
+ TOTAL_ROW_KEYWORDS = re.compile(r"\b(grand\s+total|final\s+total|total\s+(?:amount|due|payable|bill)|net\s+(?:amount|payable)|amount\s+(?:due|payable)|balance\s+due|payable)\b", re.I)
371
+ SUBTOTAL_ROW_KEYWORDS = re.compile(r"\b(sub\s*[\-\s]?total|subtotal|sub\s+total|items\s+total|net\s+amount|amount)\b", re.I)
372
+ TAX_ROW_KEYWORDS = re.compile(r"\b(tax|gst|cgst|sgst|igst|vat|sales\s+tax|service\s+tax)\b", re.I)
373
+ DISCOUNT_ROW_KEYWORDS = re.compile(r"\b(discount|rebate|deduction|reduction)\b", re.I)
374
+ FOOTER_ROW_KEYWORDS = re.compile(r"(page|printed|date|time|signature|authorized|terms|conditions|note)", re.I)
375
 
376
+ def parse_rows_into_line_items(row_groups: List[List[Dict[str, Any]]], all_page_cells: List[Dict[str, Any]]) -> List[LineItemForBill]:
377
+ logger.info(f"Parsing {len(row_groups)} rows into line items...")
378
+
379
+ extracted_items = []
380
+ numeric_column_positions = detect_numeric_column_positions(all_page_cells, max_columns=6)
381
+
382
+ logger.info(f"Columns detected: {len(numeric_column_positions)}")
383
+
384
+ skipped_rows = 0
385
+
386
+ for row_idx, row in enumerate(row_groups):
387
+ row_tokens = [cell["text"] for cell in row]
388
+ full_row_text = " ".join(row_tokens)
389
+ row_text_lower = full_row_text.lower()
 
 
390
 
391
+ # Skip footer/non-data rows
392
+ if FOOTER_ROW_KEYWORDS.search(row_text_lower) and not any(is_token_numeric(t) for t in row_tokens):
393
+ logger.debug(f"Row {row_idx}: Skipped (footer keyword)")
394
+ skipped_rows += 1
395
  continue
396
 
397
+ if not any(is_token_numeric(t) for t in row_tokens):
398
+ logger.debug(f"Row {row_idx}: Skipped (no numeric tokens)")
399
+ skipped_rows += 1
400
  continue
401
 
402
+ # Extract numeric values
403
+ numeric_values_in_row = []
404
+ for token in row_tokens:
405
+ if is_token_numeric(token):
406
+ value = parse_numeric_string(token, allow_zero_values=False)
407
+ if value is not None:
408
+ numeric_values_in_row.append(value)
409
 
410
+ if not numeric_values_in_row:
411
+ logger.debug(f"Row {row_idx}: Skipped (no numeric values)")
412
+ skipped_rows += 1
413
  continue
414
 
415
+ numeric_values_in_row = sorted(list(set(numeric_values_in_row)), reverse=True)
416
 
417
+ # Check for total keywords
418
+ if TOTAL_ROW_KEYWORDS.search(row_text_lower) or SUBTOTAL_ROW_KEYWORDS.search(row_text_lower):
419
+ logger.debug(f"Row {row_idx}: Skipped (total row)")
420
+ skipped_rows += 1
421
+ continue
422
+
423
+ # Parse row
424
+ if numeric_column_positions:
425
+ description_parts = []
426
+ numeric_column_buckets = defaultdict(list)
427
 
428
+ for cell in row:
429
+ token_text = cell["text"]
430
+ horizontal_pos = cell["center_x"]
431
+ token_confidence = cell.get("conf", 1.0)
432
 
433
+ if is_token_numeric(token_text):
434
+ distances = [abs(horizontal_pos - col_center) for col_center in numeric_column_positions]
435
+ column_index = int(np.argmin(distances))
436
+ numeric_column_buckets[column_index].append((token_text, token_confidence))
 
437
  else:
438
+ description_parts.append(token_text)
439
 
440
+ item_description = comprehensive_text_normalization(" ".join(description_parts))
441
 
442
+ if item_description == "UNKNOWN":
443
+ logger.debug(f"Row {row_idx}: Skipped (no description)")
444
+ skipped_rows += 1
445
+ continue
446
 
447
+ num_columns = len(numeric_column_positions)
448
+ item_amount = None
449
+ item_rate = None
450
+ item_quantity = None
 
451
 
452
+ if num_columns >= 1:
453
+ bucket = numeric_column_buckets.get(num_columns - 1, [])
454
  if bucket:
455
+ item_amount = parse_numeric_string(bucket[-1][0], allow_zero_values=False)
 
 
 
 
 
 
 
456
 
457
+ if num_columns >= 2:
458
+ bucket = numeric_column_buckets.get(num_columns - 2, [])
459
  if bucket:
460
+ item_rate = parse_numeric_string(bucket[-1][0], allow_zero_values=False)
461
 
462
+ if num_columns >= 3:
463
+ bucket = numeric_column_buckets.get(num_columns - 3, [])
464
  if bucket:
465
+ item_quantity = parse_numeric_string(bucket[-1][0], allow_zero_values=False)
466
 
467
+ if item_amount is None:
468
+ for value in numeric_values_in_row:
469
+ if value > 0:
470
+ item_amount = value
 
 
 
 
 
471
  break
472
 
473
+ if item_quantity is None:
474
+ item_quantity = 1.0
475
+ if item_rate is None:
476
+ item_rate = 0.0
477
+ if item_amount is None or item_amount <= 0:
478
+ logger.debug(f"Row {row_idx}: Skipped (invalid amount)")
479
+ skipped_rows += 1
480
+ continue
481
 
482
+ ocr_score = np.mean([c.get("conf", 0.85) for c in row]) if row else 0.85
 
 
 
 
 
483
 
484
+ extracted_items.append(LineItemForBill(
485
+ item_description=item_description,
486
+ item_quantity=float(item_quantity),
487
+ unit_price_per_item=float(round(item_rate, 2)),
488
+ total_item_amount=float(round(item_amount, 2)),
489
+ ocr_confidence_score=min(1.0, max(0.0, ocr_score)),
490
+ raw_row_text=full_row_text,
491
+ ))
492
+
493
+ logger.info(f"Row {row_idx}: ✓ Extracted '{item_description}' x{item_quantity} @ ₹{item_rate} = ₹{item_amount}")
494
+
 
 
 
 
495
  else:
496
+ # Fallback: single column
497
+ numeric_indices = [i for i, t in enumerate(row_tokens) if is_token_numeric(t)]
498
+ if not numeric_indices:
499
+ logger.debug(f"Row {row_idx}: Skipped (fallback, no numeric)")
500
+ skipped_rows += 1
501
  continue
502
 
503
+ last_numeric_idx = numeric_indices[-1]
504
+ item_amount = parse_numeric_string(row_tokens[last_numeric_idx], allow_zero_values=False)
505
+
506
+ if item_amount is None or item_amount <= 0:
507
+ logger.debug(f"Row {row_idx}: Skipped (invalid amount)")
508
+ skipped_rows += 1
509
  continue
510
 
511
+ description_text = " ".join(row_tokens[:last_numeric_idx]).strip()
512
+ item_description = comprehensive_text_normalization(description_text)
513
+
514
+ if item_description == "UNKNOWN":
515
+ logger.debug(f"Row {row_idx}: Skipped (no description)")
516
+ skipped_rows += 1
517
+ continue
518
 
519
+ ocr_score = np.mean([c.get("conf", 0.85) for c in row]) if row else 0.85
 
 
520
 
521
+ extracted_items.append(LineItemForBill(
522
+ item_description=item_description,
 
523
  item_quantity=1.0,
524
+ unit_price_per_item=0.0,
525
+ total_item_amount=float(round(item_amount, 2)),
526
+ ocr_confidence_score=min(1.0, max(0.0, ocr_score)),
527
+ raw_row_text=full_row_text,
 
528
  ))
529
+
530
+ logger.info(f"Row {row_idx} (fallback): ✓ Extracted '{item_description}' = ₹{item_amount}")
531
+
532
+ logger.info(f"Parsing complete: {len(extracted_items)} items extracted, {skipped_rows} rows skipped")
533
 
534
+ return extracted_items
535
 
536
+ # ============================================================================
537
+ # MAIN EXTRACTION PIPELINE
538
+ # ============================================================================
539
+ def extract_bill_data_from_pdf(pdf_bytes: bytes) -> List[ExtractedBillPage]:
540
+ """Main extraction pipeline with comprehensive debug logging"""
541
+ logger.info("=" * 80)
542
+ logger.info("STARTING BILL EXTRACTION")
543
+ logger.info("=" * 80)
544
+
545
+ extracted_pages = []
546
 
547
  try:
548
+ pdf_images = convert_from_bytes(pdf_bytes)
549
+ logger.info(f"PDF converted to {len(pdf_images)} images")
550
+ except Exception as e:
551
+ logger.error(f"PDF conversion failed: {e}")
552
  try:
553
+ pdf_image = Image.open(BytesIO(pdf_bytes))
554
+ pdf_images = [pdf_image]
555
+ logger.info("Using fallback: Opened as single image")
556
+ except Exception as e2:
557
+ logger.error(f"Fallback also failed: {e2}")
558
  return []
559
 
560
+ for page_index, pil_page_image in enumerate(pdf_images, start=1):
561
+ logger.info(f"\n{'='*80}")
562
+ logger.info(f"PROCESSING PAGE {page_index}")
563
+ logger.info(f"{'='*80}")
564
+
565
  try:
566
+ # Preprocess
567
+ logger.info("Step 1: Preprocessing image...")
568
+ preprocessed_image = preprocess_bill_image_for_ocr(pil_page_image)
569
+
570
+ # Extract OCR cells
571
+ logger.info("Step 2: Extracting OCR cells...")
572
+ page_cells = extract_ocr_cells_from_image(preprocessed_image)
573
 
574
+ if not page_cells:
575
+ logger.error("❌ No OCR cells extracted! Possible causes:")
576
+ logger.error(" - Tesseract not installed or misconfigured")
577
+ logger.error(" - Image quality too poor")
578
+ logger.error(" - PSM mode incompatible with document layout")
579
+ extracted_pages.append(ExtractedBillPage(
580
+ page_number=page_index,
581
+ page_classification="Error",
582
+ extracted_items=[],
583
+ bill_summary=BillSummaryTotals(),
584
+ debug_info={"error": "No OCR cells extracted"}
585
+ ))
586
+ continue
587
 
588
+ # Group into rows
589
+ logger.info("Step 3: Grouping cells into rows...")
590
+ page_rows = group_ocr_cells_into_rows(page_cells, vertical_tolerance_pixels=12)
591
 
592
+ if not page_rows:
593
+ logger.error("❌ No rows created!")
594
+ extracted_pages.append(ExtractedBillPage(
595
+ page_number=page_index,
596
+ page_classification="Error",
597
+ extracted_items=[],
598
+ bill_summary=BillSummaryTotals(),
599
+ debug_info={"error": "No rows created"}
600
+ ))
601
+ continue
602
 
603
+ # Parse items
604
+ logger.info("Step 4: Parsing rows into items...")
605
+ page_items = parse_rows_into_line_items(page_rows, page_cells)
606
 
607
+ if not page_items:
608
+ logger.warning("⚠️ No items extracted. Trying alternative PSM modes...")
609
+ # Try alternative PSM
610
+ for alt_psm in ["1", "3", "11"]:
611
+ logger.info(f" Trying PSM {alt_psm}...")
612
+ try:
613
+ ocr_data = pytesseract.image_to_data(
614
+ preprocessed_image,
615
+ output_type=Output.DICT,
616
+ config=f"--psm {alt_psm}"
617
+ )
618
+ cells_alt = []
619
+ for i in range(len(ocr_data.get("text", []))):
620
+ text_str = str(ocr_data["text"][i]).strip()
621
+ if text_str:
622
+ cells_alt.append({"text": text_str})
623
+
624
+ if len(cells_alt) > len(page_cells):
625
+ logger.info(f" ✓ PSM {alt_psm} got more cells ({len(cells_alt)} vs {len(page_cells)})")
626
+ break
627
+ except Exception as alt_e:
628
+ logger.debug(f" PSM {alt_psm} failed: {alt_e}")
629
+
630
+ # Verify items
631
+ logger.info(f"\nStep 5: Item verification")
632
+ logger.info(f" Extracted items: {len(page_items)}")
633
+ for i, item in enumerate(page_items[:5], 1):
634
+ logger.info(f" Item {i}: {item.item_description} | ₹{item.total_item_amount}")
635
 
636
+ # Create page result
637
+ bill_summary = BillSummaryTotals()
638
+ page_avg_confidence = np.mean([item.ocr_confidence_score for item in page_items]) if page_items else 0.7
639
+
640
+ page_result = ExtractedBillPage(
641
+ page_number=page_index,
642
+ page_classification="Bill Detail" if page_items else "No Items",
643
+ extracted_items=page_items,
644
+ bill_summary=bill_summary,
645
+ page_extraction_confidence=page_avg_confidence,
646
+ debug_info={
647
+ "ocr_cells_count": len(page_cells),
648
+ "rows_count": len(page_rows),
649
+ "items_count": len(page_items),
650
+ }
651
  )
652
 
653
+ extracted_pages.append(page_result)
654
+ logger.info(f"✓ PAGE {page_index} COMPLETE: {len(page_items)} items extracted")
655
 
656
+ except Exception as page_error:
657
+ logger.exception(f"❌ PAGE {page_index} FAILED")
658
+ extracted_pages.append(ExtractedBillPage(
659
+ page_number=page_index,
660
+ page_classification="Error",
661
+ extracted_items=[],
662
+ bill_summary=BillSummaryTotals(),
663
+ debug_info={"error": str(page_error)}
 
 
 
 
 
 
 
 
664
  ))
665
 
666
+ logger.info("\n" + "="*80)
667
+ logger.info("EXTRACTION COMPLETE")
668
+ logger.info("="*80)
669
+ logger.info(f"Total pages: {len(extracted_pages)}")
670
+ logger.info(f"Total items: {sum(len(p.extracted_items) for p in extracted_pages)}")
671
+
672
+ return extracted_pages
673
 
674
+ # ============================================================================
675
+ # FASTAPI APPLICATION
676
+ # ============================================================================
677
+ app = FastAPI(title="Bill Extractor - Debug Version")
678
 
679
+ class BillExtractionRequest(BaseModel):
680
  document: str
681
 
682
+ class BillExtractionResponse(BaseModel):
683
  is_success: bool
684
  error: Optional[str] = None
685
  data: Dict[str, Any]
686
  token_usage: Dict[str, int]
687
+ debug_info: Optional[Dict[str, Any]] = None
688
 
689
+ @app.post("/extract-bill-data", response_model=BillExtractionResponse)
690
+ async def api_extract_bill_data(request: BillExtractionRequest):
691
  """Main extraction endpoint"""
692
+ document_source = request.document
693
+ file_content_bytes = None
694
 
695
+ if document_source.startswith("file://"):
696
+ local_file_path = document_source.replace("file://", "")
697
  try:
698
+ with open(local_file_path, "rb") as f:
699
+ file_content_bytes = f.read()
700
+ logger.info(f"Loaded file: {local_file_path} ({len(file_content_bytes)} bytes)")
701
  except Exception as e:
702
+ logger.error(f"File read error: {e}")
703
+ return BillExtractionResponse(
704
  is_success=False,
705
+ error=f"File read error: {e}",
706
  data={"pagewise_line_items": [], "total_item_count": 0},
707
+ token_usage={"total_tokens": 0},
708
  )
709
  else:
710
  try:
711
+ response = requests.get(document_source, headers={"User-Agent": "Mozilla/5.0"}, timeout=30)
712
+ if response.status_code != 200:
713
+ return BillExtractionResponse(
 
714
  is_success=False,
715
+ error=f"HTTP {response.status_code}",
716
  data={"pagewise_line_items": [], "total_item_count": 0},
717
+ token_usage={"total_tokens": 0},
718
  )
719
+ file_content_bytes = response.content
720
  except Exception as e:
721
+ return BillExtractionResponse(
722
  is_success=False,
723
  error=f"HTTP error: {e}",
724
  data={"pagewise_line_items": [], "total_item_count": 0},
725
+ token_usage={"total_tokens": 0},
726
  )
727
 
728
+ if not file_content_bytes:
729
+ return BillExtractionResponse(
730
  is_success=False,
731
+ error="Empty file",
732
  data={"pagewise_line_items": [], "total_item_count": 0},
733
+ token_usage={"total_tokens": 0},
734
  )
735
 
736
+ # Extract
737
+ extracted_pages = extract_bill_data_from_pdf(file_content_bytes)
738
+
739
+ total_items = sum(len(p.extracted_items) for p in extracted_pages)
740
+ pages_output = [p.convert_to_output_dict() for p in extracted_pages]
 
 
 
 
741
 
742
+ debug_info = {
743
+ "total_pages": len(extracted_pages),
744
+ "total_items": total_items,
745
+ "pages_debug": [p.debug_info for p in extracted_pages],
746
+ }
747
 
748
+ return BillExtractionResponse(
749
  is_success=True,
750
  data={
751
+ "pagewise_line_items": pages_output,
752
  "total_item_count": total_items,
753
  },
754
+ token_usage={"total_tokens": 0},
755
+ debug_info=debug_info,
756
  )
757
 
758
  @app.get("/")
759
+ def health_check():
760
  return {
761
+ "status": "healthy",
762
+ "mode": "DEBUG",
763
+ "message": "Bill Extractor Debug Version - Check logs for detailed output"
764
+ }