Spaces:

Rogue2003
/

Receipt_Agent

Sleeping

App Files Files Community

Raghu commited on Dec 7, 2025

Commit

eb79113

1 Parent(s): 6fad358

Re-enable LayoutLMv3 field extractor with cached weights

Browse files

Files changed (1) hide show

app.py +141 -7

app.py CHANGED Viewed

@@ -15,7 +15,12 @@ import re
 from PIL import Image, ImageDraw
 from datetime import datetime
 from torchvision import transforms, models
-from transformers import ViTForImageClassification, ViTImageProcessor
 from sklearn.ensemble import IsolationForest
 import warnings
 warnings.filterwarnings('ignore')
@@ -421,6 +426,126 @@ class ReceiptOCR:
         return match.group() if match else None
 # ============================================================================
 # Anomaly Detection
 # ============================================================================
@@ -512,6 +637,13 @@ except Exception as e:
     print(f"Warning: Could not load OCR: {e}")
     receipt_ocr = None
 anomaly_detector = AnomalyDetector()
 print("\n" + "="*50)
@@ -608,19 +740,21 @@ def process_receipt(image):
     fields = {}
     fields_html = ""
     try:
-        if receipt_ocr and ocr_results:
             fields = receipt_ocr.postprocess_receipt(ocr_results)
         fields_html = "<div style='padding: 16px; background: #f8f9fa; border-radius: 12px;'><h4>Extracted Fields</h4>"
         for name, value in [('Vendor', fields.get('vendor')), ('Date', fields.get('date')),
-                           ('Total', f"${fields.get('total')}" if fields.get('total') else None),
                            ('Time', fields.get('time'))]:
-            display = value or '<span style="color: #adb5bd;">Not found</span>'
-            fields_html += f"<div style='padding: 8px; background: white; border-radius: 6px; margin: 4px 0;'><b>{name}:</b> {display}</div>"
-        fields_html += "</div>"
         results['fields'] = fields
     except Exception as e:
-        fields_html = f"<div style='color: red;'>Extraction error: {e}</div>"
     # 4. Anomaly Detection
     anomaly_html = ""

 from PIL import Image, ImageDraw
 from datetime import datetime
 from torchvision import transforms, models
+from transformers import (
+    ViTForImageClassification,
+    ViTImageProcessor,
+    LayoutLMv3ForTokenClassification,
+    LayoutLMv3Processor,
+)
 from sklearn.ensemble import IsolationForest
 import warnings
 warnings.filterwarnings('ignore')
         return match.group() if match else None
+# ============================================================================
+# LayoutLMv3 Field Extractor
+# ============================================================================
+class LayoutLMFieldExtractor:
+    """LayoutLMv3-based field extractor using fine-tuned weights if available."""
+    def __init__(self, model_path=None):
+        self.model_path = model_path or os.path.join(MODELS_DIR, 'layoutlm_extractor.pt')
+        self.id2label = {
+            0: 'O',
+            1: 'B-VENDOR', 2: 'I-VENDOR',
+            3: 'B-DATE', 4: 'I-DATE',
+            5: 'B-TOTAL', 6: 'I-TOTAL',
+            7: 'B-TIME', 8: 'I-TIME'
+        }
+        self.label2id = {v: k for k, v in self.id2label.items()}
+        self.processor = None
+        self.model = None
+    def load(self):
+        print("Loading LayoutLMv3 extractor...")
+        self.processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
+        self.model = LayoutLMv3ForTokenClassification.from_pretrained(
+            "microsoft/layoutlmv3-base",
+            num_labels=len(self.id2label),
+            id2label=self.id2label,
+            label2id=self.label2id,
+        )
+        if os.path.exists(self.model_path):
+            checkpoint = torch.load(self.model_path, map_location=DEVICE)
+            if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
+                checkpoint = checkpoint['model_state_dict']
+            if isinstance(checkpoint, dict):
+                missing, unexpected = self.model.load_state_dict(checkpoint, strict=False)
+                print(f"Loaded LayoutLM weights; missing={len(missing)}, unexpected={len(unexpected)}")
+        self.model = self.model.to(DEVICE)
+        self.model.eval()
+        print("LayoutLMv3 ready")
+        return self
+    def _prepare_boxes(self, ocr_results, image_size):
+        """Convert absolute pixel boxes to LayoutLM 0-1000 format."""
+        width, height = image_size
+        boxes = []
+        words = []
+        for r in ocr_results:
+            bbox = r.get("bbox", [0, 0, width, height])
+            x0, y0, x1, y1 = bbox
+            boxes.append([
+                int(1000 * x0 / width),
+                int(1000 * y0 / height),
+                int(1000 * x1 / width),
+                int(1000 * y1 / height),
+            ])
+            words.append(r.get("text", ""))
+        return words, boxes
+    def predict_fields(self, image, ocr_results=None):
+        if self.model is None:
+            self.load()
+        if not isinstance(image, Image.Image):
+            image = Image.fromarray(image)
+        image = image.convert("RGB")
+        if ocr_results:
+            words, boxes = self._prepare_boxes(ocr_results, image.size)
+            encoding = self.processor(
+                image,
+                words=words,
+                boxes=boxes,
+                return_tensors="pt",
+                truncation=True,
+                padding="max_length",
+                max_length=512,
+            )
+        else:
+            encoding = self.processor(image, return_tensors="pt")
+        encoding = {k: v.to(DEVICE) for k, v in encoding.items()}
+        with torch.no_grad():
+            outputs = self.model(**encoding)
+            logits = outputs.logits[0]
+            preds = logits.argmax(-1).cpu().tolist()
+            tokens = self.processor.tokenizer.convert_ids_to_tokens(encoding["input_ids"][0].cpu())
+        entities = {"VENDOR": [], "DATE": [], "TOTAL": [], "TIME": []}
+        current = {"label": None, "tokens": []}
+        for token, pred in zip(tokens, preds):
+            label = self.id2label.get(pred, "O")
+            if token in ["[PAD]", "[CLS]", "[SEP]"]:
+                continue
+            if label.startswith("B-"):
+                # flush previous
+                if current["label"] and current["tokens"]:
+                    entities[current["label"]].append(" ".join(current["tokens"]))
+                current = {"label": label[2:], "tokens": [token]}
+            elif label.startswith("I-") and current["label"] == label[2:]:
+                current["tokens"].append(token)
+            else:
+                if current["label"] and current["tokens"]:
+                    entities[current["label"]].append(" ".join(current["tokens"]))
+                current = {"label": None, "tokens": []}
+        if current["label"] and current["tokens"]:
+            entities[current["label"]].append(" ".join(current["tokens"]))
+        def pick_first(key):
+            vals = entities.get(key, [])
+            return vals[0].replace("▁", " ").strip() if vals else None
+        return {
+            "vendor": pick_first("VENDOR"),
+            "date": pick_first("DATE"),
+            "total": pick_first("TOTAL"),
+            "time": pick_first("TIME"),
+        }
 # ============================================================================
 # Anomaly Detection
 # ============================================================================
     print(f"Warning: Could not load OCR: {e}")
     receipt_ocr = None
+try:
+    layoutlm_extractor = LayoutLMFieldExtractor()
+    layoutlm_extractor.load()
+except Exception as e:
+    print(f"Warning: Could not load LayoutLMv3 extractor: {e}")
+    layoutlm_extractor = None
 anomaly_detector = AnomalyDetector()
 print("\n" + "="*50)
     fields = {}
     fields_html = ""
     try:
+        if layoutlm_extractor:
+            fields = layoutlm_extractor.predict_fields(image, ocr_results)
+        elif receipt_ocr and ocr_results:
             fields = receipt_ocr.postprocess_receipt(ocr_results)
         fields_html = "<div style='padding: 16px; background: #f8f9fa; border-radius: 12px;'><h4>Extracted Fields</h4>"
         for name, value in [('Vendor', fields.get('vendor')), ('Date', fields.get('date')),
+                           ('Total', f\"${fields.get('total')}\" if fields.get('total') else None),
                            ('Time', fields.get('time'))]:
+            display = value or '<span style=\"color: #adb5bd;\">Not found</span>'
+            fields_html += f\"<div style='padding: 8px; background: white; border-radius: 6px; margin: 4px 0;'><b>{name}:</b> {display}</div>\"
+        fields_html += \"</div>\"
         results['fields'] = fields
     except Exception as e:
+        fields_html = f\"<div style='color: red;'>Extraction error: {e}</div>\"
     # 4. Anomaly Detection
     anomaly_html = ""