Spaces:

satvaSolutions
/

pdf-ocr

Sleeping

App Files Files Community

shubhjo commited on Jun 11, 2025

Commit

14f9439

verified ·

1 Parent(s): c423c13

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -68

app.py CHANGED Viewed

@@ -107,74 +107,80 @@ async def process_with_gemini(filename: str, raw_text: str):
         logger.info(f"Truncated raw text for {filename} to 10000 characters, {log_memory_usage()}")
     try:
-        prompt = f"""
-        You are an intelligent invoice data extractor. Given raw text from an invoice in any language and extract key business fields in the specified JSON format. Support English. Handle synonyms (e.g., 'total' = 'net', 'tax' = 'GST'/'TDS'). The 'Products' field is dynamic and may contain multiple items, each with 'qty', 'description', 'unit_price', and 'amount'. Detect the currency (e.g., USD, INR, EUR) from symbols ($, ₹, €) or text; default to USD if unclear. If a field is missing, include it with an empty string ("") or appropriate default (e.g., 0 for numbers).
-        Raw text:
-        {raw_text}
-        Output JSON:
-        {{
-          "invoice": {{
-            "invoice_number": "",
-            "invoice_date": "YYYY-MM-DD" or null,
-            "due_date": "YYYY-MM-DD"or null,
-            "purchase_order_number": "",
-            "vendor": {{
-              "vendor_id": "",
-              "name": "",
-              "address": {{
-                "line1": "",
-                "line2": "",
-                "city": "",
-                "state": "",
-                "postal_code": "",
-                "country": ""
-              }},
-              "contact": {{
-                "email": "",
-                "phone": ""
-              }},
-              "tax_id": ""
-            }},
-            "buyer": {{
-              "buyer_id": "",
-              "name": "",
-              "address": {{
-                "line1": "",
-                "line2": "",
-                "city": "",
-                "state": "",
-                "postal_code": "",
-                "country": ""
-              }},
-              "contact": {{
-                "email": "",
-                "phone": ""
-              }},
-              "tax_id": ""
-            }},
-            "items": [
-              {{
-                "item_id": "",
-                "description": "",
-                "quantity": 0,
-                "unit_of_measure": "",
-                "unit_price": 0,
-                "total_price": 0,
-                "tax_rate": 0,
-                "tax_amount": 0,
-                "discount": 0,
-                "net_amount": 0
-              }}
-            ],
-            "sub_total": 0,
-            "tax_total": 0,
-            "discount_total": 0,
-            "total_amount": 0,
-            "currency": ""
-        }}
         """
         response = model.generate_content(prompt)
         llm_output = response.text

         logger.info(f"Truncated raw text for {filename} to 10000 characters, {log_memory_usage()}")
     try:
+        prompt = f"""You are an intelligent invoice data extractor. Given raw text from an invoice (in English or other languages),
+        extract key business fields into the specified JSON format. Return each field along with an estimated accuracy score between 0 and 1.
+- Accuracy reflects your confidence in the correctness of each field.
+- Handle synonyms (e.g., 'total' = 'net', 'tax' = 'GST'/'TDS').
+- Detect currency from symbols ($, ₹, €) or keywords (USD, INR, EUR); default to USD if unclear.
+- The 'items' list may have multiple entries, each with detailed attributes.
+- If a field is missing or not found, return an empty value (`""` or `0`) and set `accuracy` to `0.0`.
+Raw text:
+{raw_text}
+Output JSON:
+{{
+  "invoice": {{
+    "invoice_number": {{"value": "", "accuracy": 0.0}},
+    "invoice_date": {{"value": "YYYY-MM-DD", "accuracy": 0.0}},
+    "due_date": {{"value": "YYYY-MM-DD", "accuracy": 0.0}},
+    "purchase_order_number": {{"value": "", "accuracy": 0.0}},
+    "vendor": {{
+      "vendor_id": {{"value": "", "accuracy": 0.0}},
+      "name": {{"value": "", "accuracy": 0.0}},
+      "address": {{
+        "line1": {{"value": "", "accuracy": 0.0}},
+        "line2": {{"value": "", "accuracy": 0.0}},
+        "city": {{"value": "", "accuracy": 0.0}},
+        "state": {{"value": "", "accuracy": 0.0}},
+        "postal_code": {{"value": "", "accuracy": 0.0}},
+        "country": {{"value": "", "accuracy": 0.0}}
+      }},
+      "contact": {{
+        "email": {{"value": "", "accuracy": 0.0}},
+        "phone": {{"value": "", "accuracy": 0.0}}
+      }},
+      "tax_id": {{"value": "", "accuracy": 0.0}}
+    }},
+    "buyer": {{
+      "buyer_id": {{"value": "", "accuracy": 0.0}},
+      "name": {{"value": "", "accuracy": 0.0}},
+      "address": {{
+        "line1": {{"value": "", "accuracy": 0.0}},
+        "line2": {{"value": "", "accuracy": 0.0}},
+        "city": {{"value": "", "accuracy": 0.0}},
+        "state": {{"value": "", "accuracy": 0.0}},
+        "postal_code": {{"value": "", "accuracy": 0.0}},
+        "country": {{"value": "", "accuracy": 0.0}}
+      }},
+      "contact": {{
+        "email": {{"value": "", "accuracy": 0.0}},
+        "phone": {{"value": "", "accuracy": 0.0}}
+      }},
+      "tax_id": {{"value": "", "accuracy": 0.0}}
+    }},
+    "items": [
+      {{
+        "item_id": {{"value": "", "accuracy": 0.0}},
+        "description": {{"value": "", "accuracy": 0.0}},
+        "quantity": {{"value": 0, "accuracy": 0.0}},
+        "unit_of_measure": {{"value": "", "accuracy": 0.0}},
+        "unit_price": {{"value": 0, "accuracy": 0.0}},
+        "total_price": {{"value": 0, "accuracy": 0.0}},
+        "tax_rate": {{"value": 0, "accuracy": 0.0}},
+        "tax_amount": {{"value": 0, "accuracy": 0.0}},
+        "discount": {{"value": 0, "accuracy": 0.0}},
+        "net_amount": {{"value": 0, "accuracy": 0.0}}
+      }}
+    ],
+    "sub_total": {{"value": 0, "accuracy": 0.0}},
+    "tax_total": {{"value": 0, "accuracy": 0.0}},
+    "discount_total": {{"value": 0, "accuracy": 0.0}},
+    "total_amount": {{"value": 0, "accuracy": 0.0}},
+    "currency": {{"value": "", "accuracy": 0.0}}
+  }}
+}}
         """
         response = model.generate_content(prompt)
         llm_output = response.text