shubhjo commited on
Commit
14f9439
·
verified ·
1 Parent(s): c423c13

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -68
app.py CHANGED
@@ -107,74 +107,80 @@ async def process_with_gemini(filename: str, raw_text: str):
107
  logger.info(f"Truncated raw text for {filename} to 10000 characters, {log_memory_usage()}")
108
 
109
  try:
110
- prompt = f"""
111
- You are an intelligent invoice data extractor. Given raw text from an invoice in any language and extract key business fields in the specified JSON format. Support English. Handle synonyms (e.g., 'total' = 'net', 'tax' = 'GST'/'TDS'). The 'Products' field is dynamic and may contain multiple items, each with 'qty', 'description', 'unit_price', and 'amount'. Detect the currency (e.g., USD, INR, EUR) from symbols ($, ₹, €) or text; default to USD if unclear. If a field is missing, include it with an empty string ("") or appropriate default (e.g., 0 for numbers).
112
-
113
- Raw text:
114
- {raw_text}
115
-
116
- Output JSON:
117
- {{
118
- "invoice": {{
119
- "invoice_number": "",
120
- "invoice_date": "YYYY-MM-DD" or null,
121
- "due_date": "YYYY-MM-DD"or null,
122
- "purchase_order_number": "",
123
- "vendor": {{
124
- "vendor_id": "",
125
- "name": "",
126
- "address": {{
127
- "line1": "",
128
- "line2": "",
129
- "city": "",
130
- "state": "",
131
- "postal_code": "",
132
- "country": ""
133
- }},
134
- "contact": {{
135
- "email": "",
136
- "phone": ""
137
- }},
138
- "tax_id": ""
139
- }},
140
- "buyer": {{
141
- "buyer_id": "",
142
- "name": "",
143
- "address": {{
144
- "line1": "",
145
- "line2": "",
146
- "city": "",
147
- "state": "",
148
- "postal_code": "",
149
- "country": ""
150
- }},
151
- "contact": {{
152
- "email": "",
153
- "phone": ""
154
- }},
155
- "tax_id": ""
156
- }},
157
- "items": [
158
- {{
159
- "item_id": "",
160
- "description": "",
161
- "quantity": 0,
162
- "unit_of_measure": "",
163
- "unit_price": 0,
164
- "total_price": 0,
165
- "tax_rate": 0,
166
- "tax_amount": 0,
167
- "discount": 0,
168
- "net_amount": 0
169
- }}
170
- ],
171
- "sub_total": 0,
172
- "tax_total": 0,
173
- "discount_total": 0,
174
- "total_amount": 0,
175
- "currency": ""
176
-
177
- }}
 
 
 
 
 
 
178
  """
179
  response = model.generate_content(prompt)
180
  llm_output = response.text
 
107
  logger.info(f"Truncated raw text for {filename} to 10000 characters, {log_memory_usage()}")
108
 
109
  try:
110
+ prompt = f"""You are an intelligent invoice data extractor. Given raw text from an invoice (in English or other languages),
111
+ extract key business fields into the specified JSON format. Return each field along with an estimated accuracy score between 0 and 1.
112
+
113
+ - Accuracy reflects your confidence in the correctness of each field.
114
+ - Handle synonyms (e.g., 'total' = 'net', 'tax' = 'GST'/'TDS').
115
+ - Detect currency from symbols ($, ₹, €) or keywords (USD, INR, EUR); default to USD if unclear.
116
+ - The 'items' list may have multiple entries, each with detailed attributes.
117
+ - If a field is missing or not found, return an empty value (`""` or `0`) and set `accuracy` to `0.0`.
118
+
119
+ Raw text:
120
+ {raw_text}
121
+
122
+ Output JSON:
123
+ {{
124
+ "invoice": {{
125
+ "invoice_number": {{"value": "", "accuracy": 0.0}},
126
+ "invoice_date": {{"value": "YYYY-MM-DD", "accuracy": 0.0}},
127
+ "due_date": {{"value": "YYYY-MM-DD", "accuracy": 0.0}},
128
+ "purchase_order_number": {{"value": "", "accuracy": 0.0}},
129
+ "vendor": {{
130
+ "vendor_id": {{"value": "", "accuracy": 0.0}},
131
+ "name": {{"value": "", "accuracy": 0.0}},
132
+ "address": {{
133
+ "line1": {{"value": "", "accuracy": 0.0}},
134
+ "line2": {{"value": "", "accuracy": 0.0}},
135
+ "city": {{"value": "", "accuracy": 0.0}},
136
+ "state": {{"value": "", "accuracy": 0.0}},
137
+ "postal_code": {{"value": "", "accuracy": 0.0}},
138
+ "country": {{"value": "", "accuracy": 0.0}}
139
+ }},
140
+ "contact": {{
141
+ "email": {{"value": "", "accuracy": 0.0}},
142
+ "phone": {{"value": "", "accuracy": 0.0}}
143
+ }},
144
+ "tax_id": {{"value": "", "accuracy": 0.0}}
145
+ }},
146
+ "buyer": {{
147
+ "buyer_id": {{"value": "", "accuracy": 0.0}},
148
+ "name": {{"value": "", "accuracy": 0.0}},
149
+ "address": {{
150
+ "line1": {{"value": "", "accuracy": 0.0}},
151
+ "line2": {{"value": "", "accuracy": 0.0}},
152
+ "city": {{"value": "", "accuracy": 0.0}},
153
+ "state": {{"value": "", "accuracy": 0.0}},
154
+ "postal_code": {{"value": "", "accuracy": 0.0}},
155
+ "country": {{"value": "", "accuracy": 0.0}}
156
+ }},
157
+ "contact": {{
158
+ "email": {{"value": "", "accuracy": 0.0}},
159
+ "phone": {{"value": "", "accuracy": 0.0}}
160
+ }},
161
+ "tax_id": {{"value": "", "accuracy": 0.0}}
162
+ }},
163
+ "items": [
164
+ {{
165
+ "item_id": {{"value": "", "accuracy": 0.0}},
166
+ "description": {{"value": "", "accuracy": 0.0}},
167
+ "quantity": {{"value": 0, "accuracy": 0.0}},
168
+ "unit_of_measure": {{"value": "", "accuracy": 0.0}},
169
+ "unit_price": {{"value": 0, "accuracy": 0.0}},
170
+ "total_price": {{"value": 0, "accuracy": 0.0}},
171
+ "tax_rate": {{"value": 0, "accuracy": 0.0}},
172
+ "tax_amount": {{"value": 0, "accuracy": 0.0}},
173
+ "discount": {{"value": 0, "accuracy": 0.0}},
174
+ "net_amount": {{"value": 0, "accuracy": 0.0}}
175
+ }}
176
+ ],
177
+ "sub_total": {{"value": 0, "accuracy": 0.0}},
178
+ "tax_total": {{"value": 0, "accuracy": 0.0}},
179
+ "discount_total": {{"value": 0, "accuracy": 0.0}},
180
+ "total_amount": {{"value": 0, "accuracy": 0.0}},
181
+ "currency": {{"value": "", "accuracy": 0.0}}
182
+ }}
183
+ }}
184
  """
185
  response = model.generate_content(prompt)
186
  llm_output = response.text