Seth0330 commited on
Commit
dd68ed4
·
verified ·
1 Parent(s): e31081a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -76
app.py CHANGED
@@ -100,89 +100,40 @@ def query_llm(model_choice, prompt):
100
  st.error(f"🌐 Connection Failed: {str(e)}")
101
  return None
102
 
103
- def clean_json_response(text):
104
- """Improved JSON extraction with comprehensive error handling"""
105
  if not text:
106
  return None
107
 
108
- # First attempt to parse directly
109
- try:
110
- data = json.loads(text)
111
- return data
112
- except json.JSONDecodeError:
113
- pass
114
 
115
- # Try to extract JSON from potential markdown
116
- json_match = re.search(r'```(?:json)?\n({.*?})\n```', text, re.DOTALL)
117
- if json_match:
118
- try:
119
- return json.loads(json_match.group(1))
120
- except json.JSONDecodeError:
121
- pass
122
-
123
- # Try to find any JSON-like structure
124
- try:
125
- start_idx = text.find('{')
126
- end_idx = text.rfind('}') + 1
127
- if start_idx != -1 and end_idx != 0:
128
- return json.loads(text[start_idx:end_idx])
129
- except:
130
- pass
131
 
132
- # Final fallback - manual reconstruction
133
  try:
134
- if '"invoice_header":' in text and '"line_items":' in text:
135
- header_part = text.split('"line_items":')[0]
136
- line_items_part = text.split('"line_items":')[1]
137
-
138
- # Ensure proper closing of JSON
139
- if not header_part.strip().endswith('{'):
140
- header_part += '{'
141
-
142
- if not line_items_part.strip().endswith('}}'):
143
- line_items_part = line_items_part.split('}')[0] + ']}}'
144
-
145
- reconstructed = header_part + '"line_items":' + line_items_part
146
- return json.loads(reconstructed)
147
- except Exception as e:
148
- st.warning(f"Could not fully reconstruct JSON: {str(e)}")
149
  return None
150
 
151
- return None
152
-
153
  def get_extraction_prompt(model_choice, text):
154
  """Return the appropriate prompt based on model choice"""
155
- base_prompt = """Extract complete invoice information and return a VALID JSON object with these fields:
156
- {
157
- "invoice_header": {
158
- "invoice_number": "string",
159
- "invoice_date": "YYYY-MM-DD",
160
- "po_number": "string or null",
161
- "invoice_value": "string with currency",
162
- "supplier_name": "string or null",
163
- "customer_name": "string or null"
164
- },
165
- "line_items": [
166
- {
167
- "item_number": "string or null",
168
- "description": "string",
169
- "quantity": "number",
170
- "unit_price": "string with currency",
171
- "total_price": "string with currency"
172
- }
173
- ]
174
- }
175
- Rules:
176
- 1. Return ONLY valid JSON (no additional text or markdown)
177
- 2. Use null for missing fields
178
- 3. Date format must be YYYY-MM-DD
179
- 4. All currency values must include currency symbol or code
180
- 5. Include all line items found in the invoice
181
- 6. For line items, quantity should be a number, prices as strings with currency
182
- 7. Do not include any explanations or notes
183
- Invoice Text:
184
- """ + text
185
-
186
  if model_choice == "DeepSeek v3":
187
  return f"""Extract complete invoice information from the text below and return ONLY a valid JSON object with these fields:
188
  {{
@@ -228,7 +179,36 @@ Invoice Text:
228
  """ + text
229
 
230
  else: # For Llama 4 and Mistral
231
- return base_prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
  def format_currency(value):
234
  """Helper function to format currency values consistently"""
@@ -241,7 +221,7 @@ def format_currency(value):
241
  def display_line_items(line_items, model_choice="DeepSeek v3"):
242
  """Display line items in a formatted table"""
243
  if not line_items:
244
- st.info("No line items found in this invoice. This may be due to incomplete data from the API.")
245
  return
246
 
247
  st.subheader("📋 Line Items")
@@ -331,7 +311,7 @@ def extract_invoice_info(model_choice, text):
331
  if not result:
332
  return None
333
 
334
- parsed_data = clean_json_response(result)
335
  if not parsed_data:
336
  st.error("Failed to parse JSON. Raw response:")
337
  st.code(result)
 
100
  st.error(f"🌐 Connection Failed: {str(e)}")
101
  return None
102
 
103
+ def clean_json_response(text, model_choice):
104
+ """Improved JSON extraction with model-specific handling"""
105
  if not text:
106
  return None
107
 
108
+ # Handle Mistral's markdown response
109
+ if model_choice == "Mistral Small":
110
+ json_match = re.search(r'```(?:json)?\n({.*?})\n```', text, re.DOTALL)
111
+ if json_match:
112
+ text = json_match.group(1)
 
113
 
114
+ # Handle Llama's truncated response
115
+ if model_choice == "Llama 4 Mavericks":
116
+ if '"line_items":' in text and not text.strip().endswith('}}'):
117
+ text = text.split('"line_items":')[0] + '"line_items": []}}'
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
+ # Try parsing the cleaned JSON
120
  try:
121
+ data = json.loads(text)
122
+
123
+ # Ensure proper structure exists
124
+ if model_choice in ["Llama 4 Mavericks", "Mistral Small"]:
125
+ if "invoice_header" not in data:
126
+ data["invoice_header"] = {}
127
+ if "line_items" not in data:
128
+ data["line_items"] = []
129
+
130
+ return data
131
+ except json.JSONDecodeError as e:
132
+ st.warning(f"JSON parsing failed: {str(e)}")
 
 
 
133
  return None
134
 
 
 
135
  def get_extraction_prompt(model_choice, text):
136
  """Return the appropriate prompt based on model choice"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  if model_choice == "DeepSeek v3":
138
  return f"""Extract complete invoice information from the text below and return ONLY a valid JSON object with these fields:
139
  {{
 
179
  """ + text
180
 
181
  else: # For Llama 4 and Mistral
182
+ return f"""Extract complete invoice information and return a VALID JSON object with these fields:
183
+ {{
184
+ "invoice_header": {{
185
+ "invoice_number": "string",
186
+ "invoice_date": "YYYY-MM-DD",
187
+ "po_number": "string or null",
188
+ "invoice_value": "string with currency",
189
+ "supplier_name": "string or null",
190
+ "customer_name": "string or null"
191
+ }},
192
+ "line_items": [
193
+ {{
194
+ "item_number": "string or null",
195
+ "description": "string",
196
+ "quantity": "number",
197
+ "unit_price": "string with currency",
198
+ "total_price": "string with currency"
199
+ }}
200
+ ]
201
+ }}
202
+ Rules:
203
+ 1. Return ONLY valid JSON (no additional text or markdown)
204
+ 2. Use null for missing fields
205
+ 3. Date format must be YYYY-MM-DD
206
+ 4. All currency values must include currency symbol or code
207
+ 5. Include all line items found in the invoice
208
+ 6. For line items, quantity should be a number, prices as strings with currency
209
+ 7. Do not include any explanations or notes
210
+ Invoice Text:
211
+ """ + text
212
 
213
  def format_currency(value):
214
  """Helper function to format currency values consistently"""
 
221
  def display_line_items(line_items, model_choice="DeepSeek v3"):
222
  """Display line items in a formatted table"""
223
  if not line_items:
224
+ st.info("No line items found in this invoice.")
225
  return
226
 
227
  st.subheader("📋 Line Items")
 
311
  if not result:
312
  return None
313
 
314
+ parsed_data = clean_json_response(result, model_choice)
315
  if not parsed_data:
316
  st.error("Failed to parse JSON. Raw response:")
317
  st.code(result)