Seth0330 commited on
Commit
88f23c6
·
verified ·
1 Parent(s): 73e2df7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -47
app.py CHANGED
@@ -107,17 +107,32 @@ def clean_json_response(text, model_choice):
107
 
108
  # Handle Mistral's markdown response
109
  if model_choice == "Mistral Small":
110
- # Remove all markdown formatting
111
- text = text.replace('```json', '').replace('```', '').strip()
112
-
113
- # Handle Llama's truncated response
 
 
 
 
 
 
 
114
  if model_choice == "Llama 4 Mavericks":
115
- if '"line_items":' in text and not text.strip().endswith('}}'):
116
- # Check if we have at least a complete header
117
- if '"invoice_header":' in text:
118
- # Return with empty line items
119
- text = text.split('"line_items":')[0] + '"line_items": []}'
120
-
 
 
 
 
 
 
 
 
121
  # Try parsing the cleaned JSON
122
  try:
123
  data = json.loads(text)
@@ -131,21 +146,68 @@ def clean_json_response(text, model_choice):
131
 
132
  return data
133
  except json.JSONDecodeError as e:
134
- # Try one more time with strict=False for Llama
135
- if model_choice == "Llama 4 Mavericks":
136
- try:
 
 
 
137
  # Find the last complete JSON object
138
  end_pos = text.rfind('}')
139
  if end_pos != -1:
140
- return json.loads(text[:end_pos+1])
141
- except:
142
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
- st.warning(f"JSON parsing failed: {str(e)}")
145
  return None
146
 
147
  def get_extraction_prompt(model_choice, text):
148
  """Return the appropriate prompt based on model choice"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  if model_choice == "DeepSeek v3":
150
  return f"""Extract complete invoice information from the text below and return ONLY a valid JSON object with these fields:
151
  {{
@@ -191,36 +253,7 @@ Invoice Text:
191
  """ + text
192
 
193
  else: # For Llama 4 and Mistral
194
- return f"""Extract complete invoice information and return a VALID JSON object with these fields:
195
- {{
196
- "invoice_header": {{
197
- "invoice_number": "string",
198
- "invoice_date": "YYYY-MM-DD",
199
- "po_number": "string or null",
200
- "invoice_value": "string with currency",
201
- "supplier_name": "string or null",
202
- "customer_name": "string or null"
203
- }},
204
- "line_items": [
205
- {{
206
- "item_number": "string or null",
207
- "description": "string",
208
- "quantity": "number",
209
- "unit_price": "string with currency",
210
- "total_price": "string with currency"
211
- }}
212
- ]
213
- }}
214
- Rules:
215
- 1. Return ONLY valid JSON (no additional text or markdown)
216
- 2. Use null for missing fields
217
- 3. Date format must be YYYY-MM-DD
218
- 4. All currency values must include currency symbol or code
219
- 5. Include all line items found in the invoice
220
- 6. For line items, quantity should be a number, prices as strings with currency
221
- 7. Do not include any explanations or notes
222
- Invoice Text:
223
- """ + text
224
 
225
  def format_currency(value):
226
  """Helper function to format currency values consistently"""
 
107
 
108
  # Handle Mistral's markdown response
109
  if model_choice == "Mistral Small":
110
+ try:
111
+ # Extract JSON from between ```json and ```
112
+ json_start = text.find('{')
113
+ json_end = text.rfind('}') + 1
114
+ if json_start != -1 and json_end != 0:
115
+ text = text[json_start:json_end]
116
+ except Exception as e:
117
+ st.warning(f"Error processing Mistral response: {str(e)}")
118
+ return None
119
+
120
+ # Handle Llama's response
121
  if model_choice == "Llama 4 Mavericks":
122
+ # Check if response is complete
123
+ if not text.strip().endswith('}}'):
124
+ # Try to complete the JSON structure
125
+ if '"line_items":' in text:
126
+ # Case 1: Line items started but not finished
127
+ if '"line_items": [' in text:
128
+ text = text.split('"line_items": [')[0] + '"line_items": []}'
129
+ # Case 2: Just the line_items key exists
130
+ else:
131
+ text = text.split('"line_items":')[0] + '"line_items": []}'
132
+ # Ensure proper closing
133
+ if not text.endswith('}'):
134
+ text += '}'
135
+
136
  # Try parsing the cleaned JSON
137
  try:
138
  data = json.loads(text)
 
146
 
147
  return data
148
  except json.JSONDecodeError as e:
149
+ st.warning(f"JSON parsing failed: {str(e)}")
150
+ st.warning(f"Attempting to repair JSON for {model_choice}...")
151
+
152
+ # Final attempt to repair
153
+ try:
154
+ if model_choice == "Llama 4 Mavericks":
155
  # Find the last complete JSON object
156
  end_pos = text.rfind('}')
157
  if end_pos != -1:
158
+ repaired = text[:end_pos+1]
159
+ data = json.loads(repaired)
160
+ if "line_items" not in data:
161
+ data["line_items"] = []
162
+ return data
163
+
164
+ elif model_choice == "Mistral Small":
165
+ # Remove all non-JSON content
166
+ json_start = text.find('{')
167
+ json_end = text.rfind('}') + 1
168
+ if json_start != -1 and json_end != 0:
169
+ repaired = text[json_start:json_end]
170
+ return json.loads(repaired)
171
+
172
+ except Exception as e:
173
+ st.error(f"Failed to repair JSON: {str(e)}")
174
+ return None
175
 
 
176
  return None
177
 
178
  def get_extraction_prompt(model_choice, text):
179
  """Return the appropriate prompt based on model choice"""
180
+ base_prompt = """Extract complete invoice information and return a VALID JSON object with these fields:
181
+ {
182
+ "invoice_header": {
183
+ "invoice_number": "string",
184
+ "invoice_date": "YYYY-MM-DD",
185
+ "po_number": "string or null",
186
+ "invoice_value": "string with currency",
187
+ "supplier_name": "string or null",
188
+ "customer_name": "string or null"
189
+ },
190
+ "line_items": [
191
+ {
192
+ "item_number": "string or null",
193
+ "description": "string",
194
+ "quantity": "number",
195
+ "unit_price": "string with currency",
196
+ "total_price": "string with currency"
197
+ }
198
+ ]
199
+ }
200
+ Rules:
201
+ 1. Return ONLY valid JSON (no additional text or markdown)
202
+ 2. Use null for missing fields
203
+ 3. Date format must be YYYY-MM-DD
204
+ 4. All currency values must include currency symbol or code
205
+ 5. Include all line items found in the invoice
206
+ 6. For line items, quantity should be a number, prices as strings with currency
207
+ 7. Do not include any explanations or notes
208
+ Invoice Text:
209
+ """ + text
210
+
211
  if model_choice == "DeepSeek v3":
212
  return f"""Extract complete invoice information from the text below and return ONLY a valid JSON object with these fields:
213
  {{
 
253
  """ + text
254
 
255
  else: # For Llama 4 and Mistral
256
+ return base_prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
  def format_currency(value):
259
  """Helper function to format currency values consistently"""