Seth0330 commited on
Commit
b23347b
·
verified ·
1 Parent(s): 88f23c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -97
app.py CHANGED
@@ -100,114 +100,89 @@ def query_llm(model_choice, prompt):
100
  st.error(f"🌐 Connection Failed: {str(e)}")
101
  return None
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  def clean_json_response(text, model_choice):
104
- """Improved JSON extraction with model-specific handling"""
105
  if not text:
106
  return None
107
 
108
- # Handle Mistral's markdown response
 
 
109
  if model_choice == "Mistral Small":
110
- try:
111
- # Extract JSON from between ```json and ```
112
- json_start = text.find('{')
113
- json_end = text.rfind('}') + 1
114
- if json_start != -1 and json_end != 0:
115
- text = text[json_start:json_end]
116
- except Exception as e:
117
- st.warning(f"Error processing Mistral response: {str(e)}")
118
- return None
 
 
 
119
 
120
- # Handle Llama's response
121
- if model_choice == "Llama 4 Mavericks":
122
- # Check if response is complete
123
- if not text.strip().endswith('}}'):
124
- # Try to complete the JSON structure
125
- if '"line_items":' in text:
126
- # Case 1: Line items started but not finished
127
- if '"line_items": [' in text:
128
- text = text.split('"line_items": [')[0] + '"line_items": []}'
129
- # Case 2: Just the line_items key exists
 
 
130
  else:
131
- text = text.split('"line_items":')[0] + '"line_items": []}'
132
- # Ensure proper closing
133
- if not text.endswith('}'):
134
- text += '}'
135
 
136
- # Try parsing the cleaned JSON
137
- try:
138
- data = json.loads(text)
139
-
140
- # Ensure proper structure exists
141
- if model_choice in ["Llama 4 Mavericks", "Mistral Small"]:
142
- if "invoice_header" not in data:
143
- data["invoice_header"] = {}
144
- if "line_items" not in data:
145
- data["line_items"] = []
146
-
147
- return data
148
- except json.JSONDecodeError as e:
149
- st.warning(f"JSON parsing failed: {str(e)}")
150
- st.warning(f"Attempting to repair JSON for {model_choice}...")
151
-
152
- # Final attempt to repair
153
  try:
154
- if model_choice == "Llama 4 Mavericks":
155
- # Find the last complete JSON object
156
- end_pos = text.rfind('}')
157
- if end_pos != -1:
158
- repaired = text[:end_pos+1]
159
- data = json.loads(repaired)
160
- if "line_items" not in data:
161
- data["line_items"] = []
162
- return data
163
-
164
- elif model_choice == "Mistral Small":
165
- # Remove all non-JSON content
166
- json_start = text.find('{')
167
- json_end = text.rfind('}') + 1
168
- if json_start != -1 and json_end != 0:
169
- repaired = text[json_start:json_end]
170
- return json.loads(repaired)
171
-
172
- except Exception as e:
173
- st.error(f"Failed to repair JSON: {str(e)}")
174
- return None
175
-
176
- return None
177
 
178
  def get_extraction_prompt(model_choice, text):
179
  """Return the appropriate prompt based on model choice"""
180
- base_prompt = """Extract complete invoice information and return a VALID JSON object with these fields:
181
- {
182
- "invoice_header": {
183
- "invoice_number": "string",
184
- "invoice_date": "YYYY-MM-DD",
185
- "po_number": "string or null",
186
- "invoice_value": "string with currency",
187
- "supplier_name": "string or null",
188
- "customer_name": "string or null"
189
- },
190
- "line_items": [
191
- {
192
- "item_number": "string or null",
193
- "description": "string",
194
- "quantity": "number",
195
- "unit_price": "string with currency",
196
- "total_price": "string with currency"
197
- }
198
- ]
199
- }
200
- Rules:
201
- 1. Return ONLY valid JSON (no additional text or markdown)
202
- 2. Use null for missing fields
203
- 3. Date format must be YYYY-MM-DD
204
- 4. All currency values must include currency symbol or code
205
- 5. Include all line items found in the invoice
206
- 6. For line items, quantity should be a number, prices as strings with currency
207
- 7. Do not include any explanations or notes
208
- Invoice Text:
209
- """ + text
210
-
211
  if model_choice == "DeepSeek v3":
212
  return f"""Extract complete invoice information from the text below and return ONLY a valid JSON object with these fields:
213
  {{
@@ -253,7 +228,36 @@ Invoice Text:
253
  """ + text
254
 
255
  else: # For Llama 4 and Mistral
256
- return base_prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
  def format_currency(value):
259
  """Helper function to format currency values consistently"""
 
100
  st.error(f"🌐 Connection Failed: {str(e)}")
101
  return None
102
 
103
+ def find_json_end(text):
104
+ """Find the end of a potentially incomplete JSON object"""
105
+ stack = []
106
+ for i, c in enumerate(text):
107
+ if c == '{':
108
+ stack.append(i)
109
+ elif c == '}':
110
+ if stack:
111
+ stack.pop()
112
+ if not stack:
113
+ return i+1
114
+ return -1
115
+
116
  def clean_json_response(text, model_choice):
117
+ """Robust JSON extraction with advanced error handling"""
118
  if not text:
119
  return None
120
 
121
+ original_text = text # Save for error reporting
122
+
123
+ # Model-specific preprocessing
124
  if model_choice == "Mistral Small":
125
+ # Remove all markdown formatting
126
+ text = re.sub(r'^```json|```$', '', text, flags=re.MULTILINE).strip()
127
+
128
+ # Common JSON repair patterns
129
+ repair_attempts = [
130
+ # Try extracting JSON from markdown
131
+ lambda t: re.search(r'```(?:json)?\n({.*?})\n```', t, re.DOTALL),
132
+ # Try finding the outermost JSON object
133
+ lambda t: {'start': t.find('{'), 'end': t.rfind('}')+1},
134
+ # Try last valid JSON fragment
135
+ lambda t: {'start': 0, 'end': find_json_end(t)}
136
+ ]
137
 
138
+ for attempt in repair_attempts:
139
+ try:
140
+ result = attempt(text)
141
+ if not result:
142
+ continue
143
+
144
+ if isinstance(result, re.Match):
145
+ json_str = result.group(1)
146
+ else:
147
+ start, end = result['start'], result['end']
148
+ if start >= 0 and end > start:
149
+ json_str = text[start:end]
150
  else:
151
+ continue
 
 
 
152
 
153
+ data = json.loads(json_str)
154
+
155
+ # Ensure required structure exists
156
+ if model_choice in ["Llama 4 Mavericks", "Mistral Small"]:
157
+ if "invoice_header" not in data:
158
+ data["invoice_header"] = {}
159
+ if "line_items" not in data:
160
+ data["line_items"] = []
161
+
162
+ return data
163
+
164
+ except (json.JSONDecodeError, AttributeError, KeyError) as e:
165
+ continue
166
+
167
+ # Final fallback - manual reconstruction for Llama
168
+ if model_choice == "Llama 4 Mavericks":
 
169
  try:
170
+ if '"invoice_header":' in text:
171
+ header_part = text.split('"line_items":')[0] if '"line_items":' in text else text
172
+ if not header_part.strip().endswith('}'):
173
+ header_part += '}'
174
+ data = json.loads(header_part + ('"line_items": []}' if '"line_items":' not in text else ''))
175
+ data["line_items"] = data.get("line_items", [])
176
+ return data
177
+ except:
178
+ pass
179
+
180
+ st.error(f"Failed to parse JSON after multiple attempts for {model_choice}")
181
+ st.code(f"Original response:\n{original_text}")
182
+ return None
 
 
 
 
 
 
 
 
 
 
183
 
184
  def get_extraction_prompt(model_choice, text):
185
  """Return the appropriate prompt based on model choice"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  if model_choice == "DeepSeek v3":
187
  return f"""Extract complete invoice information from the text below and return ONLY a valid JSON object with these fields:
188
  {{
 
228
  """ + text
229
 
230
  else: # For Llama 4 and Mistral
231
+ return f"""Extract complete invoice information and return a VALID JSON object with these fields:
232
+ {{
233
+ "invoice_header": {{
234
+ "invoice_number": "string",
235
+ "invoice_date": "YYYY-MM-DD",
236
+ "po_number": "string or null",
237
+ "invoice_value": "string with currency",
238
+ "supplier_name": "string or null",
239
+ "customer_name": "string or null"
240
+ }},
241
+ "line_items": [
242
+ {{
243
+ "item_number": "string or null",
244
+ "description": "string",
245
+ "quantity": "number",
246
+ "unit_price": "string with currency",
247
+ "total_price": "string with currency"
248
+ }}
249
+ ]
250
+ }}
251
+ Rules:
252
+ 1. Return ONLY valid JSON (no additional text or markdown)
253
+ 2. Use null for missing fields
254
+ 3. Date format must be YYYY-MM-DD
255
+ 4. All currency values must include currency symbol or code
256
+ 5. Include all line items found in the invoice
257
+ 6. For line items, quantity should be a number, prices as strings with currency
258
+ 7. Do not include any explanations or notes
259
+ Invoice Text:
260
+ """ + text
261
 
262
  def format_currency(value):
263
  """Helper function to format currency values consistently"""