Seth0330 commited on
Commit
4a0b9bf
·
verified ·
1 Parent(s): 7154f00

Update backend/app/openrouter_client.py

Browse files
Files changed (1) hide show
  1. backend/app/openrouter_client.py +133 -6
backend/app/openrouter_client.py CHANGED
@@ -251,7 +251,7 @@ async def extract_fields_from_document(
251
  "X-Title": "Document Capture Demo",
252
  }
253
 
254
- async with httpx.AsyncClient(timeout=120) as client:
255
  resp = await client.post(OPENROUTER_BASE_URL, headers=headers, json=payload)
256
  resp.raise_for_status()
257
  data = resp.json()
@@ -262,8 +262,17 @@ async def extract_fields_from_document(
262
 
263
  content = data["choices"][0]["message"]["content"]
264
 
265
- # Log the raw response for debugging (first 500 chars)
266
- print(f"[DEBUG] OpenRouter response preview: {str(content)[:500]}")
 
 
 
 
 
 
 
 
 
267
 
268
  # content may be a string or a list of content blocks
269
  if isinstance(content, list):
@@ -283,6 +292,7 @@ async def extract_fields_from_document(
283
  return parsed
284
  except json.JSONDecodeError as e:
285
  print(f"[DEBUG] Direct JSON parse failed: {e}")
 
286
  # Try to extract JSON from markdown code blocks
287
  json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
288
  if json_match:
@@ -296,21 +306,138 @@ async def extract_fields_from_document(
296
  # Try to find JSON object in the text (look for {...})
297
  json_match = re.search(r'\{.*\}', text, re.DOTALL)
298
  if json_match:
 
299
  try:
300
- parsed = json.loads(json_match.group(0))
301
  print(f"[DEBUG] Successfully parsed JSON from regex match")
302
  return parsed
303
  except json.JSONDecodeError as e3:
304
  print(f"[DEBUG] Regex match parse failed: {e3}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
 
306
  # If all parsing fails, return a default structure with the raw text
307
  print(f"[WARNING] All JSON parsing attempts failed. Returning fallback structure.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  return {
309
  "doc_type": "other",
310
  "confidence": 50.0,
311
  "fields": {
312
- "raw_response": text[:1000], # First 1000 chars for debugging
313
- "error": "Could not parse JSON from model response",
314
  "note": "Check server logs for full response"
315
  }
316
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  "X-Title": "Document Capture Demo",
252
  }
253
 
254
+ async with httpx.AsyncClient(timeout=180) as client: # Increased timeout for long responses
255
  resp = await client.post(OPENROUTER_BASE_URL, headers=headers, json=payload)
256
  resp.raise_for_status()
257
  data = resp.json()
 
262
 
263
  content = data["choices"][0]["message"]["content"]
264
 
265
+ # Check if response was truncated
266
+ finish_reason = data["choices"][0].get("finish_reason", "")
267
+ if finish_reason == "length":
268
+ print(f"[WARNING] Response was truncated due to token limit (finish_reason: {finish_reason})")
269
+
270
+ # Log the raw response for debugging (first 1000 chars and last 500 chars)
271
+ content_str = str(content)
272
+ print(f"[DEBUG] OpenRouter response preview (first 1000 chars): {content_str[:1000]}")
273
+ if len(content_str) > 1000:
274
+ print(f"[DEBUG] OpenRouter response preview (last 500 chars): {content_str[-500:]}")
275
+ print(f"[DEBUG] Total response length: {len(content_str)} characters")
276
 
277
  # content may be a string or a list of content blocks
278
  if isinstance(content, list):
 
292
  return parsed
293
  except json.JSONDecodeError as e:
294
  print(f"[DEBUG] Direct JSON parse failed: {e}")
295
+
296
  # Try to extract JSON from markdown code blocks
297
  json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
298
  if json_match:
 
306
  # Try to find JSON object in the text (look for {...})
307
  json_match = re.search(r'\{.*\}', text, re.DOTALL)
308
  if json_match:
309
+ json_str = json_match.group(0)
310
  try:
311
+ parsed = json.loads(json_str)
312
  print(f"[DEBUG] Successfully parsed JSON from regex match")
313
  return parsed
314
  except json.JSONDecodeError as e3:
315
  print(f"[DEBUG] Regex match parse failed: {e3}")
316
+ # Try to fix truncated JSON by closing unclosed strings/objects
317
+ try:
318
+ fixed_json = _fix_truncated_json(json_str)
319
+ parsed = json.loads(fixed_json)
320
+ print(f"[DEBUG] Successfully parsed fixed truncated JSON")
321
+ return parsed
322
+ except Exception as e4:
323
+ print(f"[DEBUG] Failed to fix truncated JSON: {e4}")
324
+
325
+ # Last resort: try to extract what we can from the partial JSON
326
+ try:
327
+ partial_data = _extract_partial_json(text)
328
+ if partial_data:
329
+ print(f"[DEBUG] Extracted partial data from truncated JSON")
330
+ return partial_data
331
+ except Exception as e5:
332
+ print(f"[DEBUG] Failed to extract partial JSON: {e5}")
333
 
334
  # If all parsing fails, return a default structure with the raw text
335
  print(f"[WARNING] All JSON parsing attempts failed. Returning fallback structure.")
336
+ # Try to extract at least the full_text if it's visible (even if truncated)
337
+ # Look for "full_text": "..." pattern, handling escaped characters and truncation
338
+ full_text_match = re.search(r'"full_text"\s*:\s*"(.*?)(?:"\s*[,}]|$)', text, re.DOTALL)
339
+ if full_text_match:
340
+ try:
341
+ # Get the matched text (may be truncated)
342
+ full_text_raw = full_text_match.group(1)
343
+ # Unescape common sequences
344
+ full_text = (full_text_raw
345
+ .replace('\\n', '\n')
346
+ .replace('\\"', '"')
347
+ .replace('\\\\', '\\')
348
+ .replace('\\t', '\t')
349
+ .replace('\\r', '\r'))
350
+
351
+ # Try to extract other fields too
352
+ doc_type_match = re.search(r'"doc_type"\s*:\s*"([^"]+)"', text)
353
+ confidence_match = re.search(r'"confidence"\s*:\s*(\d+(?:\.\d+)?)', text)
354
+
355
+ result = {
356
+ "doc_type": doc_type_match.group(1) if doc_type_match else "other",
357
+ "confidence": float(confidence_match.group(1)) if confidence_match else 90.0,
358
+ "full_text": full_text,
359
+ "fields": {
360
+ "full_text": full_text,
361
+ "note": "Response may have been truncated, but full_text was extracted"
362
+ }
363
+ }
364
+ print(f"[INFO] Extracted full_text ({len(full_text)} chars) from truncated JSON")
365
+ return result
366
+ except Exception as e:
367
+ print(f"[DEBUG] Failed to extract full_text from truncated JSON: {e}")
368
+ pass
369
+
370
  return {
371
  "doc_type": "other",
372
  "confidence": 50.0,
373
  "fields": {
374
+ "raw_response": text[:2000], # First 2000 chars for debugging
375
+ "error": "Could not parse JSON from model response (may be truncated)",
376
  "note": "Check server logs for full response"
377
  }
378
  }
379
+
380
+
381
+ def _fix_truncated_json(json_str: str) -> str:
382
+ """Attempt to fix truncated JSON by closing unclosed strings and objects."""
383
+ # Count open braces
384
+ open_braces = json_str.count('{') - json_str.count('}')
385
+ open_brackets = json_str.count('[') - json_str.count(']')
386
+
387
+ # Check if we're in the middle of a string
388
+ in_string = False
389
+ escape_next = False
390
+ for i, char in enumerate(json_str):
391
+ if escape_next:
392
+ escape_next = False
393
+ continue
394
+ if char == '\\':
395
+ escape_next = True
396
+ continue
397
+ if char == '"':
398
+ in_string = not in_string
399
+
400
+ # If we're in a string, close it
401
+ if in_string:
402
+ json_str = json_str.rstrip() + '"'
403
+
404
+ # Close any open brackets
405
+ json_str += ']' * open_brackets
406
+
407
+ # Close any open braces
408
+ json_str += '}' * open_braces
409
+
410
+ return json_str
411
+
412
+
413
+ def _extract_partial_json(text: str) -> Dict[str, Any]:
414
+ """Extract what we can from a partial JSON response."""
415
+ result = {
416
+ "doc_type": "other",
417
+ "confidence": 0.0,
418
+ "fields": {}
419
+ }
420
+
421
+ # Try to extract doc_type
422
+ doc_type_match = re.search(r'"doc_type"\s*:\s*"([^"]+)"', text)
423
+ if doc_type_match:
424
+ result["doc_type"] = doc_type_match.group(1)
425
+
426
+ # Try to extract confidence
427
+ confidence_match = re.search(r'"confidence"\s*:\s*(\d+(?:\.\d+)?)', text)
428
+ if confidence_match:
429
+ result["confidence"] = float(confidence_match.group(1))
430
+
431
+ # Try to extract full_text (even if truncated)
432
+ full_text_match = re.search(r'"full_text"\s*:\s*"([^"]*(?:\\.[^"]*)*)', text, re.DOTALL)
433
+ if full_text_match:
434
+ try:
435
+ full_text = full_text_match.group(1)
436
+ # Unescape common sequences
437
+ full_text = full_text.replace('\\n', '\n').replace('\\"', '"').replace('\\\\', '\\')
438
+ result["full_text"] = full_text
439
+ result["fields"]["full_text"] = full_text
440
+ except Exception:
441
+ pass
442
+
443
+ return result