Spaces:
Sleeping
Sleeping
Update backend/app/openrouter_client.py
Browse files- backend/app/openrouter_client.py +133 -6
backend/app/openrouter_client.py
CHANGED
|
@@ -251,7 +251,7 @@ async def extract_fields_from_document(
|
|
| 251 |
"X-Title": "Document Capture Demo",
|
| 252 |
}
|
| 253 |
|
| 254 |
-
async with httpx.AsyncClient(timeout=
|
| 255 |
resp = await client.post(OPENROUTER_BASE_URL, headers=headers, json=payload)
|
| 256 |
resp.raise_for_status()
|
| 257 |
data = resp.json()
|
|
@@ -262,8 +262,17 @@ async def extract_fields_from_document(
|
|
| 262 |
|
| 263 |
content = data["choices"][0]["message"]["content"]
|
| 264 |
|
| 265 |
-
#
|
| 266 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
|
| 268 |
# content may be a string or a list of content blocks
|
| 269 |
if isinstance(content, list):
|
|
@@ -283,6 +292,7 @@ async def extract_fields_from_document(
|
|
| 283 |
return parsed
|
| 284 |
except json.JSONDecodeError as e:
|
| 285 |
print(f"[DEBUG] Direct JSON parse failed: {e}")
|
|
|
|
| 286 |
# Try to extract JSON from markdown code blocks
|
| 287 |
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
|
| 288 |
if json_match:
|
|
@@ -296,21 +306,138 @@ async def extract_fields_from_document(
|
|
| 296 |
# Try to find JSON object in the text (look for {...})
|
| 297 |
json_match = re.search(r'\{.*\}', text, re.DOTALL)
|
| 298 |
if json_match:
|
|
|
|
| 299 |
try:
|
| 300 |
-
parsed = json.loads(
|
| 301 |
print(f"[DEBUG] Successfully parsed JSON from regex match")
|
| 302 |
return parsed
|
| 303 |
except json.JSONDecodeError as e3:
|
| 304 |
print(f"[DEBUG] Regex match parse failed: {e3}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
|
| 306 |
# If all parsing fails, return a default structure with the raw text
|
| 307 |
print(f"[WARNING] All JSON parsing attempts failed. Returning fallback structure.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
return {
|
| 309 |
"doc_type": "other",
|
| 310 |
"confidence": 50.0,
|
| 311 |
"fields": {
|
| 312 |
-
"raw_response": text[:
|
| 313 |
-
"error": "Could not parse JSON from model response",
|
| 314 |
"note": "Check server logs for full response"
|
| 315 |
}
|
| 316 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
"X-Title": "Document Capture Demo",
|
| 252 |
}
|
| 253 |
|
| 254 |
+
async with httpx.AsyncClient(timeout=180) as client: # Increased timeout for long responses
|
| 255 |
resp = await client.post(OPENROUTER_BASE_URL, headers=headers, json=payload)
|
| 256 |
resp.raise_for_status()
|
| 257 |
data = resp.json()
|
|
|
|
| 262 |
|
| 263 |
content = data["choices"][0]["message"]["content"]
|
| 264 |
|
| 265 |
+
# Check if response was truncated
|
| 266 |
+
finish_reason = data["choices"][0].get("finish_reason", "")
|
| 267 |
+
if finish_reason == "length":
|
| 268 |
+
print(f"[WARNING] Response was truncated due to token limit (finish_reason: {finish_reason})")
|
| 269 |
+
|
| 270 |
+
# Log the raw response for debugging (first 1000 chars and last 500 chars)
|
| 271 |
+
content_str = str(content)
|
| 272 |
+
print(f"[DEBUG] OpenRouter response preview (first 1000 chars): {content_str[:1000]}")
|
| 273 |
+
if len(content_str) > 1000:
|
| 274 |
+
print(f"[DEBUG] OpenRouter response preview (last 500 chars): {content_str[-500:]}")
|
| 275 |
+
print(f"[DEBUG] Total response length: {len(content_str)} characters")
|
| 276 |
|
| 277 |
# content may be a string or a list of content blocks
|
| 278 |
if isinstance(content, list):
|
|
|
|
| 292 |
return parsed
|
| 293 |
except json.JSONDecodeError as e:
|
| 294 |
print(f"[DEBUG] Direct JSON parse failed: {e}")
|
| 295 |
+
|
| 296 |
# Try to extract JSON from markdown code blocks
|
| 297 |
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
|
| 298 |
if json_match:
|
|
|
|
| 306 |
# Try to find JSON object in the text (look for {...})
|
| 307 |
json_match = re.search(r'\{.*\}', text, re.DOTALL)
|
| 308 |
if json_match:
|
| 309 |
+
json_str = json_match.group(0)
|
| 310 |
try:
|
| 311 |
+
parsed = json.loads(json_str)
|
| 312 |
print(f"[DEBUG] Successfully parsed JSON from regex match")
|
| 313 |
return parsed
|
| 314 |
except json.JSONDecodeError as e3:
|
| 315 |
print(f"[DEBUG] Regex match parse failed: {e3}")
|
| 316 |
+
# Try to fix truncated JSON by closing unclosed strings/objects
|
| 317 |
+
try:
|
| 318 |
+
fixed_json = _fix_truncated_json(json_str)
|
| 319 |
+
parsed = json.loads(fixed_json)
|
| 320 |
+
print(f"[DEBUG] Successfully parsed fixed truncated JSON")
|
| 321 |
+
return parsed
|
| 322 |
+
except Exception as e4:
|
| 323 |
+
print(f"[DEBUG] Failed to fix truncated JSON: {e4}")
|
| 324 |
+
|
| 325 |
+
# Last resort: try to extract what we can from the partial JSON
|
| 326 |
+
try:
|
| 327 |
+
partial_data = _extract_partial_json(text)
|
| 328 |
+
if partial_data:
|
| 329 |
+
print(f"[DEBUG] Extracted partial data from truncated JSON")
|
| 330 |
+
return partial_data
|
| 331 |
+
except Exception as e5:
|
| 332 |
+
print(f"[DEBUG] Failed to extract partial JSON: {e5}")
|
| 333 |
|
| 334 |
# If all parsing fails, return a default structure with the raw text
|
| 335 |
print(f"[WARNING] All JSON parsing attempts failed. Returning fallback structure.")
|
| 336 |
+
# Try to extract at least the full_text if it's visible (even if truncated)
|
| 337 |
+
# Look for "full_text": "..." pattern, handling escaped characters and truncation
|
| 338 |
+
full_text_match = re.search(r'"full_text"\s*:\s*"(.*?)(?:"\s*[,}]|$)', text, re.DOTALL)
|
| 339 |
+
if full_text_match:
|
| 340 |
+
try:
|
| 341 |
+
# Get the matched text (may be truncated)
|
| 342 |
+
full_text_raw = full_text_match.group(1)
|
| 343 |
+
# Unescape common sequences
|
| 344 |
+
full_text = (full_text_raw
|
| 345 |
+
.replace('\\n', '\n')
|
| 346 |
+
.replace('\\"', '"')
|
| 347 |
+
.replace('\\\\', '\\')
|
| 348 |
+
.replace('\\t', '\t')
|
| 349 |
+
.replace('\\r', '\r'))
|
| 350 |
+
|
| 351 |
+
# Try to extract other fields too
|
| 352 |
+
doc_type_match = re.search(r'"doc_type"\s*:\s*"([^"]+)"', text)
|
| 353 |
+
confidence_match = re.search(r'"confidence"\s*:\s*(\d+(?:\.\d+)?)', text)
|
| 354 |
+
|
| 355 |
+
result = {
|
| 356 |
+
"doc_type": doc_type_match.group(1) if doc_type_match else "other",
|
| 357 |
+
"confidence": float(confidence_match.group(1)) if confidence_match else 90.0,
|
| 358 |
+
"full_text": full_text,
|
| 359 |
+
"fields": {
|
| 360 |
+
"full_text": full_text,
|
| 361 |
+
"note": "Response may have been truncated, but full_text was extracted"
|
| 362 |
+
}
|
| 363 |
+
}
|
| 364 |
+
print(f"[INFO] Extracted full_text ({len(full_text)} chars) from truncated JSON")
|
| 365 |
+
return result
|
| 366 |
+
except Exception as e:
|
| 367 |
+
print(f"[DEBUG] Failed to extract full_text from truncated JSON: {e}")
|
| 368 |
+
pass
|
| 369 |
+
|
| 370 |
return {
|
| 371 |
"doc_type": "other",
|
| 372 |
"confidence": 50.0,
|
| 373 |
"fields": {
|
| 374 |
+
"raw_response": text[:2000], # First 2000 chars for debugging
|
| 375 |
+
"error": "Could not parse JSON from model response (may be truncated)",
|
| 376 |
"note": "Check server logs for full response"
|
| 377 |
}
|
| 378 |
}
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
def _fix_truncated_json(json_str: str) -> str:
|
| 382 |
+
"""Attempt to fix truncated JSON by closing unclosed strings and objects."""
|
| 383 |
+
# Count open braces
|
| 384 |
+
open_braces = json_str.count('{') - json_str.count('}')
|
| 385 |
+
open_brackets = json_str.count('[') - json_str.count(']')
|
| 386 |
+
|
| 387 |
+
# Check if we're in the middle of a string
|
| 388 |
+
in_string = False
|
| 389 |
+
escape_next = False
|
| 390 |
+
for i, char in enumerate(json_str):
|
| 391 |
+
if escape_next:
|
| 392 |
+
escape_next = False
|
| 393 |
+
continue
|
| 394 |
+
if char == '\\':
|
| 395 |
+
escape_next = True
|
| 396 |
+
continue
|
| 397 |
+
if char == '"':
|
| 398 |
+
in_string = not in_string
|
| 399 |
+
|
| 400 |
+
# If we're in a string, close it
|
| 401 |
+
if in_string:
|
| 402 |
+
json_str = json_str.rstrip() + '"'
|
| 403 |
+
|
| 404 |
+
# Close any open brackets
|
| 405 |
+
json_str += ']' * open_brackets
|
| 406 |
+
|
| 407 |
+
# Close any open braces
|
| 408 |
+
json_str += '}' * open_braces
|
| 409 |
+
|
| 410 |
+
return json_str
|
| 411 |
+
|
| 412 |
+
|
| 413 |
+
def _extract_partial_json(text: str) -> Dict[str, Any]:
|
| 414 |
+
"""Extract what we can from a partial JSON response."""
|
| 415 |
+
result = {
|
| 416 |
+
"doc_type": "other",
|
| 417 |
+
"confidence": 0.0,
|
| 418 |
+
"fields": {}
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
+
# Try to extract doc_type
|
| 422 |
+
doc_type_match = re.search(r'"doc_type"\s*:\s*"([^"]+)"', text)
|
| 423 |
+
if doc_type_match:
|
| 424 |
+
result["doc_type"] = doc_type_match.group(1)
|
| 425 |
+
|
| 426 |
+
# Try to extract confidence
|
| 427 |
+
confidence_match = re.search(r'"confidence"\s*:\s*(\d+(?:\.\d+)?)', text)
|
| 428 |
+
if confidence_match:
|
| 429 |
+
result["confidence"] = float(confidence_match.group(1))
|
| 430 |
+
|
| 431 |
+
# Try to extract full_text (even if truncated)
|
| 432 |
+
full_text_match = re.search(r'"full_text"\s*:\s*"([^"]*(?:\\.[^"]*)*)', text, re.DOTALL)
|
| 433 |
+
if full_text_match:
|
| 434 |
+
try:
|
| 435 |
+
full_text = full_text_match.group(1)
|
| 436 |
+
# Unescape common sequences
|
| 437 |
+
full_text = full_text.replace('\\n', '\n').replace('\\"', '"').replace('\\\\', '\\')
|
| 438 |
+
result["full_text"] = full_text
|
| 439 |
+
result["fields"]["full_text"] = full_text
|
| 440 |
+
except Exception:
|
| 441 |
+
pass
|
| 442 |
+
|
| 443 |
+
return result
|