Spaces:
Sleeping
Sleeping
Update backend/app/openrouter_client.py
Browse files- backend/app/openrouter_client.py +0 -260
backend/app/openrouter_client.py
CHANGED
|
@@ -432,266 +432,6 @@ def _parse_model_response(text: str, page_num: int = None) -> Dict[str, Any]:
|
|
| 432 |
"fields": {"raw_text": text[:2000]}
|
| 433 |
}
|
| 434 |
|
| 435 |
-
system_prompt = (
|
| 436 |
-
"You are a document extraction engine with vision capabilities. "
|
| 437 |
-
"You read and extract text from documents in any language, preserving structure, formatting, and all content. "
|
| 438 |
-
"You output structured JSON with both the full extracted text and key-value pairs."
|
| 439 |
-
)
|
| 440 |
-
|
| 441 |
-
# Update prompt for multi-page documents - ask for full text extraction first
|
| 442 |
-
if len(image_blocks) > 1:
|
| 443 |
-
user_prompt = (
|
| 444 |
-
f"Read this {len(image_blocks)}-page document using your vision capability and extract ALL text content. "
|
| 445 |
-
"I want the complete end-to-end text from all pages, preserving structure, headings, formatting, and content in all languages.\n\n"
|
| 446 |
-
"Analyze ALL pages thoroughly, including any non-English text (Punjabi, Hindi, or other languages). "
|
| 447 |
-
"Extract every word, number, and piece of information from every page.\n\n"
|
| 448 |
-
"Respond with JSON in this format:\n"
|
| 449 |
-
"{\n"
|
| 450 |
-
' \"doc_type\": \"invoice | receipt | contract | report | notice | other\",\n'
|
| 451 |
-
' \"confidence\": number between 0 and 100,\n'
|
| 452 |
-
' \"full_text\": \"Complete extracted text from all pages, preserving structure and formatting. Include all languages.\",\n'
|
| 453 |
-
' \"fields\": {\n'
|
| 454 |
-
' \"invoice_number\": \"...\",\n'
|
| 455 |
-
' \"date\": \"...\",\n'
|
| 456 |
-
' \"due_date\": \"...\",\n'
|
| 457 |
-
' \"total_amount\": \"...\",\n'
|
| 458 |
-
' \"currency\": \"...\",\n'
|
| 459 |
-
' \"vendor_name\": \"...\",\n'
|
| 460 |
-
' \"company_name\": \"...\",\n'
|
| 461 |
-
' \"address\": \"...\",\n'
|
| 462 |
-
' \"line_items\": [\n'
|
| 463 |
-
' {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
|
| 464 |
-
' ],\n'
|
| 465 |
-
' \"other_field\": \"...\"\n'
|
| 466 |
-
" },\n"
|
| 467 |
-
' \"pages\": [\n'
|
| 468 |
-
' {\"page_number\": 1, \"text\": \"Full text from page 1\"},\n'
|
| 469 |
-
' {\"page_number\": 2, \"text\": \"Full text from page 2\"}\n'
|
| 470 |
-
' ]\n'
|
| 471 |
-
"}\n\n"
|
| 472 |
-
"IMPORTANT:\n"
|
| 473 |
-
"- Extract ALL text from ALL pages, including non-English languages\n"
|
| 474 |
-
"- Preserve structure, headings, and formatting in the full_text field\n"
|
| 475 |
-
"- Fill in fields with relevant extracted information\n"
|
| 476 |
-
"- If a field is not found, use empty string or omit it\n"
|
| 477 |
-
"- The full_text should contain everything readable from the document"
|
| 478 |
-
)
|
| 479 |
-
else:
|
| 480 |
-
user_prompt = (
|
| 481 |
-
"Read this document using your vision capability and extract ALL text content. "
|
| 482 |
-
"I want the complete end-to-end text, preserving structure, headings, formatting, and content in all languages.\n\n"
|
| 483 |
-
"Extract every word, number, and piece of information, including any non-English text.\n\n"
|
| 484 |
-
"Respond with JSON in this format:\n"
|
| 485 |
-
"{\n"
|
| 486 |
-
' \"doc_type\": \"invoice | receipt | contract | report | notice | other\",\n'
|
| 487 |
-
' \"confidence\": number between 0 and 100,\n'
|
| 488 |
-
' \"full_text\": \"Complete extracted text, preserving structure and formatting. Include all languages.\",\n'
|
| 489 |
-
' \"fields\": {\n'
|
| 490 |
-
' \"invoice_number\": \"...\",\n'
|
| 491 |
-
' \"date\": \"...\",\n'
|
| 492 |
-
' \"due_date\": \"...\",\n'
|
| 493 |
-
' \"total_amount\": \"...\",\n'
|
| 494 |
-
' \"currency\": \"...\",\n'
|
| 495 |
-
' \"vendor_name\": \"...\",\n'
|
| 496 |
-
' \"company_name\": \"...\",\n'
|
| 497 |
-
' \"address\": \"...\",\n'
|
| 498 |
-
' \"line_items\": [\n'
|
| 499 |
-
' {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
|
| 500 |
-
' ],\n'
|
| 501 |
-
' \"other_field\": \"...\"\n'
|
| 502 |
-
" }\n"
|
| 503 |
-
"}\n\n"
|
| 504 |
-
"IMPORTANT:\n"
|
| 505 |
-
"- Extract ALL text, including non-English languages\n"
|
| 506 |
-
"- Preserve structure, headings, and formatting in the full_text field\n"
|
| 507 |
-
"- Fill in fields with relevant extracted information\n"
|
| 508 |
-
"- If a field is not found, use empty string or omit it"
|
| 509 |
-
)
|
| 510 |
-
|
| 511 |
-
# Build content array with text prompt and all image blocks
|
| 512 |
-
user_content = [{"type": "text", "text": user_prompt}]
|
| 513 |
-
user_content.extend(image_blocks)
|
| 514 |
-
|
| 515 |
-
payload: Dict[str, Any] = {
|
| 516 |
-
"model": MODEL_NAME,
|
| 517 |
-
"messages": [
|
| 518 |
-
{
|
| 519 |
-
"role": "system",
|
| 520 |
-
"content": [{"type": "text", "text": system_prompt}],
|
| 521 |
-
},
|
| 522 |
-
{
|
| 523 |
-
"role": "user",
|
| 524 |
-
"content": user_content,
|
| 525 |
-
},
|
| 526 |
-
],
|
| 527 |
-
"max_tokens": 8192, # Increased for full text extraction from multi-page documents
|
| 528 |
-
}
|
| 529 |
-
|
| 530 |
-
headers = {
|
| 531 |
-
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
| 532 |
-
"Content-Type": "application/json",
|
| 533 |
-
# Optional attribution headers
|
| 534 |
-
"HTTP-Referer": os.environ.get(
|
| 535 |
-
"APP_URL",
|
| 536 |
-
"https://huggingface.co/spaces/your-space",
|
| 537 |
-
),
|
| 538 |
-
"X-Title": "Document Capture Demo",
|
| 539 |
-
}
|
| 540 |
-
|
| 541 |
-
# Calculate payload size
|
| 542 |
-
import sys
|
| 543 |
-
payload_str = json.dumps(payload)
|
| 544 |
-
payload_size_mb = len(payload_str.encode('utf-8')) / 1024 / 1024
|
| 545 |
-
|
| 546 |
-
print(f"[INFO] Sending request to OpenRouter API...")
|
| 547 |
-
print(f"[INFO] Payload size: {payload_size_mb:.2f} MB, Images: {len(image_blocks)} blocks")
|
| 548 |
-
print(f"[INFO] Model: {MODEL_NAME}")
|
| 549 |
-
|
| 550 |
-
if payload_size_mb > 10:
|
| 551 |
-
print(f"[WARNING] Payload is very large ({payload_size_mb:.2f} MB). This may cause slow responses or timeouts.")
|
| 552 |
-
|
| 553 |
-
try:
|
| 554 |
-
# Use a longer timeout for large documents - 10 minutes
|
| 555 |
-
timeout = httpx.Timeout(600.0, connect=30.0) # 10 min total, 30s connect
|
| 556 |
-
async with httpx.AsyncClient(timeout=timeout) as client:
|
| 557 |
-
print(f"[INFO] Making POST request to {OPENROUTER_BASE_URL}...")
|
| 558 |
-
print(f"[INFO] Timeout set to 10 minutes for large document processing...")
|
| 559 |
-
resp = await client.post(OPENROUTER_BASE_URL, headers=headers, json=payload)
|
| 560 |
-
print(f"[INFO] Received response: Status {resp.status_code}")
|
| 561 |
-
resp.raise_for_status()
|
| 562 |
-
data = resp.json()
|
| 563 |
-
print(f"[INFO] Response parsed successfully")
|
| 564 |
-
except httpx.TimeoutException:
|
| 565 |
-
print(f"[ERROR] Request to OpenRouter timed out after 5 minutes")
|
| 566 |
-
raise RuntimeError("Request to OpenRouter API timed out. The document may be too large or the API is slow. Please try again or use a smaller document.")
|
| 567 |
-
except httpx.HTTPStatusError as e:
|
| 568 |
-
print(f"[ERROR] HTTP error from OpenRouter: {e.response.status_code} - {e.response.text[:500]}")
|
| 569 |
-
raise RuntimeError(f"OpenRouter API error: {e.response.status_code} - {str(e)}")
|
| 570 |
-
except Exception as e:
|
| 571 |
-
print(f"[ERROR] Unexpected error calling OpenRouter: {type(e).__name__}: {str(e)}")
|
| 572 |
-
raise RuntimeError(f"Failed to call OpenRouter API: {str(e)}")
|
| 573 |
-
|
| 574 |
-
# OpenRouter returns choices[0].message.content
|
| 575 |
-
if "choices" not in data or len(data["choices"]) == 0:
|
| 576 |
-
raise ValueError("No choices in OpenRouter response")
|
| 577 |
-
|
| 578 |
-
content = data["choices"][0]["message"]["content"]
|
| 579 |
-
|
| 580 |
-
# Check if response was truncated
|
| 581 |
-
finish_reason = data["choices"][0].get("finish_reason", "")
|
| 582 |
-
if finish_reason == "length":
|
| 583 |
-
print(f"[WARNING] Response was truncated due to token limit (finish_reason: {finish_reason})")
|
| 584 |
-
|
| 585 |
-
# Log the raw response for debugging (first 1000 chars and last 500 chars)
|
| 586 |
-
content_str = str(content)
|
| 587 |
-
print(f"[DEBUG] OpenRouter response preview (first 1000 chars): {content_str[:1000]}")
|
| 588 |
-
if len(content_str) > 1000:
|
| 589 |
-
print(f"[DEBUG] OpenRouter response preview (last 500 chars): {content_str[-500:]}")
|
| 590 |
-
print(f"[DEBUG] Total response length: {len(content_str)} characters")
|
| 591 |
-
|
| 592 |
-
# content may be a string or a list of content blocks
|
| 593 |
-
if isinstance(content, list):
|
| 594 |
-
text = "".join(part.get("text", "") for part in content if part.get("type") == "text")
|
| 595 |
-
else:
|
| 596 |
-
text = content
|
| 597 |
-
|
| 598 |
-
if not text or not text.strip():
|
| 599 |
-
raise ValueError("Empty response from OpenRouter API")
|
| 600 |
-
|
| 601 |
-
# Try to parse JSON from the model output
|
| 602 |
-
# The model might return JSON wrapped in markdown code blocks or with extra text
|
| 603 |
-
try:
|
| 604 |
-
# First, try direct JSON parsing
|
| 605 |
-
parsed = json.loads(text)
|
| 606 |
-
print(f"[DEBUG] Successfully parsed JSON directly")
|
| 607 |
-
return parsed
|
| 608 |
-
except json.JSONDecodeError as e:
|
| 609 |
-
print(f"[DEBUG] Direct JSON parse failed: {e}")
|
| 610 |
-
|
| 611 |
-
# Try to extract JSON from markdown code blocks
|
| 612 |
-
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
|
| 613 |
-
if json_match:
|
| 614 |
-
try:
|
| 615 |
-
parsed = json.loads(json_match.group(1))
|
| 616 |
-
print(f"[DEBUG] Successfully parsed JSON from markdown code block")
|
| 617 |
-
return parsed
|
| 618 |
-
except json.JSONDecodeError as e2:
|
| 619 |
-
print(f"[DEBUG] Markdown code block parse failed: {e2}")
|
| 620 |
-
|
| 621 |
-
# Try to find JSON object in the text (look for {...})
|
| 622 |
-
json_match = re.search(r'\{.*\}', text, re.DOTALL)
|
| 623 |
-
if json_match:
|
| 624 |
-
json_str = json_match.group(0)
|
| 625 |
-
try:
|
| 626 |
-
parsed = json.loads(json_str)
|
| 627 |
-
print(f"[DEBUG] Successfully parsed JSON from regex match")
|
| 628 |
-
return parsed
|
| 629 |
-
except json.JSONDecodeError as e3:
|
| 630 |
-
print(f"[DEBUG] Regex match parse failed: {e3}")
|
| 631 |
-
# Try to fix truncated JSON by closing unclosed strings/objects
|
| 632 |
-
try:
|
| 633 |
-
fixed_json = _fix_truncated_json(json_str)
|
| 634 |
-
parsed = json.loads(fixed_json)
|
| 635 |
-
print(f"[DEBUG] Successfully parsed fixed truncated JSON")
|
| 636 |
-
return parsed
|
| 637 |
-
except Exception as e4:
|
| 638 |
-
print(f"[DEBUG] Failed to fix truncated JSON: {e4}")
|
| 639 |
-
|
| 640 |
-
# Last resort: try to extract what we can from the partial JSON
|
| 641 |
-
try:
|
| 642 |
-
partial_data = _extract_partial_json(text)
|
| 643 |
-
if partial_data:
|
| 644 |
-
print(f"[DEBUG] Extracted partial data from truncated JSON")
|
| 645 |
-
return partial_data
|
| 646 |
-
except Exception as e5:
|
| 647 |
-
print(f"[DEBUG] Failed to extract partial JSON: {e5}")
|
| 648 |
-
|
| 649 |
-
# If all parsing fails, return a default structure with the raw text
|
| 650 |
-
print(f"[WARNING] All JSON parsing attempts failed. Returning fallback structure.")
|
| 651 |
-
# Try to extract at least the full_text if it's visible (even if truncated)
|
| 652 |
-
# Look for "full_text": "..." pattern, handling escaped characters and truncation
|
| 653 |
-
full_text_match = re.search(r'"full_text"\s*:\s*"(.*?)(?:"\s*[,}]|$)', text, re.DOTALL)
|
| 654 |
-
if full_text_match:
|
| 655 |
-
try:
|
| 656 |
-
# Get the matched text (may be truncated)
|
| 657 |
-
full_text_raw = full_text_match.group(1)
|
| 658 |
-
# Unescape common sequences
|
| 659 |
-
full_text = (full_text_raw
|
| 660 |
-
.replace('\\n', '\n')
|
| 661 |
-
.replace('\\"', '"')
|
| 662 |
-
.replace('\\\\', '\\')
|
| 663 |
-
.replace('\\t', '\t')
|
| 664 |
-
.replace('\\r', '\r'))
|
| 665 |
-
|
| 666 |
-
# Try to extract other fields too
|
| 667 |
-
doc_type_match = re.search(r'"doc_type"\s*:\s*"([^"]+)"', text)
|
| 668 |
-
confidence_match = re.search(r'"confidence"\s*:\s*(\d+(?:\.\d+)?)', text)
|
| 669 |
-
|
| 670 |
-
result = {
|
| 671 |
-
"doc_type": doc_type_match.group(1) if doc_type_match else "other",
|
| 672 |
-
"confidence": float(confidence_match.group(1)) if confidence_match else 90.0,
|
| 673 |
-
"full_text": full_text,
|
| 674 |
-
"fields": {
|
| 675 |
-
"full_text": full_text,
|
| 676 |
-
"note": "Response may have been truncated, but full_text was extracted"
|
| 677 |
-
}
|
| 678 |
-
}
|
| 679 |
-
print(f"[INFO] Extracted full_text ({len(full_text)} chars) from truncated JSON")
|
| 680 |
-
return result
|
| 681 |
-
except Exception as e:
|
| 682 |
-
print(f"[DEBUG] Failed to extract full_text from truncated JSON: {e}")
|
| 683 |
-
pass
|
| 684 |
-
|
| 685 |
-
return {
|
| 686 |
-
"doc_type": "other",
|
| 687 |
-
"confidence": 50.0,
|
| 688 |
-
"fields": {
|
| 689 |
-
"raw_response": text[:2000], # First 2000 chars for debugging
|
| 690 |
-
"error": "Could not parse JSON from model response (may be truncated)",
|
| 691 |
-
"note": "Check server logs for full response"
|
| 692 |
-
}
|
| 693 |
-
}
|
| 694 |
-
|
| 695 |
|
| 696 |
def _fix_truncated_json(json_str: str) -> str:
|
| 697 |
"""Attempt to fix truncated JSON by closing unclosed strings and objects."""
|
|
|
|
| 432 |
"fields": {"raw_text": text[:2000]}
|
| 433 |
}
|
| 434 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 435 |
|
| 436 |
def _fix_truncated_json(json_str: str) -> str:
|
| 437 |
"""Attempt to fix truncated JSON by closing unclosed strings and objects."""
|