Seth0330 commited on
Commit
fa5ecef
·
verified ·
1 Parent(s): 94de22a

Update backend/app/openrouter_client.py

Browse files
Files changed (1) hide show
  1. backend/app/openrouter_client.py +0 -260
backend/app/openrouter_client.py CHANGED
@@ -432,266 +432,6 @@ def _parse_model_response(text: str, page_num: int = None) -> Dict[str, Any]:
432
  "fields": {"raw_text": text[:2000]}
433
  }
434
 
435
- system_prompt = (
436
- "You are a document extraction engine with vision capabilities. "
437
- "You read and extract text from documents in any language, preserving structure, formatting, and all content. "
438
- "You output structured JSON with both the full extracted text and key-value pairs."
439
- )
440
-
441
- # Update prompt for multi-page documents - ask for full text extraction first
442
- if len(image_blocks) > 1:
443
- user_prompt = (
444
- f"Read this {len(image_blocks)}-page document using your vision capability and extract ALL text content. "
445
- "I want the complete end-to-end text from all pages, preserving structure, headings, formatting, and content in all languages.\n\n"
446
- "Analyze ALL pages thoroughly, including any non-English text (Punjabi, Hindi, or other languages). "
447
- "Extract every word, number, and piece of information from every page.\n\n"
448
- "Respond with JSON in this format:\n"
449
- "{\n"
450
- ' \"doc_type\": \"invoice | receipt | contract | report | notice | other\",\n'
451
- ' \"confidence\": number between 0 and 100,\n'
452
- ' \"full_text\": \"Complete extracted text from all pages, preserving structure and formatting. Include all languages.\",\n'
453
- ' \"fields\": {\n'
454
- ' \"invoice_number\": \"...\",\n'
455
- ' \"date\": \"...\",\n'
456
- ' \"due_date\": \"...\",\n'
457
- ' \"total_amount\": \"...\",\n'
458
- ' \"currency\": \"...\",\n'
459
- ' \"vendor_name\": \"...\",\n'
460
- ' \"company_name\": \"...\",\n'
461
- ' \"address\": \"...\",\n'
462
- ' \"line_items\": [\n'
463
- ' {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
464
- ' ],\n'
465
- ' \"other_field\": \"...\"\n'
466
- " },\n"
467
- ' \"pages\": [\n'
468
- ' {\"page_number\": 1, \"text\": \"Full text from page 1\"},\n'
469
- ' {\"page_number\": 2, \"text\": \"Full text from page 2\"}\n'
470
- ' ]\n'
471
- "}\n\n"
472
- "IMPORTANT:\n"
473
- "- Extract ALL text from ALL pages, including non-English languages\n"
474
- "- Preserve structure, headings, and formatting in the full_text field\n"
475
- "- Fill in fields with relevant extracted information\n"
476
- "- If a field is not found, use empty string or omit it\n"
477
- "- The full_text should contain everything readable from the document"
478
- )
479
- else:
480
- user_prompt = (
481
- "Read this document using your vision capability and extract ALL text content. "
482
- "I want the complete end-to-end text, preserving structure, headings, formatting, and content in all languages.\n\n"
483
- "Extract every word, number, and piece of information, including any non-English text.\n\n"
484
- "Respond with JSON in this format:\n"
485
- "{\n"
486
- ' \"doc_type\": \"invoice | receipt | contract | report | notice | other\",\n'
487
- ' \"confidence\": number between 0 and 100,\n'
488
- ' \"full_text\": \"Complete extracted text, preserving structure and formatting. Include all languages.\",\n'
489
- ' \"fields\": {\n'
490
- ' \"invoice_number\": \"...\",\n'
491
- ' \"date\": \"...\",\n'
492
- ' \"due_date\": \"...\",\n'
493
- ' \"total_amount\": \"...\",\n'
494
- ' \"currency\": \"...\",\n'
495
- ' \"vendor_name\": \"...\",\n'
496
- ' \"company_name\": \"...\",\n'
497
- ' \"address\": \"...\",\n'
498
- ' \"line_items\": [\n'
499
- ' {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
500
- ' ],\n'
501
- ' \"other_field\": \"...\"\n'
502
- " }\n"
503
- "}\n\n"
504
- "IMPORTANT:\n"
505
- "- Extract ALL text, including non-English languages\n"
506
- "- Preserve structure, headings, and formatting in the full_text field\n"
507
- "- Fill in fields with relevant extracted information\n"
508
- "- If a field is not found, use empty string or omit it"
509
- )
510
-
511
- # Build content array with text prompt and all image blocks
512
- user_content = [{"type": "text", "text": user_prompt}]
513
- user_content.extend(image_blocks)
514
-
515
- payload: Dict[str, Any] = {
516
- "model": MODEL_NAME,
517
- "messages": [
518
- {
519
- "role": "system",
520
- "content": [{"type": "text", "text": system_prompt}],
521
- },
522
- {
523
- "role": "user",
524
- "content": user_content,
525
- },
526
- ],
527
- "max_tokens": 8192, # Increased for full text extraction from multi-page documents
528
- }
529
-
530
- headers = {
531
- "Authorization": f"Bearer {OPENROUTER_API_KEY}",
532
- "Content-Type": "application/json",
533
- # Optional attribution headers
534
- "HTTP-Referer": os.environ.get(
535
- "APP_URL",
536
- "https://huggingface.co/spaces/your-space",
537
- ),
538
- "X-Title": "Document Capture Demo",
539
- }
540
-
541
- # Calculate payload size
542
- import sys
543
- payload_str = json.dumps(payload)
544
- payload_size_mb = len(payload_str.encode('utf-8')) / 1024 / 1024
545
-
546
- print(f"[INFO] Sending request to OpenRouter API...")
547
- print(f"[INFO] Payload size: {payload_size_mb:.2f} MB, Images: {len(image_blocks)} blocks")
548
- print(f"[INFO] Model: {MODEL_NAME}")
549
-
550
- if payload_size_mb > 10:
551
- print(f"[WARNING] Payload is very large ({payload_size_mb:.2f} MB). This may cause slow responses or timeouts.")
552
-
553
- try:
554
- # Use a longer timeout for large documents - 10 minutes
555
- timeout = httpx.Timeout(600.0, connect=30.0) # 10 min total, 30s connect
556
- async with httpx.AsyncClient(timeout=timeout) as client:
557
- print(f"[INFO] Making POST request to {OPENROUTER_BASE_URL}...")
558
- print(f"[INFO] Timeout set to 10 minutes for large document processing...")
559
- resp = await client.post(OPENROUTER_BASE_URL, headers=headers, json=payload)
560
- print(f"[INFO] Received response: Status {resp.status_code}")
561
- resp.raise_for_status()
562
- data = resp.json()
563
- print(f"[INFO] Response parsed successfully")
564
- except httpx.TimeoutException:
565
- print(f"[ERROR] Request to OpenRouter timed out after 5 minutes")
566
- raise RuntimeError("Request to OpenRouter API timed out. The document may be too large or the API is slow. Please try again or use a smaller document.")
567
- except httpx.HTTPStatusError as e:
568
- print(f"[ERROR] HTTP error from OpenRouter: {e.response.status_code} - {e.response.text[:500]}")
569
- raise RuntimeError(f"OpenRouter API error: {e.response.status_code} - {str(e)}")
570
- except Exception as e:
571
- print(f"[ERROR] Unexpected error calling OpenRouter: {type(e).__name__}: {str(e)}")
572
- raise RuntimeError(f"Failed to call OpenRouter API: {str(e)}")
573
-
574
- # OpenRouter returns choices[0].message.content
575
- if "choices" not in data or len(data["choices"]) == 0:
576
- raise ValueError("No choices in OpenRouter response")
577
-
578
- content = data["choices"][0]["message"]["content"]
579
-
580
- # Check if response was truncated
581
- finish_reason = data["choices"][0].get("finish_reason", "")
582
- if finish_reason == "length":
583
- print(f"[WARNING] Response was truncated due to token limit (finish_reason: {finish_reason})")
584
-
585
- # Log the raw response for debugging (first 1000 chars and last 500 chars)
586
- content_str = str(content)
587
- print(f"[DEBUG] OpenRouter response preview (first 1000 chars): {content_str[:1000]}")
588
- if len(content_str) > 1000:
589
- print(f"[DEBUG] OpenRouter response preview (last 500 chars): {content_str[-500:]}")
590
- print(f"[DEBUG] Total response length: {len(content_str)} characters")
591
-
592
- # content may be a string or a list of content blocks
593
- if isinstance(content, list):
594
- text = "".join(part.get("text", "") for part in content if part.get("type") == "text")
595
- else:
596
- text = content
597
-
598
- if not text or not text.strip():
599
- raise ValueError("Empty response from OpenRouter API")
600
-
601
- # Try to parse JSON from the model output
602
- # The model might return JSON wrapped in markdown code blocks or with extra text
603
- try:
604
- # First, try direct JSON parsing
605
- parsed = json.loads(text)
606
- print(f"[DEBUG] Successfully parsed JSON directly")
607
- return parsed
608
- except json.JSONDecodeError as e:
609
- print(f"[DEBUG] Direct JSON parse failed: {e}")
610
-
611
- # Try to extract JSON from markdown code blocks
612
- json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
613
- if json_match:
614
- try:
615
- parsed = json.loads(json_match.group(1))
616
- print(f"[DEBUG] Successfully parsed JSON from markdown code block")
617
- return parsed
618
- except json.JSONDecodeError as e2:
619
- print(f"[DEBUG] Markdown code block parse failed: {e2}")
620
-
621
- # Try to find JSON object in the text (look for {...})
622
- json_match = re.search(r'\{.*\}', text, re.DOTALL)
623
- if json_match:
624
- json_str = json_match.group(0)
625
- try:
626
- parsed = json.loads(json_str)
627
- print(f"[DEBUG] Successfully parsed JSON from regex match")
628
- return parsed
629
- except json.JSONDecodeError as e3:
630
- print(f"[DEBUG] Regex match parse failed: {e3}")
631
- # Try to fix truncated JSON by closing unclosed strings/objects
632
- try:
633
- fixed_json = _fix_truncated_json(json_str)
634
- parsed = json.loads(fixed_json)
635
- print(f"[DEBUG] Successfully parsed fixed truncated JSON")
636
- return parsed
637
- except Exception as e4:
638
- print(f"[DEBUG] Failed to fix truncated JSON: {e4}")
639
-
640
- # Last resort: try to extract what we can from the partial JSON
641
- try:
642
- partial_data = _extract_partial_json(text)
643
- if partial_data:
644
- print(f"[DEBUG] Extracted partial data from truncated JSON")
645
- return partial_data
646
- except Exception as e5:
647
- print(f"[DEBUG] Failed to extract partial JSON: {e5}")
648
-
649
- # If all parsing fails, return a default structure with the raw text
650
- print(f"[WARNING] All JSON parsing attempts failed. Returning fallback structure.")
651
- # Try to extract at least the full_text if it's visible (even if truncated)
652
- # Look for "full_text": "..." pattern, handling escaped characters and truncation
653
- full_text_match = re.search(r'"full_text"\s*:\s*"(.*?)(?:"\s*[,}]|$)', text, re.DOTALL)
654
- if full_text_match:
655
- try:
656
- # Get the matched text (may be truncated)
657
- full_text_raw = full_text_match.group(1)
658
- # Unescape common sequences
659
- full_text = (full_text_raw
660
- .replace('\\n', '\n')
661
- .replace('\\"', '"')
662
- .replace('\\\\', '\\')
663
- .replace('\\t', '\t')
664
- .replace('\\r', '\r'))
665
-
666
- # Try to extract other fields too
667
- doc_type_match = re.search(r'"doc_type"\s*:\s*"([^"]+)"', text)
668
- confidence_match = re.search(r'"confidence"\s*:\s*(\d+(?:\.\d+)?)', text)
669
-
670
- result = {
671
- "doc_type": doc_type_match.group(1) if doc_type_match else "other",
672
- "confidence": float(confidence_match.group(1)) if confidence_match else 90.0,
673
- "full_text": full_text,
674
- "fields": {
675
- "full_text": full_text,
676
- "note": "Response may have been truncated, but full_text was extracted"
677
- }
678
- }
679
- print(f"[INFO] Extracted full_text ({len(full_text)} chars) from truncated JSON")
680
- return result
681
- except Exception as e:
682
- print(f"[DEBUG] Failed to extract full_text from truncated JSON: {e}")
683
- pass
684
-
685
- return {
686
- "doc_type": "other",
687
- "confidence": 50.0,
688
- "fields": {
689
- "raw_response": text[:2000], # First 2000 chars for debugging
690
- "error": "Could not parse JSON from model response (may be truncated)",
691
- "note": "Check server logs for full response"
692
- }
693
- }
694
-
695
 
696
  def _fix_truncated_json(json_str: str) -> str:
697
  """Attempt to fix truncated JSON by closing unclosed strings and objects."""
 
432
  "fields": {"raw_text": text[:2000]}
433
  }
434
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
435
 
436
  def _fix_truncated_json(json_str: str) -> str:
437
  """Attempt to fix truncated JSON by closing unclosed strings and objects."""