Spaces:

Seth0330
/

AIEXTRACT1

Sleeping

App Files Files Community

Seth0330 commited on 17 days ago

Commit

fa5ecef

verified ·

1 Parent(s): 94de22a

Update backend/app/openrouter_client.py

Browse files

Files changed (1) hide show

backend/app/openrouter_client.py +0 -260

backend/app/openrouter_client.py CHANGED Viewed

@@ -432,266 +432,6 @@ def _parse_model_response(text: str, page_num: int = None) -> Dict[str, Any]:
             "fields": {"raw_text": text[:2000]}
         }
-    system_prompt = (
-        "You are a document extraction engine with vision capabilities. "
-        "You read and extract text from documents in any language, preserving structure, formatting, and all content. "
-        "You output structured JSON with both the full extracted text and key-value pairs."
-    )
-    # Update prompt for multi-page documents - ask for full text extraction first
-    if len(image_blocks) > 1:
-        user_prompt = (
-            f"Read this {len(image_blocks)}-page document using your vision capability and extract ALL text content. "
-            "I want the complete end-to-end text from all pages, preserving structure, headings, formatting, and content in all languages.\n\n"
-            "Analyze ALL pages thoroughly, including any non-English text (Punjabi, Hindi, or other languages). "
-            "Extract every word, number, and piece of information from every page.\n\n"
-            "Respond with JSON in this format:\n"
-            "{\n"
-            '  \"doc_type\": \"invoice | receipt | contract | report | notice | other\",\n'
-            '  \"confidence\": number between 0 and 100,\n'
-            '  \"full_text\": \"Complete extracted text from all pages, preserving structure and formatting. Include all languages.\",\n'
-            '  \"fields\": {\n'
-            '    \"invoice_number\": \"...\",\n'
-            '    \"date\": \"...\",\n'
-            '    \"due_date\": \"...\",\n'
-            '    \"total_amount\": \"...\",\n'
-            '    \"currency\": \"...\",\n'
-            '    \"vendor_name\": \"...\",\n'
-            '    \"company_name\": \"...\",\n'
-            '    \"address\": \"...\",\n'
-            '    \"line_items\": [\n'
-            '       {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
-            '    ],\n'
-            '    \"other_field\": \"...\"\n'
-            "  },\n"
-            '  \"pages\": [\n'
-            '    {\"page_number\": 1, \"text\": \"Full text from page 1\"},\n'
-            '    {\"page_number\": 2, \"text\": \"Full text from page 2\"}\n'
-            '  ]\n'
-            "}\n\n"
-            "IMPORTANT:\n"
-            "- Extract ALL text from ALL pages, including non-English languages\n"
-            "- Preserve structure, headings, and formatting in the full_text field\n"
-            "- Fill in fields with relevant extracted information\n"
-            "- If a field is not found, use empty string or omit it\n"
-            "- The full_text should contain everything readable from the document"
-        )
-    else:
-        user_prompt = (
-            "Read this document using your vision capability and extract ALL text content. "
-            "I want the complete end-to-end text, preserving structure, headings, formatting, and content in all languages.\n\n"
-            "Extract every word, number, and piece of information, including any non-English text.\n\n"
-            "Respond with JSON in this format:\n"
-            "{\n"
-            '  \"doc_type\": \"invoice | receipt | contract | report | notice | other\",\n'
-            '  \"confidence\": number between 0 and 100,\n'
-            '  \"full_text\": \"Complete extracted text, preserving structure and formatting. Include all languages.\",\n'
-            '  \"fields\": {\n'
-            '    \"invoice_number\": \"...\",\n'
-            '    \"date\": \"...\",\n'
-            '    \"due_date\": \"...\",\n'
-            '    \"total_amount\": \"...\",\n'
-            '    \"currency\": \"...\",\n'
-            '    \"vendor_name\": \"...\",\n'
-            '    \"company_name\": \"...\",\n'
-            '    \"address\": \"...\",\n'
-            '    \"line_items\": [\n'
-            '       {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
-            '    ],\n'
-            '    \"other_field\": \"...\"\n'
-            "  }\n"
-            "}\n\n"
-            "IMPORTANT:\n"
-            "- Extract ALL text, including non-English languages\n"
-            "- Preserve structure, headings, and formatting in the full_text field\n"
-            "- Fill in fields with relevant extracted information\n"
-            "- If a field is not found, use empty string or omit it"
-        )
-    # Build content array with text prompt and all image blocks
-    user_content = [{"type": "text", "text": user_prompt}]
-    user_content.extend(image_blocks)
-    payload: Dict[str, Any] = {
-        "model": MODEL_NAME,
-        "messages": [
-            {
-                "role": "system",
-                "content": [{"type": "text", "text": system_prompt}],
-            },
-            {
-                "role": "user",
-                "content": user_content,
-            },
-        ],
-        "max_tokens": 8192,  # Increased for full text extraction from multi-page documents
-    }
-    headers = {
-        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
-        "Content-Type": "application/json",
-        # Optional attribution headers
-        "HTTP-Referer": os.environ.get(
-            "APP_URL",
-            "https://huggingface.co/spaces/your-space",
-        ),
-        "X-Title": "Document Capture Demo",
-    }
-    # Calculate payload size
-    import sys
-    payload_str = json.dumps(payload)
-    payload_size_mb = len(payload_str.encode('utf-8')) / 1024 / 1024
-    print(f"[INFO] Sending request to OpenRouter API...")
-    print(f"[INFO] Payload size: {payload_size_mb:.2f} MB, Images: {len(image_blocks)} blocks")
-    print(f"[INFO] Model: {MODEL_NAME}")
-    if payload_size_mb > 10:
-        print(f"[WARNING] Payload is very large ({payload_size_mb:.2f} MB). This may cause slow responses or timeouts.")
-    try:
-        # Use a longer timeout for large documents - 10 minutes
-        timeout = httpx.Timeout(600.0, connect=30.0)  # 10 min total, 30s connect
-        async with httpx.AsyncClient(timeout=timeout) as client:
-            print(f"[INFO] Making POST request to {OPENROUTER_BASE_URL}...")
-            print(f"[INFO] Timeout set to 10 minutes for large document processing...")
-            resp = await client.post(OPENROUTER_BASE_URL, headers=headers, json=payload)
-            print(f"[INFO] Received response: Status {resp.status_code}")
-            resp.raise_for_status()
-            data = resp.json()
-            print(f"[INFO] Response parsed successfully")
-    except httpx.TimeoutException:
-        print(f"[ERROR] Request to OpenRouter timed out after 5 minutes")
-        raise RuntimeError("Request to OpenRouter API timed out. The document may be too large or the API is slow. Please try again or use a smaller document.")
-    except httpx.HTTPStatusError as e:
-        print(f"[ERROR] HTTP error from OpenRouter: {e.response.status_code} - {e.response.text[:500]}")
-        raise RuntimeError(f"OpenRouter API error: {e.response.status_code} - {str(e)}")
-    except Exception as e:
-        print(f"[ERROR] Unexpected error calling OpenRouter: {type(e).__name__}: {str(e)}")
-        raise RuntimeError(f"Failed to call OpenRouter API: {str(e)}")
-    # OpenRouter returns choices[0].message.content
-    if "choices" not in data or len(data["choices"]) == 0:
-        raise ValueError("No choices in OpenRouter response")
-    content = data["choices"][0]["message"]["content"]
-    # Check if response was truncated
-    finish_reason = data["choices"][0].get("finish_reason", "")
-    if finish_reason == "length":
-        print(f"[WARNING] Response was truncated due to token limit (finish_reason: {finish_reason})")
-    # Log the raw response for debugging (first 1000 chars and last 500 chars)
-    content_str = str(content)
-    print(f"[DEBUG] OpenRouter response preview (first 1000 chars): {content_str[:1000]}")
-    if len(content_str) > 1000:
-        print(f"[DEBUG] OpenRouter response preview (last 500 chars): {content_str[-500:]}")
-    print(f"[DEBUG] Total response length: {len(content_str)} characters")
-    # content may be a string or a list of content blocks
-    if isinstance(content, list):
-        text = "".join(part.get("text", "") for part in content if part.get("type") == "text")
-    else:
-        text = content
-    if not text or not text.strip():
-        raise ValueError("Empty response from OpenRouter API")
-    # Try to parse JSON from the model output
-    # The model might return JSON wrapped in markdown code blocks or with extra text
-    try:
-        # First, try direct JSON parsing
-        parsed = json.loads(text)
-        print(f"[DEBUG] Successfully parsed JSON directly")
-        return parsed
-    except json.JSONDecodeError as e:
-        print(f"[DEBUG] Direct JSON parse failed: {e}")
-        # Try to extract JSON from markdown code blocks
-        json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
-        if json_match:
-            try:
-                parsed = json.loads(json_match.group(1))
-                print(f"[DEBUG] Successfully parsed JSON from markdown code block")
-                return parsed
-            except json.JSONDecodeError as e2:
-                print(f"[DEBUG] Markdown code block parse failed: {e2}")
-        # Try to find JSON object in the text (look for {...})
-        json_match = re.search(r'\{.*\}', text, re.DOTALL)
-        if json_match:
-            json_str = json_match.group(0)
-            try:
-                parsed = json.loads(json_str)
-                print(f"[DEBUG] Successfully parsed JSON from regex match")
-                return parsed
-            except json.JSONDecodeError as e3:
-                print(f"[DEBUG] Regex match parse failed: {e3}")
-                # Try to fix truncated JSON by closing unclosed strings/objects
-                try:
-                    fixed_json = _fix_truncated_json(json_str)
-                    parsed = json.loads(fixed_json)
-                    print(f"[DEBUG] Successfully parsed fixed truncated JSON")
-                    return parsed
-                except Exception as e4:
-                    print(f"[DEBUG] Failed to fix truncated JSON: {e4}")
-        # Last resort: try to extract what we can from the partial JSON
-        try:
-            partial_data = _extract_partial_json(text)
-            if partial_data:
-                print(f"[DEBUG] Extracted partial data from truncated JSON")
-                return partial_data
-        except Exception as e5:
-            print(f"[DEBUG] Failed to extract partial JSON: {e5}")
-        # If all parsing fails, return a default structure with the raw text
-        print(f"[WARNING] All JSON parsing attempts failed. Returning fallback structure.")
-        # Try to extract at least the full_text if it's visible (even if truncated)
-        # Look for "full_text": "..." pattern, handling escaped characters and truncation
-        full_text_match = re.search(r'"full_text"\s*:\s*"(.*?)(?:"\s*[,}]|$)', text, re.DOTALL)
-        if full_text_match:
-            try:
-                # Get the matched text (may be truncated)
-                full_text_raw = full_text_match.group(1)
-                # Unescape common sequences
-                full_text = (full_text_raw
-                           .replace('\\n', '\n')
-                           .replace('\\"', '"')
-                           .replace('\\\\', '\\')
-                           .replace('\\t', '\t')
-                           .replace('\\r', '\r'))
-                # Try to extract other fields too
-                doc_type_match = re.search(r'"doc_type"\s*:\s*"([^"]+)"', text)
-                confidence_match = re.search(r'"confidence"\s*:\s*(\d+(?:\.\d+)?)', text)
-                result = {
-                    "doc_type": doc_type_match.group(1) if doc_type_match else "other",
-                    "confidence": float(confidence_match.group(1)) if confidence_match else 90.0,
-                    "full_text": full_text,
-                    "fields": {
-                        "full_text": full_text,
-                        "note": "Response may have been truncated, but full_text was extracted"
-                    }
-                }
-                print(f"[INFO] Extracted full_text ({len(full_text)} chars) from truncated JSON")
-                return result
-            except Exception as e:
-                print(f"[DEBUG] Failed to extract full_text from truncated JSON: {e}")
-                pass
-        return {
-            "doc_type": "other",
-            "confidence": 50.0,
-            "fields": {
-                "raw_response": text[:2000],  # First 2000 chars for debugging
-                "error": "Could not parse JSON from model response (may be truncated)",
-                "note": "Check server logs for full response"
-            }
-        }
 def _fix_truncated_json(json_str: str) -> str:
     """Attempt to fix truncated JSON by closing unclosed strings and objects."""

             "fields": {"raw_text": text[:2000]}
         }
 def _fix_truncated_json(json_str: str) -> str:
     """Attempt to fix truncated JSON by closing unclosed strings and objects."""