Spaces:

Seth0330
/

AIEXTRACT1

Sleeping

App Files Files Community

Seth0330 commited on 15 days ago

Commit

4a0b9bf

verified ·

1 Parent(s): 7154f00

Update backend/app/openrouter_client.py

Browse files

Files changed (1) hide show

backend/app/openrouter_client.py +133 -6

backend/app/openrouter_client.py CHANGED Viewed

@@ -251,7 +251,7 @@ async def extract_fields_from_document(
         "X-Title": "Document Capture Demo",
     }
-    async with httpx.AsyncClient(timeout=120) as client:
         resp = await client.post(OPENROUTER_BASE_URL, headers=headers, json=payload)
         resp.raise_for_status()
         data = resp.json()
@@ -262,8 +262,17 @@ async def extract_fields_from_document(
     content = data["choices"][0]["message"]["content"]
-    # Log the raw response for debugging (first 500 chars)
-    print(f"[DEBUG] OpenRouter response preview: {str(content)[:500]}")
     # content may be a string or a list of content blocks
     if isinstance(content, list):
@@ -283,6 +292,7 @@ async def extract_fields_from_document(
         return parsed
     except json.JSONDecodeError as e:
         print(f"[DEBUG] Direct JSON parse failed: {e}")
         # Try to extract JSON from markdown code blocks
         json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
         if json_match:
@@ -296,21 +306,138 @@ async def extract_fields_from_document(
         # Try to find JSON object in the text (look for {...})
         json_match = re.search(r'\{.*\}', text, re.DOTALL)
         if json_match:
             try:
-                parsed = json.loads(json_match.group(0))
                 print(f"[DEBUG] Successfully parsed JSON from regex match")
                 return parsed
             except json.JSONDecodeError as e3:
                 print(f"[DEBUG] Regex match parse failed: {e3}")
         # If all parsing fails, return a default structure with the raw text
         print(f"[WARNING] All JSON parsing attempts failed. Returning fallback structure.")
         return {
             "doc_type": "other",
             "confidence": 50.0,
             "fields": {
-                "raw_response": text[:1000],  # First 1000 chars for debugging
-                "error": "Could not parse JSON from model response",
                 "note": "Check server logs for full response"
             }
         }

         "X-Title": "Document Capture Demo",
     }
+    async with httpx.AsyncClient(timeout=180) as client:  # Increased timeout for long responses
         resp = await client.post(OPENROUTER_BASE_URL, headers=headers, json=payload)
         resp.raise_for_status()
         data = resp.json()
     content = data["choices"][0]["message"]["content"]
+    # Check if response was truncated
+    finish_reason = data["choices"][0].get("finish_reason", "")
+    if finish_reason == "length":
+        print(f"[WARNING] Response was truncated due to token limit (finish_reason: {finish_reason})")
+    # Log the raw response for debugging (first 1000 chars and last 500 chars)
+    content_str = str(content)
+    print(f"[DEBUG] OpenRouter response preview (first 1000 chars): {content_str[:1000]}")
+    if len(content_str) > 1000:
+        print(f"[DEBUG] OpenRouter response preview (last 500 chars): {content_str[-500:]}")
+    print(f"[DEBUG] Total response length: {len(content_str)} characters")
     # content may be a string or a list of content blocks
     if isinstance(content, list):
         return parsed
     except json.JSONDecodeError as e:
         print(f"[DEBUG] Direct JSON parse failed: {e}")
         # Try to extract JSON from markdown code blocks
         json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
         if json_match:
         # Try to find JSON object in the text (look for {...})
         json_match = re.search(r'\{.*\}', text, re.DOTALL)
         if json_match:
+            json_str = json_match.group(0)
             try:
+                parsed = json.loads(json_str)
                 print(f"[DEBUG] Successfully parsed JSON from regex match")
                 return parsed
             except json.JSONDecodeError as e3:
                 print(f"[DEBUG] Regex match parse failed: {e3}")
+                # Try to fix truncated JSON by closing unclosed strings/objects
+                try:
+                    fixed_json = _fix_truncated_json(json_str)
+                    parsed = json.loads(fixed_json)
+                    print(f"[DEBUG] Successfully parsed fixed truncated JSON")
+                    return parsed
+                except Exception as e4:
+                    print(f"[DEBUG] Failed to fix truncated JSON: {e4}")
+        # Last resort: try to extract what we can from the partial JSON
+        try:
+            partial_data = _extract_partial_json(text)
+            if partial_data:
+                print(f"[DEBUG] Extracted partial data from truncated JSON")
+                return partial_data
+        except Exception as e5:
+            print(f"[DEBUG] Failed to extract partial JSON: {e5}")
         # If all parsing fails, return a default structure with the raw text
         print(f"[WARNING] All JSON parsing attempts failed. Returning fallback structure.")
+        # Try to extract at least the full_text if it's visible (even if truncated)
+        # Look for "full_text": "..." pattern, handling escaped characters and truncation
+        full_text_match = re.search(r'"full_text"\s*:\s*"(.*?)(?:"\s*[,}]|$)', text, re.DOTALL)
+        if full_text_match:
+            try:
+                # Get the matched text (may be truncated)
+                full_text_raw = full_text_match.group(1)
+                # Unescape common sequences
+                full_text = (full_text_raw
+                           .replace('\\n', '\n')
+                           .replace('\\"', '"')
+                           .replace('\\\\', '\\')
+                           .replace('\\t', '\t')
+                           .replace('\\r', '\r'))
+                # Try to extract other fields too
+                doc_type_match = re.search(r'"doc_type"\s*:\s*"([^"]+)"', text)
+                confidence_match = re.search(r'"confidence"\s*:\s*(\d+(?:\.\d+)?)', text)
+                result = {
+                    "doc_type": doc_type_match.group(1) if doc_type_match else "other",
+                    "confidence": float(confidence_match.group(1)) if confidence_match else 90.0,
+                    "full_text": full_text,
+                    "fields": {
+                        "full_text": full_text,
+                        "note": "Response may have been truncated, but full_text was extracted"
+                    }
+                }
+                print(f"[INFO] Extracted full_text ({len(full_text)} chars) from truncated JSON")
+                return result
+            except Exception as e:
+                print(f"[DEBUG] Failed to extract full_text from truncated JSON: {e}")
+                pass
         return {
             "doc_type": "other",
             "confidence": 50.0,
             "fields": {
+                "raw_response": text[:2000],  # First 2000 chars for debugging
+                "error": "Could not parse JSON from model response (may be truncated)",
                 "note": "Check server logs for full response"
             }
         }
+def _fix_truncated_json(json_str: str) -> str:
+    """Attempt to fix truncated JSON by closing unclosed strings and objects."""
+    # Count open braces
+    open_braces = json_str.count('{') - json_str.count('}')
+    open_brackets = json_str.count('[') - json_str.count(']')
+    # Check if we're in the middle of a string
+    in_string = False
+    escape_next = False
+    for i, char in enumerate(json_str):
+        if escape_next:
+            escape_next = False
+            continue
+        if char == '\\':
+            escape_next = True
+            continue
+        if char == '"':
+            in_string = not in_string
+    # If we're in a string, close it
+    if in_string:
+        json_str = json_str.rstrip() + '"'
+    # Close any open brackets
+    json_str += ']' * open_brackets
+    # Close any open braces
+    json_str += '}' * open_braces
+    return json_str
+def _extract_partial_json(text: str) -> Dict[str, Any]:
+    """Extract what we can from a partial JSON response."""
+    result = {
+        "doc_type": "other",
+        "confidence": 0.0,
+        "fields": {}
+    }
+    # Try to extract doc_type
+    doc_type_match = re.search(r'"doc_type"\s*:\s*"([^"]+)"', text)
+    if doc_type_match:
+        result["doc_type"] = doc_type_match.group(1)
+    # Try to extract confidence
+    confidence_match = re.search(r'"confidence"\s*:\s*(\d+(?:\.\d+)?)', text)
+    if confidence_match:
+        result["confidence"] = float(confidence_match.group(1))
+    # Try to extract full_text (even if truncated)
+    full_text_match = re.search(r'"full_text"\s*:\s*"([^"]*(?:\\.[^"]*)*)', text, re.DOTALL)
+    if full_text_match:
+        try:
+            full_text = full_text_match.group(1)
+            # Unescape common sequences
+            full_text = full_text.replace('\\n', '\n').replace('\\"', '"').replace('\\\\', '\\')
+            result["full_text"] = full_text
+            result["fields"]["full_text"] = full_text
+        except Exception:
+            pass
+    return result