AIEXTRACT1 / backend /app /openrouter_client.py
Seth0330's picture
Update backend/app/openrouter_client.py
9c61ac4 verified
raw
history blame
9.95 kB
import os
import base64
import json
import re
from io import BytesIO
from typing import Any, Dict, List
import httpx
try:
import fitz # PyMuPDF
from PIL import Image
PDF_SUPPORT = True
except ImportError as e:
PDF_SUPPORT = False
print(f"[WARNING] PDF support libraries not available: {e}. PDF conversion will not work.")
# Get your OpenRouter API key from env (you'll set this in Hugging Face later)
OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1/chat/completions"
MODEL_NAME = "qwen/qwen3-vl-235b-a22b-instruct"
def _pdf_to_images(pdf_bytes: bytes) -> List[bytes]:
"""
Convert PDF pages to PNG images.
Returns a list of PNG image bytes, one per page.
"""
if not PDF_SUPPORT:
raise RuntimeError("PyMuPDF not installed. Cannot convert PDF to images.")
pdf_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
images = []
print(f"[INFO] PDF has {len(pdf_doc)} page(s)")
for page_num in range(len(pdf_doc)):
page = pdf_doc[page_num]
# Render page to image (zoom factor 2 for better quality)
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
pix = page.get_pixmap(matrix=mat)
# Convert to PIL Image then to PNG bytes
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
img_bytes = BytesIO()
img.save(img_bytes, format="PNG")
images.append(img_bytes.getvalue())
print(f"[INFO] Converted page {page_num + 1} to image ({pix.width}x{pix.height})")
pdf_doc.close()
return images
def _image_bytes_to_base64(image_bytes: bytes) -> str:
"""Convert image bytes to base64 data URL."""
b64 = base64.b64encode(image_bytes).decode("utf-8")
return f"data:image/png;base64,{b64}"
def _file_to_image_blocks(file_bytes: bytes, content_type: str) -> List[Dict[str, Any]]:
"""
Convert file to image blocks for the vision model.
- For images: Returns single image block
- For PDFs: Converts each page to an image and returns multiple blocks
"""
# Handle PDF files
if content_type == "application/pdf" or content_type.endswith("/pdf"):
if not PDF_SUPPORT:
raise RuntimeError("PDF support requires PyMuPDF. Please install it.")
print(f"[INFO] Converting PDF to images...")
pdf_images = _pdf_to_images(file_bytes)
# Create image blocks for each page
image_blocks = []
for i, img_bytes in enumerate(pdf_images):
image_url = _image_bytes_to_base64(img_bytes)
image_blocks.append({
"type": "input_image",
"image_url": image_url,
})
print(f"[INFO] Created image block for page {i + 1} ({len(img_bytes)} bytes)")
return image_blocks
# Handle regular image files
else:
b64 = base64.b64encode(file_bytes).decode("utf-8")
print(f"[DEBUG] Encoding image file. Content type: {content_type}, Size: {len(file_bytes)} bytes")
return [{
"type": "input_image",
"image_url": f"data:{content_type};base64,{b64}",
}]
async def extract_fields_from_document(
file_bytes: bytes,
content_type: str,
filename: str,
) -> Dict[str, Any]:
"""
Call OpenRouter with Qwen3-VL and return parsed JSON with fields.
We instruct the model to return JSON only.
"""
if not OPENROUTER_API_KEY:
raise RuntimeError("OPENROUTER_API_KEY environment variable is not set")
# Convert file to image blocks (handles PDF conversion)
image_blocks = _file_to_image_blocks(file_bytes, content_type)
if not image_blocks:
raise ValueError("No images generated from file")
print(f"[INFO] Generated {len(image_blocks)} image block(s) for processing")
system_prompt = (
"You are a document extraction engine. "
"You analyze invoices, receipts, contracts, reports and similar documents, "
"and output structured JSON only (no explanations or comments)."
)
# Update prompt for multi-page documents
if len(image_blocks) > 1:
user_prompt = (
f"Extract important key-value pairs from this {len(image_blocks)}-page document. "
"Analyze all pages and combine the information into a single JSON response.\n"
"Use this shape:\n"
"{\n"
' \"doc_type\": \"invoice | receipt | contract | report | other\",\n'
' \"confidence\": number between 0 and 100,\n'
' \"fields\": {\n'
' \"invoice_number\": \"...\",\n'
' \"date\": \"...\",\n'
' \"due_date\": \"...\",\n'
' \"total_amount\": \"...\",\n'
' \"currency\": \"...\",\n'
' \"vendor_name\": \"...\",\n'
' \"line_items\": [\n'
' {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
' ],\n'
' \"other_field\": \"...\"\n'
" }\n"
"}\n"
"If fields are missing or not applicable, simply omit them. "
"Combine information from all pages into a single response."
)
else:
user_prompt = (
"Extract important key-value pairs from the document and respond with JSON only.\n"
"Use this shape:\n"
"{\n"
' \"doc_type\": \"invoice | receipt | contract | report | other\",\n'
' \"confidence\": number between 0 and 100,\n'
' \"fields\": {\n'
' \"invoice_number\": \"...\",\n'
' \"date\": \"...\",\n'
' \"due_date\": \"...\",\n'
' \"total_amount\": \"...\",\n'
' \"currency\": \"...\",\n'
' \"vendor_name\": \"...\",\n'
' \"line_items\": [\n'
' {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
' ],\n'
' \"other_field\": \"...\"\n'
" }\n"
"}\n"
"If fields are missing or not applicable, simply omit them."
)
# Build content array with text prompt and all image blocks
user_content = [{"type": "text", "text": user_prompt}]
user_content.extend(image_blocks)
payload: Dict[str, Any] = {
"model": MODEL_NAME,
"messages": [
{
"role": "system",
"content": [{"type": "text", "text": system_prompt}],
},
{
"role": "user",
"content": user_content,
},
],
"max_tokens": 4096, # Increased for multi-page documents
}
headers = {
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
"Content-Type": "application/json",
# Optional attribution headers
"HTTP-Referer": os.environ.get(
"APP_URL",
"https://huggingface.co/spaces/your-space",
),
"X-Title": "Document Capture Demo",
}
async with httpx.AsyncClient(timeout=120) as client:
resp = await client.post(OPENROUTER_BASE_URL, headers=headers, json=payload)
resp.raise_for_status()
data = resp.json()
# OpenRouter returns choices[0].message.content
if "choices" not in data or len(data["choices"]) == 0:
raise ValueError("No choices in OpenRouter response")
content = data["choices"][0]["message"]["content"]
# Log the raw response for debugging (first 500 chars)
print(f"[DEBUG] OpenRouter response preview: {str(content)[:500]}")
# content may be a string or a list of content blocks
if isinstance(content, list):
text = "".join(part.get("text", "") for part in content if part.get("type") == "text")
else:
text = content
if not text or not text.strip():
raise ValueError("Empty response from OpenRouter API")
# Try to parse JSON from the model output
# The model might return JSON wrapped in markdown code blocks or with extra text
try:
# First, try direct JSON parsing
parsed = json.loads(text)
print(f"[DEBUG] Successfully parsed JSON directly")
return parsed
except json.JSONDecodeError as e:
print(f"[DEBUG] Direct JSON parse failed: {e}")
# Try to extract JSON from markdown code blocks
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
if json_match:
try:
parsed = json.loads(json_match.group(1))
print(f"[DEBUG] Successfully parsed JSON from markdown code block")
return parsed
except json.JSONDecodeError as e2:
print(f"[DEBUG] Markdown code block parse failed: {e2}")
# Try to find JSON object in the text (look for {...})
json_match = re.search(r'\{.*\}', text, re.DOTALL)
if json_match:
try:
parsed = json.loads(json_match.group(0))
print(f"[DEBUG] Successfully parsed JSON from regex match")
return parsed
except json.JSONDecodeError as e3:
print(f"[DEBUG] Regex match parse failed: {e3}")
# If all parsing fails, return a default structure with the raw text
print(f"[WARNING] All JSON parsing attempts failed. Returning fallback structure.")
return {
"doc_type": "other",
"confidence": 50.0,
"fields": {
"raw_response": text[:1000], # First 1000 chars for debugging
"error": "Could not parse JSON from model response",
"note": "Check server logs for full response"
}
}