Spaces:
Sleeping
Sleeping
Update backend/app/openrouter_client.py
Browse files- backend/app/openrouter_client.py +138 -40
backend/app/openrouter_client.py
CHANGED
|
@@ -2,32 +2,97 @@ import os
|
|
| 2 |
import base64
|
| 3 |
import json
|
| 4 |
import re
|
| 5 |
-
from
|
|
|
|
| 6 |
|
| 7 |
import httpx
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
# Get your OpenRouter API key from env (you'll set this in Hugging Face later)
|
| 10 |
OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
|
| 11 |
OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1/chat/completions"
|
| 12 |
MODEL_NAME = "qwen/qwen3-vl-235b-a22b-instruct"
|
| 13 |
|
| 14 |
|
| 15 |
-
def
|
| 16 |
"""
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
For images (PNG, JPG, etc.), this works fine.
|
| 20 |
-
For PDFs, the model might not be able to process them.
|
| 21 |
"""
|
| 22 |
-
|
|
|
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
async def extract_fields_from_document(
|
|
@@ -42,7 +107,13 @@ async def extract_fields_from_document(
|
|
| 42 |
if not OPENROUTER_API_KEY:
|
| 43 |
raise RuntimeError("OPENROUTER_API_KEY environment variable is not set")
|
| 44 |
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
system_prompt = (
|
| 48 |
"You are a document extraction engine. "
|
|
@@ -50,27 +121,57 @@ async def extract_fields_from_document(
|
|
| 50 |
"and output structured JSON only (no explanations or comments)."
|
| 51 |
)
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
payload: Dict[str, Any] = {
|
| 76 |
"model": MODEL_NAME,
|
|
@@ -81,13 +182,10 @@ async def extract_fields_from_document(
|
|
| 81 |
},
|
| 82 |
{
|
| 83 |
"role": "user",
|
| 84 |
-
"content":
|
| 85 |
-
{"type": "text", "text": user_prompt},
|
| 86 |
-
image_block,
|
| 87 |
-
],
|
| 88 |
},
|
| 89 |
],
|
| 90 |
-
"max_tokens":
|
| 91 |
}
|
| 92 |
|
| 93 |
headers = {
|
|
|
|
| 2 |
import base64
|
| 3 |
import json
|
| 4 |
import re
|
| 5 |
+
from io import BytesIO
|
| 6 |
+
from typing import Any, Dict, List
|
| 7 |
|
| 8 |
import httpx
|
| 9 |
|
| 10 |
+
try:
|
| 11 |
+
import fitz # PyMuPDF
|
| 12 |
+
from PIL import Image
|
| 13 |
+
PDF_SUPPORT = True
|
| 14 |
+
except ImportError as e:
|
| 15 |
+
PDF_SUPPORT = False
|
| 16 |
+
print(f"[WARNING] PDF support libraries not available: {e}. PDF conversion will not work.")
|
| 17 |
+
|
| 18 |
# Get your OpenRouter API key from env (you'll set this in Hugging Face later)
|
| 19 |
OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
|
| 20 |
OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1/chat/completions"
|
| 21 |
MODEL_NAME = "qwen/qwen3-vl-235b-a22b-instruct"
|
| 22 |
|
| 23 |
|
| 24 |
+
def _pdf_to_images(pdf_bytes: bytes) -> List[bytes]:
|
| 25 |
"""
|
| 26 |
+
Convert PDF pages to PNG images.
|
| 27 |
+
Returns a list of PNG image bytes, one per page.
|
|
|
|
|
|
|
| 28 |
"""
|
| 29 |
+
if not PDF_SUPPORT:
|
| 30 |
+
raise RuntimeError("PyMuPDF not installed. Cannot convert PDF to images.")
|
| 31 |
|
| 32 |
+
pdf_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 33 |
+
images = []
|
| 34 |
|
| 35 |
+
print(f"[INFO] PDF has {len(pdf_doc)} page(s)")
|
| 36 |
+
|
| 37 |
+
for page_num in range(len(pdf_doc)):
|
| 38 |
+
page = pdf_doc[page_num]
|
| 39 |
+
# Render page to image (zoom factor 2 for better quality)
|
| 40 |
+
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
|
| 41 |
+
pix = page.get_pixmap(matrix=mat)
|
| 42 |
+
|
| 43 |
+
# Convert to PIL Image then to PNG bytes
|
| 44 |
+
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
| 45 |
+
img_bytes = BytesIO()
|
| 46 |
+
img.save(img_bytes, format="PNG")
|
| 47 |
+
images.append(img_bytes.getvalue())
|
| 48 |
+
|
| 49 |
+
print(f"[INFO] Converted page {page_num + 1} to image ({pix.width}x{pix.height})")
|
| 50 |
+
|
| 51 |
+
pdf_doc.close()
|
| 52 |
+
return images
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def _image_bytes_to_base64(image_bytes: bytes) -> str:
|
| 56 |
+
"""Convert image bytes to base64 data URL."""
|
| 57 |
+
b64 = base64.b64encode(image_bytes).decode("utf-8")
|
| 58 |
+
return f"data:image/png;base64,{b64}"
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def _file_to_image_blocks(file_bytes: bytes, content_type: str) -> List[Dict[str, Any]]:
|
| 62 |
+
"""
|
| 63 |
+
Convert file to image blocks for the vision model.
|
| 64 |
+
- For images: Returns single image block
|
| 65 |
+
- For PDFs: Converts each page to an image and returns multiple blocks
|
| 66 |
+
"""
|
| 67 |
+
# Handle PDF files
|
| 68 |
+
if content_type == "application/pdf" or content_type.endswith("/pdf"):
|
| 69 |
+
if not PDF_SUPPORT:
|
| 70 |
+
raise RuntimeError("PDF support requires PyMuPDF. Please install it.")
|
| 71 |
+
|
| 72 |
+
print(f"[INFO] Converting PDF to images...")
|
| 73 |
+
pdf_images = _pdf_to_images(file_bytes)
|
| 74 |
+
|
| 75 |
+
# Create image blocks for each page
|
| 76 |
+
image_blocks = []
|
| 77 |
+
for i, img_bytes in enumerate(pdf_images):
|
| 78 |
+
image_url = _image_bytes_to_base64(img_bytes)
|
| 79 |
+
image_blocks.append({
|
| 80 |
+
"type": "input_image",
|
| 81 |
+
"image_url": image_url,
|
| 82 |
+
})
|
| 83 |
+
print(f"[INFO] Created image block for page {i + 1} ({len(img_bytes)} bytes)")
|
| 84 |
+
|
| 85 |
+
return image_blocks
|
| 86 |
+
|
| 87 |
+
# Handle regular image files
|
| 88 |
+
else:
|
| 89 |
+
b64 = base64.b64encode(file_bytes).decode("utf-8")
|
| 90 |
+
print(f"[DEBUG] Encoding image file. Content type: {content_type}, Size: {len(file_bytes)} bytes")
|
| 91 |
+
|
| 92 |
+
return [{
|
| 93 |
+
"type": "input_image",
|
| 94 |
+
"image_url": f"data:{content_type};base64,{b64}",
|
| 95 |
+
}]
|
| 96 |
|
| 97 |
|
| 98 |
async def extract_fields_from_document(
|
|
|
|
| 107 |
if not OPENROUTER_API_KEY:
|
| 108 |
raise RuntimeError("OPENROUTER_API_KEY environment variable is not set")
|
| 109 |
|
| 110 |
+
# Convert file to image blocks (handles PDF conversion)
|
| 111 |
+
image_blocks = _file_to_image_blocks(file_bytes, content_type)
|
| 112 |
+
|
| 113 |
+
if not image_blocks:
|
| 114 |
+
raise ValueError("No images generated from file")
|
| 115 |
+
|
| 116 |
+
print(f"[INFO] Generated {len(image_blocks)} image block(s) for processing")
|
| 117 |
|
| 118 |
system_prompt = (
|
| 119 |
"You are a document extraction engine. "
|
|
|
|
| 121 |
"and output structured JSON only (no explanations or comments)."
|
| 122 |
)
|
| 123 |
|
| 124 |
+
# Update prompt for multi-page documents
|
| 125 |
+
if len(image_blocks) > 1:
|
| 126 |
+
user_prompt = (
|
| 127 |
+
f"Extract important key-value pairs from this {len(image_blocks)}-page document. "
|
| 128 |
+
"Analyze all pages and combine the information into a single JSON response.\n"
|
| 129 |
+
"Use this shape:\n"
|
| 130 |
+
"{\n"
|
| 131 |
+
' \"doc_type\": \"invoice | receipt | contract | report | other\",\n'
|
| 132 |
+
' \"confidence\": number between 0 and 100,\n'
|
| 133 |
+
' \"fields\": {\n'
|
| 134 |
+
' \"invoice_number\": \"...\",\n'
|
| 135 |
+
' \"date\": \"...\",\n'
|
| 136 |
+
' \"due_date\": \"...\",\n'
|
| 137 |
+
' \"total_amount\": \"...\",\n'
|
| 138 |
+
' \"currency\": \"...\",\n'
|
| 139 |
+
' \"vendor_name\": \"...\",\n'
|
| 140 |
+
' \"line_items\": [\n'
|
| 141 |
+
' {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
|
| 142 |
+
' ],\n'
|
| 143 |
+
' \"other_field\": \"...\"\n'
|
| 144 |
+
" }\n"
|
| 145 |
+
"}\n"
|
| 146 |
+
"If fields are missing or not applicable, simply omit them. "
|
| 147 |
+
"Combine information from all pages into a single response."
|
| 148 |
+
)
|
| 149 |
+
else:
|
| 150 |
+
user_prompt = (
|
| 151 |
+
"Extract important key-value pairs from the document and respond with JSON only.\n"
|
| 152 |
+
"Use this shape:\n"
|
| 153 |
+
"{\n"
|
| 154 |
+
' \"doc_type\": \"invoice | receipt | contract | report | other\",\n'
|
| 155 |
+
' \"confidence\": number between 0 and 100,\n'
|
| 156 |
+
' \"fields\": {\n'
|
| 157 |
+
' \"invoice_number\": \"...\",\n'
|
| 158 |
+
' \"date\": \"...\",\n'
|
| 159 |
+
' \"due_date\": \"...\",\n'
|
| 160 |
+
' \"total_amount\": \"...\",\n'
|
| 161 |
+
' \"currency\": \"...\",\n'
|
| 162 |
+
' \"vendor_name\": \"...\",\n'
|
| 163 |
+
' \"line_items\": [\n'
|
| 164 |
+
' {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
|
| 165 |
+
' ],\n'
|
| 166 |
+
' \"other_field\": \"...\"\n'
|
| 167 |
+
" }\n"
|
| 168 |
+
"}\n"
|
| 169 |
+
"If fields are missing or not applicable, simply omit them."
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
# Build content array with text prompt and all image blocks
|
| 173 |
+
user_content = [{"type": "text", "text": user_prompt}]
|
| 174 |
+
user_content.extend(image_blocks)
|
| 175 |
|
| 176 |
payload: Dict[str, Any] = {
|
| 177 |
"model": MODEL_NAME,
|
|
|
|
| 182 |
},
|
| 183 |
{
|
| 184 |
"role": "user",
|
| 185 |
+
"content": user_content,
|
|
|
|
|
|
|
|
|
|
| 186 |
},
|
| 187 |
],
|
| 188 |
+
"max_tokens": 4096, # Increased for multi-page documents
|
| 189 |
}
|
| 190 |
|
| 191 |
headers = {
|