Spaces:

Seth0330
/

AIEXTRACT1

Running

App Files Files Community

AIEXTRACT1 / backend /app /openrouter_client.py

Seth0330

Update backend/app/openrouter_client.py

9c61ac4 verified 18 days ago

raw

history blame

9.95 kB

	import os
	import base64
	import json
	import re
	from io import BytesIO
	from typing import Any, Dict, List

	import httpx

	try:
	import fitz # PyMuPDF
	from PIL import Image
	PDF_SUPPORT = True
	except ImportError as e:
	PDF_SUPPORT = False
	print(f"[WARNING] PDF support libraries not available: {e}. PDF conversion will not work.")

	# Get your OpenRouter API key from env (you'll set this in Hugging Face later)
	OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
	OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1/chat/completions"
	MODEL_NAME = "qwen/qwen3-vl-235b-a22b-instruct"


	def _pdf_to_images(pdf_bytes: bytes) -> List[bytes]:
	"""
	Convert PDF pages to PNG images.
	Returns a list of PNG image bytes, one per page.
	"""
	if not PDF_SUPPORT:
	raise RuntimeError("PyMuPDF not installed. Cannot convert PDF to images.")

	pdf_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
	images = []

	print(f"[INFO] PDF has {len(pdf_doc)} page(s)")

	for page_num in range(len(pdf_doc)):
	page = pdf_doc[page_num]
	# Render page to image (zoom factor 2 for better quality)
	mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
	pix = page.get_pixmap(matrix=mat)

	# Convert to PIL Image then to PNG bytes
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	img_bytes = BytesIO()
	img.save(img_bytes, format="PNG")
	images.append(img_bytes.getvalue())

	print(f"[INFO] Converted page {page_num + 1} to image ({pix.width}x{pix.height})")

	pdf_doc.close()
	return images


	def _image_bytes_to_base64(image_bytes: bytes) -> str:
	"""Convert image bytes to base64 data URL."""
	b64 = base64.b64encode(image_bytes).decode("utf-8")
	return f"data:image/png;base64,{b64}"


	def _file_to_image_blocks(file_bytes: bytes, content_type: str) -> List[Dict[str, Any]]:
	"""
	Convert file to image blocks for the vision model.
	- For images: Returns single image block
	- For PDFs: Converts each page to an image and returns multiple blocks
	"""
	# Handle PDF files
	if content_type == "application/pdf" or content_type.endswith("/pdf"):
	if not PDF_SUPPORT:
	raise RuntimeError("PDF support requires PyMuPDF. Please install it.")

	print(f"[INFO] Converting PDF to images...")
	pdf_images = _pdf_to_images(file_bytes)

	# Create image blocks for each page
	image_blocks = []
	for i, img_bytes in enumerate(pdf_images):
	image_url = _image_bytes_to_base64(img_bytes)
	image_blocks.append({
	"type": "input_image",
	"image_url": image_url,
	})
	print(f"[INFO] Created image block for page {i + 1} ({len(img_bytes)} bytes)")

	return image_blocks

	# Handle regular image files
	else:
	b64 = base64.b64encode(file_bytes).decode("utf-8")
	print(f"[DEBUG] Encoding image file. Content type: {content_type}, Size: {len(file_bytes)} bytes")

	return [{
	"type": "input_image",
	"image_url": f"data:{content_type};base64,{b64}",
	}]


	async def extract_fields_from_document(
	file_bytes: bytes,
	content_type: str,
	filename: str,
	) -> Dict[str, Any]:
	"""
	Call OpenRouter with Qwen3-VL and return parsed JSON with fields.
	We instruct the model to return JSON only.
	"""
	if not OPENROUTER_API_KEY:
	raise RuntimeError("OPENROUTER_API_KEY environment variable is not set")

	# Convert file to image blocks (handles PDF conversion)
	image_blocks = _file_to_image_blocks(file_bytes, content_type)

	if not image_blocks:
	raise ValueError("No images generated from file")

	print(f"[INFO] Generated {len(image_blocks)} image block(s) for processing")

	system_prompt = (
	"You are a document extraction engine. "
	"You analyze invoices, receipts, contracts, reports and similar documents, "
	"and output structured JSON only (no explanations or comments)."
	)

	# Update prompt for multi-page documents
	if len(image_blocks) > 1:
	user_prompt = (
	f"Extract important key-value pairs from this {len(image_blocks)}-page document. "
	"Analyze all pages and combine the information into a single JSON response.\n"
	"Use this shape:\n"
	"{\n"
	' \"doc_type\": \"invoice \| receipt \| contract \| report \| other\",\n'
	' \"confidence\": number between 0 and 100,\n'
	' \"fields\": {\n'
	' \"invoice_number\": \"...\",\n'
	' \"date\": \"...\",\n'
	' \"due_date\": \"...\",\n'
	' \"total_amount\": \"...\",\n'
	' \"currency\": \"...\",\n'
	' \"vendor_name\": \"...\",\n'
	' \"line_items\": [\n'
	' {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
	' ],\n'
	' \"other_field\": \"...\"\n'
	" }\n"
	"}\n"
	"If fields are missing or not applicable, simply omit them. "
	"Combine information from all pages into a single response."
	)
	else:
	user_prompt = (
	"Extract important key-value pairs from the document and respond with JSON only.\n"
	"Use this shape:\n"
	"{\n"
	' \"doc_type\": \"invoice \| receipt \| contract \| report \| other\",\n'
	' \"confidence\": number between 0 and 100,\n'
	' \"fields\": {\n'
	' \"invoice_number\": \"...\",\n'
	' \"date\": \"...\",\n'
	' \"due_date\": \"...\",\n'
	' \"total_amount\": \"...\",\n'
	' \"currency\": \"...\",\n'
	' \"vendor_name\": \"...\",\n'
	' \"line_items\": [\n'
	' {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
	' ],\n'
	' \"other_field\": \"...\"\n'
	" }\n"
	"}\n"
	"If fields are missing or not applicable, simply omit them."
	)

	# Build content array with text prompt and all image blocks
	user_content = [{"type": "text", "text": user_prompt}]
	user_content.extend(image_blocks)

	payload: Dict[str, Any] = {
	"model": MODEL_NAME,
	"messages": [
	{
	"role": "system",
	"content": [{"type": "text", "text": system_prompt}],
	},
	{
	"role": "user",
	"content": user_content,
	},
	],
	"max_tokens": 4096, # Increased for multi-page documents
	}

	headers = {
	"Authorization": f"Bearer {OPENROUTER_API_KEY}",
	"Content-Type": "application/json",
	# Optional attribution headers
	"HTTP-Referer": os.environ.get(
	"APP_URL",
	"https://huggingface.co/spaces/your-space",
	),
	"X-Title": "Document Capture Demo",
	}

	async with httpx.AsyncClient(timeout=120) as client:
	resp = await client.post(OPENROUTER_BASE_URL, headers=headers, json=payload)
	resp.raise_for_status()
	data = resp.json()

	# OpenRouter returns choices[0].message.content
	if "choices" not in data or len(data["choices"]) == 0:
	raise ValueError("No choices in OpenRouter response")

	content = data["choices"][0]["message"]["content"]

	# Log the raw response for debugging (first 500 chars)
	print(f"[DEBUG] OpenRouter response preview: {str(content)[:500]}")

	# content may be a string or a list of content blocks
	if isinstance(content, list):
	text = "".join(part.get("text", "") for part in content if part.get("type") == "text")
	else:
	text = content

	if not text or not text.strip():
	raise ValueError("Empty response from OpenRouter API")

	# Try to parse JSON from the model output
	# The model might return JSON wrapped in markdown code blocks or with extra text
	try:
	# First, try direct JSON parsing
	parsed = json.loads(text)
	print(f"[DEBUG] Successfully parsed JSON directly")
	return parsed
	except json.JSONDecodeError as e:
	print(f"[DEBUG] Direct JSON parse failed: {e}")
	# Try to extract JSON from markdown code blocks
	json_match = re.search(r'```(?:json)?\s(\{.?\})\s*```', text, re.DOTALL)
	if json_match:
	try:
	parsed = json.loads(json_match.group(1))
	print(f"[DEBUG] Successfully parsed JSON from markdown code block")
	return parsed
	except json.JSONDecodeError as e2:
	print(f"[DEBUG] Markdown code block parse failed: {e2}")

	# Try to find JSON object in the text (look for {...})
	json_match = re.search(r'\{.*\}', text, re.DOTALL)
	if json_match:
	try:
	parsed = json.loads(json_match.group(0))
	print(f"[DEBUG] Successfully parsed JSON from regex match")
	return parsed
	except json.JSONDecodeError as e3:
	print(f"[DEBUG] Regex match parse failed: {e3}")

	# If all parsing fails, return a default structure with the raw text
	print(f"[WARNING] All JSON parsing attempts failed. Returning fallback structure.")
	return {
	"doc_type": "other",
	"confidence": 50.0,
	"fields": {
	"raw_response": text[:1000], # First 1000 chars for debugging
	"error": "Could not parse JSON from model response",
	"note": "Check server logs for full response"
	}
	}