Spaces:

Seth0330
/

AIEXTRACT1

Sleeping

App Files Files Community

Seth0330 commited on 15 days ago

Commit

9c61ac4

verified ·

1 Parent(s): c019cc4

Update backend/app/openrouter_client.py

Browse files

Files changed (1) hide show

backend/app/openrouter_client.py +138 -40

backend/app/openrouter_client.py CHANGED Viewed

@@ -2,32 +2,97 @@ import os
 import base64
 import json
 import re
-from typing import Any, Dict
 import httpx
 # Get your OpenRouter API key from env (you'll set this in Hugging Face later)
 OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
 OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1/chat/completions"
 MODEL_NAME = "qwen/qwen3-vl-235b-a22b-instruct"
-def _file_to_image_block(file_bytes: bytes, content_type: str) -> Dict[str, Any]:
     """
-    Encode the file as a data URL to feed into the multimodal model.
-    Note: PDFs may not be directly supported by vision models.
-    For images (PNG, JPG, etc.), this works fine.
-    For PDFs, the model might not be able to process them.
     """
-    b64 = base64.b64encode(file_bytes).decode("utf-8")
-    # Log file type for debugging
-    print(f"[DEBUG] Encoding file as image block. Content type: {content_type}, Size: {len(file_bytes)} bytes")
-    return {
-        "type": "input_image",
-        "image_url": f"data:{content_type};base64,{b64}",
-    }
 async def extract_fields_from_document(
@@ -42,7 +107,13 @@ async def extract_fields_from_document(
     if not OPENROUTER_API_KEY:
         raise RuntimeError("OPENROUTER_API_KEY environment variable is not set")
-    image_block = _file_to_image_block(file_bytes, content_type)
     system_prompt = (
         "You are a document extraction engine. "
@@ -50,27 +121,57 @@ async def extract_fields_from_document(
         "and output structured JSON only (no explanations or comments)."
     )
-    user_prompt = (
-        "Extract important key-value pairs from the document and respond with JSON only.\n"
-        "Use this shape:\n"
-        "{\n"
-        '  \"doc_type\": \"invoice | receipt | contract | report | other\",\n'
-        '  \"confidence\": number between 0 and 100,\n'
-        '  \"fields\": {\n'
-        '    \"invoice_number\": \"...\",\n'
-        '    \"date\": \"...\",\n'
-        '    \"due_date\": \"...\",\n'
-        '    \"total_amount\": \"...\",\n'
-        '    \"currency\": \"...\",\n'
-        '    \"vendor_name\": \"...\",\n'
-        '    \"line_items\": [\n'
-        '       {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
-        '    ],\n'
-        '    \"other_field\": \"...\"\n'
-        "  }\n"
-        "}\n"
-        "If fields are missing or not applicable, simply omit them."
-    )
     payload: Dict[str, Any] = {
         "model": MODEL_NAME,
@@ -81,13 +182,10 @@ async def extract_fields_from_document(
             },
             {
                 "role": "user",
-                "content": [
-                    {"type": "text", "text": user_prompt},
-                    image_block,
-                ],
             },
         ],
-        "max_tokens": 2048,
     }
     headers = {

 import base64
 import json
 import re
+from io import BytesIO
+from typing import Any, Dict, List
 import httpx
+try:
+    import fitz  # PyMuPDF
+    from PIL import Image
+    PDF_SUPPORT = True
+except ImportError as e:
+    PDF_SUPPORT = False
+    print(f"[WARNING] PDF support libraries not available: {e}. PDF conversion will not work.")
 # Get your OpenRouter API key from env (you'll set this in Hugging Face later)
 OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
 OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1/chat/completions"
 MODEL_NAME = "qwen/qwen3-vl-235b-a22b-instruct"
+def _pdf_to_images(pdf_bytes: bytes) -> List[bytes]:
     """
+    Convert PDF pages to PNG images.
+    Returns a list of PNG image bytes, one per page.
     """
+    if not PDF_SUPPORT:
+        raise RuntimeError("PyMuPDF not installed. Cannot convert PDF to images.")
+    pdf_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+    images = []
+    print(f"[INFO] PDF has {len(pdf_doc)} page(s)")
+    for page_num in range(len(pdf_doc)):
+        page = pdf_doc[page_num]
+        # Render page to image (zoom factor 2 for better quality)
+        mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for better quality
+        pix = page.get_pixmap(matrix=mat)
+        # Convert to PIL Image then to PNG bytes
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        img_bytes = BytesIO()
+        img.save(img_bytes, format="PNG")
+        images.append(img_bytes.getvalue())
+        print(f"[INFO] Converted page {page_num + 1} to image ({pix.width}x{pix.height})")
+    pdf_doc.close()
+    return images
+def _image_bytes_to_base64(image_bytes: bytes) -> str:
+    """Convert image bytes to base64 data URL."""
+    b64 = base64.b64encode(image_bytes).decode("utf-8")
+    return f"data:image/png;base64,{b64}"
+def _file_to_image_blocks(file_bytes: bytes, content_type: str) -> List[Dict[str, Any]]:
+    """
+    Convert file to image blocks for the vision model.
+    - For images: Returns single image block
+    - For PDFs: Converts each page to an image and returns multiple blocks
+    """
+    # Handle PDF files
+    if content_type == "application/pdf" or content_type.endswith("/pdf"):
+        if not PDF_SUPPORT:
+            raise RuntimeError("PDF support requires PyMuPDF. Please install it.")
+        print(f"[INFO] Converting PDF to images...")
+        pdf_images = _pdf_to_images(file_bytes)
+        # Create image blocks for each page
+        image_blocks = []
+        for i, img_bytes in enumerate(pdf_images):
+            image_url = _image_bytes_to_base64(img_bytes)
+            image_blocks.append({
+                "type": "input_image",
+                "image_url": image_url,
+            })
+            print(f"[INFO] Created image block for page {i + 1} ({len(img_bytes)} bytes)")
+        return image_blocks
+    # Handle regular image files
+    else:
+        b64 = base64.b64encode(file_bytes).decode("utf-8")
+        print(f"[DEBUG] Encoding image file. Content type: {content_type}, Size: {len(file_bytes)} bytes")
+        return [{
+            "type": "input_image",
+            "image_url": f"data:{content_type};base64,{b64}",
+        }]
 async def extract_fields_from_document(
     if not OPENROUTER_API_KEY:
         raise RuntimeError("OPENROUTER_API_KEY environment variable is not set")
+    # Convert file to image blocks (handles PDF conversion)
+    image_blocks = _file_to_image_blocks(file_bytes, content_type)
+    if not image_blocks:
+        raise ValueError("No images generated from file")
+    print(f"[INFO] Generated {len(image_blocks)} image block(s) for processing")
     system_prompt = (
         "You are a document extraction engine. "
         "and output structured JSON only (no explanations or comments)."
     )
+    # Update prompt for multi-page documents
+    if len(image_blocks) > 1:
+        user_prompt = (
+            f"Extract important key-value pairs from this {len(image_blocks)}-page document. "
+            "Analyze all pages and combine the information into a single JSON response.\n"
+            "Use this shape:\n"
+            "{\n"
+            '  \"doc_type\": \"invoice | receipt | contract | report | other\",\n'
+            '  \"confidence\": number between 0 and 100,\n'
+            '  \"fields\": {\n'
+            '    \"invoice_number\": \"...\",\n'
+            '    \"date\": \"...\",\n'
+            '    \"due_date\": \"...\",\n'
+            '    \"total_amount\": \"...\",\n'
+            '    \"currency\": \"...\",\n'
+            '    \"vendor_name\": \"...\",\n'
+            '    \"line_items\": [\n'
+            '       {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
+            '    ],\n'
+            '    \"other_field\": \"...\"\n'
+            "  }\n"
+            "}\n"
+            "If fields are missing or not applicable, simply omit them. "
+            "Combine information from all pages into a single response."
+        )
+    else:
+        user_prompt = (
+            "Extract important key-value pairs from the document and respond with JSON only.\n"
+            "Use this shape:\n"
+            "{\n"
+            '  \"doc_type\": \"invoice | receipt | contract | report | other\",\n'
+            '  \"confidence\": number between 0 and 100,\n'
+            '  \"fields\": {\n'
+            '    \"invoice_number\": \"...\",\n'
+            '    \"date\": \"...\",\n'
+            '    \"due_date\": \"...\",\n'
+            '    \"total_amount\": \"...\",\n'
+            '    \"currency\": \"...\",\n'
+            '    \"vendor_name\": \"...\",\n'
+            '    \"line_items\": [\n'
+            '       {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
+            '    ],\n'
+            '    \"other_field\": \"...\"\n'
+            "  }\n"
+            "}\n"
+            "If fields are missing or not applicable, simply omit them."
+        )
+    # Build content array with text prompt and all image blocks
+    user_content = [{"type": "text", "text": user_prompt}]
+    user_content.extend(image_blocks)
     payload: Dict[str, Any] = {
         "model": MODEL_NAME,
             },
             {
                 "role": "user",
+                "content": user_content,
             },
         ],
+        "max_tokens": 4096,  # Increased for multi-page documents
     }
     headers = {