Spaces:

Seth0330
/

AIEXTRACT1

Running

File size: 9,951 Bytes

import os
import base64
import json
import re
from io import BytesIO
from typing import Any, Dict, List

import httpx

try:
    import fitz  # PyMuPDF
    from PIL import Image
    PDF_SUPPORT = True
except ImportError as e:
    PDF_SUPPORT = False
    print(f"[WARNING] PDF support libraries not available: {e}. PDF conversion will not work.")

# Get your OpenRouter API key from env (you'll set this in Hugging Face later)
OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1/chat/completions"
MODEL_NAME = "qwen/qwen3-vl-235b-a22b-instruct"


def _pdf_to_images(pdf_bytes: bytes) -> List[bytes]:
    """
    Convert PDF pages to PNG images.
    Returns a list of PNG image bytes, one per page.
    """
    if not PDF_SUPPORT:
        raise RuntimeError("PyMuPDF not installed. Cannot convert PDF to images.")
    
    pdf_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    images = []
    
    print(f"[INFO] PDF has {len(pdf_doc)} page(s)")
    
    for page_num in range(len(pdf_doc)):
        page = pdf_doc[page_num]
        # Render page to image (zoom factor 2 for better quality)
        mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for better quality
        pix = page.get_pixmap(matrix=mat)
        
        # Convert to PIL Image then to PNG bytes
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        img_bytes = BytesIO()
        img.save(img_bytes, format="PNG")
        images.append(img_bytes.getvalue())
        
        print(f"[INFO] Converted page {page_num + 1} to image ({pix.width}x{pix.height})")
    
    pdf_doc.close()
    return images


def _image_bytes_to_base64(image_bytes: bytes) -> str:
    """Convert image bytes to base64 data URL."""
    b64 = base64.b64encode(image_bytes).decode("utf-8")
    return f"data:image/png;base64,{b64}"


def _file_to_image_blocks(file_bytes: bytes, content_type: str) -> List[Dict[str, Any]]:
    """
    Convert file to image blocks for the vision model.
    - For images: Returns single image block
    - For PDFs: Converts each page to an image and returns multiple blocks
    """
    # Handle PDF files
    if content_type == "application/pdf" or content_type.endswith("/pdf"):
        if not PDF_SUPPORT:
            raise RuntimeError("PDF support requires PyMuPDF. Please install it.")
        
        print(f"[INFO] Converting PDF to images...")
        pdf_images = _pdf_to_images(file_bytes)
        
        # Create image blocks for each page
        image_blocks = []
        for i, img_bytes in enumerate(pdf_images):
            image_url = _image_bytes_to_base64(img_bytes)
            image_blocks.append({
                "type": "input_image",
                "image_url": image_url,
            })
            print(f"[INFO] Created image block for page {i + 1} ({len(img_bytes)} bytes)")
        
        return image_blocks
    
    # Handle regular image files
    else:
        b64 = base64.b64encode(file_bytes).decode("utf-8")
        print(f"[DEBUG] Encoding image file. Content type: {content_type}, Size: {len(file_bytes)} bytes")
        
        return [{
            "type": "input_image",
            "image_url": f"data:{content_type};base64,{b64}",
        }]


async def extract_fields_from_document(
    file_bytes: bytes,
    content_type: str,
    filename: str,
) -> Dict[str, Any]:
    """
    Call OpenRouter with Qwen3-VL and return parsed JSON with fields.
    We instruct the model to return JSON only.
    """
    if not OPENROUTER_API_KEY:
        raise RuntimeError("OPENROUTER_API_KEY environment variable is not set")

    # Convert file to image blocks (handles PDF conversion)
    image_blocks = _file_to_image_blocks(file_bytes, content_type)
    
    if not image_blocks:
        raise ValueError("No images generated from file")

    print(f"[INFO] Generated {len(image_blocks)} image block(s) for processing")

    system_prompt = (
        "You are a document extraction engine. "
        "You analyze invoices, receipts, contracts, reports and similar documents, "
        "and output structured JSON only (no explanations or comments)."
    )

    # Update prompt for multi-page documents
    if len(image_blocks) > 1:
        user_prompt = (
            f"Extract important key-value pairs from this {len(image_blocks)}-page document. "
            "Analyze all pages and combine the information into a single JSON response.\n"
            "Use this shape:\n"
            "{\n"
            '  \"doc_type\": \"invoice | receipt | contract | report | other\",\n'
            '  \"confidence\": number between 0 and 100,\n'
            '  \"fields\": {\n'
            '    \"invoice_number\": \"...\",\n'
            '    \"date\": \"...\",\n'
            '    \"due_date\": \"...\",\n'
            '    \"total_amount\": \"...\",\n'
            '    \"currency\": \"...\",\n'
            '    \"vendor_name\": \"...\",\n'
            '    \"line_items\": [\n'
            '       {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
            '    ],\n'
            '    \"other_field\": \"...\"\n'
            "  }\n"
            "}\n"
            "If fields are missing or not applicable, simply omit them. "
            "Combine information from all pages into a single response."
        )
    else:
        user_prompt = (
            "Extract important key-value pairs from the document and respond with JSON only.\n"
            "Use this shape:\n"
            "{\n"
            '  \"doc_type\": \"invoice | receipt | contract | report | other\",\n'
            '  \"confidence\": number between 0 and 100,\n'
            '  \"fields\": {\n'
            '    \"invoice_number\": \"...\",\n'
            '    \"date\": \"...\",\n'
            '    \"due_date\": \"...\",\n'
            '    \"total_amount\": \"...\",\n'
            '    \"currency\": \"...\",\n'
            '    \"vendor_name\": \"...\",\n'
            '    \"line_items\": [\n'
            '       {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
            '    ],\n'
            '    \"other_field\": \"...\"\n'
            "  }\n"
            "}\n"
            "If fields are missing or not applicable, simply omit them."
        )

    # Build content array with text prompt and all image blocks
    user_content = [{"type": "text", "text": user_prompt}]
    user_content.extend(image_blocks)

    payload: Dict[str, Any] = {
        "model": MODEL_NAME,
        "messages": [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_prompt}],
            },
            {
                "role": "user",
                "content": user_content,
            },
        ],
        "max_tokens": 4096,  # Increased for multi-page documents
    }

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        # Optional attribution headers
        "HTTP-Referer": os.environ.get(
            "APP_URL",
            "https://huggingface.co/spaces/your-space",
        ),
        "X-Title": "Document Capture Demo",
    }

    async with httpx.AsyncClient(timeout=120) as client:
        resp = await client.post(OPENROUTER_BASE_URL, headers=headers, json=payload)
        resp.raise_for_status()
        data = resp.json()

    # OpenRouter returns choices[0].message.content
    if "choices" not in data or len(data["choices"]) == 0:
        raise ValueError("No choices in OpenRouter response")
    
    content = data["choices"][0]["message"]["content"]
    
    # Log the raw response for debugging (first 500 chars)
    print(f"[DEBUG] OpenRouter response preview: {str(content)[:500]}")

    # content may be a string or a list of content blocks
    if isinstance(content, list):
        text = "".join(part.get("text", "") for part in content if part.get("type") == "text")
    else:
        text = content

    if not text or not text.strip():
        raise ValueError("Empty response from OpenRouter API")

    # Try to parse JSON from the model output
    # The model might return JSON wrapped in markdown code blocks or with extra text
    try:
        # First, try direct JSON parsing
        parsed = json.loads(text)
        print(f"[DEBUG] Successfully parsed JSON directly")
        return parsed
    except json.JSONDecodeError as e:
        print(f"[DEBUG] Direct JSON parse failed: {e}")
        # Try to extract JSON from markdown code blocks
        json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
        if json_match:
            try:
                parsed = json.loads(json_match.group(1))
                print(f"[DEBUG] Successfully parsed JSON from markdown code block")
                return parsed
            except json.JSONDecodeError as e2:
                print(f"[DEBUG] Markdown code block parse failed: {e2}")
        
        # Try to find JSON object in the text (look for {...})
        json_match = re.search(r'\{.*\}', text, re.DOTALL)
        if json_match:
            try:
                parsed = json.loads(json_match.group(0))
                print(f"[DEBUG] Successfully parsed JSON from regex match")
                return parsed
            except json.JSONDecodeError as e3:
                print(f"[DEBUG] Regex match parse failed: {e3}")
        
        # If all parsing fails, return a default structure with the raw text
        print(f"[WARNING] All JSON parsing attempts failed. Returning fallback structure.")
        return {
            "doc_type": "other",
            "confidence": 50.0,
            "fields": {
                "raw_response": text[:1000],  # First 1000 chars for debugging
                "error": "Could not parse JSON from model response",
                "note": "Check server logs for full response"
            }
        }