File size: 9,951 Bytes
d24a0cf
 
 
d03aadc
9c61ac4
 
d24a0cf
 
 
9c61ac4
 
 
 
 
 
 
 
d24a0cf
 
 
 
 
 
9c61ac4
d24a0cf
9c61ac4
 
d24a0cf
9c61ac4
 
c019cc4
9c61ac4
 
c019cc4
9c61ac4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d24a0cf
 
 
 
 
 
 
 
 
 
 
 
 
 
9c61ac4
 
 
 
 
 
 
d24a0cf
 
 
 
 
 
 
9c61ac4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d24a0cf
 
 
 
 
 
 
 
 
 
9c61ac4
d24a0cf
 
9c61ac4
d24a0cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c019cc4
 
 
d24a0cf
c019cc4
 
 
d24a0cf
 
 
 
 
 
 
c019cc4
 
 
d24a0cf
d03aadc
 
 
c019cc4
 
 
 
 
d03aadc
 
 
 
c019cc4
 
 
 
 
d03aadc
 
 
 
 
c019cc4
 
 
 
 
d03aadc
 
c019cc4
d03aadc
 
 
 
c019cc4
 
 
d03aadc
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
import os
import base64
import json
import re
from io import BytesIO
from typing import Any, Dict, List

import httpx

try:
    import fitz  # PyMuPDF
    from PIL import Image
    PDF_SUPPORT = True
except ImportError as e:
    PDF_SUPPORT = False
    print(f"[WARNING] PDF support libraries not available: {e}. PDF conversion will not work.")

# Get your OpenRouter API key from env (you'll set this in Hugging Face later)
OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1/chat/completions"
MODEL_NAME = "qwen/qwen3-vl-235b-a22b-instruct"


def _pdf_to_images(pdf_bytes: bytes) -> List[bytes]:
    """
    Convert PDF pages to PNG images.
    Returns a list of PNG image bytes, one per page.
    """
    if not PDF_SUPPORT:
        raise RuntimeError("PyMuPDF not installed. Cannot convert PDF to images.")
    
    pdf_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    images = []
    
    print(f"[INFO] PDF has {len(pdf_doc)} page(s)")
    
    for page_num in range(len(pdf_doc)):
        page = pdf_doc[page_num]
        # Render page to image (zoom factor 2 for better quality)
        mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for better quality
        pix = page.get_pixmap(matrix=mat)
        
        # Convert to PIL Image then to PNG bytes
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        img_bytes = BytesIO()
        img.save(img_bytes, format="PNG")
        images.append(img_bytes.getvalue())
        
        print(f"[INFO] Converted page {page_num + 1} to image ({pix.width}x{pix.height})")
    
    pdf_doc.close()
    return images


def _image_bytes_to_base64(image_bytes: bytes) -> str:
    """Convert image bytes to base64 data URL."""
    b64 = base64.b64encode(image_bytes).decode("utf-8")
    return f"data:image/png;base64,{b64}"


def _file_to_image_blocks(file_bytes: bytes, content_type: str) -> List[Dict[str, Any]]:
    """
    Convert file to image blocks for the vision model.
    - For images: Returns single image block
    - For PDFs: Converts each page to an image and returns multiple blocks
    """
    # Handle PDF files
    if content_type == "application/pdf" or content_type.endswith("/pdf"):
        if not PDF_SUPPORT:
            raise RuntimeError("PDF support requires PyMuPDF. Please install it.")
        
        print(f"[INFO] Converting PDF to images...")
        pdf_images = _pdf_to_images(file_bytes)
        
        # Create image blocks for each page
        image_blocks = []
        for i, img_bytes in enumerate(pdf_images):
            image_url = _image_bytes_to_base64(img_bytes)
            image_blocks.append({
                "type": "input_image",
                "image_url": image_url,
            })
            print(f"[INFO] Created image block for page {i + 1} ({len(img_bytes)} bytes)")
        
        return image_blocks
    
    # Handle regular image files
    else:
        b64 = base64.b64encode(file_bytes).decode("utf-8")
        print(f"[DEBUG] Encoding image file. Content type: {content_type}, Size: {len(file_bytes)} bytes")
        
        return [{
            "type": "input_image",
            "image_url": f"data:{content_type};base64,{b64}",
        }]


async def extract_fields_from_document(
    file_bytes: bytes,
    content_type: str,
    filename: str,
) -> Dict[str, Any]:
    """
    Call OpenRouter with Qwen3-VL and return parsed JSON with fields.
    We instruct the model to return JSON only.
    """
    if not OPENROUTER_API_KEY:
        raise RuntimeError("OPENROUTER_API_KEY environment variable is not set")

    # Convert file to image blocks (handles PDF conversion)
    image_blocks = _file_to_image_blocks(file_bytes, content_type)
    
    if not image_blocks:
        raise ValueError("No images generated from file")

    print(f"[INFO] Generated {len(image_blocks)} image block(s) for processing")

    system_prompt = (
        "You are a document extraction engine. "
        "You analyze invoices, receipts, contracts, reports and similar documents, "
        "and output structured JSON only (no explanations or comments)."
    )

    # Update prompt for multi-page documents
    if len(image_blocks) > 1:
        user_prompt = (
            f"Extract important key-value pairs from this {len(image_blocks)}-page document. "
            "Analyze all pages and combine the information into a single JSON response.\n"
            "Use this shape:\n"
            "{\n"
            '  \"doc_type\": \"invoice | receipt | contract | report | other\",\n'
            '  \"confidence\": number between 0 and 100,\n'
            '  \"fields\": {\n'
            '    \"invoice_number\": \"...\",\n'
            '    \"date\": \"...\",\n'
            '    \"due_date\": \"...\",\n'
            '    \"total_amount\": \"...\",\n'
            '    \"currency\": \"...\",\n'
            '    \"vendor_name\": \"...\",\n'
            '    \"line_items\": [\n'
            '       {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
            '    ],\n'
            '    \"other_field\": \"...\"\n'
            "  }\n"
            "}\n"
            "If fields are missing or not applicable, simply omit them. "
            "Combine information from all pages into a single response."
        )
    else:
        user_prompt = (
            "Extract important key-value pairs from the document and respond with JSON only.\n"
            "Use this shape:\n"
            "{\n"
            '  \"doc_type\": \"invoice | receipt | contract | report | other\",\n'
            '  \"confidence\": number between 0 and 100,\n'
            '  \"fields\": {\n'
            '    \"invoice_number\": \"...\",\n'
            '    \"date\": \"...\",\n'
            '    \"due_date\": \"...\",\n'
            '    \"total_amount\": \"...\",\n'
            '    \"currency\": \"...\",\n'
            '    \"vendor_name\": \"...\",\n'
            '    \"line_items\": [\n'
            '       {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
            '    ],\n'
            '    \"other_field\": \"...\"\n'
            "  }\n"
            "}\n"
            "If fields are missing or not applicable, simply omit them."
        )

    # Build content array with text prompt and all image blocks
    user_content = [{"type": "text", "text": user_prompt}]
    user_content.extend(image_blocks)

    payload: Dict[str, Any] = {
        "model": MODEL_NAME,
        "messages": [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_prompt}],
            },
            {
                "role": "user",
                "content": user_content,
            },
        ],
        "max_tokens": 4096,  # Increased for multi-page documents
    }

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        # Optional attribution headers
        "HTTP-Referer": os.environ.get(
            "APP_URL",
            "https://huggingface.co/spaces/your-space",
        ),
        "X-Title": "Document Capture Demo",
    }

    async with httpx.AsyncClient(timeout=120) as client:
        resp = await client.post(OPENROUTER_BASE_URL, headers=headers, json=payload)
        resp.raise_for_status()
        data = resp.json()

    # OpenRouter returns choices[0].message.content
    if "choices" not in data or len(data["choices"]) == 0:
        raise ValueError("No choices in OpenRouter response")
    
    content = data["choices"][0]["message"]["content"]
    
    # Log the raw response for debugging (first 500 chars)
    print(f"[DEBUG] OpenRouter response preview: {str(content)[:500]}")

    # content may be a string or a list of content blocks
    if isinstance(content, list):
        text = "".join(part.get("text", "") for part in content if part.get("type") == "text")
    else:
        text = content

    if not text or not text.strip():
        raise ValueError("Empty response from OpenRouter API")

    # Try to parse JSON from the model output
    # The model might return JSON wrapped in markdown code blocks or with extra text
    try:
        # First, try direct JSON parsing
        parsed = json.loads(text)
        print(f"[DEBUG] Successfully parsed JSON directly")
        return parsed
    except json.JSONDecodeError as e:
        print(f"[DEBUG] Direct JSON parse failed: {e}")
        # Try to extract JSON from markdown code blocks
        json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
        if json_match:
            try:
                parsed = json.loads(json_match.group(1))
                print(f"[DEBUG] Successfully parsed JSON from markdown code block")
                return parsed
            except json.JSONDecodeError as e2:
                print(f"[DEBUG] Markdown code block parse failed: {e2}")
        
        # Try to find JSON object in the text (look for {...})
        json_match = re.search(r'\{.*\}', text, re.DOTALL)
        if json_match:
            try:
                parsed = json.loads(json_match.group(0))
                print(f"[DEBUG] Successfully parsed JSON from regex match")
                return parsed
            except json.JSONDecodeError as e3:
                print(f"[DEBUG] Regex match parse failed: {e3}")
        
        # If all parsing fails, return a default structure with the raw text
        print(f"[WARNING] All JSON parsing attempts failed. Returning fallback structure.")
        return {
            "doc_type": "other",
            "confidence": 50.0,
            "fields": {
                "raw_response": text[:1000],  # First 1000 chars for debugging
                "error": "Could not parse JSON from model response",
                "note": "Check server logs for full response"
            }
        }