Seth0330 commited on
Commit
9c61ac4
·
verified ·
1 Parent(s): c019cc4

Update backend/app/openrouter_client.py

Browse files
Files changed (1) hide show
  1. backend/app/openrouter_client.py +138 -40
backend/app/openrouter_client.py CHANGED
@@ -2,32 +2,97 @@ import os
2
  import base64
3
  import json
4
  import re
5
- from typing import Any, Dict
 
6
 
7
  import httpx
8
 
 
 
 
 
 
 
 
 
9
  # Get your OpenRouter API key from env (you'll set this in Hugging Face later)
10
  OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
11
  OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1/chat/completions"
12
  MODEL_NAME = "qwen/qwen3-vl-235b-a22b-instruct"
13
 
14
 
15
- def _file_to_image_block(file_bytes: bytes, content_type: str) -> Dict[str, Any]:
16
  """
17
- Encode the file as a data URL to feed into the multimodal model.
18
- Note: PDFs may not be directly supported by vision models.
19
- For images (PNG, JPG, etc.), this works fine.
20
- For PDFs, the model might not be able to process them.
21
  """
22
- b64 = base64.b64encode(file_bytes).decode("utf-8")
 
23
 
24
- # Log file type for debugging
25
- print(f"[DEBUG] Encoding file as image block. Content type: {content_type}, Size: {len(file_bytes)} bytes")
26
 
27
- return {
28
- "type": "input_image",
29
- "image_url": f"data:{content_type};base64,{b64}",
30
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
 
33
  async def extract_fields_from_document(
@@ -42,7 +107,13 @@ async def extract_fields_from_document(
42
  if not OPENROUTER_API_KEY:
43
  raise RuntimeError("OPENROUTER_API_KEY environment variable is not set")
44
 
45
- image_block = _file_to_image_block(file_bytes, content_type)
 
 
 
 
 
 
46
 
47
  system_prompt = (
48
  "You are a document extraction engine. "
@@ -50,27 +121,57 @@ async def extract_fields_from_document(
50
  "and output structured JSON only (no explanations or comments)."
51
  )
52
 
53
- user_prompt = (
54
- "Extract important key-value pairs from the document and respond with JSON only.\n"
55
- "Use this shape:\n"
56
- "{\n"
57
- ' \"doc_type\": \"invoice | receipt | contract | report | other\",\n'
58
- ' \"confidence\": number between 0 and 100,\n'
59
- ' \"fields\": {\n'
60
- ' \"invoice_number\": \"...\",\n'
61
- ' \"date\": \"...\",\n'
62
- ' \"due_date\": \"...\",\n'
63
- ' \"total_amount\": \"...\",\n'
64
- ' \"currency\": \"...\",\n'
65
- ' \"vendor_name\": \"...\",\n'
66
- ' \"line_items\": [\n'
67
- ' {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
68
- ' ],\n'
69
- ' \"other_field\": \"...\"\n'
70
- " }\n"
71
- "}\n"
72
- "If fields are missing or not applicable, simply omit them."
73
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  payload: Dict[str, Any] = {
76
  "model": MODEL_NAME,
@@ -81,13 +182,10 @@ async def extract_fields_from_document(
81
  },
82
  {
83
  "role": "user",
84
- "content": [
85
- {"type": "text", "text": user_prompt},
86
- image_block,
87
- ],
88
  },
89
  ],
90
- "max_tokens": 2048,
91
  }
92
 
93
  headers = {
 
2
  import base64
3
  import json
4
  import re
5
+ from io import BytesIO
6
+ from typing import Any, Dict, List
7
 
8
  import httpx
9
 
10
+ try:
11
+ import fitz # PyMuPDF
12
+ from PIL import Image
13
+ PDF_SUPPORT = True
14
+ except ImportError as e:
15
+ PDF_SUPPORT = False
16
+ print(f"[WARNING] PDF support libraries not available: {e}. PDF conversion will not work.")
17
+
18
  # Get your OpenRouter API key from env (you'll set this in Hugging Face later)
19
  OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
20
  OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1/chat/completions"
21
  MODEL_NAME = "qwen/qwen3-vl-235b-a22b-instruct"
22
 
23
 
24
+ def _pdf_to_images(pdf_bytes: bytes) -> List[bytes]:
25
  """
26
+ Convert PDF pages to PNG images.
27
+ Returns a list of PNG image bytes, one per page.
 
 
28
  """
29
+ if not PDF_SUPPORT:
30
+ raise RuntimeError("PyMuPDF not installed. Cannot convert PDF to images.")
31
 
32
+ pdf_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
33
+ images = []
34
 
35
+ print(f"[INFO] PDF has {len(pdf_doc)} page(s)")
36
+
37
+ for page_num in range(len(pdf_doc)):
38
+ page = pdf_doc[page_num]
39
+ # Render page to image (zoom factor 2 for better quality)
40
+ mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
41
+ pix = page.get_pixmap(matrix=mat)
42
+
43
+ # Convert to PIL Image then to PNG bytes
44
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
45
+ img_bytes = BytesIO()
46
+ img.save(img_bytes, format="PNG")
47
+ images.append(img_bytes.getvalue())
48
+
49
+ print(f"[INFO] Converted page {page_num + 1} to image ({pix.width}x{pix.height})")
50
+
51
+ pdf_doc.close()
52
+ return images
53
+
54
+
55
+ def _image_bytes_to_base64(image_bytes: bytes) -> str:
56
+ """Convert image bytes to base64 data URL."""
57
+ b64 = base64.b64encode(image_bytes).decode("utf-8")
58
+ return f"data:image/png;base64,{b64}"
59
+
60
+
61
+ def _file_to_image_blocks(file_bytes: bytes, content_type: str) -> List[Dict[str, Any]]:
62
+ """
63
+ Convert file to image blocks for the vision model.
64
+ - For images: Returns single image block
65
+ - For PDFs: Converts each page to an image and returns multiple blocks
66
+ """
67
+ # Handle PDF files
68
+ if content_type == "application/pdf" or content_type.endswith("/pdf"):
69
+ if not PDF_SUPPORT:
70
+ raise RuntimeError("PDF support requires PyMuPDF. Please install it.")
71
+
72
+ print(f"[INFO] Converting PDF to images...")
73
+ pdf_images = _pdf_to_images(file_bytes)
74
+
75
+ # Create image blocks for each page
76
+ image_blocks = []
77
+ for i, img_bytes in enumerate(pdf_images):
78
+ image_url = _image_bytes_to_base64(img_bytes)
79
+ image_blocks.append({
80
+ "type": "input_image",
81
+ "image_url": image_url,
82
+ })
83
+ print(f"[INFO] Created image block for page {i + 1} ({len(img_bytes)} bytes)")
84
+
85
+ return image_blocks
86
+
87
+ # Handle regular image files
88
+ else:
89
+ b64 = base64.b64encode(file_bytes).decode("utf-8")
90
+ print(f"[DEBUG] Encoding image file. Content type: {content_type}, Size: {len(file_bytes)} bytes")
91
+
92
+ return [{
93
+ "type": "input_image",
94
+ "image_url": f"data:{content_type};base64,{b64}",
95
+ }]
96
 
97
 
98
  async def extract_fields_from_document(
 
107
  if not OPENROUTER_API_KEY:
108
  raise RuntimeError("OPENROUTER_API_KEY environment variable is not set")
109
 
110
+ # Convert file to image blocks (handles PDF conversion)
111
+ image_blocks = _file_to_image_blocks(file_bytes, content_type)
112
+
113
+ if not image_blocks:
114
+ raise ValueError("No images generated from file")
115
+
116
+ print(f"[INFO] Generated {len(image_blocks)} image block(s) for processing")
117
 
118
  system_prompt = (
119
  "You are a document extraction engine. "
 
121
  "and output structured JSON only (no explanations or comments)."
122
  )
123
 
124
+ # Update prompt for multi-page documents
125
+ if len(image_blocks) > 1:
126
+ user_prompt = (
127
+ f"Extract important key-value pairs from this {len(image_blocks)}-page document. "
128
+ "Analyze all pages and combine the information into a single JSON response.\n"
129
+ "Use this shape:\n"
130
+ "{\n"
131
+ ' \"doc_type\": \"invoice | receipt | contract | report | other\",\n'
132
+ ' \"confidence\": number between 0 and 100,\n'
133
+ ' \"fields\": {\n'
134
+ ' \"invoice_number\": \"...\",\n'
135
+ ' \"date\": \"...\",\n'
136
+ ' \"due_date\": \"...\",\n'
137
+ ' \"total_amount\": \"...\",\n'
138
+ ' \"currency\": \"...\",\n'
139
+ ' \"vendor_name\": \"...\",\n'
140
+ ' \"line_items\": [\n'
141
+ ' {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
142
+ ' ],\n'
143
+ ' \"other_field\": \"...\"\n'
144
+ " }\n"
145
+ "}\n"
146
+ "If fields are missing or not applicable, simply omit them. "
147
+ "Combine information from all pages into a single response."
148
+ )
149
+ else:
150
+ user_prompt = (
151
+ "Extract important key-value pairs from the document and respond with JSON only.\n"
152
+ "Use this shape:\n"
153
+ "{\n"
154
+ ' \"doc_type\": \"invoice | receipt | contract | report | other\",\n'
155
+ ' \"confidence\": number between 0 and 100,\n'
156
+ ' \"fields\": {\n'
157
+ ' \"invoice_number\": \"...\",\n'
158
+ ' \"date\": \"...\",\n'
159
+ ' \"due_date\": \"...\",\n'
160
+ ' \"total_amount\": \"...\",\n'
161
+ ' \"currency\": \"...\",\n'
162
+ ' \"vendor_name\": \"...\",\n'
163
+ ' \"line_items\": [\n'
164
+ ' {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
165
+ ' ],\n'
166
+ ' \"other_field\": \"...\"\n'
167
+ " }\n"
168
+ "}\n"
169
+ "If fields are missing or not applicable, simply omit them."
170
+ )
171
+
172
+ # Build content array with text prompt and all image blocks
173
+ user_content = [{"type": "text", "text": user_prompt}]
174
+ user_content.extend(image_blocks)
175
 
176
  payload: Dict[str, Any] = {
177
  "model": MODEL_NAME,
 
182
  },
183
  {
184
  "role": "user",
185
+ "content": user_content,
 
 
 
186
  },
187
  ],
188
+ "max_tokens": 4096, # Increased for multi-page documents
189
  }
190
 
191
  headers = {