Seth0330 commited on
Commit
ae5855a
·
verified ·
1 Parent(s): ffebd56

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -46
app.py CHANGED
@@ -36,15 +36,21 @@ st.set_page_config(
36
  # ---------------------------
37
  # Global UI / Render constants (NOT args to set_page_config)
38
  # ---------------------------
39
- IMAGE_PREVIEW_WIDTH = 1000 # 5x larger preview
40
- PDF_RENDER_SCALE = 3.0 # higher-res PDF rasterization
41
 
42
  # ---------------------------
43
  # Secrets / Tokens
44
  # ---------------------------
 
45
  OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") # For OpenRouter models
46
  HF_TOKEN = os.getenv("HF_TOKEN") # For HF Inference API (LLaVA)
47
 
 
 
 
 
 
48
  # ---------------------------
49
  # Helpers
50
  # ---------------------------
@@ -83,21 +89,21 @@ def extract_structured_data(content, fields):
83
  pass
84
  return structured_data
85
 
 
 
 
 
 
86
  # ---------------------------
87
  # OpenRouter client (multimodal chat)
88
  # ---------------------------
89
  def query_openrouter(prompt: str, image_base64: str, model_id: str) -> str:
90
- """
91
- Calls OpenRouter's /api/v1/chat/completions with a text prompt + one image.
92
- Requires OPENROUTER_API_KEY.
93
- """
94
  if not OPENROUTER_API_KEY:
95
  raise RuntimeError("Missing OPENROUTER_API_KEY. Add it in your Space → Settings → Variables & secrets.")
96
 
97
  data_url = f"data:image/jpeg;base64,{image_base64}"
98
-
99
  payload = {
100
- "model": model_id, # e.g., "google/gemma-3-4b-it", "openai/gpt-4.1"
101
  "messages": [
102
  {
103
  "role": "user",
@@ -109,20 +115,14 @@ def query_openrouter(prompt: str, image_base64: str, model_id: str) -> str:
109
  ],
110
  "max_tokens": 800
111
  }
112
-
113
  headers = {
114
  "Authorization": f"Bearer {OPENROUTER_API_KEY}",
115
  "Content-Type": "application/json",
116
  "HTTP-Referer": st.secrets.get("SPACE_URL", "https://hf.space"),
117
  "X-Title": "EZOFIS AI OCR"
118
  }
119
-
120
- r = requests.post(
121
- "https://openrouter.ai/api/v1/chat/completions",
122
- headers=headers,
123
- json=payload,
124
- timeout=120
125
- )
126
  r.raise_for_status()
127
  data = r.json()
128
  return data["choices"][0]["message"]["content"]
@@ -139,53 +139,92 @@ def _hf_client(model_id: str):
139
  return InferenceClient(model=model_id, token=HF_TOKEN)
140
 
141
  def query_hf_llava_vqa(prompt: str, image_base64: str, model_id: str) -> str:
142
- """
143
- Calls Hugging Face Hosted Inference API for VQA without extra kwargs that
144
- some client versions don’t support. Includes robust fallbacks for return types.
145
- """
146
  client = _hf_client(model_id)
147
  image_bytes = base64.b64decode(image_base64)
148
-
149
- # Primary: simple VQA call (most deployments support this signature)
150
  try:
151
- result = client.visual_question_answering(
152
- image=image_bytes,
153
- question=prompt
154
- )
155
  except TypeError:
156
- # Fallback for client variants that don’t expose the helper
157
  result = client.request(
158
  task="visual_question_answering",
159
  data={"inputs": {"question": prompt}},
160
  files={"image": image_bytes}
161
  )
162
 
163
- # Normalize result into a string
164
  if isinstance(result, str):
165
  return result
166
  if isinstance(result, dict):
167
- if "answer" in result:
168
- return result["answer"]
169
- if "generated_text" in result:
170
- return result["generated_text"]
171
  if isinstance(result, list) and result:
172
  first = result[0]
173
  if isinstance(first, dict):
174
- if "answer" in first:
175
- return first["answer"]
176
- if "generated_text" in first:
177
- return first["generated_text"]
178
  return str(result)
179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  # ---------------------------
181
  # Router to pick the right backend by model selection
182
  # ---------------------------
183
  HF_LLaVA_LABEL = "llava-hf/llava-v1.6-mistral-7b-hf (HF API)"
184
  HF_LLaVA_ID = "llava-hf/llava-v1.6-mistral-7b-hf"
 
185
 
186
  def run_vision_inference(prompt: str, img_b64: str, model_id: str) -> str:
187
  if model_id == HF_LLaVA_LABEL:
188
  return query_hf_llava_vqa(prompt, img_b64, HF_LLaVA_ID)
 
 
189
  # All others go via OpenRouter
190
  return query_openrouter(prompt, img_b64, model_id)
191
 
@@ -306,12 +345,24 @@ with st.sidebar:
306
  "google/gemma-3-12b-it",
307
  "openai/gpt-4.1",
308
  "openai/gpt-4.1-mini",
309
- "qwen/qwen2.5-vl-32b-instruct",
310
- HF_LLaVA_LABEL # LLaVA via HF API
 
311
  ],
312
- help="OpenRouter models use OPENROUTER_API_KEY. LLaVA uses HF_TOKEN via HF Inference API."
 
 
 
313
  )
314
 
 
 
 
 
 
 
 
 
315
  extraction_mode = "General description"
316
  pdf_process_mode = "Process each page separately"
317
  fields = None
@@ -345,22 +396,27 @@ with st.sidebar:
345
 
346
  # Processing loop
347
  if uploaded_files and process_button:
348
- # Check tokens depending on model choice
 
349
  if selected_model == HF_LLaVA_LABEL:
350
  if not HF_CLIENT_AVAILABLE:
351
  st.error("huggingface_hub not installed. Add 'huggingface_hub' to requirements.txt.")
352
  elif not HF_TOKEN:
353
- st.error("HF_TOKEN is not set. Add it in your Space → Settings → Variables & secrets.")
 
 
 
 
 
354
  else:
355
  can_run = True
356
  else:
357
  if not OPENROUTER_API_KEY:
358
- st.error("OPENROUTER_API_KEY is not set. Add it in your Space → Settings → Variables & secrets.")
359
- can_run = False
360
  else:
361
  can_run = True
362
 
363
- if 'can_run' in locals() and can_run:
364
  st.header("Processing Results")
365
  progress_bar = st.progress(0)
366
  status_text = st.empty()
@@ -472,7 +528,10 @@ if not uploaded_files:
472
  st.write("""
473
  How to use:
474
  1) Upload one or more images or PDFs
475
- 2) Choose a model
 
 
 
476
  3) Pick description or custom field extraction
477
  4) For PDFs, choose page-by-page or first page
478
  5) Click Process Files
@@ -487,4 +546,4 @@ st.markdown(
487
  </div>
488
  """,
489
  unsafe_allow_html=True
490
- )
 
36
  # ---------------------------
37
  # Global UI / Render constants (NOT args to set_page_config)
38
  # ---------------------------
39
+ IMAGE_PREVIEW_WIDTH = 1000
40
+ PDF_RENDER_SCALE = 3.0
41
 
42
  # ---------------------------
43
  # Secrets / Tokens
44
  # ---------------------------
45
+ # OpenRouter + HF API
46
  OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") # For OpenRouter models
47
  HF_TOKEN = os.getenv("HF_TOKEN") # For HF Inference API (LLaVA)
48
 
49
+ # RunPod (secured, OpenAI-compatible)
50
+ RUNPOD_SECURE_BASE_URL = os.getenv("RUNPOD_SECURE_BASE_URL", "").rstrip("/") # e.g. http://194.68.245.201:22156/v1
51
+ RUNPOD_SECURE_API_KEY = os.getenv("RUNPOD_SECURE_API_KEY") # optional
52
+ RUNPOD_SECURE_MODEL = os.getenv("RUNPOD_SECURE_MODEL", "qwen2.5:32b-instruct") # set to your model id
53
+
54
  # ---------------------------
55
  # Helpers
56
  # ---------------------------
 
89
  pass
90
  return structured_data
91
 
92
+ def is_vision_model_name(name: str) -> bool:
93
+ """Heuristic: treat models containing 'vl', 'vision', 'mm', or 'multimodal' as vision-capable."""
94
+ n = (name or "").lower()
95
+ return any(k in n for k in ["vl", "vision", "mm", "multimodal"])
96
+
97
  # ---------------------------
98
  # OpenRouter client (multimodal chat)
99
  # ---------------------------
100
  def query_openrouter(prompt: str, image_base64: str, model_id: str) -> str:
 
 
 
 
101
  if not OPENROUTER_API_KEY:
102
  raise RuntimeError("Missing OPENROUTER_API_KEY. Add it in your Space → Settings → Variables & secrets.")
103
 
104
  data_url = f"data:image/jpeg;base64,{image_base64}"
 
105
  payload = {
106
+ "model": model_id,
107
  "messages": [
108
  {
109
  "role": "user",
 
115
  ],
116
  "max_tokens": 800
117
  }
 
118
  headers = {
119
  "Authorization": f"Bearer {OPENROUTER_API_KEY}",
120
  "Content-Type": "application/json",
121
  "HTTP-Referer": st.secrets.get("SPACE_URL", "https://hf.space"),
122
  "X-Title": "EZOFIS AI OCR"
123
  }
124
+ r = requests.post("https://openrouter.ai/api/v1/chat/completions",
125
+ headers=headers, json=payload, timeout=120)
 
 
 
 
 
126
  r.raise_for_status()
127
  data = r.json()
128
  return data["choices"][0]["message"]["content"]
 
139
  return InferenceClient(model=model_id, token=HF_TOKEN)
140
 
141
  def query_hf_llava_vqa(prompt: str, image_base64: str, model_id: str) -> str:
 
 
 
 
142
  client = _hf_client(model_id)
143
  image_bytes = base64.b64decode(image_base64)
 
 
144
  try:
145
+ result = client.visual_question_answering(image=image_bytes, question=prompt)
 
 
 
146
  except TypeError:
 
147
  result = client.request(
148
  task="visual_question_answering",
149
  data={"inputs": {"question": prompt}},
150
  files={"image": image_bytes}
151
  )
152
 
 
153
  if isinstance(result, str):
154
  return result
155
  if isinstance(result, dict):
156
+ return result.get("answer") or result.get("generated_text") or json.dumps(result, ensure_ascii=False)
 
 
 
157
  if isinstance(result, list) and result:
158
  first = result[0]
159
  if isinstance(first, dict):
160
+ return first.get("answer") or first.get("generated_text") or json.dumps(first, ensure_ascii=False)
161
+ return str(first)
 
 
162
  return str(result)
163
 
164
+ # ---------------------------
165
+ # RunPod (secured, OpenAI-compatible)
166
+ # ---------------------------
167
+ def _secured_openai_compatible(prompt: str, image_base64: str) -> str:
168
+ """
169
+ Call your OpenAI-compatible server on RunPod/OpenWebUI/Ollama.
170
+ Works with base URLs that already include /v1 or not.
171
+ API key header is added only if provided.
172
+ """
173
+ if not RUNPOD_SECURE_BASE_URL:
174
+ raise RuntimeError("RUNPOD_SECURE_BASE_URL is missing.")
175
+
176
+ base = RUNPOD_SECURE_BASE_URL.rstrip("/")
177
+ if base.endswith("/v1"):
178
+ url = f"{base}/chat/completions"
179
+ else:
180
+ url = f"{base}/v1/chat/completions"
181
+
182
+ headers = {"Content-Type": "application/json"}
183
+ if RUNPOD_SECURE_API_KEY:
184
+ headers["Authorization"] = f"Bearer {RUNPOD_SECURE_API_KEY}"
185
+
186
+ # If the configured model isn't vision-capable, send text-only content.
187
+ model_name = RUNPOD_SECURE_MODEL
188
+ vision_ok = is_vision_model_name(model_name)
189
+
190
+ if vision_ok:
191
+ data_url = f"data:image/jpeg;base64,{image_base64}"
192
+ content = [
193
+ {"type": "text", "text": prompt},
194
+ {"type": "image_url", "image_url": {"url": data_url}}
195
+ ]
196
+ else:
197
+ # Text-only fallback: no image is sent.
198
+ content = [
199
+ {"type": "text", "text": f"{prompt}\n\n(Note: model configured as text-only; image not sent.)"}
200
+ ]
201
+
202
+ payload = {
203
+ "model": model_name,
204
+ "messages": [{"role": "user", "content": content}],
205
+ "max_tokens": 800
206
+ }
207
+
208
+ r = requests.post(url, headers=headers, json=payload, timeout=600)
209
+ r.raise_for_status()
210
+ js = r.json()
211
+ return js["choices"][0]["message"]["content"]
212
+
213
+ def query_runpod_secured(prompt: str, image_base64: str) -> str:
214
+ return _secured_openai_compatible(prompt, image_base64)
215
+
216
  # ---------------------------
217
  # Router to pick the right backend by model selection
218
  # ---------------------------
219
  HF_LLaVA_LABEL = "llava-hf/llava-v1.6-mistral-7b-hf (HF API)"
220
  HF_LLaVA_ID = "llava-hf/llava-v1.6-mistral-7b-hf"
221
+ RUNPOD_SECURE_LABEL = "RunPod (secured)"
222
 
223
  def run_vision_inference(prompt: str, img_b64: str, model_id: str) -> str:
224
  if model_id == HF_LLaVA_LABEL:
225
  return query_hf_llava_vqa(prompt, img_b64, HF_LLaVA_ID)
226
+ if model_id == RUNPOD_SECURE_LABEL:
227
+ return query_runpod_secured(prompt, img_b64)
228
  # All others go via OpenRouter
229
  return query_openrouter(prompt, img_b64, model_id)
230
 
 
345
  "google/gemma-3-12b-it",
346
  "openai/gpt-4.1",
347
  "openai/gpt-4.1-mini",
348
+ "qwen/qwen2.5-vl-32b-instruct", # OpenRouter vision option
349
+ HF_LLaVA_LABEL, # LLaVA via HF API
350
+ RUNPOD_SECURE_LABEL # Your RunPod OpenAI-compatible server
351
  ],
352
+ help=("OpenRouter uses OPENROUTER_API_KEY. "
353
+ "LLaVA (HF API) uses HF_TOKEN. "
354
+ "RunPod (secured) uses RUNPOD_SECURE_* env vars. "
355
+ f"Current RunPod model: {RUNPOD_SECURE_MODEL}")
356
  )
357
 
358
+ # If RunPod model looks text-only, warn user
359
+ if selected_model == RUNPOD_SECURE_LABEL and not is_vision_model_name(RUNPOD_SECURE_MODEL):
360
+ st.warning(
361
+ f"RunPod model '{RUNPOD_SECURE_MODEL}' appears text-only. "
362
+ "Requests to this endpoint will NOT include images. "
363
+ "Use a VL model (e.g. 'qwen2.5-vl:32b-instruct') for vision."
364
+ )
365
+
366
  extraction_mode = "General description"
367
  pdf_process_mode = "Process each page separately"
368
  fields = None
 
396
 
397
  # Processing loop
398
  if uploaded_files and process_button:
399
+ # Token checks by route
400
+ can_run = False
401
  if selected_model == HF_LLaVA_LABEL:
402
  if not HF_CLIENT_AVAILABLE:
403
  st.error("huggingface_hub not installed. Add 'huggingface_hub' to requirements.txt.")
404
  elif not HF_TOKEN:
405
+ st.error("HF_TOKEN is not set.")
406
+ else:
407
+ can_run = True
408
+ elif selected_model == RUNPOD_SECURE_LABEL:
409
+ if not RUNPOD_SECURE_BASE_URL:
410
+ st.error("RUNPOD_SECURE_BASE_URL is not set.")
411
  else:
412
  can_run = True
413
  else:
414
  if not OPENROUTER_API_KEY:
415
+ st.error("OPENROUTER_API_KEY is not set.")
 
416
  else:
417
  can_run = True
418
 
419
+ if can_run:
420
  st.header("Processing Results")
421
  progress_bar = st.progress(0)
422
  status_text = st.empty()
 
528
  st.write("""
529
  How to use:
530
  1) Upload one or more images or PDFs
531
+ 2) Choose a model:
532
+ - OpenRouter: Gemma-3 4B/12B, GPT-4.1/4.1-mini, Qwen2.5-VL-32B
533
+ - HF API: LLaVA v1.6 Mistral-7B
534
+ - RunPod (secured): OpenAI-compatible base URL (supports images only if the model is VL)
535
  3) Pick description or custom field extraction
536
  4) For PDFs, choose page-by-page or first page
537
  5) Click Process Files
 
546
  </div>
547
  """,
548
  unsafe_allow_html=True
549
+ )