Update app.py
Browse files
app.py
CHANGED
|
@@ -36,15 +36,21 @@ st.set_page_config(
|
|
| 36 |
# ---------------------------
|
| 37 |
# Global UI / Render constants (NOT args to set_page_config)
|
| 38 |
# ---------------------------
|
| 39 |
-
IMAGE_PREVIEW_WIDTH = 1000
|
| 40 |
-
PDF_RENDER_SCALE = 3.0
|
| 41 |
|
| 42 |
# ---------------------------
|
| 43 |
# Secrets / Tokens
|
| 44 |
# ---------------------------
|
|
|
|
| 45 |
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") # For OpenRouter models
|
| 46 |
HF_TOKEN = os.getenv("HF_TOKEN") # For HF Inference API (LLaVA)
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
# ---------------------------
|
| 49 |
# Helpers
|
| 50 |
# ---------------------------
|
|
@@ -83,21 +89,21 @@ def extract_structured_data(content, fields):
|
|
| 83 |
pass
|
| 84 |
return structured_data
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
# ---------------------------
|
| 87 |
# OpenRouter client (multimodal chat)
|
| 88 |
# ---------------------------
|
| 89 |
def query_openrouter(prompt: str, image_base64: str, model_id: str) -> str:
|
| 90 |
-
"""
|
| 91 |
-
Calls OpenRouter's /api/v1/chat/completions with a text prompt + one image.
|
| 92 |
-
Requires OPENROUTER_API_KEY.
|
| 93 |
-
"""
|
| 94 |
if not OPENROUTER_API_KEY:
|
| 95 |
raise RuntimeError("Missing OPENROUTER_API_KEY. Add it in your Space → Settings → Variables & secrets.")
|
| 96 |
|
| 97 |
data_url = f"data:image/jpeg;base64,{image_base64}"
|
| 98 |
-
|
| 99 |
payload = {
|
| 100 |
-
"model": model_id,
|
| 101 |
"messages": [
|
| 102 |
{
|
| 103 |
"role": "user",
|
|
@@ -109,20 +115,14 @@ def query_openrouter(prompt: str, image_base64: str, model_id: str) -> str:
|
|
| 109 |
],
|
| 110 |
"max_tokens": 800
|
| 111 |
}
|
| 112 |
-
|
| 113 |
headers = {
|
| 114 |
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
| 115 |
"Content-Type": "application/json",
|
| 116 |
"HTTP-Referer": st.secrets.get("SPACE_URL", "https://hf.space"),
|
| 117 |
"X-Title": "EZOFIS AI OCR"
|
| 118 |
}
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
"https://openrouter.ai/api/v1/chat/completions",
|
| 122 |
-
headers=headers,
|
| 123 |
-
json=payload,
|
| 124 |
-
timeout=120
|
| 125 |
-
)
|
| 126 |
r.raise_for_status()
|
| 127 |
data = r.json()
|
| 128 |
return data["choices"][0]["message"]["content"]
|
|
@@ -139,53 +139,92 @@ def _hf_client(model_id: str):
|
|
| 139 |
return InferenceClient(model=model_id, token=HF_TOKEN)
|
| 140 |
|
| 141 |
def query_hf_llava_vqa(prompt: str, image_base64: str, model_id: str) -> str:
|
| 142 |
-
"""
|
| 143 |
-
Calls Hugging Face Hosted Inference API for VQA without extra kwargs that
|
| 144 |
-
some client versions don’t support. Includes robust fallbacks for return types.
|
| 145 |
-
"""
|
| 146 |
client = _hf_client(model_id)
|
| 147 |
image_bytes = base64.b64decode(image_base64)
|
| 148 |
-
|
| 149 |
-
# Primary: simple VQA call (most deployments support this signature)
|
| 150 |
try:
|
| 151 |
-
result = client.visual_question_answering(
|
| 152 |
-
image=image_bytes,
|
| 153 |
-
question=prompt
|
| 154 |
-
)
|
| 155 |
except TypeError:
|
| 156 |
-
# Fallback for client variants that don’t expose the helper
|
| 157 |
result = client.request(
|
| 158 |
task="visual_question_answering",
|
| 159 |
data={"inputs": {"question": prompt}},
|
| 160 |
files={"image": image_bytes}
|
| 161 |
)
|
| 162 |
|
| 163 |
-
# Normalize result into a string
|
| 164 |
if isinstance(result, str):
|
| 165 |
return result
|
| 166 |
if isinstance(result, dict):
|
| 167 |
-
|
| 168 |
-
return result["answer"]
|
| 169 |
-
if "generated_text" in result:
|
| 170 |
-
return result["generated_text"]
|
| 171 |
if isinstance(result, list) and result:
|
| 172 |
first = result[0]
|
| 173 |
if isinstance(first, dict):
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
if "generated_text" in first:
|
| 177 |
-
return first["generated_text"]
|
| 178 |
return str(result)
|
| 179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
# ---------------------------
|
| 181 |
# Router to pick the right backend by model selection
|
| 182 |
# ---------------------------
|
| 183 |
HF_LLaVA_LABEL = "llava-hf/llava-v1.6-mistral-7b-hf (HF API)"
|
| 184 |
HF_LLaVA_ID = "llava-hf/llava-v1.6-mistral-7b-hf"
|
|
|
|
| 185 |
|
| 186 |
def run_vision_inference(prompt: str, img_b64: str, model_id: str) -> str:
|
| 187 |
if model_id == HF_LLaVA_LABEL:
|
| 188 |
return query_hf_llava_vqa(prompt, img_b64, HF_LLaVA_ID)
|
|
|
|
|
|
|
| 189 |
# All others go via OpenRouter
|
| 190 |
return query_openrouter(prompt, img_b64, model_id)
|
| 191 |
|
|
@@ -306,12 +345,24 @@ with st.sidebar:
|
|
| 306 |
"google/gemma-3-12b-it",
|
| 307 |
"openai/gpt-4.1",
|
| 308 |
"openai/gpt-4.1-mini",
|
| 309 |
-
"qwen/qwen2.5-vl-32b-instruct",
|
| 310 |
-
HF_LLaVA_LABEL
|
|
|
|
| 311 |
],
|
| 312 |
-
help="OpenRouter
|
|
|
|
|
|
|
|
|
|
| 313 |
)
|
| 314 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
extraction_mode = "General description"
|
| 316 |
pdf_process_mode = "Process each page separately"
|
| 317 |
fields = None
|
|
@@ -345,22 +396,27 @@ with st.sidebar:
|
|
| 345 |
|
| 346 |
# Processing loop
|
| 347 |
if uploaded_files and process_button:
|
| 348 |
-
#
|
|
|
|
| 349 |
if selected_model == HF_LLaVA_LABEL:
|
| 350 |
if not HF_CLIENT_AVAILABLE:
|
| 351 |
st.error("huggingface_hub not installed. Add 'huggingface_hub' to requirements.txt.")
|
| 352 |
elif not HF_TOKEN:
|
| 353 |
-
st.error("HF_TOKEN is not set.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
else:
|
| 355 |
can_run = True
|
| 356 |
else:
|
| 357 |
if not OPENROUTER_API_KEY:
|
| 358 |
-
st.error("OPENROUTER_API_KEY is not set.
|
| 359 |
-
can_run = False
|
| 360 |
else:
|
| 361 |
can_run = True
|
| 362 |
|
| 363 |
-
if
|
| 364 |
st.header("Processing Results")
|
| 365 |
progress_bar = st.progress(0)
|
| 366 |
status_text = st.empty()
|
|
@@ -472,7 +528,10 @@ if not uploaded_files:
|
|
| 472 |
st.write("""
|
| 473 |
How to use:
|
| 474 |
1) Upload one or more images or PDFs
|
| 475 |
-
2) Choose a model
|
|
|
|
|
|
|
|
|
|
| 476 |
3) Pick description or custom field extraction
|
| 477 |
4) For PDFs, choose page-by-page or first page
|
| 478 |
5) Click Process Files
|
|
@@ -487,4 +546,4 @@ st.markdown(
|
|
| 487 |
</div>
|
| 488 |
""",
|
| 489 |
unsafe_allow_html=True
|
| 490 |
-
)
|
|
|
|
| 36 |
# ---------------------------
|
| 37 |
# Global UI / Render constants (NOT args to set_page_config)
|
| 38 |
# ---------------------------
|
| 39 |
+
IMAGE_PREVIEW_WIDTH = 1000
|
| 40 |
+
PDF_RENDER_SCALE = 3.0
|
| 41 |
|
| 42 |
# ---------------------------
|
| 43 |
# Secrets / Tokens
|
| 44 |
# ---------------------------
|
| 45 |
+
# OpenRouter + HF API
|
| 46 |
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") # For OpenRouter models
|
| 47 |
HF_TOKEN = os.getenv("HF_TOKEN") # For HF Inference API (LLaVA)
|
| 48 |
|
| 49 |
+
# RunPod (secured, OpenAI-compatible)
|
| 50 |
+
RUNPOD_SECURE_BASE_URL = os.getenv("RUNPOD_SECURE_BASE_URL", "").rstrip("/") # e.g. http://194.68.245.201:22156/v1
|
| 51 |
+
RUNPOD_SECURE_API_KEY = os.getenv("RUNPOD_SECURE_API_KEY") # optional
|
| 52 |
+
RUNPOD_SECURE_MODEL = os.getenv("RUNPOD_SECURE_MODEL", "qwen2.5:32b-instruct") # set to your model id
|
| 53 |
+
|
| 54 |
# ---------------------------
|
| 55 |
# Helpers
|
| 56 |
# ---------------------------
|
|
|
|
| 89 |
pass
|
| 90 |
return structured_data
|
| 91 |
|
| 92 |
+
def is_vision_model_name(name: str) -> bool:
|
| 93 |
+
"""Heuristic: treat models containing 'vl', 'vision', 'mm', or 'multimodal' as vision-capable."""
|
| 94 |
+
n = (name or "").lower()
|
| 95 |
+
return any(k in n for k in ["vl", "vision", "mm", "multimodal"])
|
| 96 |
+
|
| 97 |
# ---------------------------
|
| 98 |
# OpenRouter client (multimodal chat)
|
| 99 |
# ---------------------------
|
| 100 |
def query_openrouter(prompt: str, image_base64: str, model_id: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
if not OPENROUTER_API_KEY:
|
| 102 |
raise RuntimeError("Missing OPENROUTER_API_KEY. Add it in your Space → Settings → Variables & secrets.")
|
| 103 |
|
| 104 |
data_url = f"data:image/jpeg;base64,{image_base64}"
|
|
|
|
| 105 |
payload = {
|
| 106 |
+
"model": model_id,
|
| 107 |
"messages": [
|
| 108 |
{
|
| 109 |
"role": "user",
|
|
|
|
| 115 |
],
|
| 116 |
"max_tokens": 800
|
| 117 |
}
|
|
|
|
| 118 |
headers = {
|
| 119 |
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
| 120 |
"Content-Type": "application/json",
|
| 121 |
"HTTP-Referer": st.secrets.get("SPACE_URL", "https://hf.space"),
|
| 122 |
"X-Title": "EZOFIS AI OCR"
|
| 123 |
}
|
| 124 |
+
r = requests.post("https://openrouter.ai/api/v1/chat/completions",
|
| 125 |
+
headers=headers, json=payload, timeout=120)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
r.raise_for_status()
|
| 127 |
data = r.json()
|
| 128 |
return data["choices"][0]["message"]["content"]
|
|
|
|
| 139 |
return InferenceClient(model=model_id, token=HF_TOKEN)
|
| 140 |
|
| 141 |
def query_hf_llava_vqa(prompt: str, image_base64: str, model_id: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
client = _hf_client(model_id)
|
| 143 |
image_bytes = base64.b64decode(image_base64)
|
|
|
|
|
|
|
| 144 |
try:
|
| 145 |
+
result = client.visual_question_answering(image=image_bytes, question=prompt)
|
|
|
|
|
|
|
|
|
|
| 146 |
except TypeError:
|
|
|
|
| 147 |
result = client.request(
|
| 148 |
task="visual_question_answering",
|
| 149 |
data={"inputs": {"question": prompt}},
|
| 150 |
files={"image": image_bytes}
|
| 151 |
)
|
| 152 |
|
|
|
|
| 153 |
if isinstance(result, str):
|
| 154 |
return result
|
| 155 |
if isinstance(result, dict):
|
| 156 |
+
return result.get("answer") or result.get("generated_text") or json.dumps(result, ensure_ascii=False)
|
|
|
|
|
|
|
|
|
|
| 157 |
if isinstance(result, list) and result:
|
| 158 |
first = result[0]
|
| 159 |
if isinstance(first, dict):
|
| 160 |
+
return first.get("answer") or first.get("generated_text") or json.dumps(first, ensure_ascii=False)
|
| 161 |
+
return str(first)
|
|
|
|
|
|
|
| 162 |
return str(result)
|
| 163 |
|
| 164 |
+
# ---------------------------
|
| 165 |
+
# RunPod (secured, OpenAI-compatible)
|
| 166 |
+
# ---------------------------
|
| 167 |
+
def _secured_openai_compatible(prompt: str, image_base64: str) -> str:
|
| 168 |
+
"""
|
| 169 |
+
Call your OpenAI-compatible server on RunPod/OpenWebUI/Ollama.
|
| 170 |
+
Works with base URLs that already include /v1 or not.
|
| 171 |
+
API key header is added only if provided.
|
| 172 |
+
"""
|
| 173 |
+
if not RUNPOD_SECURE_BASE_URL:
|
| 174 |
+
raise RuntimeError("RUNPOD_SECURE_BASE_URL is missing.")
|
| 175 |
+
|
| 176 |
+
base = RUNPOD_SECURE_BASE_URL.rstrip("/")
|
| 177 |
+
if base.endswith("/v1"):
|
| 178 |
+
url = f"{base}/chat/completions"
|
| 179 |
+
else:
|
| 180 |
+
url = f"{base}/v1/chat/completions"
|
| 181 |
+
|
| 182 |
+
headers = {"Content-Type": "application/json"}
|
| 183 |
+
if RUNPOD_SECURE_API_KEY:
|
| 184 |
+
headers["Authorization"] = f"Bearer {RUNPOD_SECURE_API_KEY}"
|
| 185 |
+
|
| 186 |
+
# If the configured model isn't vision-capable, send text-only content.
|
| 187 |
+
model_name = RUNPOD_SECURE_MODEL
|
| 188 |
+
vision_ok = is_vision_model_name(model_name)
|
| 189 |
+
|
| 190 |
+
if vision_ok:
|
| 191 |
+
data_url = f"data:image/jpeg;base64,{image_base64}"
|
| 192 |
+
content = [
|
| 193 |
+
{"type": "text", "text": prompt},
|
| 194 |
+
{"type": "image_url", "image_url": {"url": data_url}}
|
| 195 |
+
]
|
| 196 |
+
else:
|
| 197 |
+
# Text-only fallback: no image is sent.
|
| 198 |
+
content = [
|
| 199 |
+
{"type": "text", "text": f"{prompt}\n\n(Note: model configured as text-only; image not sent.)"}
|
| 200 |
+
]
|
| 201 |
+
|
| 202 |
+
payload = {
|
| 203 |
+
"model": model_name,
|
| 204 |
+
"messages": [{"role": "user", "content": content}],
|
| 205 |
+
"max_tokens": 800
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
r = requests.post(url, headers=headers, json=payload, timeout=600)
|
| 209 |
+
r.raise_for_status()
|
| 210 |
+
js = r.json()
|
| 211 |
+
return js["choices"][0]["message"]["content"]
|
| 212 |
+
|
| 213 |
+
def query_runpod_secured(prompt: str, image_base64: str) -> str:
|
| 214 |
+
return _secured_openai_compatible(prompt, image_base64)
|
| 215 |
+
|
| 216 |
# ---------------------------
|
| 217 |
# Router to pick the right backend by model selection
|
| 218 |
# ---------------------------
|
| 219 |
HF_LLaVA_LABEL = "llava-hf/llava-v1.6-mistral-7b-hf (HF API)"
|
| 220 |
HF_LLaVA_ID = "llava-hf/llava-v1.6-mistral-7b-hf"
|
| 221 |
+
RUNPOD_SECURE_LABEL = "RunPod (secured)"
|
| 222 |
|
| 223 |
def run_vision_inference(prompt: str, img_b64: str, model_id: str) -> str:
|
| 224 |
if model_id == HF_LLaVA_LABEL:
|
| 225 |
return query_hf_llava_vqa(prompt, img_b64, HF_LLaVA_ID)
|
| 226 |
+
if model_id == RUNPOD_SECURE_LABEL:
|
| 227 |
+
return query_runpod_secured(prompt, img_b64)
|
| 228 |
# All others go via OpenRouter
|
| 229 |
return query_openrouter(prompt, img_b64, model_id)
|
| 230 |
|
|
|
|
| 345 |
"google/gemma-3-12b-it",
|
| 346 |
"openai/gpt-4.1",
|
| 347 |
"openai/gpt-4.1-mini",
|
| 348 |
+
"qwen/qwen2.5-vl-32b-instruct", # OpenRouter vision option
|
| 349 |
+
HF_LLaVA_LABEL, # LLaVA via HF API
|
| 350 |
+
RUNPOD_SECURE_LABEL # Your RunPod OpenAI-compatible server
|
| 351 |
],
|
| 352 |
+
help=("OpenRouter uses OPENROUTER_API_KEY. "
|
| 353 |
+
"LLaVA (HF API) uses HF_TOKEN. "
|
| 354 |
+
"RunPod (secured) uses RUNPOD_SECURE_* env vars. "
|
| 355 |
+
f"Current RunPod model: {RUNPOD_SECURE_MODEL}")
|
| 356 |
)
|
| 357 |
|
| 358 |
+
# If RunPod model looks text-only, warn user
|
| 359 |
+
if selected_model == RUNPOD_SECURE_LABEL and not is_vision_model_name(RUNPOD_SECURE_MODEL):
|
| 360 |
+
st.warning(
|
| 361 |
+
f"RunPod model '{RUNPOD_SECURE_MODEL}' appears text-only. "
|
| 362 |
+
"Requests to this endpoint will NOT include images. "
|
| 363 |
+
"Use a VL model (e.g. 'qwen2.5-vl:32b-instruct') for vision."
|
| 364 |
+
)
|
| 365 |
+
|
| 366 |
extraction_mode = "General description"
|
| 367 |
pdf_process_mode = "Process each page separately"
|
| 368 |
fields = None
|
|
|
|
| 396 |
|
| 397 |
# Processing loop
|
| 398 |
if uploaded_files and process_button:
|
| 399 |
+
# Token checks by route
|
| 400 |
+
can_run = False
|
| 401 |
if selected_model == HF_LLaVA_LABEL:
|
| 402 |
if not HF_CLIENT_AVAILABLE:
|
| 403 |
st.error("huggingface_hub not installed. Add 'huggingface_hub' to requirements.txt.")
|
| 404 |
elif not HF_TOKEN:
|
| 405 |
+
st.error("HF_TOKEN is not set.")
|
| 406 |
+
else:
|
| 407 |
+
can_run = True
|
| 408 |
+
elif selected_model == RUNPOD_SECURE_LABEL:
|
| 409 |
+
if not RUNPOD_SECURE_BASE_URL:
|
| 410 |
+
st.error("RUNPOD_SECURE_BASE_URL is not set.")
|
| 411 |
else:
|
| 412 |
can_run = True
|
| 413 |
else:
|
| 414 |
if not OPENROUTER_API_KEY:
|
| 415 |
+
st.error("OPENROUTER_API_KEY is not set.")
|
|
|
|
| 416 |
else:
|
| 417 |
can_run = True
|
| 418 |
|
| 419 |
+
if can_run:
|
| 420 |
st.header("Processing Results")
|
| 421 |
progress_bar = st.progress(0)
|
| 422 |
status_text = st.empty()
|
|
|
|
| 528 |
st.write("""
|
| 529 |
How to use:
|
| 530 |
1) Upload one or more images or PDFs
|
| 531 |
+
2) Choose a model:
|
| 532 |
+
- OpenRouter: Gemma-3 4B/12B, GPT-4.1/4.1-mini, Qwen2.5-VL-32B
|
| 533 |
+
- HF API: LLaVA v1.6 Mistral-7B
|
| 534 |
+
- RunPod (secured): OpenAI-compatible base URL (supports images only if the model is VL)
|
| 535 |
3) Pick description or custom field extraction
|
| 536 |
4) For PDFs, choose page-by-page or first page
|
| 537 |
5) Click Process Files
|
|
|
|
| 546 |
</div>
|
| 547 |
""",
|
| 548 |
unsafe_allow_html=True
|
| 549 |
+
)
|