Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -16,6 +16,13 @@ try:
|
|
| 16 |
except ImportError:
|
| 17 |
PDF_SUPPORT = False
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
# ---------------------------
|
| 20 |
# Page config
|
| 21 |
# ---------------------------
|
|
@@ -26,6 +33,12 @@ st.set_page_config(
|
|
| 26 |
initial_sidebar_state="expanded"
|
| 27 |
)
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
# ---------------------------
|
| 30 |
# Helpers
|
| 31 |
# ---------------------------
|
|
@@ -47,6 +60,7 @@ def image_to_base64(image):
|
|
| 47 |
return base64.b64encode(buf.getvalue()).decode('utf-8')
|
| 48 |
|
| 49 |
def extract_structured_data(content, fields):
|
|
|
|
| 50 |
structured_data = {}
|
| 51 |
try:
|
| 52 |
if "```json" in content and "```" in content.split("```json")[1]:
|
|
@@ -64,18 +78,20 @@ def extract_structured_data(content, fields):
|
|
| 64 |
return structured_data
|
| 65 |
|
| 66 |
# ---------------------------
|
| 67 |
-
# OpenRouter client
|
| 68 |
# ---------------------------
|
| 69 |
-
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") # set this in Space Secrets
|
| 70 |
-
|
| 71 |
def query_openrouter(prompt: str, image_base64: str, model_id: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
if not OPENROUTER_API_KEY:
|
| 73 |
raise RuntimeError("Missing OPENROUTER_API_KEY. Add it in your Space β Settings β Variables & secrets.")
|
| 74 |
|
| 75 |
data_url = f"data:image/jpeg;base64,{image_base64}"
|
| 76 |
|
| 77 |
payload = {
|
| 78 |
-
"model": model_id,
|
| 79 |
"messages": [
|
| 80 |
{
|
| 81 |
"role": "user",
|
|
@@ -105,6 +121,48 @@ def query_openrouter(prompt: str, image_base64: str, model_id: str) -> str:
|
|
| 105 |
data = r.json()
|
| 106 |
return data["choices"][0]["message"]["content"]
|
| 107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
# ---------------------------
|
| 109 |
# Core processing
|
| 110 |
# ---------------------------
|
|
@@ -113,7 +171,7 @@ def process_image(image, filename, fields=None, model=None):
|
|
| 113 |
|
| 114 |
if fields is None:
|
| 115 |
prompt = "Describe this image in detail."
|
| 116 |
-
content =
|
| 117 |
return {'filename': filename, 'description': content}, content, None
|
| 118 |
else:
|
| 119 |
fields_str = ", ".join(fields)
|
|
@@ -121,7 +179,7 @@ def process_image(image, filename, fields=None, model=None):
|
|
| 121 |
"Extract the following fields from this image and return JSON only "
|
| 122 |
f"with these exact keys: {fields_str}. If a field is missing, use an empty string."
|
| 123 |
)
|
| 124 |
-
content =
|
| 125 |
structured_data = {'filename': filename}
|
| 126 |
parsed = extract_structured_data(content, fields)
|
| 127 |
if parsed:
|
|
@@ -217,9 +275,10 @@ with st.sidebar:
|
|
| 217 |
"google/gemma-3-4b-it",
|
| 218 |
"google/gemma-3-12b-it",
|
| 219 |
"openai/gpt-4.1",
|
| 220 |
-
"openai/gpt-4.1-mini"
|
|
|
|
| 221 |
],
|
| 222 |
-
help="OpenRouter
|
| 223 |
)
|
| 224 |
|
| 225 |
extraction_mode = "General description"
|
|
@@ -255,9 +314,22 @@ with st.sidebar:
|
|
| 255 |
|
| 256 |
# Processing loop
|
| 257 |
if uploaded_files and process_button:
|
| 258 |
-
|
| 259 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
st.header("Processing Results")
|
| 262 |
progress_bar = st.progress(0)
|
| 263 |
status_text = st.empty()
|
|
@@ -369,7 +441,9 @@ if not uploaded_files:
|
|
| 369 |
st.write("""
|
| 370 |
How to use:
|
| 371 |
1) Upload one or more images or PDFs
|
| 372 |
-
2) Choose a model
|
|
|
|
|
|
|
| 373 |
3) Pick description or custom field extraction
|
| 374 |
4) For PDFs, choose page-by-page or first page
|
| 375 |
5) Click Process Files
|
|
@@ -380,7 +454,7 @@ st.markdown("---")
|
|
| 380 |
st.markdown(
|
| 381 |
"""
|
| 382 |
<div style="text-align: center; margin-top: 12px; opacity: 0.7;">
|
| 383 |
-
|
| 384 |
</div>
|
| 385 |
""",
|
| 386 |
unsafe_allow_html=True
|
|
|
|
| 16 |
except ImportError:
|
| 17 |
PDF_SUPPORT = False
|
| 18 |
|
| 19 |
+
# Optional HF Inference API client (for LLaVA serverless)
|
| 20 |
+
try:
|
| 21 |
+
from huggingface_hub import InferenceClient
|
| 22 |
+
HF_CLIENT_AVAILABLE = True
|
| 23 |
+
except ImportError:
|
| 24 |
+
HF_CLIENT_AVAILABLE = False
|
| 25 |
+
|
| 26 |
# ---------------------------
|
| 27 |
# Page config
|
| 28 |
# ---------------------------
|
|
|
|
| 33 |
initial_sidebar_state="expanded"
|
| 34 |
)
|
| 35 |
|
| 36 |
+
# ---------------------------
|
| 37 |
+
# Secrets / Tokens
|
| 38 |
+
# ---------------------------
|
| 39 |
+
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") # For OpenRouter models
|
| 40 |
+
HF_TOKEN = os.getenv("HF_TOKEN") # For HF Inference API (LLaVA)
|
| 41 |
+
|
| 42 |
# ---------------------------
|
| 43 |
# Helpers
|
| 44 |
# ---------------------------
|
|
|
|
| 60 |
return base64.b64encode(buf.getvalue()).decode('utf-8')
|
| 61 |
|
| 62 |
def extract_structured_data(content, fields):
|
| 63 |
+
"""Attempt to parse JSON object from model text."""
|
| 64 |
structured_data = {}
|
| 65 |
try:
|
| 66 |
if "```json" in content and "```" in content.split("```json")[1]:
|
|
|
|
| 78 |
return structured_data
|
| 79 |
|
| 80 |
# ---------------------------
|
| 81 |
+
# OpenRouter client (multimodal chat)
|
| 82 |
# ---------------------------
|
|
|
|
|
|
|
| 83 |
def query_openrouter(prompt: str, image_base64: str, model_id: str) -> str:
|
| 84 |
+
"""
|
| 85 |
+
Calls OpenRouter's /api/v1/chat/completions with a text prompt + one image.
|
| 86 |
+
Requires OPENROUTER_API_KEY.
|
| 87 |
+
"""
|
| 88 |
if not OPENROUTER_API_KEY:
|
| 89 |
raise RuntimeError("Missing OPENROUTER_API_KEY. Add it in your Space β Settings β Variables & secrets.")
|
| 90 |
|
| 91 |
data_url = f"data:image/jpeg;base64,{image_base64}"
|
| 92 |
|
| 93 |
payload = {
|
| 94 |
+
"model": model_id, # e.g., "google/gemma-3-4b-it", "openai/gpt-4.1"
|
| 95 |
"messages": [
|
| 96 |
{
|
| 97 |
"role": "user",
|
|
|
|
| 121 |
data = r.json()
|
| 122 |
return data["choices"][0]["message"]["content"]
|
| 123 |
|
| 124 |
+
# ---------------------------
|
| 125 |
+
# HF Inference API client for LLaVA (serverless VQA-style)
|
| 126 |
+
# ---------------------------
|
| 127 |
+
@st.cache_resource
|
| 128 |
+
def _hf_client(model_id: str):
|
| 129 |
+
if not HF_CLIENT_AVAILABLE:
|
| 130 |
+
raise RuntimeError("huggingface_hub not installed. Add it to requirements.txt.")
|
| 131 |
+
if not HF_TOKEN:
|
| 132 |
+
raise RuntimeError("Missing HF_TOKEN. Add it in your Space β Settings β Variables & secrets.")
|
| 133 |
+
return InferenceClient(model=model_id, token=HF_TOKEN)
|
| 134 |
+
|
| 135 |
+
def query_hf_llava_vqa(prompt: str, image_base64: str, model_id: str) -> str:
|
| 136 |
+
"""
|
| 137 |
+
Calls Hugging Face Hosted Inference API for a VLM (e.g., LLaVA v1.6 Mistral-7B).
|
| 138 |
+
Uses the Visual Question Answering interface: (image + question) -> text.
|
| 139 |
+
"""
|
| 140 |
+
client = _hf_client(model_id)
|
| 141 |
+
image_bytes = base64.b64decode(image_base64)
|
| 142 |
+
# Some deployments return list[{'answer': '...'}]; others return str
|
| 143 |
+
result = client.visual_question_answering(
|
| 144 |
+
image=image_bytes,
|
| 145 |
+
question=prompt,
|
| 146 |
+
max_new_tokens=512
|
| 147 |
+
)
|
| 148 |
+
if isinstance(result, list) and result and isinstance(result[0], dict) and "answer" in result[0]:
|
| 149 |
+
return result[0]["answer"]
|
| 150 |
+
if isinstance(result, str):
|
| 151 |
+
return result
|
| 152 |
+
return str(result)
|
| 153 |
+
|
| 154 |
+
# ---------------------------
|
| 155 |
+
# Router to pick the right backend by model selection
|
| 156 |
+
# ---------------------------
|
| 157 |
+
HF_LLaVA_LABEL = "llava-hf/llava-v1.6-mistral-7b-hf (HF API)"
|
| 158 |
+
HF_LLaVA_ID = "llava-hf/llava-v1.6-mistral-7b-hf"
|
| 159 |
+
|
| 160 |
+
def run_vision_inference(prompt: str, img_b64: str, model_id: str) -> str:
|
| 161 |
+
if model_id == HF_LLaVA_LABEL:
|
| 162 |
+
return query_hf_llava_vqa(prompt, img_b64, HF_LLaVA_ID)
|
| 163 |
+
# All others go via OpenRouter
|
| 164 |
+
return query_openrouter(prompt, img_b64, model_id)
|
| 165 |
+
|
| 166 |
# ---------------------------
|
| 167 |
# Core processing
|
| 168 |
# ---------------------------
|
|
|
|
| 171 |
|
| 172 |
if fields is None:
|
| 173 |
prompt = "Describe this image in detail."
|
| 174 |
+
content = run_vision_inference(prompt, img_base64, model)
|
| 175 |
return {'filename': filename, 'description': content}, content, None
|
| 176 |
else:
|
| 177 |
fields_str = ", ".join(fields)
|
|
|
|
| 179 |
"Extract the following fields from this image and return JSON only "
|
| 180 |
f"with these exact keys: {fields_str}. If a field is missing, use an empty string."
|
| 181 |
)
|
| 182 |
+
content = run_vision_inference(prompt, img_base64, model)
|
| 183 |
structured_data = {'filename': filename}
|
| 184 |
parsed = extract_structured_data(content, fields)
|
| 185 |
if parsed:
|
|
|
|
| 275 |
"google/gemma-3-4b-it",
|
| 276 |
"google/gemma-3-12b-it",
|
| 277 |
"openai/gpt-4.1",
|
| 278 |
+
"openai/gpt-4.1-mini",
|
| 279 |
+
HF_LLaVA_LABEL # LLaVA via HF API
|
| 280 |
],
|
| 281 |
+
help="OpenRouter models use OPENROUTER_API_KEY. LLaVA uses HF_TOKEN via HF Inference API."
|
| 282 |
)
|
| 283 |
|
| 284 |
extraction_mode = "General description"
|
|
|
|
| 314 |
|
| 315 |
# Processing loop
|
| 316 |
if uploaded_files and process_button:
|
| 317 |
+
# Check tokens depending on model choice
|
| 318 |
+
if selected_model == HF_LLaVA_LABEL:
|
| 319 |
+
if not HF_CLIENT_AVAILABLE:
|
| 320 |
+
st.error("huggingface_hub not installed. Add 'huggingface_hub' to requirements.txt.")
|
| 321 |
+
elif not HF_TOKEN:
|
| 322 |
+
st.error("HF_TOKEN is not set. Add it in your Space β Settings β Variables & secrets.")
|
| 323 |
+
else:
|
| 324 |
+
can_run = True
|
| 325 |
else:
|
| 326 |
+
if not OPENROUTER_API_KEY:
|
| 327 |
+
st.error("OPENROUTER_API_KEY is not set. Add it in your Space β Settings β Variables & secrets.")
|
| 328 |
+
can_run = False
|
| 329 |
+
else:
|
| 330 |
+
can_run = True
|
| 331 |
+
|
| 332 |
+
if 'can_run' in locals() and can_run:
|
| 333 |
st.header("Processing Results")
|
| 334 |
progress_bar = st.progress(0)
|
| 335 |
status_text = st.empty()
|
|
|
|
| 441 |
st.write("""
|
| 442 |
How to use:
|
| 443 |
1) Upload one or more images or PDFs
|
| 444 |
+
2) Choose a model:
|
| 445 |
+
- OpenRouter: Gemma-3 4B IT, Gemma-3 12B IT, GPT-4.1, GPT-4.1-mini
|
| 446 |
+
- HF API: LLaVA v1.6 Mistral-7B
|
| 447 |
3) Pick description or custom field extraction
|
| 448 |
4) For PDFs, choose page-by-page or first page
|
| 449 |
5) Click Process Files
|
|
|
|
| 454 |
st.markdown(
|
| 455 |
"""
|
| 456 |
<div style="text-align: center; margin-top: 12px; opacity: 0.7;">
|
| 457 |
+
EZOFIS AI OCR
|
| 458 |
</div>
|
| 459 |
""",
|
| 460 |
unsafe_allow_html=True
|