Seth0330 commited on
Commit
4e6170c
Β·
verified Β·
1 Parent(s): d8b8032

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -12
app.py CHANGED
@@ -16,6 +16,13 @@ try:
16
  except ImportError:
17
  PDF_SUPPORT = False
18
 
 
 
 
 
 
 
 
19
  # ---------------------------
20
  # Page config
21
  # ---------------------------
@@ -26,6 +33,12 @@ st.set_page_config(
26
  initial_sidebar_state="expanded"
27
  )
28
 
 
 
 
 
 
 
29
  # ---------------------------
30
  # Helpers
31
  # ---------------------------
@@ -47,6 +60,7 @@ def image_to_base64(image):
47
  return base64.b64encode(buf.getvalue()).decode('utf-8')
48
 
49
  def extract_structured_data(content, fields):
 
50
  structured_data = {}
51
  try:
52
  if "```json" in content and "```" in content.split("```json")[1]:
@@ -64,18 +78,20 @@ def extract_structured_data(content, fields):
64
  return structured_data
65
 
66
  # ---------------------------
67
- # OpenRouter client
68
  # ---------------------------
69
- OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") # set this in Space Secrets
70
-
71
  def query_openrouter(prompt: str, image_base64: str, model_id: str) -> str:
 
 
 
 
72
  if not OPENROUTER_API_KEY:
73
  raise RuntimeError("Missing OPENROUTER_API_KEY. Add it in your Space β†’ Settings β†’ Variables & secrets.")
74
 
75
  data_url = f"data:image/jpeg;base64,{image_base64}"
76
 
77
  payload = {
78
- "model": model_id,
79
  "messages": [
80
  {
81
  "role": "user",
@@ -105,6 +121,48 @@ def query_openrouter(prompt: str, image_base64: str, model_id: str) -> str:
105
  data = r.json()
106
  return data["choices"][0]["message"]["content"]
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  # ---------------------------
109
  # Core processing
110
  # ---------------------------
@@ -113,7 +171,7 @@ def process_image(image, filename, fields=None, model=None):
113
 
114
  if fields is None:
115
  prompt = "Describe this image in detail."
116
- content = query_openrouter(prompt, img_base64, model)
117
  return {'filename': filename, 'description': content}, content, None
118
  else:
119
  fields_str = ", ".join(fields)
@@ -121,7 +179,7 @@ def process_image(image, filename, fields=None, model=None):
121
  "Extract the following fields from this image and return JSON only "
122
  f"with these exact keys: {fields_str}. If a field is missing, use an empty string."
123
  )
124
- content = query_openrouter(prompt, img_base64, model)
125
  structured_data = {'filename': filename}
126
  parsed = extract_structured_data(content, fields)
127
  if parsed:
@@ -217,9 +275,10 @@ with st.sidebar:
217
  "google/gemma-3-4b-it",
218
  "google/gemma-3-12b-it",
219
  "openai/gpt-4.1",
220
- "openai/gpt-4.1-mini"
 
221
  ],
222
- help="OpenRouter model id"
223
  )
224
 
225
  extraction_mode = "General description"
@@ -255,9 +314,22 @@ with st.sidebar:
255
 
256
  # Processing loop
257
  if uploaded_files and process_button:
258
- if not OPENROUTER_API_KEY:
259
- st.error("OPENROUTER_API_KEY is not set. Add it in your Space β†’ Settings β†’ Variables & secrets.")
 
 
 
 
 
 
260
  else:
 
 
 
 
 
 
 
261
  st.header("Processing Results")
262
  progress_bar = st.progress(0)
263
  status_text = st.empty()
@@ -369,7 +441,9 @@ if not uploaded_files:
369
  st.write("""
370
  How to use:
371
  1) Upload one or more images or PDFs
372
- 2) Choose a model (Gemma-3, GPT-4.1, GPT-4.1-mini)
 
 
373
  3) Pick description or custom field extraction
374
  4) For PDFs, choose page-by-page or first page
375
  5) Click Process Files
@@ -380,7 +454,7 @@ st.markdown("---")
380
  st.markdown(
381
  """
382
  <div style="text-align: center; margin-top: 12px; opacity: 0.7;">
383
- Built for Hugging Face Spaces + OpenRouter (EZOFIS AI OCR)
384
  </div>
385
  """,
386
  unsafe_allow_html=True
 
16
  except ImportError:
17
  PDF_SUPPORT = False
18
 
19
+ # Optional HF Inference API client (for LLaVA serverless)
20
+ try:
21
+ from huggingface_hub import InferenceClient
22
+ HF_CLIENT_AVAILABLE = True
23
+ except ImportError:
24
+ HF_CLIENT_AVAILABLE = False
25
+
26
  # ---------------------------
27
  # Page config
28
  # ---------------------------
 
33
  initial_sidebar_state="expanded"
34
  )
35
 
36
+ # ---------------------------
37
+ # Secrets / Tokens
38
+ # ---------------------------
39
+ OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") # For OpenRouter models
40
+ HF_TOKEN = os.getenv("HF_TOKEN") # For HF Inference API (LLaVA)
41
+
42
  # ---------------------------
43
  # Helpers
44
  # ---------------------------
 
60
  return base64.b64encode(buf.getvalue()).decode('utf-8')
61
 
62
  def extract_structured_data(content, fields):
63
+ """Attempt to parse JSON object from model text."""
64
  structured_data = {}
65
  try:
66
  if "```json" in content and "```" in content.split("```json")[1]:
 
78
  return structured_data
79
 
80
  # ---------------------------
81
+ # OpenRouter client (multimodal chat)
82
  # ---------------------------
 
 
83
  def query_openrouter(prompt: str, image_base64: str, model_id: str) -> str:
84
+ """
85
+ Calls OpenRouter's /api/v1/chat/completions with a text prompt + one image.
86
+ Requires OPENROUTER_API_KEY.
87
+ """
88
  if not OPENROUTER_API_KEY:
89
  raise RuntimeError("Missing OPENROUTER_API_KEY. Add it in your Space β†’ Settings β†’ Variables & secrets.")
90
 
91
  data_url = f"data:image/jpeg;base64,{image_base64}"
92
 
93
  payload = {
94
+ "model": model_id, # e.g., "google/gemma-3-4b-it", "openai/gpt-4.1"
95
  "messages": [
96
  {
97
  "role": "user",
 
121
  data = r.json()
122
  return data["choices"][0]["message"]["content"]
123
 
124
+ # ---------------------------
125
+ # HF Inference API client for LLaVA (serverless VQA-style)
126
+ # ---------------------------
127
+ @st.cache_resource
128
+ def _hf_client(model_id: str):
129
+ if not HF_CLIENT_AVAILABLE:
130
+ raise RuntimeError("huggingface_hub not installed. Add it to requirements.txt.")
131
+ if not HF_TOKEN:
132
+ raise RuntimeError("Missing HF_TOKEN. Add it in your Space β†’ Settings β†’ Variables & secrets.")
133
+ return InferenceClient(model=model_id, token=HF_TOKEN)
134
+
135
+ def query_hf_llava_vqa(prompt: str, image_base64: str, model_id: str) -> str:
136
+ """
137
+ Calls Hugging Face Hosted Inference API for a VLM (e.g., LLaVA v1.6 Mistral-7B).
138
+ Uses the Visual Question Answering interface: (image + question) -> text.
139
+ """
140
+ client = _hf_client(model_id)
141
+ image_bytes = base64.b64decode(image_base64)
142
+ # Some deployments return list[{'answer': '...'}]; others return str
143
+ result = client.visual_question_answering(
144
+ image=image_bytes,
145
+ question=prompt,
146
+ max_new_tokens=512
147
+ )
148
+ if isinstance(result, list) and result and isinstance(result[0], dict) and "answer" in result[0]:
149
+ return result[0]["answer"]
150
+ if isinstance(result, str):
151
+ return result
152
+ return str(result)
153
+
154
+ # ---------------------------
155
+ # Router to pick the right backend by model selection
156
+ # ---------------------------
157
+ HF_LLaVA_LABEL = "llava-hf/llava-v1.6-mistral-7b-hf (HF API)"
158
+ HF_LLaVA_ID = "llava-hf/llava-v1.6-mistral-7b-hf"
159
+
160
+ def run_vision_inference(prompt: str, img_b64: str, model_id: str) -> str:
161
+ if model_id == HF_LLaVA_LABEL:
162
+ return query_hf_llava_vqa(prompt, img_b64, HF_LLaVA_ID)
163
+ # All others go via OpenRouter
164
+ return query_openrouter(prompt, img_b64, model_id)
165
+
166
  # ---------------------------
167
  # Core processing
168
  # ---------------------------
 
171
 
172
  if fields is None:
173
  prompt = "Describe this image in detail."
174
+ content = run_vision_inference(prompt, img_base64, model)
175
  return {'filename': filename, 'description': content}, content, None
176
  else:
177
  fields_str = ", ".join(fields)
 
179
  "Extract the following fields from this image and return JSON only "
180
  f"with these exact keys: {fields_str}. If a field is missing, use an empty string."
181
  )
182
+ content = run_vision_inference(prompt, img_base64, model)
183
  structured_data = {'filename': filename}
184
  parsed = extract_structured_data(content, fields)
185
  if parsed:
 
275
  "google/gemma-3-4b-it",
276
  "google/gemma-3-12b-it",
277
  "openai/gpt-4.1",
278
+ "openai/gpt-4.1-mini",
279
+ HF_LLaVA_LABEL # LLaVA via HF API
280
  ],
281
+ help="OpenRouter models use OPENROUTER_API_KEY. LLaVA uses HF_TOKEN via HF Inference API."
282
  )
283
 
284
  extraction_mode = "General description"
 
314
 
315
  # Processing loop
316
  if uploaded_files and process_button:
317
+ # Check tokens depending on model choice
318
+ if selected_model == HF_LLaVA_LABEL:
319
+ if not HF_CLIENT_AVAILABLE:
320
+ st.error("huggingface_hub not installed. Add 'huggingface_hub' to requirements.txt.")
321
+ elif not HF_TOKEN:
322
+ st.error("HF_TOKEN is not set. Add it in your Space β†’ Settings β†’ Variables & secrets.")
323
+ else:
324
+ can_run = True
325
  else:
326
+ if not OPENROUTER_API_KEY:
327
+ st.error("OPENROUTER_API_KEY is not set. Add it in your Space β†’ Settings β†’ Variables & secrets.")
328
+ can_run = False
329
+ else:
330
+ can_run = True
331
+
332
+ if 'can_run' in locals() and can_run:
333
  st.header("Processing Results")
334
  progress_bar = st.progress(0)
335
  status_text = st.empty()
 
441
  st.write("""
442
  How to use:
443
  1) Upload one or more images or PDFs
444
+ 2) Choose a model:
445
+ - OpenRouter: Gemma-3 4B IT, Gemma-3 12B IT, GPT-4.1, GPT-4.1-mini
446
+ - HF API: LLaVA v1.6 Mistral-7B
447
  3) Pick description or custom field extraction
448
  4) For PDFs, choose page-by-page or first page
449
  5) Click Process Files
 
454
  st.markdown(
455
  """
456
  <div style="text-align: center; margin-top: 12px; opacity: 0.7;">
457
+ EZOFIS AI OCR
458
  </div>
459
  """,
460
  unsafe_allow_html=True