Spaces:

Seth0330
/

OCR_VISION

Running

App Files Files Community

Seth0330 commited on Aug 18, 2025

Commit

4e6170c

verified ·

1 Parent(s): d8b8032

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -12

app.py CHANGED Viewed

@@ -16,6 +16,13 @@ try:
 except ImportError:
     PDF_SUPPORT = False
 # ---------------------------
 # Page config
 # ---------------------------
@@ -26,6 +33,12 @@ st.set_page_config(
     initial_sidebar_state="expanded"
 )
 # ---------------------------
 # Helpers
 # ---------------------------
@@ -47,6 +60,7 @@ def image_to_base64(image):
     return base64.b64encode(buf.getvalue()).decode('utf-8')
 def extract_structured_data(content, fields):
     structured_data = {}
     try:
         if "```json" in content and "```" in content.split("```json")[1]:
@@ -64,18 +78,20 @@ def extract_structured_data(content, fields):
     return structured_data
 # ---------------------------
-# OpenRouter client
 # ---------------------------
-OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")  # set this in Space Secrets
 def query_openrouter(prompt: str, image_base64: str, model_id: str) -> str:
     if not OPENROUTER_API_KEY:
         raise RuntimeError("Missing OPENROUTER_API_KEY. Add it in your Space → Settings → Variables & secrets.")
     data_url = f"data:image/jpeg;base64,{image_base64}"
     payload = {
-        "model": model_id,
         "messages": [
             {
                 "role": "user",
@@ -105,6 +121,48 @@ def query_openrouter(prompt: str, image_base64: str, model_id: str) -> str:
     data = r.json()
     return data["choices"][0]["message"]["content"]
 # ---------------------------
 # Core processing
 # ---------------------------
@@ -113,7 +171,7 @@ def process_image(image, filename, fields=None, model=None):
     if fields is None:
         prompt = "Describe this image in detail."
-        content = query_openrouter(prompt, img_base64, model)
         return {'filename': filename, 'description': content}, content, None
     else:
         fields_str = ", ".join(fields)
@@ -121,7 +179,7 @@ def process_image(image, filename, fields=None, model=None):
             "Extract the following fields from this image and return JSON only "
             f"with these exact keys: {fields_str}. If a field is missing, use an empty string."
         )
-        content = query_openrouter(prompt, img_base64, model)
         structured_data = {'filename': filename}
         parsed = extract_structured_data(content, fields)
         if parsed:
@@ -217,9 +275,10 @@ with st.sidebar:
             "google/gemma-3-4b-it",
             "google/gemma-3-12b-it",
             "openai/gpt-4.1",
-            "openai/gpt-4.1-mini"
         ],
-        help="OpenRouter model id"
     )
     extraction_mode = "General description"
@@ -255,9 +314,22 @@ with st.sidebar:
 # Processing loop
 if uploaded_files and process_button:
-    if not OPENROUTER_API_KEY:
-        st.error("OPENROUTER_API_KEY is not set. Add it in your Space → Settings → Variables & secrets.")
     else:
         st.header("Processing Results")
         progress_bar = st.progress(0)
         status_text = st.empty()
@@ -369,7 +441,9 @@ if not uploaded_files:
     st.write("""
     How to use:
     1) Upload one or more images or PDFs
-    2) Choose a model (Gemma-3, GPT-4.1, GPT-4.1-mini)
     3) Pick description or custom field extraction
     4) For PDFs, choose page-by-page or first page
     5) Click Process Files
@@ -380,7 +454,7 @@ st.markdown("---")
 st.markdown(
     """
     <div style="text-align: center; margin-top: 12px; opacity: 0.7;">
-        Built for Hugging Face Spaces + OpenRouter (EZOFIS AI OCR)
     </div>
     """,
     unsafe_allow_html=True

 except ImportError:
     PDF_SUPPORT = False
+# Optional HF Inference API client (for LLaVA serverless)
+try:
+    from huggingface_hub import InferenceClient
+    HF_CLIENT_AVAILABLE = True
+except ImportError:
+    HF_CLIENT_AVAILABLE = False
 # ---------------------------
 # Page config
 # ---------------------------
     initial_sidebar_state="expanded"
 )
+# ---------------------------
+# Secrets / Tokens
+# ---------------------------
+OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")  # For OpenRouter models
+HF_TOKEN = os.getenv("HF_TOKEN")                      # For HF Inference API (LLaVA)
 # ---------------------------
 # Helpers
 # ---------------------------
     return base64.b64encode(buf.getvalue()).decode('utf-8')
 def extract_structured_data(content, fields):
+    """Attempt to parse JSON object from model text."""
     structured_data = {}
     try:
         if "```json" in content and "```" in content.split("```json")[1]:
     return structured_data
 # ---------------------------
+# OpenRouter client (multimodal chat)
 # ---------------------------
 def query_openrouter(prompt: str, image_base64: str, model_id: str) -> str:
+    """
+    Calls OpenRouter's /api/v1/chat/completions with a text prompt + one image.
+    Requires OPENROUTER_API_KEY.
+    """
     if not OPENROUTER_API_KEY:
         raise RuntimeError("Missing OPENROUTER_API_KEY. Add it in your Space → Settings → Variables & secrets.")
     data_url = f"data:image/jpeg;base64,{image_base64}"
     payload = {
+        "model": model_id,  # e.g., "google/gemma-3-4b-it", "openai/gpt-4.1"
         "messages": [
             {
                 "role": "user",
     data = r.json()
     return data["choices"][0]["message"]["content"]
+# ---------------------------
+# HF Inference API client for LLaVA (serverless VQA-style)
+# ---------------------------
+@st.cache_resource
+def _hf_client(model_id: str):
+    if not HF_CLIENT_AVAILABLE:
+        raise RuntimeError("huggingface_hub not installed. Add it to requirements.txt.")
+    if not HF_TOKEN:
+        raise RuntimeError("Missing HF_TOKEN. Add it in your Space → Settings → Variables & secrets.")
+    return InferenceClient(model=model_id, token=HF_TOKEN)
+def query_hf_llava_vqa(prompt: str, image_base64: str, model_id: str) -> str:
+    """
+    Calls Hugging Face Hosted Inference API for a VLM (e.g., LLaVA v1.6 Mistral-7B).
+    Uses the Visual Question Answering interface: (image + question) -> text.
+    """
+    client = _hf_client(model_id)
+    image_bytes = base64.b64decode(image_base64)
+    # Some deployments return list[{'answer': '...'}]; others return str
+    result = client.visual_question_answering(
+        image=image_bytes,
+        question=prompt,
+        max_new_tokens=512
+    )
+    if isinstance(result, list) and result and isinstance(result[0], dict) and "answer" in result[0]:
+        return result[0]["answer"]
+    if isinstance(result, str):
+        return result
+    return str(result)
+# ---------------------------
+# Router to pick the right backend by model selection
+# ---------------------------
+HF_LLaVA_LABEL = "llava-hf/llava-v1.6-mistral-7b-hf (HF API)"
+HF_LLaVA_ID = "llava-hf/llava-v1.6-mistral-7b-hf"
+def run_vision_inference(prompt: str, img_b64: str, model_id: str) -> str:
+    if model_id == HF_LLaVA_LABEL:
+        return query_hf_llava_vqa(prompt, img_b64, HF_LLaVA_ID)
+    # All others go via OpenRouter
+    return query_openrouter(prompt, img_b64, model_id)
 # ---------------------------
 # Core processing
 # ---------------------------
     if fields is None:
         prompt = "Describe this image in detail."
+        content = run_vision_inference(prompt, img_base64, model)
         return {'filename': filename, 'description': content}, content, None
     else:
         fields_str = ", ".join(fields)
             "Extract the following fields from this image and return JSON only "
             f"with these exact keys: {fields_str}. If a field is missing, use an empty string."
         )
+        content = run_vision_inference(prompt, img_base64, model)
         structured_data = {'filename': filename}
         parsed = extract_structured_data(content, fields)
         if parsed:
             "google/gemma-3-4b-it",
             "google/gemma-3-12b-it",
             "openai/gpt-4.1",
+            "openai/gpt-4.1-mini",
+            HF_LLaVA_LABEL  # LLaVA via HF API
         ],
+        help="OpenRouter models use OPENROUTER_API_KEY. LLaVA uses HF_TOKEN via HF Inference API."
     )
     extraction_mode = "General description"
 # Processing loop
 if uploaded_files and process_button:
+    # Check tokens depending on model choice
+    if selected_model == HF_LLaVA_LABEL:
+        if not HF_CLIENT_AVAILABLE:
+            st.error("huggingface_hub not installed. Add 'huggingface_hub' to requirements.txt.")
+        elif not HF_TOKEN:
+            st.error("HF_TOKEN is not set. Add it in your Space → Settings → Variables & secrets.")
+        else:
+            can_run = True
     else:
+        if not OPENROUTER_API_KEY:
+            st.error("OPENROUTER_API_KEY is not set. Add it in your Space → Settings → Variables & secrets.")
+            can_run = False
+        else:
+            can_run = True
+    if 'can_run' in locals() and can_run:
         st.header("Processing Results")
         progress_bar = st.progress(0)
         status_text = st.empty()
     st.write("""
     How to use:
     1) Upload one or more images or PDFs
+    2) Choose a model:
+       - OpenRouter: Gemma-3 4B IT, Gemma-3 12B IT, GPT-4.1, GPT-4.1-mini
+       - HF API: LLaVA v1.6 Mistral-7B
     3) Pick description or custom field extraction
     4) For PDFs, choose page-by-page or first page
     5) Click Process Files
 st.markdown(
     """
     <div style="text-align: center; margin-top: 12px; opacity: 0.7;">
+     EZOFIS AI OCR
     </div>
     """,
     unsafe_allow_html=True