Spaces:

EYEDOL
/

AGRO

Sleeping

App Files Files Community

EYEDOL commited on Nov 19, 2025

Commit

a48863e

verified ·

1 Parent(s): 94b81b9

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -43

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py — Robust CPU-friendly SigLip -> (Llava local OR HF-inference fallback) pipeline
 import os
 # Force CPU before importing torch/transformers if you want CPU-only
 os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")
@@ -6,37 +6,43 @@ os.environ.setdefault("TRANSFORMERS_NO_ADVISORY_WARNINGS", "1")
 import sys
 import traceback
-from typing import List, Tuple, Optional
 import json
-import requests
-import time
 import torch
 import torch.nn.functional as F
 from datasets import load_dataset
-from transformers import AutoProcessor, AutoModel, AutoTokenizer, AutoModelForCausalLM
 from PIL import Image
 import gradio as gr
 from tqdm import tqdm
 # -------------------------
-# Config (update model ids & dataset count)
 # -------------------------
 SIGLIP_MODEL_ID = "EYEDOL/siglipFULL-agri-finetuned"
 LLAVA_MODEL_ID = "llava-hf/llava-1.5-7b-hf"   # change if needed
 DATASET_TEMPLATE = "EYEDOL/AGRILLAVA-image-text{}"
-NUM_DATASETS = 1          # set to 15 if you want full data (startup time/memory increases)
 BATCH_SIZE = 16
 TOP_K_DEFAULT = 3
-HF_API_URL = f"https://api-inference.huggingface.co/models/{LLAVA_MODEL_ID}"
 HUGGINGFACE_TOKEN = os.environ.get("HUGGINGFACE_TOKEN", None)
-# Device
 device = torch.device("cpu")
-print("Device:", device)
 # -------------------------
-# Load SigLip dataset & model → precompute text embeddings at startup
 # -------------------------
 print("Loading datasets and computing SigLip text embeddings (startup)...")
 texts_all: List[str] = []
@@ -48,9 +54,9 @@ siglip_processor = AutoProcessor.from_pretrained(SIGLIP_MODEL_ID)
 siglip_model = AutoModel.from_pretrained(SIGLIP_MODEL_ID).to(device)
 siglip_model.eval()
-# compute embeddings
 text_embeds_parts = []
-for i in tqdm(range(0, len(texts_all), BATCH_SIZE), desc="Encoding texts"):
     batch_texts = texts_all[i : i + BATCH_SIZE]
     inputs = siglip_processor(text=batch_texts, padding=True, truncation=True, return_tensors="pt")
     with torch.no_grad():
@@ -69,7 +75,7 @@ print(f"Encoded {len(texts_all)} texts. Embeddings shape: {text_embeds_all.shape
 # -------------------------
 llava_tokenizer: Optional[AutoTokenizer] = None
 llava_model = None
-llava_mode = None  # 'local', 'trust_remote_code', or 'hf_api' or None
 load_errors = []
 # Attempt 1: local llava package (preferred)
@@ -89,10 +95,10 @@ try:
     llava_model.eval()
     llava_mode = "local"
     print("✅ Llava loaded from installed package.")
-except Exception as e_local:
     tb_local = traceback.format_exc()
     load_errors.append(("local_llava_import", tb_local))
-    print("Local llava import failed — will try trust_remote_code fallback. (see logs)")
 # Attempt 2: trust_remote_code fallback
 if llava_mode is None:
@@ -110,33 +116,30 @@ if llava_mode is None:
         llava_model.eval()
         llava_mode = "trust_remote_code"
         print("✅ Llava loaded via trust_remote_code fallback.")
-    except Exception as e_trust:
         tb_trust = traceback.format_exc()
         load_errors.append(("fallback_trust_remote_code", tb_trust))
-        print("trust_remote_code fallback failed.")
-# Attempt 3: Hugging Face Inference API fallback (requires HUGGINGFACE_TOKEN)
 if llava_mode is None and HUGGINGFACE_TOKEN:
-    # we won't load a model locally; will call inference API for generation
     llava_mode = "hf_api"
-    print("No local model available. Will use Hugging Face Inference API for generation (HUGGINGFACE_TOKEN detected).")
-# If still no method available, keep llava_mode None and continue — UI will show actionable message
 if llava_mode is None:
-    print("WARNING: No Llava model available locally or via trust_remote_code, and no HUGGINGFACE_TOKEN found.")
-    print("App will start but generation will return an actionable error. See load_errors for tracebacks.")
     for name, tb in load_errors:
-        print(f"--- {name} traceback ---")
-        print(tb)
 # -------------------------
-# Helper: call Hugging Face Inference API for text generation
 # -------------------------
 def call_hf_inference_api(prompt: str, max_new_tokens: int = 256, temperature: float = 0.0):
     if not HUGGINGFACE_TOKEN:
         raise RuntimeError("HUGGINGFACE_TOKEN not set; cannot call Hugging Face Inference API.")
-    headers = {"Authorization": f"Bearer {HUGGINGFACE_TOKEN}"}
     payload = {
         "inputs": prompt,
         "parameters": {"max_new_tokens": max_new_tokens, "temperature": temperature},
         "options": {"wait_for_model": True},
@@ -145,19 +148,17 @@ def call_hf_inference_api(prompt: str, max_new_tokens: int = 256, temperature: f
     if resp.status_code != 200:
         raise RuntimeError(f"HF Inference API error {resp.status_code}: {resp.text}")
     data = resp.json()
-    # API returns list or dict depending on model; handle common shapes
     if isinstance(data, list) and data and isinstance(data[0], dict) and "generated_text" in data[0]:
         return data[0]["generated_text"]
     if isinstance(data, dict) and "generated_text" in data:
         return data["generated_text"]
-    # If the model returns a plain string or other structure:
     if isinstance(data, str):
         return data
-    # Fallback: try to stringify
     return json.dumps(data)
 # -------------------------
-# Retrieval & generation functions
 # -------------------------
 def retrieve_top_k_texts(image: Image.Image, k: int = TOP_K_DEFAULT):
     inputs = siglip_processor(images=image, return_tensors="pt")
@@ -180,7 +181,6 @@ def llava_answer(image: Image.Image, retrieved_texts, question: str, max_tokens:
     )
     if llava_mode in ("local", "trust_remote_code"):
-        # use the tokenizer + local model
         inputs = llava_tokenizer(prompt, return_tensors="pt")
         inputs = {k: v.to(device) for k, v in inputs.items()}
         with torch.no_grad():
@@ -188,28 +188,24 @@ def llava_answer(image: Image.Image, retrieved_texts, question: str, max_tokens:
         resp = llava_tokenizer.decode(output_ids[0], skip_special_tokens=True)
         return resp
     elif llava_mode == "hf_api":
-        # Use HF Inference API
         return call_hf_inference_api(prompt, max_new_tokens=max_tokens)
     else:
-        # No model available — return actionable error for the UI
         err = (
             "No Llava model is available for generation.\n\n"
-            "Options to fix:\n"
             "1) Install the LLaVA repo in requirements.txt and rebuild the Space:\n"
             "   git+https://github.com/haotian-liu/LLaVA.git@main\n"
-            "2) Or provide a Hugging Face API token as the HUGGINGFACE_TOKEN secret in Space settings so the app can\n"
-            f"   fall back to the Inference API. Expected token env var name: HUGGINGFACE_TOKEN\n\n"
-            "Debug info (tracebacks were printed to Space logs at startup).\n"
         )
         return err
 # -------------------------
-# Gradio pipeline + UI
 # -------------------------
 def gradio_pipeline(image: Image.Image, question: str, k: int = TOP_K_DEFAULT):
     if image is None or not question:
         return None, "Please provide both an image and a question."
     retrieved = retrieve_top_k_texts(image, k=int(k))
     try:
         answer = llava_answer(image, retrieved, question)
@@ -221,8 +217,8 @@ def gradio_pipeline(image: Image.Image, question: str, k: int = TOP_K_DEFAULT):
 with gr.Blocks(title="Agri Image + Question → Llava Response (robust)") as demo:
     gr.Markdown(
         "## Agri Image QA\n\nThis app preloads SigLip embeddings at startup. "
-        "Generation uses a local Llava model if available, otherwise the Hugging Face Inference API "
-        "(requires HUGGINGFACE_TOKEN set in Space secrets)."
     )
     with gr.Row():
         img_in = gr.Image(type="pil")

+# app.py — Robust CPU-friendly SigLip -> (Llava local | trust_remote_code | HF router) pipeline
 import os
 # Force CPU before importing torch/transformers if you want CPU-only
 os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")
 import sys
 import traceback
 import json
+from typing import List, Optional
+import requests
 import torch
 import torch.nn.functional as F
 from datasets import load_dataset
+from transformers import (
+    AutoProcessor,
+    AutoModel,
+    AutoTokenizer,
+    AutoModelForCausalLM,
+)
 from PIL import Image
 import gradio as gr
 from tqdm import tqdm
 # -------------------------
+# Config - update these IDs as needed
 # -------------------------
 SIGLIP_MODEL_ID = "EYEDOL/siglipFULL-agri-finetuned"
 LLAVA_MODEL_ID = "llava-hf/llava-1.5-7b-hf"   # change if needed
 DATASET_TEMPLATE = "EYEDOL/AGRILLAVA-image-text{}"
+NUM_DATASETS = 1         # set to 15 if you want all datasets (startup memory/time increases)
 BATCH_SIZE = 16
 TOP_K_DEFAULT = 3
+# Hugging Face router endpoint (new inference endpoint)
+HF_API_URL = "https://router.huggingface.co/hf-inference"
 HUGGINGFACE_TOKEN = os.environ.get("HUGGINGFACE_TOKEN", None)
+# Device - CPU only
 device = torch.device("cpu")
+print("Running on device:", device)
 # -------------------------
+# Load dataset and SigLip model & precompute text embeddings at startup
 # -------------------------
 print("Loading datasets and computing SigLip text embeddings (startup)...")
 texts_all: List[str] = []
 siglip_model = AutoModel.from_pretrained(SIGLIP_MODEL_ID).to(device)
 siglip_model.eval()
+# Precompute text embeddings (on CPU)
 text_embeds_parts = []
+for i in tqdm(range(0, len(texts_all), BATCH_SIZE), desc="Encoding texts (CPU)"):
     batch_texts = texts_all[i : i + BATCH_SIZE]
     inputs = siglip_processor(text=batch_texts, padding=True, truncation=True, return_tensors="pt")
     with torch.no_grad():
 # -------------------------
 llava_tokenizer: Optional[AutoTokenizer] = None
 llava_model = None
+llava_mode: Optional[str] = None  # 'local', 'trust_remote_code', 'hf_api', or None
 load_errors = []
 # Attempt 1: local llava package (preferred)
     llava_model.eval()
     llava_mode = "local"
     print("✅ Llava loaded from installed package.")
+except Exception:
     tb_local = traceback.format_exc()
     load_errors.append(("local_llava_import", tb_local))
+    print("Local llava import failed — will try trust_remote_code fallback. See logs for details.")
 # Attempt 2: trust_remote_code fallback
 if llava_mode is None:
         llava_model.eval()
         llava_mode = "trust_remote_code"
         print("✅ Llava loaded via trust_remote_code fallback.")
+    except Exception:
         tb_trust = traceback.format_exc()
         load_errors.append(("fallback_trust_remote_code", tb_trust))
+        print("trust_remote_code fallback failed — will try HF router if token provided.")
+# Attempt 3: Hugging Face router Inference API fallback (requires HUGGINGFACE_TOKEN)
 if llava_mode is None and HUGGINGFACE_TOKEN:
     llava_mode = "hf_api"
+    print("No usable local model found. Will use Hugging Face router Inference API for generation (HUGGINGFACE_TOKEN detected).")
 if llava_mode is None:
+    print("WARNING: No Llava model available and no HUGGINGFACE_TOKEN supplied. Generation will return an actionable error.")
     for name, tb in load_errors:
+        print(f"--- {name} traceback ---\n{tb}")
 # -------------------------
+# Helper: call Hugging Face router inference API
 # -------------------------
 def call_hf_inference_api(prompt: str, max_new_tokens: int = 256, temperature: float = 0.0):
     if not HUGGINGFACE_TOKEN:
         raise RuntimeError("HUGGINGFACE_TOKEN not set; cannot call Hugging Face Inference API.")
+    headers = {"Authorization": f"Bearer {HUGGINGFACE_TOKEN}", "Content-Type": "application/json"}
     payload = {
+        "model": LLAVA_MODEL_ID,
         "inputs": prompt,
         "parameters": {"max_new_tokens": max_new_tokens, "temperature": temperature},
         "options": {"wait_for_model": True},
     if resp.status_code != 200:
         raise RuntimeError(f"HF Inference API error {resp.status_code}: {resp.text}")
     data = resp.json()
+    # handle common response shapes
     if isinstance(data, list) and data and isinstance(data[0], dict) and "generated_text" in data[0]:
         return data[0]["generated_text"]
     if isinstance(data, dict) and "generated_text" in data:
         return data["generated_text"]
     if isinstance(data, str):
         return data
     return json.dumps(data)
 # -------------------------
+# Retrieval & generation
 # -------------------------
 def retrieve_top_k_texts(image: Image.Image, k: int = TOP_K_DEFAULT):
     inputs = siglip_processor(images=image, return_tensors="pt")
     )
     if llava_mode in ("local", "trust_remote_code"):
         inputs = llava_tokenizer(prompt, return_tensors="pt")
         inputs = {k: v.to(device) for k, v in inputs.items()}
         with torch.no_grad():
         resp = llava_tokenizer.decode(output_ids[0], skip_special_tokens=True)
         return resp
     elif llava_mode == "hf_api":
         return call_hf_inference_api(prompt, max_new_tokens=max_tokens)
     else:
         err = (
             "No Llava model is available for generation.\n\n"
+            "Fix options:\n"
             "1) Install the LLaVA repo in requirements.txt and rebuild the Space:\n"
             "   git+https://github.com/haotian-liu/LLaVA.git@main\n"
+            "2) Or add a valid Hugging Face API token as HUGGINGFACE_TOKEN in Space secrets to use the router.\n\n"
+            "Check Space logs for detailed tracebacks printed at startup."
         )
         return err
 # -------------------------
+# Gradio app
 # -------------------------
 def gradio_pipeline(image: Image.Image, question: str, k: int = TOP_K_DEFAULT):
     if image is None or not question:
         return None, "Please provide both an image and a question."
     retrieved = retrieve_top_k_texts(image, k=int(k))
     try:
         answer = llava_answer(image, retrieved, question)
 with gr.Blocks(title="Agri Image + Question → Llava Response (robust)") as demo:
     gr.Markdown(
         "## Agri Image QA\n\nThis app preloads SigLip embeddings at startup. "
+        "Generation uses a local Llava model if available, otherwise the Hugging Face router Inference API "
+        "(requires HUGGINGFACE_TOKEN secret in Space settings)."
     )
     with gr.Row():
         img_in = gr.Image(type="pil")