Spaces:

EYEDOL
/

AGRO

Sleeping

App Files Files Community

EYEDOL commited on Nov 19, 2025

Commit

94b81b9

verified ·

1 Parent(s): cf1d39e

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -74

app.py CHANGED Viewed

@@ -1,12 +1,15 @@
-# app.py (CPU-friendly, preloaded SigLip + Llava with robust loading)
 import os
-# FORCE CPU: must be set before importing torch/transformers
-os.environ["CUDA_VISIBLE_DEVICES"] = ""
-os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
 import sys
 import traceback
-from typing import List, Tuple
 import torch
 import torch.nn.functional as F
@@ -17,23 +20,25 @@ import gradio as gr
 from tqdm import tqdm
 # -------------------------
-# Config - update these IDs as needed
 # -------------------------
 SIGLIP_MODEL_ID = "EYEDOL/siglipFULL-agri-finetuned"
-LLAVA_MODEL_ID = "llava-hf/llava-1.5-7b-hf"   # <-- replace with your HF repo ID if different
 DATASET_TEMPLATE = "EYEDOL/AGRILLAVA-image-text{}"
-NUM_DATASETS = 1         # set to 15 if you want all datasets loaded (startup memory/time increases)
 BATCH_SIZE = 16
 TOP_K_DEFAULT = 3
-# Device - CPU only
 device = torch.device("cpu")
-print("Running on device:", device)
 # -------------------------
-# Load dataset and SigLip
 # -------------------------
-print("Loading datasets and computing SigLip text embeddings (CPU)...")
 texts_all: List[str] = []
 for i in range(1, NUM_DATASETS + 1):
     ds = load_dataset(DATASET_TEMPLATE.format(i), split="train")
@@ -43,41 +48,36 @@ siglip_processor = AutoProcessor.from_pretrained(SIGLIP_MODEL_ID)
 siglip_model = AutoModel.from_pretrained(SIGLIP_MODEL_ID).to(device)
 siglip_model.eval()
-# Precompute text embeddings (CPU)
-text_embeds_list = []
-for i in tqdm(range(0, len(texts_all), BATCH_SIZE), desc="Encoding texts (CPU)"):
     batch_texts = texts_all[i : i + BATCH_SIZE]
     inputs = siglip_processor(text=batch_texts, padding=True, truncation=True, return_tensors="pt")
-    # inputs are on CPU
     with torch.no_grad():
         text_embeds = siglip_model.get_text_features(**inputs)
         text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
-        text_embeds_list.append(text_embeds.cpu())
     del inputs, text_embeds
-if len(text_embeds_list) == 0:
-    text_embeds_all = torch.empty((0, 0))
 else:
-    text_embeds_all = torch.cat(text_embeds_list, dim=0)
-print(f"Finished encoding {len(texts_all)} texts. Embeddings shape: {text_embeds_all.shape}")
 # -------------------------
-# Load Llava model & tokenizer (robust)
-# Strategy:
-# 1) Try to import LlavaForCausalLM from installed llava package (recommended).
-# 2) If not available, try AutoModelForCausalLM.from_pretrained(..., trust_remote_code=True).
-# 3) If both fail, raise a clear error with instructions.
 # -------------------------
-llava_tokenizer = None
 llava_model = None
 load_errors = []
 # Attempt 1: local llava package (preferred)
 try:
-    # Import here so we don't require the package unless we need it
     from llava.model import LlavaForCausalLM  # type: ignore
-    print("Found installed 'llava' package — loading LlavaForCausalLM from it (CPU)...")
     llava_tokenizer = AutoTokenizer.from_pretrained(LLAVA_MODEL_ID, use_fast=False)
     llava_model = LlavaForCausalLM.from_pretrained(
         LLAVA_MODEL_ID,
@@ -87,12 +87,15 @@ try:
     )
     llava_model.to(device)
     llava_model.eval()
-    print("✅ LlavaForCausalLM loaded via local llava package.")
 except Exception as e_local:
     tb_local = traceback.format_exc()
     load_errors.append(("local_llava_import", tb_local))
-    print("Local llava import/load failed — will attempt fallback (trust_remote_code=True).")
-    # Attempt 2: trust_remote_code fallback
     try:
         print("Attempting AutoModelForCausalLM.from_pretrained(..., trust_remote_code=True) (CPU)...")
         llava_tokenizer = AutoTokenizer.from_pretrained(LLAVA_MODEL_ID, use_fast=False)
@@ -105,28 +108,56 @@ except Exception as e_local:
         )
         llava_model.to(device)
         llava_model.eval()
-        print("✅ Llava model loaded via trust_remote_code fallback.")
-    except Exception as e_fallback:
-        tb_fallback = traceback.format_exc()
-        load_errors.append(("fallback_trust_remote_code", tb_fallback))
-        # Both failed — raise a helpful error describing how to fix
-        err_msg = (
-            "Failed to load the Llava model using both strategies.\n\n"
-            "Recommended fixes:\n"
-            "1) Add the LLaVA repo to requirements.txt so the `llava` package and LlavaForCausalLM are installed:\n"
-            "   git+https://github.com/haotian-liu/LLaVA.git@main\n"
-            "   Then rebuild your Space.\n\n"
-            "2) If you prefer trust_remote_code, ensure the HF model repo supports `trust_remote_code=True` and\n"
-            "   that any repo-specific dependencies (listed in the repo README) are installed in requirements.txt.\n\n"
-            "Debug details (tracebacks):\n\n"
-        )
-        for name, tb in load_errors:
-            err_msg += f"--- {name} traceback ---\n{tb}\n"
-        # raise RuntimeError with the composed message
-        raise RuntimeError(err_msg)
 # -------------------------
-# SigLip retrieval function
 # -------------------------
 def retrieve_top_k_texts(image: Image.Image, k: int = TOP_K_DEFAULT):
     inputs = siglip_processor(images=image, return_tensors="pt")
@@ -139,48 +170,59 @@ def retrieve_top_k_texts(image: Image.Image, k: int = TOP_K_DEFAULT):
     results = [(texts_all[idx.item()], float(score)) for idx, score in zip(topk.indices, topk.values)]
     return results
-# -------------------------
-# Llava answer function
-# -------------------------
 def llava_answer(image: Image.Image, retrieved_texts, question: str, max_tokens: int = 256):
-    # Compose context: retrieved text + short instruction
     context_text = "\n".join([f"Retrieved Text: {t}" for t, _ in retrieved_texts])
     prompt = (
-        "You are an agricultural assistant. Use the provided retrieved texts and the image context to answer the user's question.\n\n"
         f"Retrieved texts:\n{context_text}\n\n"
         f"User question: {question}\n\n"
-        "Provide a concise, actionable answer and crop suggestions where appropriate."
     )
-    inputs = llava_tokenizer(prompt, return_tensors="pt")
-    # ensure tokens are on CPU
-    inputs = {k: v.to(device) for k, v in inputs.items()}
-    with torch.no_grad():
-        output_ids = llava_model.generate(**inputs, max_new_tokens=max_tokens)
-    response = llava_tokenizer.decode(output_ids[0], skip_special_tokens=True)
-    return response
 # -------------------------
-# Gradio pipeline
 # -------------------------
 def gradio_pipeline(image: Image.Image, question: str, k: int = TOP_K_DEFAULT):
     if image is None or not question:
         return None, "Please provide both an image and a question."
     retrieved = retrieve_top_k_texts(image, k=int(k))
     try:
         answer = llava_answer(image, retrieved, question)
     except Exception as e:
         tb = traceback.format_exc()
-        answer = f"Error while generating answer: {e}\n\nTraceback:\n{tb}"
     return image, answer
-# -------------------------
-# Gradio app
-# -------------------------
-with gr.Blocks(title="Agri Image + Question → Llava Response (CPU)") as demo:
     gr.Markdown(
-        "# Agri Image QA (CPU)\n\nUpload an agriculture image and ask a question. "
-        "This Space preloads models and embeddings at startup for faster responses."
     )
     with gr.Row():
         img_in = gr.Image(type="pil")

+# app.py — Robust CPU-friendly SigLip -> (Llava local OR HF-inference fallback) pipeline
 import os
+# Force CPU before importing torch/transformers if you want CPU-only
+os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")
+os.environ.setdefault("TRANSFORMERS_NO_ADVISORY_WARNINGS", "1")
 import sys
 import traceback
+from typing import List, Tuple, Optional
+import json
+import requests
+import time
 import torch
 import torch.nn.functional as F
 from tqdm import tqdm
 # -------------------------
+# Config (update model ids & dataset count)
 # -------------------------
 SIGLIP_MODEL_ID = "EYEDOL/siglipFULL-agri-finetuned"
+LLAVA_MODEL_ID = "llava-hf/llava-1.5-7b-hf"   # change if needed
 DATASET_TEMPLATE = "EYEDOL/AGRILLAVA-image-text{}"
+NUM_DATASETS = 1          # set to 15 if you want full data (startup time/memory increases)
 BATCH_SIZE = 16
 TOP_K_DEFAULT = 3
+HF_API_URL = f"https://api-inference.huggingface.co/models/{LLAVA_MODEL_ID}"
+HUGGINGFACE_TOKEN = os.environ.get("HUGGINGFACE_TOKEN", None)
+# Device
 device = torch.device("cpu")
+print("Device:", device)
 # -------------------------
+# Load SigLip dataset & model → precompute text embeddings at startup
 # -------------------------
+print("Loading datasets and computing SigLip text embeddings (startup)...")
 texts_all: List[str] = []
 for i in range(1, NUM_DATASETS + 1):
     ds = load_dataset(DATASET_TEMPLATE.format(i), split="train")
 siglip_model = AutoModel.from_pretrained(SIGLIP_MODEL_ID).to(device)
 siglip_model.eval()
+# compute embeddings
+text_embeds_parts = []
+for i in tqdm(range(0, len(texts_all), BATCH_SIZE), desc="Encoding texts"):
     batch_texts = texts_all[i : i + BATCH_SIZE]
     inputs = siglip_processor(text=batch_texts, padding=True, truncation=True, return_tensors="pt")
     with torch.no_grad():
         text_embeds = siglip_model.get_text_features(**inputs)
         text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds_parts.append(text_embeds.cpu())
     del inputs, text_embeds
+if text_embeds_parts:
+    text_embeds_all = torch.cat(text_embeds_parts, dim=0)
 else:
+    text_embeds_all = torch.empty((0, 0))
+print(f"Encoded {len(texts_all)} texts. Embeddings shape: {text_embeds_all.shape}")
 # -------------------------
+# Llava loading: try local package -> trust_remote_code -> HF Inference API (if token provided)
 # -------------------------
+llava_tokenizer: Optional[AutoTokenizer] = None
 llava_model = None
+llava_mode = None  # 'local', 'trust_remote_code', or 'hf_api' or None
 load_errors = []
 # Attempt 1: local llava package (preferred)
 try:
+    # this import requires the LLaVA repo to be installed in the environment (requirements.txt)
     from llava.model import LlavaForCausalLM  # type: ignore
+    print("Loading LlavaForCausalLM from installed 'llava' package (CPU)...")
     llava_tokenizer = AutoTokenizer.from_pretrained(LLAVA_MODEL_ID, use_fast=False)
     llava_model = LlavaForCausalLM.from_pretrained(
         LLAVA_MODEL_ID,
     )
     llava_model.to(device)
     llava_model.eval()
+    llava_mode = "local"
+    print("✅ Llava loaded from installed package.")
 except Exception as e_local:
     tb_local = traceback.format_exc()
     load_errors.append(("local_llava_import", tb_local))
+    print("Local llava import failed — will try trust_remote_code fallback. (see logs)")
+# Attempt 2: trust_remote_code fallback
+if llava_mode is None:
     try:
         print("Attempting AutoModelForCausalLM.from_pretrained(..., trust_remote_code=True) (CPU)...")
         llava_tokenizer = AutoTokenizer.from_pretrained(LLAVA_MODEL_ID, use_fast=False)
         )
         llava_model.to(device)
         llava_model.eval()
+        llava_mode = "trust_remote_code"
+        print("✅ Llava loaded via trust_remote_code fallback.")
+    except Exception as e_trust:
+        tb_trust = traceback.format_exc()
+        load_errors.append(("fallback_trust_remote_code", tb_trust))
+        print("trust_remote_code fallback failed.")
+# Attempt 3: Hugging Face Inference API fallback (requires HUGGINGFACE_TOKEN)
+if llava_mode is None and HUGGINGFACE_TOKEN:
+    # we won't load a model locally; will call inference API for generation
+    llava_mode = "hf_api"
+    print("No local model available. Will use Hugging Face Inference API for generation (HUGGINGFACE_TOKEN detected).")
+# If still no method available, keep llava_mode None and continue — UI will show actionable message
+if llava_mode is None:
+    print("WARNING: No Llava model available locally or via trust_remote_code, and no HUGGINGFACE_TOKEN found.")
+    print("App will start but generation will return an actionable error. See load_errors for tracebacks.")
+    for name, tb in load_errors:
+        print(f"--- {name} traceback ---")
+        print(tb)
+# -------------------------
+# Helper: call Hugging Face Inference API for text generation
+# -------------------------
+def call_hf_inference_api(prompt: str, max_new_tokens: int = 256, temperature: float = 0.0):
+    if not HUGGINGFACE_TOKEN:
+        raise RuntimeError("HUGGINGFACE_TOKEN not set; cannot call Hugging Face Inference API.")
+    headers = {"Authorization": f"Bearer {HUGGINGFACE_TOKEN}"}
+    payload = {
+        "inputs": prompt,
+        "parameters": {"max_new_tokens": max_new_tokens, "temperature": temperature},
+        "options": {"wait_for_model": True},
+    }
+    resp = requests.post(HF_API_URL, headers=headers, json=payload, timeout=300)
+    if resp.status_code != 200:
+        raise RuntimeError(f"HF Inference API error {resp.status_code}: {resp.text}")
+    data = resp.json()
+    # API returns list or dict depending on model; handle common shapes
+    if isinstance(data, list) and data and isinstance(data[0], dict) and "generated_text" in data[0]:
+        return data[0]["generated_text"]
+    if isinstance(data, dict) and "generated_text" in data:
+        return data["generated_text"]
+    # If the model returns a plain string or other structure:
+    if isinstance(data, str):
+        return data
+    # Fallback: try to stringify
+    return json.dumps(data)
 # -------------------------
+# Retrieval & generation functions
 # -------------------------
 def retrieve_top_k_texts(image: Image.Image, k: int = TOP_K_DEFAULT):
     inputs = siglip_processor(images=image, return_tensors="pt")
     results = [(texts_all[idx.item()], float(score)) for idx, score in zip(topk.indices, topk.values)]
     return results
 def llava_answer(image: Image.Image, retrieved_texts, question: str, max_tokens: int = 256):
     context_text = "\n".join([f"Retrieved Text: {t}" for t, _ in retrieved_texts])
     prompt = (
+        "You are an agricultural assistant. Use the provided retrieved texts to answer concisely.\n\n"
         f"Retrieved texts:\n{context_text}\n\n"
         f"User question: {question}\n\n"
+        "Provide a concise, actionable answer and crop suggestions when applicable."
     )
+    if llava_mode in ("local", "trust_remote_code"):
+        # use the tokenizer + local model
+        inputs = llava_tokenizer(prompt, return_tensors="pt")
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            output_ids = llava_model.generate(**inputs, max_new_tokens=max_tokens)
+        resp = llava_tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        return resp
+    elif llava_mode == "hf_api":
+        # Use HF Inference API
+        return call_hf_inference_api(prompt, max_new_tokens=max_tokens)
+    else:
+        # No model available — return actionable error for the UI
+        err = (
+            "No Llava model is available for generation.\n\n"
+            "Options to fix:\n"
+            "1) Install the LLaVA repo in requirements.txt and rebuild the Space:\n"
+            "   git+https://github.com/haotian-liu/LLaVA.git@main\n"
+            "2) Or provide a Hugging Face API token as the HUGGINGFACE_TOKEN secret in Space settings so the app can\n"
+            f"   fall back to the Inference API. Expected token env var name: HUGGINGFACE_TOKEN\n\n"
+            "Debug info (tracebacks were printed to Space logs at startup).\n"
+        )
+        return err
 # -------------------------
+# Gradio pipeline + UI
 # -------------------------
 def gradio_pipeline(image: Image.Image, question: str, k: int = TOP_K_DEFAULT):
     if image is None or not question:
         return None, "Please provide both an image and a question."
     retrieved = retrieve_top_k_texts(image, k=int(k))
     try:
         answer = llava_answer(image, retrieved, question)
     except Exception as e:
         tb = traceback.format_exc()
+        answer = f"Error during generation: {e}\n\nTraceback:\n{tb}"
     return image, answer
+with gr.Blocks(title="Agri Image + Question → Llava Response (robust)") as demo:
     gr.Markdown(
+        "## Agri Image QA\n\nThis app preloads SigLip embeddings at startup. "
+        "Generation uses a local Llava model if available, otherwise the Hugging Face Inference API "
+        "(requires HUGGINGFACE_TOKEN set in Space secrets)."
     )
     with gr.Row():
         img_in = gr.Image(type="pil")