Spaces:

EYEDOL
/

AGRO

Sleeping

App Files Files Community

EYEDOL commited on Nov 19, 2025

Commit

1278acb

verified ·

1 Parent(s): 606744f

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -43

app.py CHANGED Viewed

@@ -1,25 +1,28 @@
-# app.py (CPU-only version)
 import os
-# FORCE CPU: disable CUDA visibility for this process before importing torch/transformers
-os.environ["CUDA_VISIBLE_DEVICES"] = ""   # important: must be set before torch import
 os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
 import torch
 import torch.nn.functional as F
 from datasets import load_dataset
-from transformers import AutoProcessor, AutoTokenizer, AutoModelForCausalLM
 from transformers import AutoProcessor, AutoModel, AutoTokenizer, AutoModelForCausalLM
 from PIL import Image
 import gradio as gr
 from tqdm import tqdm
 # -------------------------
-# Config - set your model IDs here
 # -------------------------
 SIGLIP_MODEL_ID = "EYEDOL/siglipFULL-agri-finetuned"
-LLAVA_MODEL_ID = "llava-hf/llava-1.5-7b-hf"   # <-- replace this with the HF repo ID
 DATASET_TEMPLATE = "EYEDOL/AGRILLAVA-image-text{}"
-NUM_DATASETS = 1
 BATCH_SIZE = 16
 TOP_K_DEFAULT = 3
@@ -28,56 +31,104 @@ device = torch.device("cpu")
 print("Running on device:", device)
 # -------------------------
-# Load dataset and SigLip (as before)
 # -------------------------
 print("Loading datasets and computing SigLip text embeddings (CPU)...")
-texts_all = []
 for i in range(1, NUM_DATASETS + 1):
     ds = load_dataset(DATASET_TEMPLATE.format(i), split="train")
     texts_all.extend(ds["text"])
 siglip_processor = AutoProcessor.from_pretrained(SIGLIP_MODEL_ID)
-# Use AutoModel for Siglip (same as before)
 siglip_model = AutoModel.from_pretrained(SIGLIP_MODEL_ID).to(device)
 siglip_model.eval()
-# Precompute text embeddings (on CPU) -- this may take time
-text_embeds_all = []
 for i in tqdm(range(0, len(texts_all), BATCH_SIZE), desc="Encoding texts (CPU)"):
     batch_texts = texts_all[i : i + BATCH_SIZE]
     inputs = siglip_processor(text=batch_texts, padding=True, truncation=True, return_tensors="pt")
-    # ensure tensors are on CPU (they already are)
     with torch.no_grad():
         text_embeds = siglip_model.get_text_features(**inputs)
         text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
-        text_embeds_all.append(text_embeds.cpu())
     del inputs, text_embeds
-text_embeds_all = torch.cat(text_embeds_all, dim=0)
 print(f"Finished encoding {len(texts_all)} texts. Embeddings shape: {text_embeds_all.shape}")
 # -------------------------
-# Load Llava tokenizer + model on CPU
 # -------------------------
-print("Loading Llava tokenizer and model (CPU, trust_remote_code=True)...")
-# Use slow tokenizer if fast fails on Spaces
-llava_tokenizer = AutoTokenizer.from_pretrained(LLAVA_MODEL_ID, use_fast=False)
-# Use trust_remote_code=True so the repo's custom model class is used.
-# Use device_map={"": "cpu"} to force all model weights to CPU; use torch_dtype=float32 for safety.
-llava_model = AutoModelForCausalLM.from_pretrained(
-    LLAVA_MODEL_ID,
-    trust_remote_code=True,
-    device_map={"": "cpu"},
-    torch_dtype=torch.float32,
-    low_cpu_mem_usage=True  # help reduce RAM usage when possible
-)
-llava_model.eval()
-print("Llava model loaded onto CPU.")
 # -------------------------
-# Retrieval and answer functions
 # -------------------------
-def retrieve_top_k_texts(image: Image.Image, k=TOP_K_DEFAULT):
     inputs = siglip_processor(images=image, return_tensors="pt")
     with torch.no_grad():
         img_embed = siglip_model.get_image_features(**inputs)
@@ -88,17 +139,26 @@ def retrieve_top_k_texts(image: Image.Image, k=TOP_K_DEFAULT):
     results = [(texts_all[idx.item()], float(score)) for idx, score in zip(topk.indices, topk.values)]
     return results
-def llava_answer(image: Image.Image, retrieved_texts, question: str, max_tokens=256):
     context_text = "\n".join([f"Retrieved Text: {t}" for t, _ in retrieved_texts])
-    prompt = f"Given the image and the following texts:\n{context_text}\nUser Question: {question}\nProvide a detailed answer and crop suggestions."
     inputs = llava_tokenizer(prompt, return_tensors="pt")
-    # ensure inputs are on CPU
-    inputs = {k: v.to("cpu") for k, v in inputs.items()}
     with torch.no_grad():
-        out = llava_model.generate(**inputs, max_new_tokens=max_tokens)
-    resp = llava_tokenizer.decode(out[0], skip_special_tokens=True)
-    return resp
 # -------------------------
 # Gradio pipeline
@@ -107,17 +167,27 @@ def gradio_pipeline(image: Image.Image, question: str, k: int = TOP_K_DEFAULT):
     if image is None or not question:
         return None, "Please provide both an image and a question."
     retrieved = retrieve_top_k_texts(image, k=int(k))
-    answer = llava_answer(image, retrieved, question)
     return image, answer
 with gr.Blocks(title="Agri Image + Question → Llava Response (CPU)") as demo:
-    gr.Markdown("# Agri Image QA (CPU)\\nUpload an agriculture image + question. This runs fully on CPU.")
     with gr.Row():
         img_in = gr.Image(type="pil")
         out_img = gr.Image(type="pil", label="Image")
     question_input = gr.Textbox(label="Question about the image", lines=2)
     k_slider = gr.Slider(minimum=1, maximum=10, step=1, value=TOP_K_DEFAULT, label="Top-k retrieval")
-    txt_out = gr.Textbox(label="Llava Response", lines=8)
     run_btn = gr.Button("Generate Answer")
     run_btn.click(fn=gradio_pipeline, inputs=[img_in, question_input, k_slider], outputs=[out_img, txt_out])

+# app.py (CPU-friendly, preloaded SigLip + Llava with robust loading)
 import os
+# FORCE CPU: must be set before importing torch/transformers
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
 os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
+import sys
+import traceback
+from typing import List, Tuple
 import torch
 import torch.nn.functional as F
 from datasets import load_dataset
 from transformers import AutoProcessor, AutoModel, AutoTokenizer, AutoModelForCausalLM
 from PIL import Image
 import gradio as gr
 from tqdm import tqdm
 # -------------------------
+# Config - update these IDs as needed
 # -------------------------
 SIGLIP_MODEL_ID = "EYEDOL/siglipFULL-agri-finetuned"
+LLAVA_MODEL_ID = "llava-hf/llava-1.5-7b-hf"   # <-- replace with your HF repo ID if different
 DATASET_TEMPLATE = "EYEDOL/AGRILLAVA-image-text{}"
+NUM_DATASETS = 1         # set to 15 if you want all datasets loaded (startup memory/time increases)
 BATCH_SIZE = 16
 TOP_K_DEFAULT = 3
 print("Running on device:", device)
 # -------------------------
+# Load dataset and SigLip
 # -------------------------
 print("Loading datasets and computing SigLip text embeddings (CPU)...")
+texts_all: List[str] = []
 for i in range(1, NUM_DATASETS + 1):
     ds = load_dataset(DATASET_TEMPLATE.format(i), split="train")
     texts_all.extend(ds["text"])
 siglip_processor = AutoProcessor.from_pretrained(SIGLIP_MODEL_ID)
 siglip_model = AutoModel.from_pretrained(SIGLIP_MODEL_ID).to(device)
 siglip_model.eval()
+# Precompute text embeddings (CPU)
+text_embeds_list = []
 for i in tqdm(range(0, len(texts_all), BATCH_SIZE), desc="Encoding texts (CPU)"):
     batch_texts = texts_all[i : i + BATCH_SIZE]
     inputs = siglip_processor(text=batch_texts, padding=True, truncation=True, return_tensors="pt")
+    # inputs are on CPU
     with torch.no_grad():
         text_embeds = siglip_model.get_text_features(**inputs)
         text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds_list.append(text_embeds.cpu())
     del inputs, text_embeds
+if len(text_embeds_list) == 0:
+    text_embeds_all = torch.empty((0, 0))
+else:
+    text_embeds_all = torch.cat(text_embeds_list, dim=0)
 print(f"Finished encoding {len(texts_all)} texts. Embeddings shape: {text_embeds_all.shape}")
 # -------------------------
+# Load Llava model & tokenizer (robust)
+# Strategy:
+# 1) Try to import LlavaForCausalLM from installed llava package (recommended).
+# 2) If not available, try AutoModelForCausalLM.from_pretrained(..., trust_remote_code=True).
+# 3) If both fail, raise a clear error with instructions.
 # -------------------------
+llava_tokenizer = None
+llava_model = None
+load_errors = []
+# Attempt 1: local llava package (preferred)
+try:
+    # Import here so we don't require the package unless we need it
+    from llava.model import LlavaForCausalLM  # type: ignore
+    print("Found installed 'llava' package — loading LlavaForCausalLM from it (CPU)...")
+    llava_tokenizer = AutoTokenizer.from_pretrained(LLAVA_MODEL_ID, use_fast=False)
+    llava_model = LlavaForCausalLM.from_pretrained(
+        LLAVA_MODEL_ID,
+        device_map={"": "cpu"},
+        torch_dtype=torch.float32,
+        low_cpu_mem_usage=True,
+    )
+    llava_model.to(device)
+    llava_model.eval()
+    print("✅ LlavaForCausalLM loaded via local llava package.")
+except Exception as e_local:
+    tb_local = traceback.format_exc()
+    load_errors.append(("local_llava_import", tb_local))
+    print("Local llava import/load failed — will attempt fallback (trust_remote_code=True).")
+    # Attempt 2: trust_remote_code fallback
+    try:
+        print("Attempting AutoModelForCausalLM.from_pretrained(..., trust_remote_code=True) (CPU)...")
+        llava_tokenizer = AutoTokenizer.from_pretrained(LLAVA_MODEL_ID, use_fast=False)
+        llava_model = AutoModelForCausalLM.from_pretrained(
+            LLAVA_MODEL_ID,
+            trust_remote_code=True,
+            device_map={"": "cpu"},
+            torch_dtype=torch.float32,
+            low_cpu_mem_usage=True,
+        )
+        llava_model.to(device)
+        llava_model.eval()
+        print("✅ Llava model loaded via trust_remote_code fallback.")
+    except Exception as e_fallback:
+        tb_fallback = traceback.format_exc()
+        load_errors.append(("fallback_trust_remote_code", tb_fallback))
+        # Both failed — raise a helpful error describing how to fix
+        err_msg = (
+            "Failed to load the Llava model using both strategies.\n\n"
+            "Recommended fixes:\n"
+            "1) Add the LLaVA repo to requirements.txt so the `llava` package and LlavaForCausalLM are installed:\n"
+            "   git+https://github.com/haotian-liu/LLaVA.git@main\n"
+            "   Then rebuild your Space.\n\n"
+            "2) If you prefer trust_remote_code, ensure the HF model repo supports `trust_remote_code=True` and\n"
+            "   that any repo-specific dependencies (listed in the repo README) are installed in requirements.txt.\n\n"
+            "Debug details (tracebacks):\n\n"
+        )
+        for name, tb in load_errors:
+            err_msg += f"--- {name} traceback ---\n{tb}\n"
+        # raise RuntimeError with the composed message
+        raise RuntimeError(err_msg)
 # -------------------------
+# SigLip retrieval function
 # -------------------------
+def retrieve_top_k_texts(image: Image.Image, k: int = TOP_K_DEFAULT):
     inputs = siglip_processor(images=image, return_tensors="pt")
     with torch.no_grad():
         img_embed = siglip_model.get_image_features(**inputs)
     results = [(texts_all[idx.item()], float(score)) for idx, score in zip(topk.indices, topk.values)]
     return results
+# -------------------------
+# Llava answer function
+# -------------------------
+def llava_answer(image: Image.Image, retrieved_texts, question: str, max_tokens: int = 256):
+    # Compose context: retrieved text + short instruction
     context_text = "\n".join([f"Retrieved Text: {t}" for t, _ in retrieved_texts])
+    prompt = (
+        "You are an agricultural assistant. Use the provided retrieved texts and the image context to answer the user's question.\n\n"
+        f"Retrieved texts:\n{context_text}\n\n"
+        f"User question: {question}\n\n"
+        "Provide a concise, actionable answer and crop suggestions where appropriate."
+    )
     inputs = llava_tokenizer(prompt, return_tensors="pt")
+    # ensure tokens are on CPU
+    inputs = {k: v.to(device) for k, v in inputs.items()}
     with torch.no_grad():
+        output_ids = llava_model.generate(**inputs, max_new_tokens=max_tokens)
+    response = llava_tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    return response
 # -------------------------
 # Gradio pipeline
     if image is None or not question:
         return None, "Please provide both an image and a question."
     retrieved = retrieve_top_k_texts(image, k=int(k))
+    try:
+        answer = llava_answer(image, retrieved, question)
+    except Exception as e:
+        tb = traceback.format_exc()
+        answer = f"Error while generating answer: {e}\n\nTraceback:\n{tb}"
     return image, answer
+# -------------------------
+# Gradio app
+# -------------------------
 with gr.Blocks(title="Agri Image + Question → Llava Response (CPU)") as demo:
+    gr.Markdown(
+        "# Agri Image QA (CPU)\n\nUpload an agriculture image and ask a question. "
+        "This Space preloads models and embeddings at startup for faster responses."
+    )
     with gr.Row():
         img_in = gr.Image(type="pil")
         out_img = gr.Image(type="pil", label="Image")
     question_input = gr.Textbox(label="Question about the image", lines=2)
     k_slider = gr.Slider(minimum=1, maximum=10, step=1, value=TOP_K_DEFAULT, label="Top-k retrieval")
+    txt_out = gr.Textbox(label="Llava Response", lines=12)
     run_btn = gr.Button("Generate Answer")
     run_btn.click(fn=gradio_pipeline, inputs=[img_in, question_input, k_slider], outputs=[out_img, txt_out])