Spaces:

EYEDOL
/

AGRO

Sleeping

App Files Files Community

EYEDOL commited on Nov 16, 2025

Commit

0905af1

verified ·

1 Parent(s): bcda1e2

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -67

app.py CHANGED Viewed

@@ -1,27 +1,21 @@
 """
-Gradio Space app (app.py) — SigLip Image + Question → Llava Response (Improved)
-Pipeline and improvements:
-1. User uploads an agriculture image and asks a question.
-2. SigLip model retrieves top-k relevant texts.
-3. Llava model generates a response using retrieved texts, image, and question.
-Improvements implemented to handle the Tokenizer/Model errors:
-- Lazy-load Llava model and tokenizer only when first required, reducing startup errors and memory usage.
-- Added exception handling for tokenizer/model loading failures (common with incompatible or custom Llava models).
-- Added clear error messages to guide installing correct dependencies or using compatible model versions.
 """
 import os
-from functools import lru_cache
 from typing import List, Tuple
 import gradio as gr
 import torch
 import torch.nn.functional as F
-from datasets import load_dataset
 from PIL import Image
-from transformers import AutoProcessor, AutoModel
 from tqdm import tqdm
 SIGLIP_MODEL_ID = "EYEDOL/siglipFULL-agri-finetuned"
@@ -34,41 +28,49 @@ TOP_K_DEFAULT = 3
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # -------------------------
-# SigLip: load & precompute text embeddings
 # -------------------------
-@lru_cache(maxsize=1)
-def load_singlip_texts_and_embeddings():
-    texts = []
-    for i in range(1, NUM_DATASETS + 1):
-        ds = load_dataset(DATASET_TEMPLATE.format(i), split="train")
-        texts.extend(ds["text"])
-    processor = AutoProcessor.from_pretrained(SIGLIP_MODEL_ID)
-    model = AutoModel.from_pretrained(SIGLIP_MODEL_ID).to(device)
-    model.eval()
-    text_embeds_all = []
-    for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Encoding texts"):
-        batch_texts = texts[i:i+BATCH_SIZE]
-        inputs = processor(text=batch_texts, padding=True, truncation=True, return_tensors="pt").to(device)
-        with torch.no_grad():
-            text_embeds = model.get_text_features(**inputs)
-            text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
-            text_embeds_all.append(text_embeds.cpu())
-        del inputs, text_embeds
-        torch.cuda.empty_cache()
-    text_embeds_all = torch.cat(text_embeds_all, dim=0)
-    return processor, model, texts, text_embeds_all
 # -------------------------
 # SigLip retrieval
 # -------------------------
 def retrieve_top_k_texts(image: Image.Image, k=TOP_K_DEFAULT):
-    processor, model, texts_all, text_embeds_all = load_singlip_texts_and_embeddings()
-    inputs = processor(images=image, return_tensors="pt").to(device)
     with torch.no_grad():
-        img_embed = model.get_image_features(**inputs)
         img_embed = img_embed / img_embed.norm(p=2, dim=-1, keepdim=True)
     sims = F.cosine_similarity(img_embed.cpu(), text_embeds_all)
@@ -77,34 +79,17 @@ def retrieve_top_k_texts(image: Image.Image, k=TOP_K_DEFAULT):
     return results
 # -------------------------
-# Lazy-load Llava model with error handling
 # -------------------------
-llava_model_cache = {}
-def load_llava_model():
-    if 'model' in llava_model_cache and 'tokenizer' in llava_model_cache:
-        return llava_model_cache['tokenizer'], llava_model_cache['model']
-    try:
-        from transformers import AutoModelForCausalLM, AutoTokenizer
-        tokenizer = AutoTokenizer.from_pretrained(LLAVA_MODEL_ID)
-        model = AutoModelForCausalLM.from_pretrained(LLAVA_MODEL_ID).to(device)
-        model.eval()
-        llava_model_cache['tokenizer'] = tokenizer
-        llava_model_cache['model'] = model
-        return tokenizer, model
-    except Exception as e:
-        raise RuntimeError(f"Failed to load Llava model/tokenizer: {e}. Ensure LLAVA_MODEL_ID is correct and compatible with transformers.")
 def llava_answer(image: Image.Image, retrieved_texts: List[str], question: str, max_tokens=256):
-    tokenizer, model = load_llava_model()
     context_text = "\n".join([f"Retrieved Text: {t}" for t, _ in retrieved_texts])
     prompt = f"Given the image and the following texts:\n{context_text}\nUser Question: {question}\nProvide a detailed answer and crop suggestions."
-    inputs = tokenizer(prompt, return_tensors="pt").to(device)
     with torch.no_grad():
-        output_ids = model.generate(**inputs, max_new_tokens=max_tokens)
-    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
     return response
 # -------------------------
@@ -116,10 +101,7 @@ def gradio_pipeline(image: Image.Image, question: str, k: int = TOP_K_DEFAULT):
         return None, "Please provide both image and question."
     retrieved_texts = retrieve_top_k_texts(image, k=int(k))
-    try:
-        response = llava_answer(image, retrieved_texts, question)
-    except RuntimeError as e:
-        response = str(e)
     return image, response
 with gr.Blocks(title="Agri Image + Question → Llava Response") as demo:
@@ -135,4 +117,4 @@ with gr.Blocks(title="Agri Image + Question → Llava Response") as demo:
     run_btn.click(fn=gradio_pipeline, inputs=[img_in, question_input, k_slider], outputs=[out_img, txt_out])
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", share=False)

 """
+Gradio Space app (app.py) — Preloaded SigLip + Llava pipeline for instant response
+Pipeline:
+1. At startup: load SigLip processor & model, compute all text embeddings.
+2. At startup: load Llava tokenizer & model.
+3. User uploads an image and asks a question → pipeline uses preloaded resources for instant retrieval and response.
 """
 import os
 from typing import List, Tuple
 import gradio as gr
 import torch
 import torch.nn.functional as F
+from datasets import load_dataset, concatenate_datasets
 from PIL import Image
+from transformers import AutoProcessor, AutoModel, AutoTokenizer, AutoModelForCausalLM
 from tqdm import tqdm
 SIGLIP_MODEL_ID = "EYEDOL/siglipFULL-agri-finetuned"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # -------------------------
+# Startup: load all datasets and compute text embeddings
 # -------------------------
+print("⏳ Loading datasets and computing SigLip text embeddings...")
+texts_all = []
+for i in range(1, NUM_DATASETS + 1):
+    ds = load_dataset(DATASET_TEMPLATE.format(i), split="train")
+    texts_all.extend(ds["text"])
+siglip_processor = AutoProcessor.from_pretrained(SIGLIP_MODEL_ID)
+siglip_model = AutoModel.from_pretrained(SIGLIP_MODEL_ID).to(device)
+siglip_model.eval()
+text_embeds_all = []
+for i in tqdm(range(0, len(texts_all), BATCH_SIZE), desc="Encoding texts"):
+    batch_texts = texts_all[i:i+BATCH_SIZE]
+    inputs = siglip_processor(text=batch_texts, padding=True, truncation=True, return_tensors="pt").to(device)
+    with torch.no_grad():
+        text_embeds = siglip_model.get_text_features(**inputs)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds_all.append(text_embeds.cpu())
+    del inputs, text_embeds
+    torch.cuda.empty_cache()
+text_embeds_all = torch.cat(text_embeds_all, dim=0)
+print(f"✅ Finished encoding {len(texts_all)} texts. Shape: {text_embeds_all.shape}")
+# -------------------------
+# Startup: load Llava model & tokenizer
+# -------------------------
+print("⏳ Loading Llava model and tokenizer...")
+llava_tokenizer = AutoTokenizer.from_pretrained(LLAVA_MODEL_ID)
+llava_model = AutoModelForCausalLM.from_pretrained(LLAVA_MODEL_ID).to(device)
+llava_model.eval()
+print("✅ Llava model loaded.")
 # -------------------------
 # SigLip retrieval
 # -------------------------
 def retrieve_top_k_texts(image: Image.Image, k=TOP_K_DEFAULT):
+    inputs = siglip_processor(images=image, return_tensors="pt").to(device)
     with torch.no_grad():
+        img_embed = siglip_model.get_image_features(**inputs)
         img_embed = img_embed / img_embed.norm(p=2, dim=-1, keepdim=True)
     sims = F.cosine_similarity(img_embed.cpu(), text_embeds_all)
     return results
 # -------------------------
+# Llava answer
 # -------------------------
 def llava_answer(image: Image.Image, retrieved_texts: List[str], question: str, max_tokens=256):
     context_text = "\n".join([f"Retrieved Text: {t}" for t, _ in retrieved_texts])
     prompt = f"Given the image and the following texts:\n{context_text}\nUser Question: {question}\nProvide a detailed answer and crop suggestions."
+    inputs = llava_tokenizer(prompt, return_tensors="pt").to(device)
     with torch.no_grad():
+        output_ids = llava_model.generate(**inputs, max_new_tokens=max_tokens)
+    response = llava_tokenizer.decode(output_ids[0], skip_special_tokens=True)
     return response
 # -------------------------
         return None, "Please provide both image and question."
     retrieved_texts = retrieve_top_k_texts(image, k=int(k))
+    response = llava_answer(image, retrieved_texts, question)
     return image, response
 with gr.Blocks(title="Agri Image + Question → Llava Response") as demo:
     run_btn.click(fn=gradio_pipeline, inputs=[img_in, question_input, k_slider], outputs=[out_img, txt_out])
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", share=False)