Spaces:

EYEDOL
/

AGRO

Sleeping

App Files Files Community

EYEDOL commited on Nov 16, 2025

Commit

7679f34

verified ·

1 Parent(s): 43590b6

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -22

app.py CHANGED Viewed

@@ -1,14 +1,15 @@
 """
-Gradio Space app (app.py) — SigLip Image + Question → Llava Response
-Pipeline:
-1. User uploads an agriculture image.
-2. User asks a question about the image.
-3. SigLip model retrieves top-k text captions relevant to the image.
-4. The retrieved text, original image, and user's question are sent to a Llava model.
-5. Llava generates a context-aware response with crop suggestions or explanations.
-This updated app handles both the image retrieval and multi-modal question answering.
 """
 import os
@@ -23,17 +24,13 @@ from PIL import Image
 from transformers import AutoProcessor, AutoModel
 from tqdm import tqdm
-# -------------------------
-# Config
-# -------------------------
 SIGLIP_MODEL_ID = "EYEDOL/siglipFULL-agri-finetuned"
-LLAVA_MODEL_ID = "llava-hf/llava-1.5-7b-hf"  # replace with actual model
 DATASET_TEMPLATE = "EYEDOL/AGRILLAVA-image-text{}"
 NUM_DATASETS = 1
 BATCH_SIZE = 16
 TOP_K_DEFAULT = 3
-# Device
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # -------------------------
@@ -80,15 +77,24 @@ def retrieve_top_k_texts(image: Image.Image, k=TOP_K_DEFAULT):
     return results
 # -------------------------
-# Llava response
 # -------------------------
-@lru_cache(maxsize=1)
 def load_llava_model():
-    from transformers import AutoModelForCausalLM, AutoTokenizer
-    tokenizer = AutoTokenizer.from_pretrained(LLAVA_MODEL_ID)
-    model = AutoModelForCausalLM.from_pretrained(LLAVA_MODEL_ID).to(device)
-    model.eval()
-    return tokenizer, model
 def llava_answer(image: Image.Image, retrieved_texts: List[str], question: str, max_tokens=256):
     tokenizer, model = load_llava_model()
@@ -110,7 +116,10 @@ def gradio_pipeline(image: Image.Image, question: str, k: int = TOP_K_DEFAULT):
         return None, "Please provide both image and question."
     retrieved_texts = retrieve_top_k_texts(image, k=int(k))
-    response = llava_answer(image, retrieved_texts, question)
     return image, response
 with gr.Blocks(title="Agri Image + Question → Llava Response") as demo:
@@ -126,4 +135,4 @@ with gr.Blocks(title="Agri Image + Question → Llava Response") as demo:
     run_btn.click(fn=gradio_pipeline, inputs=[img_in, question_input, k_slider], outputs=[out_img, txt_out])
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", share=False)

 """
+Gradio Space app (app.py) — SigLip Image + Question → Llava Response (Improved)
+Pipeline and improvements:
+1. User uploads an agriculture image and asks a question.
+2. SigLip model retrieves top-k relevant texts.
+3. Llava model generates a response using retrieved texts, image, and question.
+Improvements implemented to handle the Tokenizer/Model errors:
+- Lazy-load Llava model and tokenizer only when first required, reducing startup errors and memory usage.
+- Added exception handling for tokenizer/model loading failures (common with incompatible or custom Llava models).
+- Added clear error messages to guide installing correct dependencies or using compatible model versions.
 """
 import os
 from transformers import AutoProcessor, AutoModel
 from tqdm import tqdm
 SIGLIP_MODEL_ID = "EYEDOL/siglipFULL-agri-finetuned"
+LLAVA_MODEL_ID = "your-llava-model-hf-id"  # replace with actual model
 DATASET_TEMPLATE = "EYEDOL/AGRILLAVA-image-text{}"
 NUM_DATASETS = 1
 BATCH_SIZE = 16
 TOP_K_DEFAULT = 3
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # -------------------------
     return results
 # -------------------------
+# Lazy-load Llava model with error handling
 # -------------------------
+llava_model_cache = {}
 def load_llava_model():
+    if 'model' in llava_model_cache and 'tokenizer' in llava_model_cache:
+        return llava_model_cache['tokenizer'], llava_model_cache['model']
+    try:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(LLAVA_MODEL_ID)
+        model = AutoModelForCausalLM.from_pretrained(LLAVA_MODEL_ID).to(device)
+        model.eval()
+        llava_model_cache['tokenizer'] = tokenizer
+        llava_model_cache['model'] = model
+        return tokenizer, model
+    except Exception as e:
+        raise RuntimeError(f"Failed to load Llava model/tokenizer: {e}. Ensure LLAVA_MODEL_ID is correct and compatible with transformers.")
 def llava_answer(image: Image.Image, retrieved_texts: List[str], question: str, max_tokens=256):
     tokenizer, model = load_llava_model()
         return None, "Please provide both image and question."
     retrieved_texts = retrieve_top_k_texts(image, k=int(k))
+    try:
+        response = llava_answer(image, retrieved_texts, question)
+    except RuntimeError as e:
+        response = str(e)
     return image, response
 with gr.Blocks(title="Agri Image + Question → Llava Response") as demo:
     run_btn.click(fn=gradio_pipeline, inputs=[img_in, question_input, k_slider], outputs=[out_img, txt_out])
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", share=False)