Spaces:

EYEDOL
/

AGRO

Sleeping

App Files Files Community

EYEDOL commited on Nov 17, 2025

Commit

463c8c8

verified ·

1 Parent(s): e3c600e

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -56

app.py CHANGED Viewed

@@ -1,77 +1,83 @@
-"""
-Gradio Space app: Preloaded SigLip + Llava pipeline for instant user response.
-Pipeline:
-1. Startup: load SigLip processor, model, compute all text embeddings.
-2. Startup: load Llava tokenizer & LlavaForCausalLM model.
-3. User uploads image + asks question → instant retrieval + Llava response.
-"""
 import os
-from typing import List, Tuple
-import gradio as gr
 import torch
 import torch.nn.functional as F
 from datasets import load_dataset
 from PIL import Image
-from transformers import AutoProcessor
-# Install llava repo if not already installed:
-# pip install git+https://github.com/haotian-liu/LLaVA.git
-from llava.model import LlavaForCausalLM
-from transformers import AutoTokenizer
 SIGLIP_MODEL_ID = "EYEDOL/siglipFULL-agri-finetuned"
-LLAVA_MODEL_ID = "llava-hf/llava-1.5-7b-hf"  # replace with your actual model repo
 DATASET_TEMPLATE = "EYEDOL/AGRILLAVA-image-text{}"
 NUM_DATASETS = 1
 BATCH_SIZE = 16
 TOP_K_DEFAULT = 3
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # -------------------------
-# Startup: load datasets and compute SigLip text embeddings
 # -------------------------
-print("⏳ Loading datasets and computing SigLip text embeddings...")
 texts_all = []
 for i in range(1, NUM_DATASETS + 1):
     ds = load_dataset(DATASET_TEMPLATE.format(i), split="train")
     texts_all.extend(ds["text"])
 siglip_processor = AutoProcessor.from_pretrained(SIGLIP_MODEL_ID)
 siglip_model = AutoModel.from_pretrained(SIGLIP_MODEL_ID).to(device)
 siglip_model.eval()
 text_embeds_all = []
-for i in range(0, len(texts_all), BATCH_SIZE):
-    batch_texts = texts_all[i:i+BATCH_SIZE]
-    inputs = siglip_processor(text=batch_texts, padding=True, truncation=True, return_tensors="pt").to(device)
     with torch.no_grad():
         text_embeds = siglip_model.get_text_features(**inputs)
         text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
         text_embeds_all.append(text_embeds.cpu())
     del inputs, text_embeds
-    torch.cuda.empty_cache()
 text_embeds_all = torch.cat(text_embeds_all, dim=0)
-print(f"✅ Finished encoding {len(texts_all)} texts. Shape: {text_embeds_all.shape}")
 # -------------------------
-# Startup: load Llava model & tokenizer
 # -------------------------
-print("⏳ Loading Llava tokenizer and LlavaForCausalLM model...")
 llava_tokenizer = AutoTokenizer.from_pretrained(LLAVA_MODEL_ID, use_fast=False)
-llava_model = LlavaForCausalLM.from_pretrained(LLAVA_MODEL_ID).to(device)
 llava_model.eval()
-print("✅ Llava model loaded.")
 # -------------------------
-# SigLip retrieval function
 # -------------------------
 def retrieve_top_k_texts(image: Image.Image, k=TOP_K_DEFAULT):
-    inputs = siglip_processor(images=image, return_tensors="pt").to(device)
     with torch.no_grad():
         img_embed = siglip_model.get_image_features(**inputs)
         img_embed = img_embed / img_embed.norm(p=2, dim=-1, keepdim=True)
@@ -81,37 +87,30 @@ def retrieve_top_k_texts(image: Image.Image, k=TOP_K_DEFAULT):
     results = [(texts_all[idx.item()], float(score)) for idx, score in zip(topk.indices, topk.values)]
     return results
-# -------------------------
-# Llava answer function
-# -------------------------
-def llava_answer(image: Image.Image, retrieved_texts: List[str], question: str, max_tokens=256):
     context_text = "\n".join([f"Retrieved Text: {t}" for t, _ in retrieved_texts])
     prompt = f"Given the image and the following texts:\n{context_text}\nUser Question: {question}\nProvide a detailed answer and crop suggestions."
-    inputs = llava_tokenizer(prompt, return_tensors="pt").to(device)
     with torch.no_grad():
-        output_ids = llava_model.generate(**inputs, max_new_tokens=max_tokens)
-    response = llava_tokenizer.decode(output_ids[0], skip_special_tokens=True)
-    return response
 # -------------------------
-# Gradio interface pipeline
 # -------------------------
 def gradio_pipeline(image: Image.Image, question: str, k: int = TOP_K_DEFAULT):
     if image is None or not question:
-        return None, "Please provide both image and question."
-    retrieved_texts = retrieve_top_k_texts(image, k=int(k))
-    response = llava_answer(image, retrieved_texts, question)
-    return image, response
-# -------------------------
-# Gradio Blocks
-# -------------------------
-with gr.Blocks(title="Agri Image + Question → Llava Response") as demo:
-    gr.Markdown("# Agri Image Question Answering\nUpload an agriculture image, ask a question, and get context-aware crop suggestions.")
     with gr.Row():
         img_in = gr.Image(type="pil")
         out_img = gr.Image(type="pil", label="Image")
@@ -119,8 +118,7 @@ with gr.Blocks(title="Agri Image + Question → Llava Response") as demo:
     k_slider = gr.Slider(minimum=1, maximum=10, step=1, value=TOP_K_DEFAULT, label="Top-k retrieval")
     txt_out = gr.Textbox(label="Llava Response", lines=8)
     run_btn = gr.Button("Generate Answer")
     run_btn.click(fn=gradio_pipeline, inputs=[img_in, question_input, k_slider], outputs=[out_img, txt_out])
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", share=False)

+# app.py (CPU-only version)
 import os
+# FORCE CPU: disable CUDA visibility for this process before importing torch/transformers
+os.environ["CUDA_VISIBLE_DEVICES"] = ""   # important: must be set before torch import
+os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
 import torch
 import torch.nn.functional as F
 from datasets import load_dataset
+from transformers import AutoProcessor, AutoTokenizer, AutoModelForCausalLM
 from PIL import Image
+import gradio as gr
+from tqdm import tqdm
+# -------------------------
+# Config - set your model IDs here
+# -------------------------
 SIGLIP_MODEL_ID = "EYEDOL/siglipFULL-agri-finetuned"
+LLAVA_MODEL_ID = "llava-hf/llava-1.5-7b-hf"   # <-- replace this with the HF repo ID
 DATASET_TEMPLATE = "EYEDOL/AGRILLAVA-image-text{}"
 NUM_DATASETS = 1
 BATCH_SIZE = 16
 TOP_K_DEFAULT = 3
+# Device - CPU only
+device = torch.device("cpu")
+print("Running on device:", device)
 # -------------------------
+# Load dataset and SigLip (as before)
 # -------------------------
+print("Loading datasets and computing SigLip text embeddings (CPU)...")
 texts_all = []
 for i in range(1, NUM_DATASETS + 1):
     ds = load_dataset(DATASET_TEMPLATE.format(i), split="train")
     texts_all.extend(ds["text"])
 siglip_processor = AutoProcessor.from_pretrained(SIGLIP_MODEL_ID)
+# Use AutoModel for Siglip (same as before)
 siglip_model = AutoModel.from_pretrained(SIGLIP_MODEL_ID).to(device)
 siglip_model.eval()
+# Precompute text embeddings (on CPU) -- this may take time
 text_embeds_all = []
+for i in tqdm(range(0, len(texts_all), BATCH_SIZE), desc="Encoding texts (CPU)"):
+    batch_texts = texts_all[i : i + BATCH_SIZE]
+    inputs = siglip_processor(text=batch_texts, padding=True, truncation=True, return_tensors="pt")
+    # ensure tensors are on CPU (they already are)
     with torch.no_grad():
         text_embeds = siglip_model.get_text_features(**inputs)
         text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
         text_embeds_all.append(text_embeds.cpu())
     del inputs, text_embeds
 text_embeds_all = torch.cat(text_embeds_all, dim=0)
+print(f"Finished encoding {len(texts_all)} texts. Embeddings shape: {text_embeds_all.shape}")
 # -------------------------
+# Load Llava tokenizer + model on CPU
 # -------------------------
+print("Loading Llava tokenizer and model (CPU, trust_remote_code=True)...")
+# Use slow tokenizer if fast fails on Spaces
 llava_tokenizer = AutoTokenizer.from_pretrained(LLAVA_MODEL_ID, use_fast=False)
+# Use trust_remote_code=True so the repo's custom model class is used.
+# Use device_map={"": "cpu"} to force all model weights to CPU; use torch_dtype=float32 for safety.
+llava_model = AutoModelForCausalLM.from_pretrained(
+    LLAVA_MODEL_ID,
+    trust_remote_code=True,
+    device_map={"": "cpu"},
+    torch_dtype=torch.float32,
+    low_cpu_mem_usage=True  # help reduce RAM usage when possible
+)
 llava_model.eval()
+print("Llava model loaded onto CPU.")
 # -------------------------
+# Retrieval and answer functions
 # -------------------------
 def retrieve_top_k_texts(image: Image.Image, k=TOP_K_DEFAULT):
+    inputs = siglip_processor(images=image, return_tensors="pt")
     with torch.no_grad():
         img_embed = siglip_model.get_image_features(**inputs)
         img_embed = img_embed / img_embed.norm(p=2, dim=-1, keepdim=True)
     results = [(texts_all[idx.item()], float(score)) for idx, score in zip(topk.indices, topk.values)]
     return results
+def llava_answer(image: Image.Image, retrieved_texts, question: str, max_tokens=256):
     context_text = "\n".join([f"Retrieved Text: {t}" for t, _ in retrieved_texts])
     prompt = f"Given the image and the following texts:\n{context_text}\nUser Question: {question}\nProvide a detailed answer and crop suggestions."
+    inputs = llava_tokenizer(prompt, return_tensors="pt")
+    # ensure inputs are on CPU
+    inputs = {k: v.to("cpu") for k, v in inputs.items()}
     with torch.no_grad():
+        out = llava_model.generate(**inputs, max_new_tokens=max_tokens)
+    resp = llava_tokenizer.decode(out[0], skip_special_tokens=True)
+    return resp
 # -------------------------
+# Gradio pipeline
 # -------------------------
 def gradio_pipeline(image: Image.Image, question: str, k: int = TOP_K_DEFAULT):
     if image is None or not question:
+        return None, "Please provide both an image and a question."
+    retrieved = retrieve_top_k_texts(image, k=int(k))
+    answer = llava_answer(image, retrieved, question)
+    return image, answer
+with gr.Blocks(title="Agri Image + Question → Llava Response (CPU)") as demo:
+    gr.Markdown("# Agri Image QA (CPU)\\nUpload an agriculture image + question. This runs fully on CPU.")
     with gr.Row():
         img_in = gr.Image(type="pil")
         out_img = gr.Image(type="pil", label="Image")
     k_slider = gr.Slider(minimum=1, maximum=10, step=1, value=TOP_K_DEFAULT, label="Top-k retrieval")
     txt_out = gr.Textbox(label="Llava Response", lines=8)
     run_btn = gr.Button("Generate Answer")
     run_btn.click(fn=gradio_pipeline, inputs=[img_in, question_input, k_slider], outputs=[out_img, txt_out])
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", share=False)