Spaces:

EYEDOL
/

AGRO

Sleeping

App Files Files Community

EYEDOL commited on Nov 16, 2025

Commit

43590b6

verified ·

1 Parent(s): f8c57ca

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -143

app.py CHANGED Viewed

@@ -1,203 +1,129 @@
 """
-Gradio Space app (app.py) — SigLip image -> text retrieval
-Place this file as `app.py` in your Hugging Face Space. Add the requirements listed below to `requirements.txt` in the Space.
-How it works
-- On startup it loads your concatenated datasets and the fine-tuned model `EYEDOL/siglipFULL-agri-finetuned`.
-- It precomputes (and caches) normalized text embeddings on CPU to save GPU memory.
-- The Gradio UI allows users to upload an image, view it, and returns the top-k matched text captions.
-Notes for Spaces
-- If your model or datasets are private, add a `HUGGINGFACE_TOKEN` secret in the Space settings and set `USE_HF_TOKEN = True` below.
-- If you select a GPU runtime for the Space, the app will use it if available.
 """
 import os
-import tempfile
 from functools import lru_cache
 from typing import List, Tuple
 import gradio as gr
 import torch
 import torch.nn.functional as F
-from datasets import concatenate_datasets, load_dataset
 from PIL import Image
-from transformers import AutoModel, AutoProcessor
 from tqdm import tqdm
 # -------------------------
 # Config
 # -------------------------
-MODEL_ID = "EYEDOL/siglipFULL-agri-finetuned"
-DATASET_TEMPLATE = "EYEDOL/AGRILLAVA-image-text{}"  # datasets 1..15
-NUM_DATASETS = 2
-BATCH_SIZE = 16  # for text encoding
-USE_HF_TOKEN = False  # set True if model/datasets are private and you will pass token via environment
 TOP_K_DEFAULT = 3
-# Look for HF token in environment (Spaces -> Settings -> Secrets set HUGGINGFACE_TOKEN)
-HF_TOKEN = os.environ.get("HUGGINGFACE_TOKEN", None)
-if HF_TOKEN:
-    USE_HF_TOKEN = True
-# -------------------------
 # Device
-# -------------------------
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # -------------------------
-# Utility: load & preprocess datasets
 # -------------------------
 @lru_cache(maxsize=1)
-def load_and_merge_datasets(num_datasets: int = NUM_DATASETS) -> List[str]:
     texts = []
-    for i in range(1, num_datasets + 1):
-        name = DATASET_TEMPLATE.format(i)
-        try:
-            ds = load_dataset(name, split="train")
-            # expect a field 'text'
-            texts.extend(list(ds["text"]))
-        except Exception as e:
-            print(f"Warning: failed to load {name}: {e}")
-    return texts
-# -------------------------
-# Load model & processor
-# -------------------------
-@lru_cache(maxsize=1)
-def load_model_and_processor(model_id: str = MODEL_ID, use_token: bool = USE_HF_TOKEN):
-    kwargs = {}
-    if use_token and HF_TOKEN:
-        kwargs["use_auth_token"] = HF_TOKEN
-    processor = AutoProcessor.from_pretrained(model_id, **kwargs)
-    model = AutoModel.from_pretrained(model_id, **kwargs)
-    model.to(device)
     model.eval()
-    return processor, model
-# -------------------------
-# Precompute text embeddings (CPU) and return tensors + raw texts
-# -------------------------
-@lru_cache(maxsize=1)
-def precompute_text_embeddings(texts_tuple: Tuple[str, ...]):
-    # convert tuple back to list
-    texts = list(texts_tuple)
-    processor, model = load_model_and_processor()
     text_embeds_all = []
-    for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Encoding texts (startup)"):
-        batch_texts = texts[i : i + BATCH_SIZE]
-        # processor returns PyTorch tensors by default
-        inputs = processor(text=batch_texts, padding=True, truncation=True, return_tensors="pt")
-        # encode on device then move embeddings to CPU
-        inputs = {k: v.to(device) for k, v in inputs.items()}
         with torch.no_grad():
             text_embeds = model.get_text_features(**inputs)
             text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
             text_embeds_all.append(text_embeds.cpu())
         del inputs, text_embeds
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-    if len(text_embeds_all) == 0:
-        return torch.empty((0, 0)), []
     text_embeds_all = torch.cat(text_embeds_all, dim=0)
-    return text_embeds_all, texts
 # -------------------------
-# High-level initialization step (runs on import)
-# -------------------------
-print("Starting app: loading data and model — this may take a minute...")
-raw_texts = load_and_merge_datasets()
-print(f"Loaded {len(raw_texts)} text captions from datasets (merged).")
-text_embeds_all, texts_all = precompute_text_embeddings(tuple(raw_texts))
-print(f"Precomputed text embeddings: {text_embeds_all.shape}")
-processor, model = load_model_and_processor()
 # -------------------------
-# Retrieval function
-# -------------------------
-def retrieve_top_k_texts_from_image(image: Image.Image, k: int = TOP_K_DEFAULT) -> List[Tuple[str, float]]:
-    # prepare image
-    inputs = processor(images=image, return_tensors="pt")
-    inputs = {k: v.to(device) for k, v in inputs.items()}
     with torch.no_grad():
         img_embed = model.get_image_features(**inputs)
         img_embed = img_embed / img_embed.norm(p=2, dim=-1, keepdim=True)
-    # move to CPU and compute similarity with precomputed text embeddings
     sims = F.cosine_similarity(img_embed.cpu(), text_embeds_all)
     topk = torch.topk(sims, k)
-    results = []
-    for i in range(k):
-        idx = topk.indices[i].item()
-        score = topk.values[i].item()
-        results.append((texts_all[idx], float(score)))
     return results
 # -------------------------
 # Gradio interface
 # -------------------------
-def gradio_predict(img, k=TOP_K_DEFAULT):
-    if img is None:
-        return None, "No image provided."
-    if isinstance(img, str):
-        image = Image.open(img).convert("RGB")
-    else:
-        image = img.convert("RGB")
-    results = retrieve_top_k_texts_from_image(image, k=int(k))
-    # format for display
-    formatted = "\n\n".join([f"Rank {i+1}: {t}\n(score={s:.4f})" for i, (t, s) in enumerate(results)])
-    return image, formatted
-with gr.Blocks(title="SigLip Image -> Text Retriever") as demo:
-    gr.Markdown("# SigLip Image → Text retrieval demo\nUpload an image and get the top-k matching texts from the dataset.")
     with gr.Row():
         img_in = gr.Image(type="pil")
         out_img = gr.Image(type="pil", label="Image")
-    txt_out = gr.Textbox(label="Top-k matches", lines=8)
-    k_slider = gr.Slider(minimum=1, maximum=10, step=1, value=TOP_K_DEFAULT, label="Top k")
-    run_btn = gr.Button("Retrieve")
-    run_btn.click(fn=gradio_predict, inputs=[img_in, k_slider], outputs=[out_img, txt_out])
-# Expose the app
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", share=False)
-# -------------------------
-# requirements.txt (place in your Space as requirements.txt):
-# -------------------------
-# torch
-# torchvision
-# transformers==4.44.2
-# datasets
-# gradio
-# huggingface_hub
-# accelerate
-# pillow
-# tqdm
-# -------------------------
-# Quick setup checklist for HF Space
-# -------------------------
-# 1. Create a new Space (Gradio). In the Settings -> Hardware choose GPU if available and you expect faster inference.
-# 2. Add the requirements.txt (as above).
-# 3. If the model/datasets are private, go to Settings -> Secrets and add HUGGINGFACE_TOKEN with a token that has access.
-# 4. If you set the secret, the app will automatically pick it up from the HUGGINGFACE_TOKEN env var.
-# 5. Commit this app.py and requirements.txt to the Space and your app should start.
-# -------------------------
-# Tips & Troubleshooting
-# -------------------------
-# - Startup time may be long (model download, dataset download, text embedding encoding). Consider saving precomputed text embeddings
-#   to a file (np.save / torch.save) and loading them to speed startup. In Spaces persistent storage is /workspace or /root/.cache.
-# - If memory is tight, reduce NUM_DATASETS or BATCH_SIZE or compute embeddings offline and upload a precomputed tensor.
-# - Avoid printing too many things in Spaces logs to reduce noise.
-# -------------------------

 """
+Gradio Space app (app.py) — SigLip Image + Question → Llava Response
+Pipeline:
+1. User uploads an agriculture image.
+2. User asks a question about the image.
+3. SigLip model retrieves top-k text captions relevant to the image.
+4. The retrieved text, original image, and user's question are sent to a Llava model.
+5. Llava generates a context-aware response with crop suggestions or explanations.
+This updated app handles both the image retrieval and multi-modal question answering.
 """
 import os
 from functools import lru_cache
 from typing import List, Tuple
 import gradio as gr
 import torch
 import torch.nn.functional as F
+from datasets import load_dataset
 from PIL import Image
+from transformers import AutoProcessor, AutoModel
 from tqdm import tqdm
 # -------------------------
 # Config
 # -------------------------
+SIGLIP_MODEL_ID = "EYEDOL/siglipFULL-agri-finetuned"
+LLAVA_MODEL_ID = "llava-hf/llava-1.5-7b-hf"  # replace with actual model
+DATASET_TEMPLATE = "EYEDOL/AGRILLAVA-image-text{}"
+NUM_DATASETS = 1
+BATCH_SIZE = 16
 TOP_K_DEFAULT = 3
 # Device
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # -------------------------
+# SigLip: load & precompute text embeddings
 # -------------------------
 @lru_cache(maxsize=1)
+def load_singlip_texts_and_embeddings():
     texts = []
+    for i in range(1, NUM_DATASETS + 1):
+        ds = load_dataset(DATASET_TEMPLATE.format(i), split="train")
+        texts.extend(ds["text"])
+    processor = AutoProcessor.from_pretrained(SIGLIP_MODEL_ID)
+    model = AutoModel.from_pretrained(SIGLIP_MODEL_ID).to(device)
     model.eval()
     text_embeds_all = []
+    for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Encoding texts"):
+        batch_texts = texts[i:i+BATCH_SIZE]
+        inputs = processor(text=batch_texts, padding=True, truncation=True, return_tensors="pt").to(device)
         with torch.no_grad():
             text_embeds = model.get_text_features(**inputs)
             text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
             text_embeds_all.append(text_embeds.cpu())
         del inputs, text_embeds
+        torch.cuda.empty_cache()
     text_embeds_all = torch.cat(text_embeds_all, dim=0)
+    return processor, model, texts, text_embeds_all
 # -------------------------
+# SigLip retrieval
 # -------------------------
+def retrieve_top_k_texts(image: Image.Image, k=TOP_K_DEFAULT):
+    processor, model, texts_all, text_embeds_all = load_singlip_texts_and_embeddings()
+    inputs = processor(images=image, return_tensors="pt").to(device)
     with torch.no_grad():
         img_embed = model.get_image_features(**inputs)
         img_embed = img_embed / img_embed.norm(p=2, dim=-1, keepdim=True)
     sims = F.cosine_similarity(img_embed.cpu(), text_embeds_all)
     topk = torch.topk(sims, k)
+    results = [(texts_all[idx.item()], float(score)) for idx, score in zip(topk.indices, topk.values)]
     return results
+# -------------------------
+# Llava response
+# -------------------------
+@lru_cache(maxsize=1)
+def load_llava_model():
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(LLAVA_MODEL_ID)
+    model = AutoModelForCausalLM.from_pretrained(LLAVA_MODEL_ID).to(device)
+    model.eval()
+    return tokenizer, model
+def llava_answer(image: Image.Image, retrieved_texts: List[str], question: str, max_tokens=256):
+    tokenizer, model = load_llava_model()
+    context_text = "\n".join([f"Retrieved Text: {t}" for t, _ in retrieved_texts])
+    prompt = f"Given the image and the following texts:\n{context_text}\nUser Question: {question}\nProvide a detailed answer and crop suggestions."
+    inputs = tokenizer(prompt, return_tensors="pt").to(device)
+    with torch.no_grad():
+        output_ids = model.generate(**inputs, max_new_tokens=max_tokens)
+    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    return response
 # -------------------------
 # Gradio interface
 # -------------------------
+def gradio_pipeline(image: Image.Image, question: str, k: int = TOP_K_DEFAULT):
+    if image is None or not question:
+        return None, "Please provide both image and question."
+    retrieved_texts = retrieve_top_k_texts(image, k=int(k))
+    response = llava_answer(image, retrieved_texts, question)
+    return image, response
+with gr.Blocks(title="Agri Image + Question → Llava Response") as demo:
+    gr.Markdown("# Agri Image Question Answering\nUpload an agriculture image, ask a question, and get context-aware crop suggestions.")
     with gr.Row():
         img_in = gr.Image(type="pil")
         out_img = gr.Image(type="pil", label="Image")
+    question_input = gr.Textbox(label="Question about the image", lines=2)
+    k_slider = gr.Slider(minimum=1, maximum=10, step=1, value=TOP_K_DEFAULT, label="Top-k retrieval")
+    txt_out = gr.Textbox(label="Llava Response", lines=8)
+    run_btn = gr.Button("Generate Answer")
+    run_btn.click(fn=gradio_pipeline, inputs=[img_in, question_input, k_slider], outputs=[out_img, txt_out])
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", share=False)