Spaces:

Monimoy
/

image_question_answer

Running on Zero

App Files Files Community

Monimoy commited on Apr 13, 2025

Commit

89c355b

verified ·

1 Parent(s): b2cc644

Upload 3 files

Browse files

Files changed (3) hide show

app.py +134 -0
image_encoder.pth +3 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# app.py
+import os
+import gradio as gr
+import torch
+from PIL import Image
+from transformers import AutoTokenizer
+import timm
+from torchvision import transforms
+from llama_cpp import Llama
+from peft import PeftModel
+# 1. Model Definitions (Same as in training script)
+class SigLIPImageEncoder(torch.nn.Module):
+    def __init__(self, model_name='resnet50', embed_dim=512, pretrained_path=None):
+        super().__init__()
+        self.model = timm.create_model(model_name, pretrained=False, num_classes=0, global_pool='avg') # pretrained=False
+        self.embed_dim = embed_dim
+        self.projection = torch.nn.Linear(self.model.num_features, embed_dim)
+        if pretrained_path:
+            self.load_state_dict(torch.load(pretrained_path, map_location=torch.device('cpu'))) # Load to CPU first
+            print(f"Loaded SigLIP image encoder from {pretrained_path}")
+        else:
+            print("Initialized SigLIP image encoder without pretrained weights.")
+    def forward(self, image):
+        features = self.model(image)
+        embedding = self.projection(features)
+        return embedding
+# 2. Load Models and Tokenizer
+phi3_model_path = "QuantFactory/Phi-3-mini-4k-instruct-GGUF"  # Path to your quantized Phi-3 GGUF model
+peft_model_path = "./qlora_phi3_model"
+image_model_name = 'resnet50'
+image_embed_dim = 512
+siglip_pretrained_path = "image_encoder.pth" # Path to your pretrained SigLIP model
+device = torch.device("cpu") # Force CPU
+print(f"Using device: {device}")
+# Load Tokenizer (using a compatible tokenizer)
+text_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True) # Or a compatible tokenizer
+text_tokenizer.pad_token = text_tokenizer.eos_token # Important for training
+# Image Transformations
+image_transform = transforms.Compose([
+    transforms.Resize((224, 224)),
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+])
+# Load SigLIP Image Encoder
+image_encoder = SigLIPImageEncoder(model_name=image_model_name, embed_dim=image_embed_dim, pretrained_path=siglip_pretrained_path).to(device)
+image_encoder.eval() # Set to evaluation mode
+# Load Phi-3 model using llama.cpp
+base_model = Llama(
+    model_path=phi3_model_path,
+    n_gpu_layers=0,  # Ensure no GPU usage
+    n_ctx=2048,       # Adjust context length as needed
+    verbose=True,
+)
+model = PeftModel.from_pretrained(base_model, peft_model_path, offload_dir='./offload')
+model = model.merge_and_unload()
+print("phi-3 model loaded sucessfully")
+# 3. Inference Function
+def predict(image, question):
+    """
+    Takes an image and a question as input and returns an answer.
+    """
+    if image is None or question is None or question == "":
+        return "Please provide both an image and a question."
+    try:
+        image = Image.fromarray(image).convert("RGB")
+        image = image_transform(image).unsqueeze(0).to(device)
+        # Get image embeddings
+        with torch.no_grad():
+            image_embeddings = image_encoder(image)
+            # Flatten the image embeddings for simplicity
+            image_embeddings = image_embeddings.flatten().tolist()
+        # Create the prompt with image embeddings
+        prompt = f"Question: {question}\nImage Embeddings: {image_embeddings}\nAnswer:"
+        # Generate answer using llama.cpp
+        output = model(
+            prompt,
+            max_tokens=128,
+            stop=["Q:", "\n"],
+            echo=False,
+        )
+        answer = output["choices"][0]["text"].strip()
+        return answer
+    except Exception as e:
+        return f"An error occurred: {str(e)}"
+# 4. Gradio Interface
+iface = gr.Interface(
+    fn=predict,
+    inputs=[
+        gr.Image(label="Upload an Image"),
+        gr.Textbox(label="Ask a Question about the Image", placeholder="What is in the image?")
+    ],
+    outputs=gr.Textbox(label="Answer"),
+    title="Image Question Answering with Phi-3 and SigLIP (CPU)",
+    description="Ask questions about an image and get answers powered by Phi-3 (llama.cpp) and SigLIP.",
+    examples=[
+        ["example_image_1.jpg", "What color is the car?"],
+        ["example_image_2.jpg", "How many people are in the image?"],
+        ["example_image_3.jpg", "What is the person doing?"]
+    ]
+)
+# 5. Launch the App
+if __name__ == "__main__":
+    # Create dummy example images
+    if not os.path.exists("example_image_1.jpg"):
+        dummy_image = Image.new("RGB", (224, 224), color="red")
+        dummy_image.save("example_image_1.jpg")
+    if not os.path.exists("example_image_2.jpg"):
+        dummy_image = Image.new("RGB", (224, 224), color="green")
+        dummy_image.save("example_image_2.jpg")
+    if not os.path.exists("example_image_3.jpg"):
+        dummy_image = Image.new("RGB", (224, 224), color="blue")
+        dummy_image.save("example_image_3.jpg")
+    iface.launch()

image_encoder.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f070bb389421acfedc5a21e4c3fc2a0f5f9298da7e71611dd352333e037f0cd
+size 98553802

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio
+torch
+torchvision
+timm
+Pillow
+transformers
+llama-cpp-python