Spaces:

Monimoy
/

image_question_answer

Running on Zero

App Files Files Community

Monimoy commited on Apr 13, 2025

Commit

a34f5dd

verified ·

1 Parent(s): a6fd536

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -1

app.py CHANGED Viewed

@@ -126,10 +126,56 @@ def predict(image, question):
         traceback.print_exc()
         #return f"An error occurred: {str(e)}"
         return f"An error occurred: {traceback.format_exc()}"
 # 4. Gradio Interface
 iface = gr.Interface(
-    fn=predict,
     inputs=[
         gr.Image(label="Upload an Image"),
         gr.Textbox(label="Ask a Question about the Image", placeholder="What is in the image?")

         traceback.print_exc()
         #return f"An error occurred: {str(e)}"
         return f"An error occurred: {traceback.format_exc()}"
+# 3. Inference Function
+@spaces.GPU
+def predict1(image_input, question):
+    """
+    Takes an image and a question as input and returns an answer.
+    """
+    if image_input is None or question is None or question == "":
+        return "Please provide both an image and a question."
+    try:
+        image = Image.fromarray(image_input).convert("RGB")
+        image = image_transform(image).unsqueeze(0).to(device)
+        prompt = f"Question: {question}\nAnswer:"
+        encoded = text_tokenizer(prompt, return_tensors="pt").to(device)
+        # Pass the image and encoded prompt to the model
+        with torch.no_grad():
+            # Get image embeddings
+            image_embeddings = model.image_encoder(image)
+            projected_image_embeddings = model.image_projection(image_embeddings)
+            # Reshape image embeddings to (batch_size, 1, phi3_embed_dim)
+            projected_image_embeddings = projected_image_embeddings.unsqueeze(1)
+            # Concatenate along the sequence dimension (dim=1)
+            extended_attention_mask = torch.cat([torch.ones(projected_image_embeddings.shape[:2], device=encoded["attention_mask"].device), encoded["attention_mask"]], dim=1)
+            extended_input_ids = torch.cat([torch.zeros(projected_image_embeddings.shape[:2], dtype=torch.long, device=encoded["input_ids"].device), encoded["input_ids"]], dim=1)
+            # Generate answer
+            generated_tokens = model.generate(
+                input_ids=extended_input_ids,
+                attention_mask=extended_attention_mask,
+                max_length=200,
+                pad_token_id=text_tokenizer.eos_token_id
+            )
+        answer = text_tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
+        answer = answer.replace(prompt, "").strip() # Remove prompt from answer
+        return answer
+    except Exception as e:
+        #return f"An error occurred: {str(e)}"
+        return f"An error occurred: {traceback.format_exc()}"
 # 4. Gradio Interface
 iface = gr.Interface(
+    fn=predict1,
     inputs=[
         gr.Image(label="Upload an Image"),
         gr.Textbox(label="Ask a Question about the Image", placeholder="What is in the image?")