Spaces:

Monimoy
/

image_question_answer

Running on Zero

Monimoy commited on Apr 13, 2025

Commit

d3471f5

verified ·

1 Parent(s): 4e42ff8

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ import timm
 from torchvision import transforms
 #from llama_cpp import Llama
 from peft import PeftModel
 # 1. Model Definitions (Same as in training script)
 class SigLIPImageEncoder(torch.nn.Module):
@@ -86,6 +87,7 @@ print("phi-3 model loaded sucessfully")
 # 3. Inference Function
 @spaces.GPU
 def predict(image, question):
     """
     Takes an image and a question as input and returns an answer.
@@ -101,26 +103,27 @@ def predict(image, question):
         with torch.no_grad():
             image_embeddings = image_encoder(image)
             # Flatten the image embeddings for simplicity
-            image_embeddings = image_embeddings.flatten().tolist()
         # Create the prompt with image embeddings
-        prompt = f"Question: {question}\nImage Embeddings: {image_embeddings}\nAnswer:"
         # Generate answer using llama.cpp
-        output = model(
             prompt,
-            max_tokens=128,
             stop=["Q:", "\n"],
             echo=False,
         )
         answer = output["choices"][0]["text"].strip()
         return answer
     except Exception as e:
         return f"An error occurred: {str(e)}"
 # 4. Gradio Interface
 iface = gr.Interface(
     fn=predict,

 from torchvision import transforms
 #from llama_cpp import Llama
 from peft import PeftModel
+import traceback
 # 1. Model Definitions (Same as in training script)
 class SigLIPImageEncoder(torch.nn.Module):
 # 3. Inference Function
 @spaces.GPU
+# 3. Inference Function
 def predict(image, question):
     """
     Takes an image and a question as input and returns an answer.
         with torch.no_grad():
             image_embeddings = image_encoder(image)
             # Flatten the image embeddings for simplicity
+            image_embeddings_list = image_embeddings.flatten().tolist() # Convert to list of floats
+            image_embeddings_str = ' '.join(map(str, image_embeddings_list)) # Convert to space-separated string
         # Create the prompt with image embeddings
+        prompt = f"Question: {question}\nImage Embeddings: {image_embeddings_str}\nAnswer:"
         # Generate answer using llama.cpp
+        output = llm(
             prompt,
+            max_tokens=200,
             stop=["Q:", "\n"],
             echo=False,
         )
         answer = output["choices"][0]["text"].strip()
         return answer
     except Exception as e:
+        traceback.print_exc()
         return f"An error occurred: {str(e)}"
 # 4. Gradio Interface
 iface = gr.Interface(
     fn=predict,