Spaces:

Monimoy
/

image_question_answer

Running on Zero

App Files Files Community

image_question_answer / app.py

Monimoy

Update app.py

a0852ae verified 9 months ago

raw

history blame contribute delete

7.44 kB

	# app.py
	import spaces
	import os
	import gradio as gr
	import torch
	from PIL import Image
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import timm
	from torchvision import transforms
	#from llama_cpp import Llama
	from peft import PeftModel

	import traceback

	# 1. Model Definitions (Same as in training script)
	class SigLIPImageEncoder(torch.nn.Module):
	def __init__(self, model_name='resnet50', embed_dim=512, pretrained_path=None):
	super().__init__()
	self.model = timm.create_model(model_name, pretrained=False, num_classes=0, global_pool='avg') # pretrained=False
	self.embed_dim = embed_dim
	self.projection = torch.nn.Linear(self.model.num_features, embed_dim)

	if pretrained_path:
	self.load_state_dict(torch.load(pretrained_path, map_location=torch.device('cpu'))) # Load to CPU first
	print(f"Loaded SigLIP image encoder from {pretrained_path}")
	else:
	print("Initialized SigLIP image encoder without pretrained weights.")

	def forward(self, image):
	features = self.model(image)
	embedding = self.projection(features)
	return embedding

	# 2. Load Models and Tokenizer
	#phi3_model_path = "QuantFactory/Phi-3-mini-4k-instruct-GGUF" # Path to your quantized Phi-3 GGUF model
	peft_model_path = "./qlora-phi3-model"
	image_model_name = 'resnet50'
	image_embed_dim = 512
	siglip_pretrained_path = "image_encoder.pth" # Path to your pretrained SigLIP model

	#device = torch.device("cpu") # Force CPU
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Using device: {device}")

	# Load Tokenizer (using a compatible tokenizer)
	text_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True) # Or a compatible tokenizer
	text_tokenizer.pad_token = text_tokenizer.eos_token # Important for training

	# Image Transformations
	image_transform = transforms.Compose([
	transforms.Resize((224, 224)),
	transforms.ToTensor(),
	transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
	])

	# Load SigLIP Image Encoder
	image_encoder = SigLIPImageEncoder(model_name=image_model_name, embed_dim=image_embed_dim, pretrained_path=siglip_pretrained_path).to(device)
	image_encoder.eval() # Set to evaluation mode

	# Load Phi-3 model using llama.cpp
	#base_model = Llama(
	# model_path=phi3_model_path,
	# n_gpu_layers=0, # Ensure no GPU usage
	# n_ctx=2048, # Adjust context length as needed
	# verbose=True,
	#)


	#base_model = Llama.from_pretrained(
	# repo_id="QuantFactory/Phi-3-mini-4k-instruct-GGUF",
	# filename="Phi-3-mini-4k-instruct.Q2_K.gguf",
	# n_gpu_layers=0,
	# n_ctx=2048,
	# verbose=True
	#)

	base_model_name="microsoft/Phi-3-mini-4k-instruct"
	#device = "cuda"

	#base_model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float32, device_map={"": device})
	base_model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float32, device_map="auto")


	# Load and merge
	model = PeftModel.from_pretrained(base_model, peft_model_path, offload_dir='./offload')
	model = model.merge_and_unload()
	print("phi-3 model loaded sucessfully")
	# 3. Inference Function


	# 3. Inference Function
	@spaces.GPU
	def predict(image, question):
	"""
	Takes an image and a question as input and returns an answer.
	"""
	if image is None or question is None or question == "":
	return "Please provide both an image and a question."

	try:
	image = Image.fromarray(image).convert("RGB")
	image = image_transform(image).unsqueeze(0).to(device)

	# Get image embeddings
	with torch.no_grad():
	image_embeddings = image_encoder(image)
	# Flatten the image embeddings for simplicity
	image_embeddings_list = image_embeddings.flatten().tolist() # Convert to list of floats
	image_embeddings_str = ' '.join(map(str, image_embeddings_list)) # Convert to space-separated string

	# Create the prompt with image embeddings
	prompt = f"Question: {question}\nImage Embeddings: {image_embeddings_str}\nAnswer:"

	# Generate answer using llama.cpp
	output = model(
	prompt,
	max_tokens=128,
	stop=["Q:", "\n"],
	echo=False,
	)
	answer = output["choices"][0]["text"].strip()

	return answer

	except Exception as e:
	traceback.print_exc()
	#return f"An error occurred: {str(e)}"
	return f"An error occurred: {traceback.format_exc()}"

	# 3. Inference Function
	@spaces.GPU
	def predict1(image_input, question):
	"""
	Takes an image and a question as input and returns an answer.
	"""
	if image_input is None or question is None or question == "":
	return "Please provide both an image and a question."

	try:
	image = Image.fromarray(image_input).convert("RGB")
	image = image_transform(image).unsqueeze(0).to(device)

	prompt = f"Question: {question}\nAnswer:"
	encoded = text_tokenizer(prompt, return_tensors="pt").to(device)

	# Pass the image and encoded prompt to the model
	with torch.no_grad():
	# Get image embeddings
	image_embeddings = image_encoder(image)
	#projected_image_embeddings = model.image_projection(image_embeddings)

	# Reshape image embeddings to (batch_size, 1, phi3_embed_dim)
	#projected_image_embeddings = projected_image_embeddings.unsqueeze(1)
	projected_image_embeddings = image_embeddings.unsqueeze(1)

	# Concatenate along the sequence dimension (dim=1)
	extended_attention_mask = torch.cat([torch.ones(projected_image_embeddings.shape[:2], device=encoded["attention_mask"].device), encoded["attention_mask"]], dim=1)
	extended_input_ids = torch.cat([torch.zeros(projected_image_embeddings.shape[:2], dtype=torch.long, device=encoded["input_ids"].device), encoded["input_ids"]], dim=1)

	# Generate answer
	generated_tokens = model.generate(
	input_ids=extended_input_ids,
	attention_mask=extended_attention_mask,
	max_length=200,
	pad_token_id=text_tokenizer.eos_token_id
	)

	answer = text_tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
	answer = answer.replace(prompt, "").strip() # Remove prompt from answer

	return answer

	except Exception as e:
	#return f"An error occurred: {str(e)}"
	return f"An error occurred: {traceback.format_exc()}"

	# 4. Gradio Interface
	iface = gr.Interface(
	fn=predict1,
	inputs=[
	gr.Image(label="Upload an Image"),
	gr.Textbox(label="Ask a Question about the Image", placeholder="What is in the image?")
	],
	outputs=gr.Textbox(label="Answer"),
	title="Image Question Answering with Phi-3 and SigLIP (CPU)",
	description="Ask questions about an image and get answers powered by Phi-3 (llama.cpp) and SigLIP.",
	examples=[
	["cat_0006.png", "Create a interesting story about this image?"],
	["bird_0004.png", "Can you describe this image?"],
	["truck_0003.png", "Elaborate the setting of the image"],
	["ship_0007.png", "Explain the purpose of image"]
	]
	)

	# 5. Launch the App
	if __name__ == "__main__":
	iface.launch()