Spaces:

LN1996
/

Multi-Modal-Phi2

Runtime error

App Files Files Community

Multi-Modal-Phi2 / app.py

LN1996

Update app.py

05ffd40 verified almost 2 years ago

raw

history blame contribute delete

6.89 kB

	import pandas as pd
	import torch
	from transformers import AutoTokenizer
	from transformers import AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig

	model_name = "microsoft/phi-2"
	phi2_model_pretrained = AutoModelForCausalLM.from_pretrained(
	model_name,
	trust_remote_code=True,
	device_map = 'cpu'
	)

	phi2_model_pretrained.config.use_cache = False

	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)
	tokenizer.pad_token = tokenizer.eos_token
	tokenizer.bos_token = tokenizer.eos_token

	def convert_text_input_embeds(text):

	in_tokens = tokenizer(text, return_tensors="pt", return_attention_mask=False)
	in_embeds = phi2_model_pretrained.get_input_embeddings()(in_tokens.input_ids)

	return in_embeds

	import whisperx

	whisper_model = whisperx.load_model('small', device='cpu', compute_type='float32')

	def convert_audio_file_text_embeds(fname):
	result = whisper_model.transcribe(fname)
	full_text = ''
	for seg in result['segments']:
	full_text = full_text + seg['text']
	return full_text.strip()

	from transformers import CLIPVisionModel, CLIPImageProcessor

	vision_tower_name = 'openai/clip-vit-base-patch32' ## torch.Size([1, 49, 768])
	image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
	vision_tower = CLIPVisionModel.from_pretrained(vision_tower_name)

	def feature_select(image_forward_outs):

	image_features = image_forward_outs.hidden_states[-1] # last layer
	image_features = image_features[:, 1:, :]
	return image_features # [1, 49, 768]

	def image_CLIP_embed(image):

	_ = vision_tower.requires_grad_(False)
	image = image_processor(images=image, return_tensors="pt")
	image_forward_out = vision_tower(image['pixel_values'].to(device=vision_tower.device), output_hidden_states=True)
	image_feature = feature_select(image_forward_out)

	return image_feature

	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	class CustomGELU(nn.Module):
	def forward(self, x):
	return F.gelu(x.clone())

	class SimpleResBlock(nn.Module):
	def __init__(self, input_size):
	super().__init__()
	self.pre_norm = nn.LayerNorm(input_size)
	self.proj = nn.Sequential(
	nn.Linear(input_size, input_size),
	nn.GELU(),
	nn.Linear(input_size, input_size)
	)
	def forward(self, x):
	x = self.pre_norm(x)
	return x + self.proj(x)

	class CLIPembed_projection(nn.Module):
	def __init__(self, input_dim_CLIP=768, input_dim_phi2=2560):
	super(CLIPembed_projection, self).__init__()
	self.input_dim_CLIP = input_dim_CLIP
	self.input_dim_phi2 = input_dim_phi2
	self.projection_img = nn.Linear(self.input_dim_CLIP, self.input_dim_phi2,
	bias=False)
	self.resblock = SimpleResBlock(self.input_dim_phi2)

	def forward(self, x):

	x = self.projection_img(x)
	x = self.resblock(x)

	return x

	Image_projection_layer = CLIPembed_projection()

	location_projection_img_p1 = f'./weights/stage_2/run2_projection_img.pth'
	location_projection_img_p2 = f'./weights/stage_2/run2_resblock.pth'

	# load projection_img, resblock from stage 2
	Image_projection_layer.projection_img.load_state_dict(torch.load(location_projection_img_p1, map_location='cpu'))
	Image_projection_layer.resblock.load_state_dict(torch.load(location_projection_img_p2, map_location='cpu'))

	def img_input_embed(image):
	clip_embed = image_CLIP_embed(image)
	post_projection = Image_projection_layer(clip_embed)
	return post_projection

	device = 'cpu'

	user = "LN1996" # put your user name here
	model_name = "peft-qlora-run2"
	model_id = f"{user}/{model_name}"

	import peft
	phi2_model_pretrained_peft = peft.PeftModel.from_pretrained(phi2_model_pretrained, model_id)

	def input_multimodel(image=None, audio=None, text=None, query=None):

	if len(text) == 0:
	text = None

	if len(query) == 0:
	query = None

	if query is None:
	print('Please ask a query')
	return None

	if image is None and audio is None and text is None:
	print('Please provide context in form of image, audio, text')
	return None


	bos = tokenizer("Context: ", return_tensors="pt", return_attention_mask=False)
	input_embeds_stage_2 = phi2_model_pretrained_peft.get_input_embeddings()(bos.input_ids)

	if image is not None:
	image_embeds = img_input_embed(image)
	input_embeds_stage_2 = torch.cat((input_embeds_stage_2, image_embeds), dim=1)


	if audio is not None:
	audio_transcribed = convert_audio_file_text_embeds(audio)
	audio_embeds = convert_text_input_embeds(audio_transcribed)
	input_embeds_stage_2 = torch.cat((input_embeds_stage_2, audio_embeds), dim=1)


	if text is not None:
	text_embeds = convert_text_input_embeds(text)
	input_embeds_stage_2 = torch.cat((input_embeds_stage_2, text_embeds), dim=1)


	qus = tokenizer(" Question: " + query, return_tensors="pt",
	return_attention_mask=False)

	qus_embeds = phi2_model_pretrained_peft.get_input_embeddings()(qus.input_ids)
	input_embeds_stage_2 = torch.cat((input_embeds_stage_2, qus_embeds), dim=1)

	ans = tokenizer(" Answer: ", return_tensors="pt", return_attention_mask=False)
	ans_embeds = phi2_model_pretrained_peft.get_input_embeddings()(ans.input_ids)
	input_embeds_stage_2 = torch.cat((input_embeds_stage_2, ans_embeds), dim=1)

	result = phi2_model_pretrained_peft.generate(inputs_embeds=input_embeds_stage_2,
	bos_token_id = tokenizer.bos_token_id)

	process = tokenizer.batch_decode(result)[0]
	process = process.split(tokenizer.eos_token)

	if process[0] == '':
	return process[1]
	else:
	return process[0]

	import gradio as gr

	title = "Multi-Modal Phi-2 "
	description = "A simple Gradio interface to use a custom Multi-modal (image, text, audio) version of Microsoft Phi-2"

	demo = gr.Interface(input_multimodel,
	inputs = [gr.Image(label="Input context Image"),
	gr.Audio(label="Input context Audio", sources=["microphone", "upload"], type="filepath"),
	gr.Textbox(label="Input context Text"),
	gr.Textbox(label="Input Query"),
	],
	outputs = [
	gr.Textbox(label='Answer'),
	],
	title = title,
	description = description,
	)
	demo.launch()