Spaces:

Shivdutta
/

S30-MultiModalGPT

Runtime error

App Files Files Community

Shivdutta commited on Oct 3, 2024

Commit

60b820b

verified ·

1 Parent(s): c70b1b3

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -65

app.py CHANGED Viewed

@@ -6,30 +6,17 @@ import torch
 from peft import PeftModel
 import torch.nn as nn
 import whisperx
-# Determine the appropriate device
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# Set compute_type based on device capabilities
-if device == "cuda" and torch.cuda.is_bf16_supported():
-    compute_type = "float16"
-elif device == "cuda":
-    compute_type = "float32"
-else:
-    compute_type = "int8"
 clip_model_name = "openai/clip-vit-base-patch32"
 phi_model_name  = "microsoft/phi-2"
 tokenizer  = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
 processor  = AutoProcessor.from_pretrained(clip_model_name)
 tokenizer.pad_token = tokenizer.eos_token
 IMAGE_TOKEN_ID = 23893 # token for word comment
-QA_TOKEN_ID = 50295 # token for qa
 device = "cuda" if torch.cuda.is_available() else "cpu"
 clip_embed = 768
 phi_embed  = 2560
-compute_type = "float16"
 audio_batch_size = 16
 class SimpleResBlock(nn.Module):
@@ -44,50 +31,20 @@ class SimpleResBlock(nn.Module):
     def forward(self, x):
         x = self.pre_norm(x)
         return x + self.proj(x)
 # models
 clip_model = CLIPVisionModel.from_pretrained(clip_model_name).to(device)
 projection = torch.nn.Linear(clip_embed, phi_embed).to(device)
 resblock = SimpleResBlock(phi_embed).to(device)
 phi_model = AutoModelForCausalLM.from_pretrained(phi_model_name,trust_remote_code=True).to(device)
-# Load the model with the appropriate compute_type
-# Load the audio model with appropriate compute_type
-audio_model_size = "tiny"
-compute_type = "float32"  # Ensure using a compatible compute type
-try:
-    audio_model = whisperx.load_model(
-        audio_model_size,
-        device,
-        compute_type=compute_type
-        # Removed unsupported parameters
-    )
-    print(f"Model loaded successfully with compute_type: {compute_type}")
-except ValueError as e:
-    print(f"Error loading model: {e}")
-    # Optionally, try loading with int8 if necessary
-    try:
-        audio_model = whisperx.load_model(
-            audio_model_size,
-            device,
-            compute_type="int8"
-            # Removed unsupported parameters
-        )
-        print("Fell back to int8 compute type successfully.")
-    except Exception as e:
-        print(f"Failed to load model with int8: {e}")
 # load weights
-model_to_merge = PeftModel.from_pretrained(phi_model,'./model_chkpt/lora_adaptor')
 merged_model   = model_to_merge.merge_and_unload()
-projection.load_state_dict(torch.load('./model_chkpt/finetunned_projection.pth',map_location=torch.device(device)))
-resblock.load_state_dict(torch.load('./model_chkpt/finetuned_resblock.pth',map_location=torch.device(device)))
 def model_generate_ans(img=None,img_audio=None,val_q=None):
@@ -126,20 +83,20 @@ def model_generate_ans(img=None,img_audio=None,val_q=None):
             val_q_embeds    = merged_model.model.embed_tokens(val_q_tokenised).unsqueeze(0)
             val_combined_embeds.append(val_q_embeds)
-        if img_audio is not None or len(val_q) != 0: # add QA Token
-            QA_token_tensor = torch.tensor(QA_TOKEN_ID).to(device)
-            QA_token_embeds = merged_model.model.embed_tokens(QA_token_tensor).unsqueeze(0).unsqueeze(0)
-            val_combined_embeds.append(QA_token_embeds)
         val_combined_embeds = torch.cat(val_combined_embeds,dim=1)
-        predicted_caption = merged_model.generate(inputs_embeds=val_combined_embeds,
-                                                  max_new_tokens=max_generate_length,
-                                                  return_dict_in_generate = True)
-        predicted_captions_decoded = tokenizer.batch_decode(predicted_caption.sequences[:, 1:])[0]
-        predicted_captions_decoded = predicted_captions_decoded.replace("<|endoftext|>", "")
     return predicted_captions_decoded
@@ -165,5 +122,4 @@ with gr.Blocks() as demo:
     section_btn = gr.Button("Submit")
     section_btn.click(model_generate_ans, inputs=[img_input,img_audio,img_question], outputs=[img_answer])
-if __name__ == "__main__":
-    demo.launch()

 from peft import PeftModel
 import torch.nn as nn
 import whisperx
+import os
 clip_model_name = "openai/clip-vit-base-patch32"
 phi_model_name  = "microsoft/phi-2"
 tokenizer  = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
 processor  = AutoProcessor.from_pretrained(clip_model_name)
 tokenizer.pad_token = tokenizer.eos_token
 IMAGE_TOKEN_ID = 23893 # token for word comment
 device = "cuda" if torch.cuda.is_available() else "cpu"
 clip_embed = 768
 phi_embed  = 2560
+compute_type = "float32"
 audio_batch_size = 16
 class SimpleResBlock(nn.Module):
     def forward(self, x):
         x = self.pre_norm(x)
         return x + self.proj(x)
 # models
 clip_model = CLIPVisionModel.from_pretrained(clip_model_name).to(device)
 projection = torch.nn.Linear(clip_embed, phi_embed).to(device)
 resblock = SimpleResBlock(phi_embed).to(device)
 phi_model = AutoModelForCausalLM.from_pretrained(phi_model_name,trust_remote_code=True).to(device)
+# Assuming you have defined 'device' and 'compute_type' elsewhere
+audio_model = whisperx.load_model("tiny", device, compute_type=compute_type, asr_options={'max_new_tokens': 2048, 'clip_timestamps': True, 'hallucination_silence_threshold': 0.25, 'hotwords': []})
 # load weights
+model_to_merge = PeftModel.from_pretrained(phi_model,os.path.join(os.getcwd(), 'model_chkpt/lora_adaptor'))
 merged_model   = model_to_merge.merge_and_unload()
+projection.load_state_dict(torch.load(os.path.join(os.getcwd(),'model_chkpt/finetunned_projection.pth'),map_location=torch.device(device)))
+resblock.load_state_dict(torch.load(os.path.join(os.getcwd(),'model_chkpt/finetuned_resblock.pth'),map_location=torch.device(device)))
 def model_generate_ans(img=None,img_audio=None,val_q=None):
             val_q_embeds    = merged_model.model.embed_tokens(val_q_tokenised).unsqueeze(0)
             val_combined_embeds.append(val_q_embeds)
         val_combined_embeds = torch.cat(val_combined_embeds,dim=1)
+        #val_combined_embeds = torch.cat([val_image_embeds, img_token_embeds, val_q_embeds], dim=1) # 4, 69, 2560
+        predicted_caption = torch.full((1,max_generate_length),50256).to(device)
+        for g in range(max_generate_length):
+            phi_output_logits = merged_model(inputs_embeds=val_combined_embeds)['logits'] # 4, 69, 51200
+            predicted_word_token_logits = phi_output_logits[:, -1, :].unsqueeze(1) # 4,1,51200
+            predicted_word_token = torch.argmax(predicted_word_token_logits, dim = -1) # 4,1
+            predicted_caption[:,g] = predicted_word_token.view(1,-1)
+            next_token_embeds = phi_model.model.embed_tokens(predicted_word_token) # 4,1,2560
+            val_combined_embeds   = torch.cat([val_combined_embeds, next_token_embeds], dim=1)
+        predicted_captions_decoded = tokenizer.batch_decode(predicted_caption,ignore_index = 50256)[0]
     return predicted_captions_decoded
     section_btn = gr.Button("Submit")
     section_btn.click(model_generate_ans, inputs=[img_input,img_audio,img_question], outputs=[img_answer])
+demo.launch()