Spaces:

ibrim
/

MultiModalPhi

Build error

App Files Files Community

Mohammad Ibrahim commited on Oct 23, 2024

Commit

b2db059

1 Parent(s): 9ad123d

initial commit

Browse files

Files changed (7) hide show

.gitattributes +3 -0
app.py +127 -0
model_chkpt/lora_adaptor/.ipynb_checkpoints/adapter_config-checkpoint.json +29 -0
model_chkpt/lora_adaptor/adapter_config.json +3 -0
model_chkpt/lora_adaptor/adapter_model.safetensors +3 -0
model_chkpt/step2_projection.pth +3 -0
model_chkpt/step2_resblock.pth +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+/model_chkpt/*.pth filter=lfs diff=lfs merge=lfs -text
+/model_chkpt/lora_adaptor/*.safetensors filter=lfs diff=lfs merge=lfs -text
+/model_chkpt/lora_adaptor/*.json filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import gradio as gui
+import peft
+from peft import LoraConfig
+from transformers import AutoTokenizer,BitsAndBytesConfig, AutoModelForCausalLM, CLIPVisionModel, AutoProcessor
+import torch
+from peft import PeftModel
+import torch.nn as nn
+import whisper
+import os
+os.environ['https_proxy'] = 'http://185.46.212.90:80'
+os.environ['http_proxy'] = 'http://185.46.212.90:80'
+clip_model_name = "openai/clip-vit-base-patch32"
+phi_model_name  = "microsoft/phi-2"
+tokenizer  = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
+processor  = AutoProcessor.from_pretrained(clip_model_name)
+tokenizer.pad_token = tokenizer.eos_token
+IMAGE_TOKEN_ID = 23893 # token for word comment
+QA_TOKEN_ID = 50295 # token for qa
+device = "cuda" if torch.cuda.is_available() else "cpu"
+clip_embed = 768
+phi_embed  = 2560
+audio_batch_size = 16
+current_dir = os.getcwd()
+class SimpleResBlock(nn.Module):
+    def __init__(self, phi_embed):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(phi_embed)
+        self.proj = nn.Sequential(
+            nn.Linear(phi_embed, phi_embed),
+            nn.GELU(),
+            nn.Linear(phi_embed, phi_embed)
+        )
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+# models
+clip_model = CLIPVisionModel.from_pretrained(clip_model_name).to(device)
+projection = torch.nn.Linear(clip_embed, phi_embed).to(device)
+resblock = SimpleResBlock(phi_embed).to(device)
+phi_model = AutoModelForCausalLM.from_pretrained(phi_model_name,trust_remote_code=True).to(device)
+audio_model = whisper.load_model("tiny", device=device)
+lora_adaptor_path = os.path.join(current_dir, 'model_chkpt', 'lora_adaptor')
+projection_path = os.path.join(current_dir, 'model_chkpt', 'step2_projection.pth')
+resblock_path = os.path.join(current_dir, 'model_chkpt', 'step2_resblock.pth')
+# load weights
+model_to_merge = PeftModel.from_pretrained(phi_model,lora_adaptor_path, local_files_only=True, device_map={'': device})
+merged_model   = model_to_merge.merge_and_unload()
+projection.load_state_dict(torch.load(projection_path,map_location=torch.device(device)))
+resblock.load_state_dict(torch.load(resblock_path,map_location=torch.device(device)))
+def generate_response(img=None,img_audio=None,val_q=None):
+    max_generate_length = 100
+    val_combined_embeds = []
+    with torch.no_grad():
+        # image
+        if img is not None:
+            image_processed  = processor(images=img, return_tensors="pt").to(device)
+            clip_val_outputs = clip_model(**image_processed).last_hidden_state[:,1:,:]
+            val_image_embeds = projection(clip_val_outputs)
+            val_image_embeds = resblock(val_image_embeds).to(torch.float16)
+            img_token_tensor = torch.tensor(IMAGE_TOKEN_ID).to(device)
+            img_token_embeds = merged_model.model.embed_tokens(img_token_tensor).unsqueeze(0).unsqueeze(0)
+            val_combined_embeds.append(val_image_embeds)
+            val_combined_embeds.append(img_token_embeds)
+        # audio
+        if img_audio is not None:
+            audio_result = audio_model.transcribe(img_audio)
+            audio_text = ''
+            for seg in audio_result['segments']:
+                audio_text += seg['text']
+            audio_text = audio_text.strip()
+            audio_tokens = tokenizer(audio_text, return_tensors="pt", return_attention_mask=False)['input_ids'].squeeze(0).to(device)
+            audio_embeds    = merged_model.model.embed_tokens(audio_tokens).unsqueeze(0)
+            val_combined_embeds.append(audio_embeds)
+        # text question
+        if len(val_q) != 0:
+            val_q_tokenised = tokenizer(val_q, return_tensors="pt", return_attention_mask=False)['input_ids'].squeeze(0).to(device)
+            val_q_embeds    = merged_model.model.embed_tokens(val_q_tokenised).unsqueeze(0)
+            val_combined_embeds.append(val_q_embeds)
+        if img_audio is not None or len(val_q) != 0: # add QA Token
+            QA_token_tensor = torch.tensor(QA_TOKEN_ID).to(device)
+            QA_token_embeds = merged_model.model.embed_tokens(QA_token_tensor).unsqueeze(0).unsqueeze(0)
+            val_combined_embeds.append(QA_token_embeds)
+        val_combined_embeds = torch.cat(val_combined_embeds,dim=1)
+        predicted_caption = merged_model.generate(inputs_embeds=val_combined_embeds,
+                                                  max_new_tokens=max_generate_length,
+                                                  return_dict_in_generate = True)
+        predicted_captions_decoded = tokenizer.batch_decode(predicted_caption.sequences[:, 1:])[0]
+        predicted_captions_decoded = predicted_captions_decoded.replace("<|endoftext|>", "")
+    return predicted_captions_decoded
+# Gradio interface setup with added styling
+with gui.Blocks() as app_interface:
+    with gui.Row():
+        with gui.Column():
+            image_input = gui.Image(label='Upload Image', type="pil")
+        with gui.Column():
+            audio_input = gui.Audio(label="Audio Input", sources=['microphone', 'upload'], type='filepath')
+            text_input = gui.Text(label='Enter Text', placeholder="Type your query here...")
+    with gui.Row():
+        output_response = gui.Textbox(label='Generated Response', placeholder="Response will appear here...", lines=5)
+    submit_button = gui.Button("Generate Response", variant="primary")
+    submit_button.click(generate_response, inputs=[image_input, audio_input, text_input], outputs=output_response)
+if __name__ == "__main__":
+    app_interface.launch(share=True)

model_chkpt/lora_adaptor/.ipynb_checkpoints/adapter_config-checkpoint.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "microsoft/phi-2",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "fc2",
+    "v_proj",
+    "fc1",
+    "k_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM"
+}

model_chkpt/lora_adaptor/adapter_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:23212cff0181dce73efde3004af2309e9ad4a13ace72aa36d3d236874d85b8e4
+size 603

model_chkpt/lora_adaptor/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ed4cfe545b28f8effe9ded98b472502a93a5e5e42edd6384591f0e1d71c3770
+size 335586800

model_chkpt/step2_projection.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ec5d298b71c4b50f2b626e6df9a73d02d012da0794c1b768610fe52f4a8f860
+size 7876174

model_chkpt/step2_resblock.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c161997824ad80b1af83639cd051a8beda84d4967be50982821249c509fab62c
+size 52472590