Spaces:

Vvaann
/

Capstone-MultimodalGPT

Runtime error

App Files Files Community

Vvaann commited on Oct 29, 2024

Commit

a1bffd6

verified ·

1 Parent(s): 73fcb8d

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -12

app.py CHANGED Viewed

@@ -1,23 +1,29 @@
 import gradio as gr
 import peft
 from peft import LoraConfig
-from transformers import AutoTokenizer,BitsAndBytesConfig, AutoModelForCausalLM, CLIPVisionModel, AutoProcessor
-import torch
 from peft import PeftModel
-import torch.nn as nn
 import whisperx
 import os
 clip_model_name = "openai/clip-vit-base-patch32"
 phi_model_name  = "microsoft/phi-2"
 # Tokenizers and Processors: The tokenizer tokenizes text, and the processor handles preprocessing for images.
-# Embedding sizes: clip_embed (768) is for the CLIP model, and phi_embed (2560) is for the Phi-2 model.
-# Device: It selects CUDA if a GPU is available, otherwise, it uses the CPU.
-# IMAGE_TOKEN_ID: Token ID reserved for images.
 tokenizer  = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
 processor  = AutoProcessor.from_pretrained(clip_model_name)
 tokenizer.pad_token = tokenizer.eos_token
 IMAGE_TOKEN_ID = 23893 # token for word comment
-device = "cuda" if torch.cuda.is_available() else "cpu"
 clip_embed = 768
 phi_embed  = 2560
 compute_type = "float32"
@@ -39,14 +45,20 @@ class SimpleResBlock(nn.Module):
         return x + self.proj(x)
 # models
 # CLIP Vision Model: Pretrained on visual tasks, outputs image embeddings.
-# Projection Layer: Projects the clip_embed (768) dimensions to phi_embed (2560) to match the embedding sizes for downstream tasks.
-# Residual Block: Uses the custom SimpleResBlock to process the embeddings further.
-# Phi-2 Model: The language model handles text generation tasks.
 clip_model = CLIPVisionModel.from_pretrained(clip_model_name).to(device)
 projection = torch.nn.Linear(clip_embed, phi_embed).to(device)
 resblock = SimpleResBlock(phi_embed).to(device)
 phi_model = AutoModelForCausalLM.from_pretrained(phi_model_name,trust_remote_code=True).to(device)
 audio_model = whisperx.load_model("tiny", device, compute_type=compute_type, asr_options={'max_new_tokens': 2048, 'clip_timestamps': True, 'hallucination_silence_threshold': 0.25})
 # load weights
@@ -207,8 +219,8 @@ with gr.Blocks() as demo:
     }
     </style>
-    # Engage with MultiModal GPT!
-    A seamless AI experience combining CLIP and Phi-2 models.
     """
     )

+from transformers import AutoTokenizer,BitsAndBytesConfig, AutoModelForCausalLM, CLIPVisionModel, AutoProcessor
+import torch
+import torch.nn as nn
 import gradio as gr
 import peft
 from peft import LoraConfig
 from peft import PeftModel
 import whisperx
 import os
 clip_model_name = "openai/clip-vit-base-patch32"
 phi_model_name  = "microsoft/phi-2"
+device = "cuda" if torch.cuda.is_available() else "cpu"
 # Tokenizers and Processors: The tokenizer tokenizes text, and the processor handles preprocessing for images.
 tokenizer  = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
 processor  = AutoProcessor.from_pretrained(clip_model_name)
 tokenizer.pad_token = tokenizer.eos_token
+# IMAGE_TOKEN_ID: Token ID reserved for images.
 IMAGE_TOKEN_ID = 23893 # token for word comment
+# Embedding sizes: clip_embed (768) is for the CLIP model, and phi_embed (2560) is for the Phi-2 model.
 clip_embed = 768
 phi_embed  = 2560
 compute_type = "float32"
         return x + self.proj(x)
 # models
 # CLIP Vision Model: Pretrained on visual tasks, outputs image embeddings.
 clip_model = CLIPVisionModel.from_pretrained(clip_model_name).to(device)
+# Projection Layer: Projects the clip_embed (768) dimensions to phi_embed (2560) to match the embedding sizes for downstream tasks.
 projection = torch.nn.Linear(clip_embed, phi_embed).to(device)
+# Residual Block: Uses the custom SimpleResBlock to process the embeddings further.
 resblock = SimpleResBlock(phi_embed).to(device)
+# Phi-2 Model: The language model handles text generation tasks.
 phi_model = AutoModelForCausalLM.from_pretrained(phi_model_name,trust_remote_code=True).to(device)
+# WishperX model: Pretrained audio model
 audio_model = whisperx.load_model("tiny", device, compute_type=compute_type, asr_options={'max_new_tokens': 2048, 'clip_timestamps': True, 'hallucination_silence_threshold': 0.25})
 # load weights
     }
     </style>
+    # MultiModal GPT!
+    combining CLIP, Whisper and Phi-2 models.
     """
     )