Vvaann commited on
Commit
a1bffd6
Β·
verified Β·
1 Parent(s): 73fcb8d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -12
app.py CHANGED
@@ -1,23 +1,29 @@
 
 
 
1
  import gradio as gr
2
  import peft
3
  from peft import LoraConfig
4
- from transformers import AutoTokenizer,BitsAndBytesConfig, AutoModelForCausalLM, CLIPVisionModel, AutoProcessor
5
- import torch
6
  from peft import PeftModel
7
- import torch.nn as nn
8
  import whisperx
9
  import os
 
 
10
  clip_model_name = "openai/clip-vit-base-patch32"
11
  phi_model_name = "microsoft/phi-2"
 
 
 
12
  # Tokenizers and Processors: The tokenizer tokenizes text, and the processor handles preprocessing for images.
13
- # Embedding sizes: clip_embed (768) is for the CLIP model, and phi_embed (2560) is for the Phi-2 model.
14
- # Device: It selects CUDA if a GPU is available, otherwise, it uses the CPU.
15
- # IMAGE_TOKEN_ID: Token ID reserved for images.
16
  tokenizer = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
17
  processor = AutoProcessor.from_pretrained(clip_model_name)
 
18
  tokenizer.pad_token = tokenizer.eos_token
 
 
19
  IMAGE_TOKEN_ID = 23893 # token for word comment
20
- device = "cuda" if torch.cuda.is_available() else "cpu"
 
21
  clip_embed = 768
22
  phi_embed = 2560
23
  compute_type = "float32"
@@ -39,14 +45,20 @@ class SimpleResBlock(nn.Module):
39
  return x + self.proj(x)
40
 
41
  # models
 
42
  # CLIP Vision Model: Pretrained on visual tasks, outputs image embeddings.
43
- # Projection Layer: Projects the clip_embed (768) dimensions to phi_embed (2560) to match the embedding sizes for downstream tasks.
44
- # Residual Block: Uses the custom SimpleResBlock to process the embeddings further.
45
- # Phi-2 Model: The language model handles text generation tasks.
46
  clip_model = CLIPVisionModel.from_pretrained(clip_model_name).to(device)
 
 
47
  projection = torch.nn.Linear(clip_embed, phi_embed).to(device)
 
 
48
  resblock = SimpleResBlock(phi_embed).to(device)
 
 
49
  phi_model = AutoModelForCausalLM.from_pretrained(phi_model_name,trust_remote_code=True).to(device)
 
 
50
  audio_model = whisperx.load_model("tiny", device, compute_type=compute_type, asr_options={'max_new_tokens': 2048, 'clip_timestamps': True, 'hallucination_silence_threshold': 0.25})
51
 
52
  # load weights
@@ -207,8 +219,8 @@ with gr.Blocks() as demo:
207
  }
208
  </style>
209
 
210
- # Engage with MultiModal GPT!
211
- A seamless AI experience combining CLIP and Phi-2 models.
212
  """
213
  )
214
 
 
1
+ from transformers import AutoTokenizer,BitsAndBytesConfig, AutoModelForCausalLM, CLIPVisionModel, AutoProcessor
2
+ import torch
3
+ import torch.nn as nn
4
  import gradio as gr
5
  import peft
6
  from peft import LoraConfig
 
 
7
  from peft import PeftModel
 
8
  import whisperx
9
  import os
10
+
11
+
12
  clip_model_name = "openai/clip-vit-base-patch32"
13
  phi_model_name = "microsoft/phi-2"
14
+
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+
17
  # Tokenizers and Processors: The tokenizer tokenizes text, and the processor handles preprocessing for images.
 
 
 
18
  tokenizer = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
19
  processor = AutoProcessor.from_pretrained(clip_model_name)
20
+
21
  tokenizer.pad_token = tokenizer.eos_token
22
+
23
+ # IMAGE_TOKEN_ID: Token ID reserved for images.
24
  IMAGE_TOKEN_ID = 23893 # token for word comment
25
+
26
+ # Embedding sizes: clip_embed (768) is for the CLIP model, and phi_embed (2560) is for the Phi-2 model.
27
  clip_embed = 768
28
  phi_embed = 2560
29
  compute_type = "float32"
 
45
  return x + self.proj(x)
46
 
47
  # models
48
+
49
  # CLIP Vision Model: Pretrained on visual tasks, outputs image embeddings.
 
 
 
50
  clip_model = CLIPVisionModel.from_pretrained(clip_model_name).to(device)
51
+
52
+ # Projection Layer: Projects the clip_embed (768) dimensions to phi_embed (2560) to match the embedding sizes for downstream tasks.
53
  projection = torch.nn.Linear(clip_embed, phi_embed).to(device)
54
+
55
+ # Residual Block: Uses the custom SimpleResBlock to process the embeddings further.
56
  resblock = SimpleResBlock(phi_embed).to(device)
57
+
58
+ # Phi-2 Model: The language model handles text generation tasks.
59
  phi_model = AutoModelForCausalLM.from_pretrained(phi_model_name,trust_remote_code=True).to(device)
60
+
61
+ # WishperX model: Pretrained audio model
62
  audio_model = whisperx.load_model("tiny", device, compute_type=compute_type, asr_options={'max_new_tokens': 2048, 'clip_timestamps': True, 'hallucination_silence_threshold': 0.25})
63
 
64
  # load weights
 
219
  }
220
  </style>
221
 
222
+ # MultiModal GPT!
223
+ combining CLIP, Whisper and Phi-2 models.
224
  """
225
  )
226