Spaces:
Paused
Paused
Tonic commited on
add snapshot download
Browse files
app.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
|
| 2 |
import torch
|
| 3 |
import torch.nn as nn
|
| 4 |
import torch.nn.functional as F
|
|
@@ -29,7 +28,6 @@ class GELU(nn.Module):
|
|
| 29 |
self.linear = nn.Linear(dim_in, dim_out, bias=bias)
|
| 30 |
self.approximate = approximate
|
| 31 |
|
| 32 |
-
@spaces.GPU
|
| 33 |
def forward(self, x):
|
| 34 |
if self.approximate == 'tanh':
|
| 35 |
return 0.5 * x * (1 + torch.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
|
@@ -48,7 +46,6 @@ class Rope2D(nn.Module):
|
|
| 48 |
self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
|
| 49 |
self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
|
| 50 |
|
| 51 |
-
@spaces.GPU
|
| 52 |
def forward(self, x, seq_len=None):
|
| 53 |
if seq_len > self.max_seq_len_cached:
|
| 54 |
self.max_seq_len_cached = seq_len
|
|
@@ -72,7 +69,6 @@ class VisionEncoder(nn.Module):
|
|
| 72 |
self.norm = nn.LayerNorm(config['hidden_size'])
|
| 73 |
self.gelu = GELU(config['hidden_size'], config['hidden_size'])
|
| 74 |
|
| 75 |
-
@spaces.GPU
|
| 76 |
def forward(self, pixel_values):
|
| 77 |
x = self.embed(pixel_values)
|
| 78 |
b, c, h, w = x.shape
|
|
@@ -90,25 +86,26 @@ class PixtralModel(nn.Module):
|
|
| 90 |
self.vision_encoder = VisionEncoder(params['vision_encoder'])
|
| 91 |
# Add text generation components here
|
| 92 |
|
| 93 |
-
@spaces.GPU
|
| 94 |
def forward(self, image):
|
| 95 |
vision_output = self.vision_encoder(image)
|
| 96 |
# Add text generation logic here
|
| 97 |
return vision_output
|
| 98 |
|
| 99 |
-
@spaces.GPU
|
| 100 |
def load_model(params, model_path):
|
| 101 |
model = PixtralModel(params)
|
| 102 |
|
| 103 |
-
with safe_open(f'{model_path}/consolidated.safetensors', framework="pt", device="
|
| 104 |
for name, param in model.named_parameters():
|
| 105 |
if name in f.keys():
|
| 106 |
param.data = f.get_tensor(name)
|
| 107 |
|
| 108 |
model.eval()
|
| 109 |
-
return model
|
| 110 |
|
|
|
|
| 111 |
model = load_model(params, model_path)
|
|
|
|
|
|
|
| 112 |
tokenizer = MistralTokenizer.from_model("pixtral")
|
| 113 |
|
| 114 |
@spaces.GPU
|
|
@@ -137,7 +134,9 @@ def process_image_and_text(image, prompt):
|
|
| 137 |
|
| 138 |
# Process the image and generate text
|
| 139 |
with torch.no_grad():
|
|
|
|
| 140 |
vision_output = model(image_tensor)
|
|
|
|
| 141 |
# Add text generation logic here
|
| 142 |
generated_text = f"Generated text based on the image and prompt: {prompt}"
|
| 143 |
|
|
|
|
|
|
|
| 1 |
import torch
|
| 2 |
import torch.nn as nn
|
| 3 |
import torch.nn.functional as F
|
|
|
|
| 28 |
self.linear = nn.Linear(dim_in, dim_out, bias=bias)
|
| 29 |
self.approximate = approximate
|
| 30 |
|
|
|
|
| 31 |
def forward(self, x):
|
| 32 |
if self.approximate == 'tanh':
|
| 33 |
return 0.5 * x * (1 + torch.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
|
|
|
| 46 |
self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
|
| 47 |
self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
|
| 48 |
|
|
|
|
| 49 |
def forward(self, x, seq_len=None):
|
| 50 |
if seq_len > self.max_seq_len_cached:
|
| 51 |
self.max_seq_len_cached = seq_len
|
|
|
|
| 69 |
self.norm = nn.LayerNorm(config['hidden_size'])
|
| 70 |
self.gelu = GELU(config['hidden_size'], config['hidden_size'])
|
| 71 |
|
|
|
|
| 72 |
def forward(self, pixel_values):
|
| 73 |
x = self.embed(pixel_values)
|
| 74 |
b, c, h, w = x.shape
|
|
|
|
| 86 |
self.vision_encoder = VisionEncoder(params['vision_encoder'])
|
| 87 |
# Add text generation components here
|
| 88 |
|
|
|
|
| 89 |
def forward(self, image):
|
| 90 |
vision_output = self.vision_encoder(image)
|
| 91 |
# Add text generation logic here
|
| 92 |
return vision_output
|
| 93 |
|
|
|
|
| 94 |
def load_model(params, model_path):
|
| 95 |
model = PixtralModel(params)
|
| 96 |
|
| 97 |
+
with safe_open(f'{model_path}/consolidated.safetensors', framework="pt", device="cpu") as f:
|
| 98 |
for name, param in model.named_parameters():
|
| 99 |
if name in f.keys():
|
| 100 |
param.data = f.get_tensor(name)
|
| 101 |
|
| 102 |
model.eval()
|
| 103 |
+
return model
|
| 104 |
|
| 105 |
+
# Initialize the model
|
| 106 |
model = load_model(params, model_path)
|
| 107 |
+
|
| 108 |
+
# Initialize the tokenizer
|
| 109 |
tokenizer = MistralTokenizer.from_model("pixtral")
|
| 110 |
|
| 111 |
@spaces.GPU
|
|
|
|
| 134 |
|
| 135 |
# Process the image and generate text
|
| 136 |
with torch.no_grad():
|
| 137 |
+
model.cuda() # Move model to GPU only when processing
|
| 138 |
vision_output = model(image_tensor)
|
| 139 |
+
model.cpu() # Move model back to CPU after processing
|
| 140 |
# Add text generation logic here
|
| 141 |
generated_text = f"Generated text based on the image and prompt: {prompt}"
|
| 142 |
|