Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -6,6 +6,7 @@ from PIL import Image
|
|
| 6 |
import gradio as gr
|
| 7 |
import librosa
|
| 8 |
import nltk
|
|
|
|
| 9 |
|
| 10 |
from transformers import PreTrainedModel
|
| 11 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
@@ -18,7 +19,7 @@ model_name = "microsoft/Phi-3.5-mini-instruct"
|
|
| 18 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 19 |
|
| 20 |
# Load the model and processor
|
| 21 |
-
clipmodel = CLIPModel.
|
| 22 |
clipprocessor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
| 23 |
|
| 24 |
nltk.download('punkt')
|
|
@@ -235,11 +236,16 @@ def getInputs(image_path, question, answer=""):
|
|
| 235 |
return start_input_ids, end_input_ids, image_features, attention_mask
|
| 236 |
|
| 237 |
model_location = "./MM_FT_C1"
|
| 238 |
-
print("Model location:", model_location)
|
| 239 |
|
| 240 |
model = MultimodalPhiModel.from_pretrained(model_location).to(device)
|
| 241 |
|
| 242 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
|
| 244 |
def getStringAfter(output, start_str):
|
| 245 |
if start_str in output:
|
|
@@ -260,7 +266,7 @@ def getStringAfterAnswer(output):
|
|
| 260 |
answer = preprocess_text(answer)
|
| 261 |
return answer
|
| 262 |
|
| 263 |
-
def generateOutput(image_path, audio_path, context_text, question, max_length=
|
| 264 |
answerPart = ""
|
| 265 |
speech_text = ""
|
| 266 |
if image_path is not None:
|
|
@@ -294,7 +300,7 @@ def generateOutput(image_path, audio_path, context_text, question, max_length=5)
|
|
| 294 |
# base_phi_model.generate(start_tokens, max_length=2, do_sample=False, pad_token_id=tokenizer.pad_token_id)
|
| 295 |
|
| 296 |
output_text = tokenizer.decode(
|
| 297 |
-
|
| 298 |
skip_special_tokens=True
|
| 299 |
)
|
| 300 |
|
|
@@ -326,14 +332,18 @@ def process_inputs(image, audio_source, audio_file, audio_mic, context_text, que
|
|
| 326 |
return answer
|
| 327 |
|
| 328 |
with demo:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
with gr.Row():
|
| 330 |
audio_source = gr.Radio(choices=["Microphone", "Audio File"], label="Select Audio Source")
|
| 331 |
audio_file = gr.Audio(sources="upload", type="filepath", visible=False)
|
| 332 |
audio_mic = gr.Audio(sources="microphone", type="filepath", visible=False)
|
| 333 |
-
|
| 334 |
context_text = gr.Textbox(label="Context Text")
|
| 335 |
-
question = gr.Textbox(label="Question")
|
| 336 |
-
output_text = gr.Textbox(label="Output")
|
| 337 |
|
| 338 |
def update_audio_input(source):
|
| 339 |
if source == "Microphone":
|
|
|
|
| 6 |
import gradio as gr
|
| 7 |
import librosa
|
| 8 |
import nltk
|
| 9 |
+
import re
|
| 10 |
|
| 11 |
from transformers import PreTrainedModel
|
| 12 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
| 19 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 20 |
|
| 21 |
# Load the model and processor
|
| 22 |
+
clipmodel = CLIPModel.frm_pretrained("openai/clip-vit-base-patch32")
|
| 23 |
clipprocessor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
| 24 |
|
| 25 |
nltk.download('punkt')
|
|
|
|
| 236 |
return start_input_ids, end_input_ids, image_features, attention_mask
|
| 237 |
|
| 238 |
model_location = "./MM_FT_C1"
|
| 239 |
+
# print("Model location:", model_location)
|
| 240 |
|
| 241 |
model = MultimodalPhiModel.from_pretrained(model_location).to(device)
|
| 242 |
|
| 243 |
+
model_name = "microsoft/Phi-3.5-mini-instruct"
|
| 244 |
+
base_phi_model = AutoModelForCausalLM.from_pretrained(
|
| 245 |
+
model_name,
|
| 246 |
+
torch_dtype=torch.bfloat16,
|
| 247 |
+
trust_remote_code=True,
|
| 248 |
+
).to(device)
|
| 249 |
|
| 250 |
def getStringAfter(output, start_str):
|
| 251 |
if start_str in output:
|
|
|
|
| 266 |
answer = preprocess_text(answer)
|
| 267 |
return answer
|
| 268 |
|
| 269 |
+
def generateOutput(image_path, audio_path, context_text, question, max_length=2):
|
| 270 |
answerPart = ""
|
| 271 |
speech_text = ""
|
| 272 |
if image_path is not None:
|
|
|
|
| 300 |
# base_phi_model.generate(start_tokens, max_length=2, do_sample=False, pad_token_id=tokenizer.pad_token_id)
|
| 301 |
|
| 302 |
output_text = tokenizer.decode(
|
| 303 |
+
base_phi_model.generate(start_tokens, max_length=1024, do_sample=False, pad_token_id=tokenizer.pad_token_id)[0],
|
| 304 |
skip_special_tokens=True
|
| 305 |
)
|
| 306 |
|
|
|
|
| 332 |
return answer
|
| 333 |
|
| 334 |
with demo:
|
| 335 |
+
with gr.Row():
|
| 336 |
+
with gr.Column(scale=1, min_width=300):
|
| 337 |
+
image_input = gr.Image(type="filepath", label="Upload Image")
|
| 338 |
+
with gr.Column(scale=2, min_width=300):
|
| 339 |
+
question = gr.Textbox(label="Question")
|
| 340 |
+
output_text = gr.Textbox(label="Output")
|
| 341 |
with gr.Row():
|
| 342 |
audio_source = gr.Radio(choices=["Microphone", "Audio File"], label="Select Audio Source")
|
| 343 |
audio_file = gr.Audio(sources="upload", type="filepath", visible=False)
|
| 344 |
audio_mic = gr.Audio(sources="microphone", type="filepath", visible=False)
|
| 345 |
+
with gr.Row():
|
| 346 |
context_text = gr.Textbox(label="Context Text")
|
|
|
|
|
|
|
| 347 |
|
| 348 |
def update_audio_input(source):
|
| 349 |
if source == "Microphone":
|