Spaces:
Runtime error
Runtime error
dongyh20
commited on
Commit
Β·
0bef215
1
Parent(s):
1938217
update spce
Browse files- app.py +17 -75
- requirements.txt +2 -0
app.py
CHANGED
|
@@ -24,8 +24,9 @@ from typing import Dict, Optional, Sequence, List
|
|
| 24 |
import librosa
|
| 25 |
import whisper
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
|
|
|
| 29 |
|
| 30 |
import sys
|
| 31 |
sys.path.append('./ola/CosyVoice/')
|
|
@@ -35,21 +36,21 @@ from ola.utils import disable_torch_init
|
|
| 35 |
from ola.datasets.preprocess import tokenizer_image_token, tokenizer_speech_image_token, tokenizer_speech_question_image_token
|
| 36 |
from ola.mm_utils import get_model_name_from_path, KeywordsStoppingCriteria, process_anyres_video, process_anyres_highres_image_genli
|
| 37 |
from ola.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX, DEFAULT_SPEECH_TOKEN
|
| 38 |
-
|
| 39 |
|
| 40 |
-
model_path = "/
|
| 41 |
tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None)
|
| 42 |
model = model.to('cuda').eval()
|
| 43 |
model = model.bfloat16()
|
| 44 |
|
| 45 |
-
# tts_model = CosyVoice('CosyVoice/pretrained_models/CosyVoice-300M-SFT', load_jit=True, load_onnx=False, fp16=True)
|
| 46 |
-
|
| 47 |
|
| 48 |
USE_SPEECH=False
|
| 49 |
|
| 50 |
title_markdown = """
|
| 51 |
-
<div style="display: flex; justify-content: left; align-items: center; text-align: left; background: linear-gradient(45deg, rgba(
|
| 52 |
-
<img src="https://ola-omni.github.io/static/images/icon.png" alt="
|
| 53 |
</a>
|
| 54 |
<div>
|
| 55 |
<h2 ><a href="https://github.com/Ola-Omni/Ola">Ola: Pushing the Frontiers of Omni-Modal Language Model with Progressive Modality Alignment</a> </h2>
|
|
@@ -300,83 +301,24 @@ def ola_inference(multimodal, audio_path):
|
|
| 300 |
outputs = outputs[:-len(stop_str)]
|
| 301 |
outputs = outputs.strip()
|
| 302 |
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
# else:
|
| 311 |
return outputs, None
|
| 312 |
|
| 313 |
# Define input and output for the Gradio interface
|
| 314 |
demo = gr.Interface(
|
| 315 |
fn=ola_inference,
|
| 316 |
-
inputs=[gr.MultimodalTextbox(file_types=[".mp4", "image"],placeholder="Enter message or upload
|
| 317 |
outputs=["text", "audio"],
|
| 318 |
-
# examples=[
|
| 319 |
-
# {
|
| 320 |
-
# "files":[f"{cur_dir}/case/image2.png"],
|
| 321 |
-
# "text":"Describe what is happening in this video in detail.",
|
| 322 |
-
# },
|
| 323 |
-
# {
|
| 324 |
-
# "files":[f"{cur_dir}/case/image.png"],
|
| 325 |
-
# "text":"Describe this icon.",
|
| 326 |
-
# },
|
| 327 |
-
# ],
|
| 328 |
title="Ola Demo",
|
| 329 |
description=title_markdown,
|
| 330 |
article=bibtext,
|
| 331 |
)
|
| 332 |
-
|
| 333 |
-
# textbox = gr.Textbox(
|
| 334 |
-
# show_label=False, placeholder="Enter text and press ENTER", container=False, max_lines=100
|
| 335 |
-
# )
|
| 336 |
-
# with gr.Blocks(
|
| 337 |
-
# title="Oryx-7B",
|
| 338 |
-
# theme="finlaymacklon/smooth_slate",
|
| 339 |
-
# css=".message-wrap.svelte-1lcyrx4>div.svelte-1lcyrx4 img {min-width: 50px}",
|
| 340 |
-
# fill_height=True
|
| 341 |
-
# ) as demo:
|
| 342 |
-
# html_header = "https://oryx-mllm.github.io/"
|
| 343 |
-
# gr.HTML(html_header)
|
| 344 |
-
|
| 345 |
-
# with gr.Row(equal_height=True):
|
| 346 |
-
# with gr.Column(scale=3):
|
| 347 |
-
# with gr.Row():
|
| 348 |
-
# video = gr.Video(label="Input Video", height=400)
|
| 349 |
-
# cur_dir = os.path.dirname(os.path.abspath(__file__))
|
| 350 |
-
# with gr.Row():
|
| 351 |
-
# gr.Examples(
|
| 352 |
-
# examples=[
|
| 353 |
-
# [
|
| 354 |
-
# f"{cur_dir}/case/case1.mp4",
|
| 355 |
-
# "Describe what is happening in this video in detail.",
|
| 356 |
-
# ],
|
| 357 |
-
# ],
|
| 358 |
-
# inputs=[video, textbox],
|
| 359 |
-
# )
|
| 360 |
-
|
| 361 |
-
# with gr.Column(scale=7):
|
| 362 |
-
# chatbot = gr.Chatbot(label="Oryx", bubble_full_width=False, height=660)
|
| 363 |
-
# with gr.Row():
|
| 364 |
-
# with gr.Column(scale=8):
|
| 365 |
-
# textbox.render()
|
| 366 |
-
# with gr.Column(scale=1, min_width=50):
|
| 367 |
-
# submit_btn = gr.Button(
|
| 368 |
-
# value="Send", variant="primary", interactive=True
|
| 369 |
-
# )
|
| 370 |
-
# # with gr.Row(elem_id="buttons") as button_row:
|
| 371 |
-
# # upvote_btn = gr.Button(value="π Upvote", interactive=True)
|
| 372 |
-
# # downvote_btn = gr.Button(value="π Downvote", interactive=True)
|
| 373 |
-
# # flag_btn = gr.Button(value="β οΈ Flag", interactive=True)
|
| 374 |
-
# # clear_btn = gr.Button(value="ποΈ Clear history", interactive=True)
|
| 375 |
-
|
| 376 |
-
# submit_btn.click(
|
| 377 |
-
# oryx_inference,
|
| 378 |
-
# [video, textbox],
|
| 379 |
-
# [chatbot, textbox, video],
|
| 380 |
-
# )
|
| 381 |
# Launch the Gradio app
|
| 382 |
demo.launch(server_name="0.0.0.0",server_port=80)
|
|
|
|
| 24 |
import librosa
|
| 25 |
import whisper
|
| 26 |
|
| 27 |
+
import torchaudio
|
| 28 |
+
import subprocess
|
| 29 |
+
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
|
| 30 |
|
| 31 |
import sys
|
| 32 |
sys.path.append('./ola/CosyVoice/')
|
|
|
|
| 36 |
from ola.datasets.preprocess import tokenizer_image_token, tokenizer_speech_image_token, tokenizer_speech_question_image_token
|
| 37 |
from ola.mm_utils import get_model_name_from_path, KeywordsStoppingCriteria, process_anyres_video, process_anyres_highres_image_genli
|
| 38 |
from ola.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX, DEFAULT_SPEECH_TOKEN
|
| 39 |
+
from ola.CosyVoice.cosyvoice.cli.cosyvoice import CosyVoice
|
| 40 |
|
| 41 |
+
model_path = "THUdyh/Ola-7b"
|
| 42 |
tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None)
|
| 43 |
model = model.to('cuda').eval()
|
| 44 |
model = model.bfloat16()
|
| 45 |
|
| 46 |
+
# tts_model = CosyVoice('./ola/CosyVoice/pretrained_models/CosyVoice-300M-SFT', load_jit=True, load_onnx=False, fp16=True)
|
| 47 |
+
OUTPUT_SPEECH = False
|
| 48 |
|
| 49 |
USE_SPEECH=False
|
| 50 |
|
| 51 |
title_markdown = """
|
| 52 |
+
<div style="display: flex; justify-content: left; align-items: center; text-align: left; background: linear-gradient(45deg, rgba(255,248,240, 0.8), rgba(255,135,36,0.3)); border-radius: 10px; box-shadow: 0 8px 16px 0 rgba(0,0,0,0.1);"> <a href="https://ola-omni.github.io/"" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
|
| 53 |
+
<img src="https://ola-omni.github.io/static/images/ola-icon-2.png" alt="Ola" style="max-width: 80px; height: auto; border-radius: 10px;">
|
| 54 |
</a>
|
| 55 |
<div>
|
| 56 |
<h2 ><a href="https://github.com/Ola-Omni/Ola">Ola: Pushing the Frontiers of Omni-Modal Language Model with Progressive Modality Alignment</a> </h2>
|
|
|
|
| 301 |
outputs = outputs[:-len(stop_str)]
|
| 302 |
outputs = outputs.strip()
|
| 303 |
|
| 304 |
+
if OUTPUT_SPEECH:
|
| 305 |
+
voice_all = []
|
| 306 |
+
for i, j in enumerate(tts_model.inference_sft(outputs, 'θ±ζε₯³', stream=False)):
|
| 307 |
+
voice_all.append(j['tts_speech'])
|
| 308 |
+
voice_all = torch.cat(voice_all, dim=1)
|
| 309 |
+
torchaudio.save('sft.wav', voice_all, 22050)
|
| 310 |
+
return outputs, "sft.wav"
|
| 311 |
# else:
|
| 312 |
return outputs, None
|
| 313 |
|
| 314 |
# Define input and output for the Gradio interface
|
| 315 |
demo = gr.Interface(
|
| 316 |
fn=ola_inference,
|
| 317 |
+
inputs=[gr.MultimodalTextbox(file_types=[".mp4", "image"],placeholder="Enter message or upload files...(Image or Video is required)"), gr.Audio(type="filepath")],
|
| 318 |
outputs=["text", "audio"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
title="Ola Demo",
|
| 320 |
description=title_markdown,
|
| 321 |
article=bibtext,
|
| 322 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
# Launch the Gradio app
|
| 324 |
demo.launch(server_name="0.0.0.0",server_port=80)
|
requirements.txt
CHANGED
|
@@ -15,6 +15,8 @@ timm==0.9.16
|
|
| 15 |
openai-whisper
|
| 16 |
deepspeed==0.12.2
|
| 17 |
loguru
|
|
|
|
|
|
|
| 18 |
av
|
| 19 |
librosa
|
| 20 |
gradio
|
|
|
|
| 15 |
openai-whisper
|
| 16 |
deepspeed==0.12.2
|
| 17 |
loguru
|
| 18 |
+
decord
|
| 19 |
+
torchaudio
|
| 20 |
av
|
| 21 |
librosa
|
| 22 |
gradio
|