Spaces:

THUdyh
/

Ola

Runtime error

App Files Files Community

dongyh20 commited on Feb 19

Commit

0bef215

1 Parent(s): 1938217

update spce

Browse files

Files changed (2) hide show

app.py +17 -75
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -24,8 +24,9 @@ from typing import Dict, Optional, Sequence, List
 import librosa
 import whisper
-# import subprocess
-# subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 import sys
 sys.path.append('./ola/CosyVoice/')
@@ -35,21 +36,21 @@ from ola.utils import disable_torch_init
 from ola.datasets.preprocess import tokenizer_image_token, tokenizer_speech_image_token, tokenizer_speech_question_image_token
 from ola.mm_utils import get_model_name_from_path, KeywordsStoppingCriteria, process_anyres_video, process_anyres_highres_image_genli
 from ola.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX, DEFAULT_SPEECH_TOKEN
-# from ola.CosyVoice.cosyvoice.cli.cosyvoice import CosyVoice
-model_path = "/mnt/lzy/ola-model/Ola-7b"
 tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None)
 model = model.to('cuda').eval()
 model = model.bfloat16()
-# tts_model = CosyVoice('CosyVoice/pretrained_models/CosyVoice-300M-SFT', load_jit=True, load_onnx=False, fp16=True)
-# OUTPUT_SPEECH = False
 USE_SPEECH=False
 title_markdown = """
-<div style="display: flex; justify-content: left; align-items: center; text-align: left; background: linear-gradient(45deg, rgba(204,255,231, 0.8), rgba(204,255,231, 0.3)); border-radius: 10px; box-shadow: 0 8px 16px 0 rgba(0,0,0,0.1);">  <a href="https://llava-vl.github.io/blog/2024-04-30-llava-next-video/"" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
-    <img src="https://ola-omni.github.io/static/images/icon.png" alt="Oryx" style="max-width: 80px; height: auto; border-radius: 10px;">
   </a>
   <div>
     <h2 ><a href="https://github.com/Ola-Omni/Ola">Ola: Pushing the Frontiers of Omni-Modal Language Model with Progressive Modality Alignment</a> </h2>
@@ -300,83 +301,24 @@ def ola_inference(multimodal, audio_path):
         outputs = outputs[:-len(stop_str)]
     outputs = outputs.strip()
-    # if OUTPUT_SPEECH:
-    #     voice_all = []
-    #     for i, j in enumerate(cosyvoice.inference_sft('Visual data comes in various forms, ranging from small icons of just a few pixels to long videos spanning hours. Existing multi-modal LLMs usually standardize these diverse visual inputs to a fixed resolution for visual encoders and yield similar numbers of tokens for LLMs. This approach is non-optimal for multimodal understanding and inefficient for processing inputs with long and short visual contents. To solve the problem, we propose Oryx, a unified multimodal architecture for the spatial-temporal understanding of images, videos, and multi-view 3D scenes. Oryx offers an on-demand solution to seamlessly and efficiently process visual inputs with arbitrary spatial sizes and temporal lengths through two core innovations: 1) a pre-trained OryxViT model that can encode images at any resolution into LLM-friendly visual representations; 2) a dynamic compressor module that supports 1x to 16x compression on visual tokens by request. These design features enable Oryx to accommodate extremely long visual contexts, such as videos, with lower resolution and high compression while maintaining high recognition precision for tasks like document understanding with native resolution and no compression. Beyond the architectural improvements, enhanced data curation and specialized training on long-context retrieval and spatial-aware data help Oryx achieve strong capabilities in image, video, and 3D multimodal understanding simultaneously. ', '英文女', stream=False)):
-    #         voice_all.append(j['tts_speech'])
-    #     voice_all = torch.cat(voice_all, dim=1)
-    #     torchaudio.save('sft.wav', voice_all, 22050)
-    #     return outputs, "sft.wav"
     # else:
     return outputs, None
 # Define input and output for the Gradio interface
 demo = gr.Interface(
     fn=ola_inference,
-    inputs=[gr.MultimodalTextbox(file_types=[".mp4", "image"],placeholder="Enter message or upload file..."), gr.Audio(type="filepath")],
     outputs=["text", "audio"],
-    # examples=[
-    #         {
-    #             "files":[f"{cur_dir}/case/image2.png"],
-    #             "text":"Describe what is happening in this video in detail.",
-    #         },
-    #         {
-    #             "files":[f"{cur_dir}/case/image.png"],
-    #             "text":"Describe this icon.",
-    #         },
-    #     ],
     title="Ola Demo",
     description=title_markdown,
     article=bibtext,
 )
-# textbox = gr.Textbox(
-#     show_label=False, placeholder="Enter text and press ENTER", container=False, max_lines=100
-# )
-# with gr.Blocks(
-#     title="Oryx-7B",
-#     theme="finlaymacklon/smooth_slate",
-#     css=".message-wrap.svelte-1lcyrx4>div.svelte-1lcyrx4  img {min-width: 50px}",
-#     fill_height=True
-# ) as demo:
-#     html_header = "https://oryx-mllm.github.io/"
-#     gr.HTML(html_header)
-#     with gr.Row(equal_height=True):
-#         with gr.Column(scale=3):
-#             with gr.Row():
-#                 video = gr.Video(label="Input Video",  height=400)
-#                 cur_dir = os.path.dirname(os.path.abspath(__file__))
-#             with gr.Row():
-#                 gr.Examples(
-#                     examples=[
-#                         [
-#                             f"{cur_dir}/case/case1.mp4",
-#                             "Describe what is happening in this video in detail.",
-#                         ],
-#                     ],
-#                     inputs=[video, textbox],
-#                 )
-#         with gr.Column(scale=7):
-#             chatbot = gr.Chatbot(label="Oryx", bubble_full_width=False, height=660)
-#             with gr.Row():
-#                 with gr.Column(scale=8):
-#                     textbox.render()
-#                 with gr.Column(scale=1, min_width=50):
-#                     submit_btn = gr.Button(
-#                         value="Send", variant="primary", interactive=True
-#                     )
-#             # with gr.Row(elem_id="buttons") as button_row:
-#             #     upvote_btn = gr.Button(value="👍  Upvote", interactive=True)
-#             #     downvote_btn = gr.Button(value="👎  Downvote", interactive=True)
-#             #     flag_btn = gr.Button(value="⚠️  Flag", interactive=True)
-#             #     clear_btn = gr.Button(value="🗑️  Clear history", interactive=True)
-#     submit_btn.click(
-#         oryx_inference,
-#         [video, textbox],
-#         [chatbot, textbox, video],
-#     )
 # Launch the Gradio app
 demo.launch(server_name="0.0.0.0",server_port=80)

 import librosa
 import whisper
+import torchaudio
+import subprocess
+subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 import sys
 sys.path.append('./ola/CosyVoice/')
 from ola.datasets.preprocess import tokenizer_image_token, tokenizer_speech_image_token, tokenizer_speech_question_image_token
 from ola.mm_utils import get_model_name_from_path, KeywordsStoppingCriteria, process_anyres_video, process_anyres_highres_image_genli
 from ola.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX, DEFAULT_SPEECH_TOKEN
+from ola.CosyVoice.cosyvoice.cli.cosyvoice import CosyVoice
+model_path = "THUdyh/Ola-7b"
 tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None)
 model = model.to('cuda').eval()
 model = model.bfloat16()
+# tts_model = CosyVoice('./ola/CosyVoice/pretrained_models/CosyVoice-300M-SFT', load_jit=True, load_onnx=False, fp16=True)
+OUTPUT_SPEECH = False
 USE_SPEECH=False
 title_markdown = """
+<div style="display: flex; justify-content: left; align-items: center; text-align: left; background: linear-gradient(45deg, rgba(255,248,240, 0.8), rgba(255,135,36,0.3)); border-radius: 10px; box-shadow: 0 8px 16px 0 rgba(0,0,0,0.1);">  <a href="https://ola-omni.github.io/"" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
+    <img src="https://ola-omni.github.io/static/images/ola-icon-2.png" alt="Ola" style="max-width: 80px; height: auto; border-radius: 10px;">
   </a>
   <div>
     <h2 ><a href="https://github.com/Ola-Omni/Ola">Ola: Pushing the Frontiers of Omni-Modal Language Model with Progressive Modality Alignment</a> </h2>
         outputs = outputs[:-len(stop_str)]
     outputs = outputs.strip()
+    if OUTPUT_SPEECH:
+        voice_all = []
+        for i, j in enumerate(tts_model.inference_sft(outputs, '英文女', stream=False)):
+            voice_all.append(j['tts_speech'])
+        voice_all = torch.cat(voice_all, dim=1)
+        torchaudio.save('sft.wav', voice_all, 22050)
+        return outputs, "sft.wav"
     # else:
     return outputs, None
 # Define input and output for the Gradio interface
 demo = gr.Interface(
     fn=ola_inference,
+    inputs=[gr.MultimodalTextbox(file_types=[".mp4", "image"],placeholder="Enter message or upload files...(Image or Video is required)"), gr.Audio(type="filepath")],
     outputs=["text", "audio"],
     title="Ola Demo",
     description=title_markdown,
     article=bibtext,
 )
 # Launch the Gradio app
 demo.launch(server_name="0.0.0.0",server_port=80)

requirements.txt CHANGED Viewed

@@ -15,6 +15,8 @@ timm==0.9.16
 openai-whisper
 deepspeed==0.12.2
 loguru
 av
 librosa
 gradio

 openai-whisper
 deepspeed==0.12.2
 loguru
+decord
+torchaudio
 av
 librosa
 gradio