dongyh20 commited on
Commit
0bef215
Β·
1 Parent(s): 1938217

update spce

Browse files
Files changed (2) hide show
  1. app.py +17 -75
  2. requirements.txt +2 -0
app.py CHANGED
@@ -24,8 +24,9 @@ from typing import Dict, Optional, Sequence, List
24
  import librosa
25
  import whisper
26
 
27
- # import subprocess
28
- # subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 
29
 
30
  import sys
31
  sys.path.append('./ola/CosyVoice/')
@@ -35,21 +36,21 @@ from ola.utils import disable_torch_init
35
  from ola.datasets.preprocess import tokenizer_image_token, tokenizer_speech_image_token, tokenizer_speech_question_image_token
36
  from ola.mm_utils import get_model_name_from_path, KeywordsStoppingCriteria, process_anyres_video, process_anyres_highres_image_genli
37
  from ola.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX, DEFAULT_SPEECH_TOKEN
38
- # from ola.CosyVoice.cosyvoice.cli.cosyvoice import CosyVoice
39
 
40
- model_path = "/mnt/lzy/ola-model/Ola-7b"
41
  tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None)
42
  model = model.to('cuda').eval()
43
  model = model.bfloat16()
44
 
45
- # tts_model = CosyVoice('CosyVoice/pretrained_models/CosyVoice-300M-SFT', load_jit=True, load_onnx=False, fp16=True)
46
- # OUTPUT_SPEECH = False
47
 
48
  USE_SPEECH=False
49
 
50
  title_markdown = """
51
- <div style="display: flex; justify-content: left; align-items: center; text-align: left; background: linear-gradient(45deg, rgba(204,255,231, 0.8), rgba(204,255,231, 0.3)); border-radius: 10px; box-shadow: 0 8px 16px 0 rgba(0,0,0,0.1);"> <a href="https://llava-vl.github.io/blog/2024-04-30-llava-next-video/"" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
52
- <img src="https://ola-omni.github.io/static/images/icon.png" alt="Oryx" style="max-width: 80px; height: auto; border-radius: 10px;">
53
  </a>
54
  <div>
55
  <h2 ><a href="https://github.com/Ola-Omni/Ola">Ola: Pushing the Frontiers of Omni-Modal Language Model with Progressive Modality Alignment</a> </h2>
@@ -300,83 +301,24 @@ def ola_inference(multimodal, audio_path):
300
  outputs = outputs[:-len(stop_str)]
301
  outputs = outputs.strip()
302
 
303
- # if OUTPUT_SPEECH:
304
- # voice_all = []
305
- # for i, j in enumerate(cosyvoice.inference_sft('Visual data comes in various forms, ranging from small icons of just a few pixels to long videos spanning hours. Existing multi-modal LLMs usually standardize these diverse visual inputs to a fixed resolution for visual encoders and yield similar numbers of tokens for LLMs. This approach is non-optimal for multimodal understanding and inefficient for processing inputs with long and short visual contents. To solve the problem, we propose Oryx, a unified multimodal architecture for the spatial-temporal understanding of images, videos, and multi-view 3D scenes. Oryx offers an on-demand solution to seamlessly and efficiently process visual inputs with arbitrary spatial sizes and temporal lengths through two core innovations: 1) a pre-trained OryxViT model that can encode images at any resolution into LLM-friendly visual representations; 2) a dynamic compressor module that supports 1x to 16x compression on visual tokens by request. These design features enable Oryx to accommodate extremely long visual contexts, such as videos, with lower resolution and high compression while maintaining high recognition precision for tasks like document understanding with native resolution and no compression. Beyond the architectural improvements, enhanced data curation and specialized training on long-context retrieval and spatial-aware data help Oryx achieve strong capabilities in image, video, and 3D multimodal understanding simultaneously. ', 'θ‹±ζ–‡ε₯³', stream=False)):
306
- # voice_all.append(j['tts_speech'])
307
- # voice_all = torch.cat(voice_all, dim=1)
308
- # torchaudio.save('sft.wav', voice_all, 22050)
309
- # return outputs, "sft.wav"
310
  # else:
311
  return outputs, None
312
 
313
  # Define input and output for the Gradio interface
314
  demo = gr.Interface(
315
  fn=ola_inference,
316
- inputs=[gr.MultimodalTextbox(file_types=[".mp4", "image"],placeholder="Enter message or upload file..."), gr.Audio(type="filepath")],
317
  outputs=["text", "audio"],
318
- # examples=[
319
- # {
320
- # "files":[f"{cur_dir}/case/image2.png"],
321
- # "text":"Describe what is happening in this video in detail.",
322
- # },
323
- # {
324
- # "files":[f"{cur_dir}/case/image.png"],
325
- # "text":"Describe this icon.",
326
- # },
327
- # ],
328
  title="Ola Demo",
329
  description=title_markdown,
330
  article=bibtext,
331
  )
332
-
333
- # textbox = gr.Textbox(
334
- # show_label=False, placeholder="Enter text and press ENTER", container=False, max_lines=100
335
- # )
336
- # with gr.Blocks(
337
- # title="Oryx-7B",
338
- # theme="finlaymacklon/smooth_slate",
339
- # css=".message-wrap.svelte-1lcyrx4>div.svelte-1lcyrx4 img {min-width: 50px}",
340
- # fill_height=True
341
- # ) as demo:
342
- # html_header = "https://oryx-mllm.github.io/"
343
- # gr.HTML(html_header)
344
-
345
- # with gr.Row(equal_height=True):
346
- # with gr.Column(scale=3):
347
- # with gr.Row():
348
- # video = gr.Video(label="Input Video", height=400)
349
- # cur_dir = os.path.dirname(os.path.abspath(__file__))
350
- # with gr.Row():
351
- # gr.Examples(
352
- # examples=[
353
- # [
354
- # f"{cur_dir}/case/case1.mp4",
355
- # "Describe what is happening in this video in detail.",
356
- # ],
357
- # ],
358
- # inputs=[video, textbox],
359
- # )
360
-
361
- # with gr.Column(scale=7):
362
- # chatbot = gr.Chatbot(label="Oryx", bubble_full_width=False, height=660)
363
- # with gr.Row():
364
- # with gr.Column(scale=8):
365
- # textbox.render()
366
- # with gr.Column(scale=1, min_width=50):
367
- # submit_btn = gr.Button(
368
- # value="Send", variant="primary", interactive=True
369
- # )
370
- # # with gr.Row(elem_id="buttons") as button_row:
371
- # # upvote_btn = gr.Button(value="πŸ‘ Upvote", interactive=True)
372
- # # downvote_btn = gr.Button(value="πŸ‘Ž Downvote", interactive=True)
373
- # # flag_btn = gr.Button(value="⚠️ Flag", interactive=True)
374
- # # clear_btn = gr.Button(value="πŸ—‘οΈ Clear history", interactive=True)
375
-
376
- # submit_btn.click(
377
- # oryx_inference,
378
- # [video, textbox],
379
- # [chatbot, textbox, video],
380
- # )
381
  # Launch the Gradio app
382
  demo.launch(server_name="0.0.0.0",server_port=80)
 
24
  import librosa
25
  import whisper
26
 
27
+ import torchaudio
28
+ import subprocess
29
+ subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
30
 
31
  import sys
32
  sys.path.append('./ola/CosyVoice/')
 
36
  from ola.datasets.preprocess import tokenizer_image_token, tokenizer_speech_image_token, tokenizer_speech_question_image_token
37
  from ola.mm_utils import get_model_name_from_path, KeywordsStoppingCriteria, process_anyres_video, process_anyres_highres_image_genli
38
  from ola.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX, DEFAULT_SPEECH_TOKEN
39
+ from ola.CosyVoice.cosyvoice.cli.cosyvoice import CosyVoice
40
 
41
+ model_path = "THUdyh/Ola-7b"
42
  tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None)
43
  model = model.to('cuda').eval()
44
  model = model.bfloat16()
45
 
46
+ # tts_model = CosyVoice('./ola/CosyVoice/pretrained_models/CosyVoice-300M-SFT', load_jit=True, load_onnx=False, fp16=True)
47
+ OUTPUT_SPEECH = False
48
 
49
  USE_SPEECH=False
50
 
51
  title_markdown = """
52
+ <div style="display: flex; justify-content: left; align-items: center; text-align: left; background: linear-gradient(45deg, rgba(255,248,240, 0.8), rgba(255,135,36,0.3)); border-radius: 10px; box-shadow: 0 8px 16px 0 rgba(0,0,0,0.1);"> <a href="https://ola-omni.github.io/"" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
53
+ <img src="https://ola-omni.github.io/static/images/ola-icon-2.png" alt="Ola" style="max-width: 80px; height: auto; border-radius: 10px;">
54
  </a>
55
  <div>
56
  <h2 ><a href="https://github.com/Ola-Omni/Ola">Ola: Pushing the Frontiers of Omni-Modal Language Model with Progressive Modality Alignment</a> </h2>
 
301
  outputs = outputs[:-len(stop_str)]
302
  outputs = outputs.strip()
303
 
304
+ if OUTPUT_SPEECH:
305
+ voice_all = []
306
+ for i, j in enumerate(tts_model.inference_sft(outputs, 'θ‹±ζ–‡ε₯³', stream=False)):
307
+ voice_all.append(j['tts_speech'])
308
+ voice_all = torch.cat(voice_all, dim=1)
309
+ torchaudio.save('sft.wav', voice_all, 22050)
310
+ return outputs, "sft.wav"
311
  # else:
312
  return outputs, None
313
 
314
  # Define input and output for the Gradio interface
315
  demo = gr.Interface(
316
  fn=ola_inference,
317
+ inputs=[gr.MultimodalTextbox(file_types=[".mp4", "image"],placeholder="Enter message or upload files...(Image or Video is required)"), gr.Audio(type="filepath")],
318
  outputs=["text", "audio"],
 
 
 
 
 
 
 
 
 
 
319
  title="Ola Demo",
320
  description=title_markdown,
321
  article=bibtext,
322
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  # Launch the Gradio app
324
  demo.launch(server_name="0.0.0.0",server_port=80)
requirements.txt CHANGED
@@ -15,6 +15,8 @@ timm==0.9.16
15
  openai-whisper
16
  deepspeed==0.12.2
17
  loguru
 
 
18
  av
19
  librosa
20
  gradio
 
15
  openai-whisper
16
  deepspeed==0.12.2
17
  loguru
18
+ decord
19
+ torchaudio
20
  av
21
  librosa
22
  gradio