THUdyh
/

Ola-Video

@@ -1,326 +1,337 @@
----
-pipeline_tag: video-text-to-text
-license: apache-2.0
-base_model:
-- Qwen/Qwen2.5-7B-Instruct
-language:
-- en
-- zh
-datasets:
-- HuggingFaceFV/finevideo
----
-# Ola-7B
-## Model Summary
-The Ola-7B model is developed by people from Tencent, Tsinghua University and Nanyang Technological University.
-Based on Qwen2.5 language model, it is trained on text, image, video and audio data with a context window of 32K tokens. It can take both image/video, text and audio as input and output text.
-Ola offers an on-demand solution to seamlessly and efficiently process visual inputs with arbitrary spatial sizes and temporal lengths.
-- **Repository:** https://github.com/Ola-Omni/Ola
-- **Languages:** English, Chinese
-- **Paper:** https://huggingface.co/papers/2502.04328
-## Use
-1. Download the speech encoder at https://huggingface.co/THUdyh/Ola_speech_encoders.
-2. Replace the path in config.json with local path of speech encoders.
-We provide a simple generation process for using our model. For more details, please refer to our [Github Repo](https://github.com/Ola-Omni/Ola)
-```
-import os
-os.environ['LOWRES_RESIZE'] = '384x32'
-os.environ['HIGHRES_BASE'] = '0x32'
-os.environ['VIDEO_RESIZE'] = "0x64"
-os.environ['VIDEO_MAXRES'] = "480"
-os.environ['VIDEO_MINRES'] = "288"
-os.environ['MAXRES'] = '1536'
-os.environ['MINRES'] = '0'
-os.environ['REGIONAL_POOL'] = '2x'
-os.environ['FORCE_NO_DOWNSAMPLE'] = '1'
-os.environ['LOAD_VISION_EARLY'] = '1'
-os.environ['SKIP_LOAD_VIT'] = '1'
-import gradio as gr
-import torch
-import re
-from decord import VideoReader, cpu
-from PIL import Image
-import numpy as np
-import transformers
-import moviepy.editor as mp
-from typing import Dict, Optional, Sequence, List
-import librosa
-import whisper
-from ola.conversation import conv_templates, SeparatorStyle
-from ola.model.builder import load_pretrained_model
-from ola.utils import disable_torch_init
-from ola.datasets.preprocess import tokenizer_image_token, tokenizer_speech_image_token, tokenizer_speech_question_image_token
-from ola.mm_utils import get_model_name_from_path, KeywordsStoppingCriteria, process_anyres_video, process_anyres_highres_image_genli
-from ola.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX, DEFAULT_SPEECH_TOKEN
-model_path = ""
-tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None)
-model = model.to('cuda').eval()
-model = model.bfloat16()
-USE_SPEECH=False
-cur_dir = os.path.dirname(os.path.abspath(__file__))
-def load_audio(audio_file_name):
-    speech_wav, samplerate = librosa.load(audio_file_name, sr=16000)
-    if len(speech_wav.shape) > 1:
-        speech_wav = speech_wav[:, 0]
-    speech_wav = speech_wav.astype(np.float32)
-    CHUNK_LIM = 480000
-    SAMPLE_RATE = 16000
-    speechs = []
-    speech_wavs = []
-    if len(speech_wav) <= CHUNK_LIM:
-        speech = whisper.pad_or_trim(speech_wav)
-        speech_wav = whisper.pad_or_trim(speech_wav)
-        speechs.append(speech)
-        speech_wavs.append(torch.from_numpy(speech_wav).unsqueeze(0))
-    else:
-        for i in range(0, len(speech_wav), CHUNK_LIM):
-            chunk = speech_wav[i : i + CHUNK_LIM]
-            if len(chunk) < CHUNK_LIM:
-                chunk = whisper.pad_or_trim(chunk)
-            speechs.append(chunk)
-            speech_wavs.append(torch.from_numpy(chunk).unsqueeze(0))
-    mels = []
-    for chunk in speechs:
-        chunk = whisper.log_mel_spectrogram(chunk, n_mels=128).permute(1, 0).unsqueeze(0)
-        mels.append(chunk)
-    mels = torch.cat(mels, dim=0)
-    speech_wavs = torch.cat(speech_wavs, dim=0)
-    if mels.shape[0] > 25:
-        mels = mels[:25]
-        speech_wavs = speech_wavs[:25]
-    speech_length = torch.LongTensor([mels.shape[1]] * mels.shape[0])
-    speech_chunks = torch.LongTensor([mels.shape[0]])
-    return mels, speech_length, speech_chunks, speech_wavs
-def extract_audio(videos_file_path):
-    my_clip = mp.VideoFileClip(videos_file_path)
-    return my_clip.audio
-def ola_inference(multimodal, audio_path):
-    visual, text = multimodal["files"][0], multimodal["text"]
-    if visual.endswith("image2.png"):
-        modality = "video"
-        visual = f"{cur_dir}/case/case1.mp4"
-    if visual.endswith(".mp4"):
-        modality = "video"
-    else:
-        modality = "image"
-    # input audio and video, do not parse audio in the video, else parse audio in the video
-    if audio_path:
-        USE_SPEECH = True
-    elif modality == "video":
-        USE_SPEECH = True
-    else:
-        USE_SPEECH = False
-    speechs = []
-    speech_lengths = []
-    speech_wavs = []
-    speech_chunks = []
-    if modality == "video":
-        vr = VideoReader(visual, ctx=cpu(0))
-        total_frame_num = len(vr)
-        fps = round(vr.get_avg_fps())
-        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, 64, dtype=int)
-        frame_idx = uniform_sampled_frames.tolist()
-        spare_frames = vr.get_batch(frame_idx).asnumpy()
-        video = [Image.fromarray(frame) for frame in spare_frames]
-    else:
-        image = [Image.open(visual)]
-        image_sizes = [image[0].size]
-    if USE_SPEECH and audio_path:
-        audio_path = audio_path
-        speech, speech_length, speech_chunk, speech_wav = load_audio(audio_path)
-        speechs.append(speech.bfloat16().to('cuda'))
-        speech_lengths.append(speech_length.to('cuda'))
-        speech_chunks.append(speech_chunk.to('cuda'))
-        speech_wavs.append(speech_wav.to('cuda'))
-        print('load audio')
-    elif USE_SPEECH and not audio_path:
-        # parse audio in the video
-        audio = extract_audio(visual)
-        audio.write_audiofile("./video_audio.wav")
-        video_audio_path = './video_audio.wav'
-        speech, speech_length, speech_chunk, speech_wav = load_audio(video_audio_path)
-        speechs.append(speech.bfloat16().to('cuda'))
-        speech_lengths.append(speech_length.to('cuda'))
-        speech_chunks.append(speech_chunk.to('cuda'))
-        speech_wavs.append(speech_wav.to('cuda'))
-    else:
-        speechs = [torch.zeros(1, 3000, 128).bfloat16().to('cuda')]
-        speech_lengths = [torch.LongTensor([3000]).to('cuda')]
-        speech_wavs = [torch.zeros([1, 480000]).to('cuda')]
-        speech_chunks = [torch.LongTensor([1]).to('cuda')]
-    conv_mode = "qwen_1_5"
-    if text:
-        qs = text
-    else:
-        qs = ''
-    if USE_SPEECH and audio_path:
-        qs = DEFAULT_IMAGE_TOKEN + "\n" + "User's question in speech: " + DEFAULT_SPEECH_TOKEN + '\n'
-    elif USE_SPEECH:
-        qs = DEFAULT_SPEECH_TOKEN + DEFAULT_IMAGE_TOKEN + "\n" + qs
-    else:
-        qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
-    conv = conv_templates[conv_mode].copy()
-    conv.append_message(conv.roles[0], qs)
-    conv.append_message(conv.roles[1], None)
-    prompt = conv.get_prompt()
-    if USE_SPEECH and audio_path:
-        input_ids = tokenizer_speech_question_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
-    elif USE_SPEECH:
-        input_ids = tokenizer_speech_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
-    else:
-        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
-    if modality == "video":
-        video_processed = []
-        for idx, frame in enumerate(video):
-            image_processor.do_resize = False
-            image_processor.do_center_crop = False
-            frame = process_anyres_video(frame, image_processor)
-            if frame_idx is not None and idx in frame_idx:
-                video_processed.append(frame.unsqueeze(0))
-            elif frame_idx is None:
-                video_processed.append(frame.unsqueeze(0))
-        if frame_idx is None:
-            frame_idx = np.arange(0, len(video_processed), dtype=int).tolist()
-        video_processed = torch.cat(video_processed, dim=0).bfloat16().to("cuda")
-        video_processed = (video_processed, video_processed)
-        video_data = (video_processed, (384, 384), "video")
-    else:
-        image_processor.do_resize = False
-        image_processor.do_center_crop = False
-        image_tensor, image_highres_tensor = [], []
-        for visual in image:
-            image_tensor_, image_highres_tensor_ = process_anyres_highres_image_genli(visual, image_processor)
-            image_tensor.append(image_tensor_)
-            image_highres_tensor.append(image_highres_tensor_)
-        if all(x.shape == image_tensor[0].shape for x in image_tensor):
-            image_tensor = torch.stack(image_tensor, dim=0)
-        if all(x.shape == image_highres_tensor[0].shape for x in image_highres_tensor):
-            image_highres_tensor = torch.stack(image_highres_tensor, dim=0)
-        if type(image_tensor) is list:
-            image_tensor = [_image.bfloat16().to("cuda") for _image in image_tensor]
-        else:
-            image_tensor = image_tensor.bfloat16().to("cuda")
-        if type(image_highres_tensor) is list:
-            image_highres_tensor = [_image.bfloat16().to("cuda") for _image in image_highres_tensor]
-        else:
-            image_highres_tensor = image_highres_tensor.bfloat16().to("cuda")
-    pad_token_ids = 151643
-    attention_masks = input_ids.ne(pad_token_ids).long().to('cuda')
-    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
-    keywords = [stop_str]
-    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
-    gen_kwargs = {}
-    if "max_new_tokens" not in gen_kwargs:
-        gen_kwargs["max_new_tokens"] = 1024
-    if "temperature" not in gen_kwargs:
-        gen_kwargs["temperature"] = 0.2
-    if "top_p" not in gen_kwargs:
-        gen_kwargs["top_p"] = None
-    if "num_beams" not in gen_kwargs:
-        gen_kwargs["num_beams"] = 1
-    with torch.inference_mode():
-        if modality == "video":
-            output_ids = model.generate(
-                inputs=input_ids,
-                images=video_data[0][0],
-                images_highres=video_data[0][1],
-                modalities=video_data[2],
-                speech=speechs,
-                speech_lengths=speech_lengths,
-                speech_chunks=speech_chunks,
-                speech_wav=speech_wavs,
-                attention_mask=attention_masks,
-                use_cache=True,
-                stopping_criteria=[stopping_criteria],
-                do_sample=True if gen_kwargs["temperature"] > 0 else False,
-                temperature=gen_kwargs["temperature"],
-                top_p=gen_kwargs["top_p"],
-                num_beams=gen_kwargs["num_beams"],
-                max_new_tokens=gen_kwargs["max_new_tokens"],
-            )
-        else:
-            output_ids = model.generate(
-                inputs=input_ids,
-                images=image_tensor,
-                images_highres=image_highres_tensor,
-                image_sizes=image_sizes,
-                modalities=['image'],
-                speech=speechs,
-                speech_lengths=speech_lengths,
-                speech_chunks=speech_chunks,
-                speech_wav=speech_wavs,
-                attention_mask=attention_masks,
-                use_cache=True,
-                stopping_criteria=[stopping_criteria],
-                do_sample=True if gen_kwargs["temperature"] > 0 else False,
-                temperature=gen_kwargs["temperature"],
-                top_p=gen_kwargs["top_p"],
-                num_beams=gen_kwargs["num_beams"],
-                max_new_tokens=gen_kwargs["max_new_tokens"],
-            )
-    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
-    outputs = outputs.strip()
-    if outputs.endswith(stop_str):
-        outputs = outputs[:-len(stop_str)]
-    outputs = outputs.strip()
-    return outputs, None
-```
-### Model Architecture
--   **Architecture:** Pre-trained [Oryx-ViT](https://huggingface.co/THUdyh/Oryx-ViT) + Qwen2.5-7B
--   **Data:** a mixture of more than 5M image/video/audio data, training for 3 stage.
--   **Precision:** BFloat16
-#### Hardware & Software
--   **Hardware:** 64 \* NVIDIA Tesla A100
--   **Orchestration:** HuggingFace Trainer
--   **Code:** Pytorch
-## Citation
-@article{liu2025ola,
-title={Ola: Pushing the Frontiers of Omni-Modal Language Model with Progressive Modality Alignment},
-author={Liu, Zuyan and Dong, Yuhao and Wang, Jiahui and Liu, Ziwei and Hu, Winston and Lu, Jiwen and Rao, Yongming},
-journal={arXiv preprint arXiv:2502.04328},
-year={2025}
 }

+---
+pipeline_tag: video-text-to-text
+license: apache-2.0
+base_model:
+- Qwen/Qwen2.5-7B-Instruct
+language:
+- zho
+- eng
+- fra
+- spa
+- por
+- deu
+- ita
+- rus
+- jpn
+- kor
+- vie
+- tha
+- ara
+datasets:
+- HuggingFaceFV/finevideo
+---
+# Ola-7B
+## Model Summary
+The Ola-7B model is developed by people from Tencent, Tsinghua University and Nanyang Technological University.
+Based on Qwen2.5 language model, it is trained on text, image, video and audio data with a context window of 32K tokens. It can take both image/video, text and audio as input and output text.
+Ola offers an on-demand solution to seamlessly and efficiently process visual inputs with arbitrary spatial sizes and temporal lengths.
+- **Repository:** https://github.com/Ola-Omni/Ola
+- **Languages:** English, Chinese
+- **Paper:** https://huggingface.co/papers/2502.04328
+## Use
+1. Download the speech encoder at https://huggingface.co/THUdyh/Ola_speech_encoders.
+2. Replace the path in config.json with local path of speech encoders.
+We provide a simple generation process for using our model. For more details, please refer to our [Github Repo](https://github.com/Ola-Omni/Ola)
+```
+import os
+os.environ['LOWRES_RESIZE'] = '384x32'
+os.environ['HIGHRES_BASE'] = '0x32'
+os.environ['VIDEO_RESIZE'] = "0x64"
+os.environ['VIDEO_MAXRES'] = "480"
+os.environ['VIDEO_MINRES'] = "288"
+os.environ['MAXRES'] = '1536'
+os.environ['MINRES'] = '0'
+os.environ['REGIONAL_POOL'] = '2x'
+os.environ['FORCE_NO_DOWNSAMPLE'] = '1'
+os.environ['LOAD_VISION_EARLY'] = '1'
+os.environ['SKIP_LOAD_VIT'] = '1'
+import gradio as gr
+import torch
+import re
+from decord import VideoReader, cpu
+from PIL import Image
+import numpy as np
+import transformers
+import moviepy.editor as mp
+from typing import Dict, Optional, Sequence, List
+import librosa
+import whisper
+from ola.conversation import conv_templates, SeparatorStyle
+from ola.model.builder import load_pretrained_model
+from ola.utils import disable_torch_init
+from ola.datasets.preprocess import tokenizer_image_token, tokenizer_speech_image_token, tokenizer_speech_question_image_token
+from ola.mm_utils import get_model_name_from_path, KeywordsStoppingCriteria, process_anyres_video, process_anyres_highres_image_genli
+from ola.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX, DEFAULT_SPEECH_TOKEN
+model_path = ""
+tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None)
+model = model.to('cuda').eval()
+model = model.bfloat16()
+USE_SPEECH=False
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+def load_audio(audio_file_name):
+    speech_wav, samplerate = librosa.load(audio_file_name, sr=16000)
+    if len(speech_wav.shape) > 1:
+        speech_wav = speech_wav[:, 0]
+    speech_wav = speech_wav.astype(np.float32)
+    CHUNK_LIM = 480000
+    SAMPLE_RATE = 16000
+    speechs = []
+    speech_wavs = []
+    if len(speech_wav) <= CHUNK_LIM:
+        speech = whisper.pad_or_trim(speech_wav)
+        speech_wav = whisper.pad_or_trim(speech_wav)
+        speechs.append(speech)
+        speech_wavs.append(torch.from_numpy(speech_wav).unsqueeze(0))
+    else:
+        for i in range(0, len(speech_wav), CHUNK_LIM):
+            chunk = speech_wav[i : i + CHUNK_LIM]
+            if len(chunk) < CHUNK_LIM:
+                chunk = whisper.pad_or_trim(chunk)
+            speechs.append(chunk)
+            speech_wavs.append(torch.from_numpy(chunk).unsqueeze(0))
+    mels = []
+    for chunk in speechs:
+        chunk = whisper.log_mel_spectrogram(chunk, n_mels=128).permute(1, 0).unsqueeze(0)
+        mels.append(chunk)
+    mels = torch.cat(mels, dim=0)
+    speech_wavs = torch.cat(speech_wavs, dim=0)
+    if mels.shape[0] > 25:
+        mels = mels[:25]
+        speech_wavs = speech_wavs[:25]
+    speech_length = torch.LongTensor([mels.shape[1]] * mels.shape[0])
+    speech_chunks = torch.LongTensor([mels.shape[0]])
+    return mels, speech_length, speech_chunks, speech_wavs
+def extract_audio(videos_file_path):
+    my_clip = mp.VideoFileClip(videos_file_path)
+    return my_clip.audio
+def ola_inference(multimodal, audio_path):
+    visual, text = multimodal["files"][0], multimodal["text"]
+    if visual.endswith("image2.png"):
+        modality = "video"
+        visual = f"{cur_dir}/case/case1.mp4"
+    if visual.endswith(".mp4"):
+        modality = "video"
+    else:
+        modality = "image"
+    # input audio and video, do not parse audio in the video, else parse audio in the video
+    if audio_path:
+        USE_SPEECH = True
+    elif modality == "video":
+        USE_SPEECH = True
+    else:
+        USE_SPEECH = False
+    speechs = []
+    speech_lengths = []
+    speech_wavs = []
+    speech_chunks = []
+    if modality == "video":
+        vr = VideoReader(visual, ctx=cpu(0))
+        total_frame_num = len(vr)
+        fps = round(vr.get_avg_fps())
+        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, 64, dtype=int)
+        frame_idx = uniform_sampled_frames.tolist()
+        spare_frames = vr.get_batch(frame_idx).asnumpy()
+        video = [Image.fromarray(frame) for frame in spare_frames]
+    else:
+        image = [Image.open(visual)]
+        image_sizes = [image[0].size]
+    if USE_SPEECH and audio_path:
+        audio_path = audio_path
+        speech, speech_length, speech_chunk, speech_wav = load_audio(audio_path)
+        speechs.append(speech.bfloat16().to('cuda'))
+        speech_lengths.append(speech_length.to('cuda'))
+        speech_chunks.append(speech_chunk.to('cuda'))
+        speech_wavs.append(speech_wav.to('cuda'))
+        print('load audio')
+    elif USE_SPEECH and not audio_path:
+        # parse audio in the video
+        audio = extract_audio(visual)
+        audio.write_audiofile("./video_audio.wav")
+        video_audio_path = './video_audio.wav'
+        speech, speech_length, speech_chunk, speech_wav = load_audio(video_audio_path)
+        speechs.append(speech.bfloat16().to('cuda'))
+        speech_lengths.append(speech_length.to('cuda'))
+        speech_chunks.append(speech_chunk.to('cuda'))
+        speech_wavs.append(speech_wav.to('cuda'))
+    else:
+        speechs = [torch.zeros(1, 3000, 128).bfloat16().to('cuda')]
+        speech_lengths = [torch.LongTensor([3000]).to('cuda')]
+        speech_wavs = [torch.zeros([1, 480000]).to('cuda')]
+        speech_chunks = [torch.LongTensor([1]).to('cuda')]
+    conv_mode = "qwen_1_5"
+    if text:
+        qs = text
+    else:
+        qs = ''
+    if USE_SPEECH and audio_path:
+        qs = DEFAULT_IMAGE_TOKEN + "\n" + "User's question in speech: " + DEFAULT_SPEECH_TOKEN + '\n'
+    elif USE_SPEECH:
+        qs = DEFAULT_SPEECH_TOKEN + DEFAULT_IMAGE_TOKEN + "\n" + qs
+    else:
+        qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
+    conv = conv_templates[conv_mode].copy()
+    conv.append_message(conv.roles[0], qs)
+    conv.append_message(conv.roles[1], None)
+    prompt = conv.get_prompt()
+    if USE_SPEECH and audio_path:
+        input_ids = tokenizer_speech_question_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
+    elif USE_SPEECH:
+        input_ids = tokenizer_speech_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
+    else:
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
+    if modality == "video":
+        video_processed = []
+        for idx, frame in enumerate(video):
+            image_processor.do_resize = False
+            image_processor.do_center_crop = False
+            frame = process_anyres_video(frame, image_processor)
+            if frame_idx is not None and idx in frame_idx:
+                video_processed.append(frame.unsqueeze(0))
+            elif frame_idx is None:
+                video_processed.append(frame.unsqueeze(0))
+        if frame_idx is None:
+            frame_idx = np.arange(0, len(video_processed), dtype=int).tolist()
+        video_processed = torch.cat(video_processed, dim=0).bfloat16().to("cuda")
+        video_processed = (video_processed, video_processed)
+        video_data = (video_processed, (384, 384), "video")
+    else:
+        image_processor.do_resize = False
+        image_processor.do_center_crop = False
+        image_tensor, image_highres_tensor = [], []
+        for visual in image:
+            image_tensor_, image_highres_tensor_ = process_anyres_highres_image_genli(visual, image_processor)
+            image_tensor.append(image_tensor_)
+            image_highres_tensor.append(image_highres_tensor_)
+        if all(x.shape == image_tensor[0].shape for x in image_tensor):
+            image_tensor = torch.stack(image_tensor, dim=0)
+        if all(x.shape == image_highres_tensor[0].shape for x in image_highres_tensor):
+            image_highres_tensor = torch.stack(image_highres_tensor, dim=0)
+        if type(image_tensor) is list:
+            image_tensor = [_image.bfloat16().to("cuda") for _image in image_tensor]
+        else:
+            image_tensor = image_tensor.bfloat16().to("cuda")
+        if type(image_highres_tensor) is list:
+            image_highres_tensor = [_image.bfloat16().to("cuda") for _image in image_highres_tensor]
+        else:
+            image_highres_tensor = image_highres_tensor.bfloat16().to("cuda")
+    pad_token_ids = 151643
+    attention_masks = input_ids.ne(pad_token_ids).long().to('cuda')
+    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+    keywords = [stop_str]
+    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+    gen_kwargs = {}
+    if "max_new_tokens" not in gen_kwargs:
+        gen_kwargs["max_new_tokens"] = 1024
+    if "temperature" not in gen_kwargs:
+        gen_kwargs["temperature"] = 0.2
+    if "top_p" not in gen_kwargs:
+        gen_kwargs["top_p"] = None
+    if "num_beams" not in gen_kwargs:
+        gen_kwargs["num_beams"] = 1
+    with torch.inference_mode():
+        if modality == "video":
+            output_ids = model.generate(
+                inputs=input_ids,
+                images=video_data[0][0],
+                images_highres=video_data[0][1],
+                modalities=video_data[2],
+                speech=speechs,
+                speech_lengths=speech_lengths,
+                speech_chunks=speech_chunks,
+                speech_wav=speech_wavs,
+                attention_mask=attention_masks,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria],
+                do_sample=True if gen_kwargs["temperature"] > 0 else False,
+                temperature=gen_kwargs["temperature"],
+                top_p=gen_kwargs["top_p"],
+                num_beams=gen_kwargs["num_beams"],
+                max_new_tokens=gen_kwargs["max_new_tokens"],
+            )
+        else:
+            output_ids = model.generate(
+                inputs=input_ids,
+                images=image_tensor,
+                images_highres=image_highres_tensor,
+                image_sizes=image_sizes,
+                modalities=['image'],
+                speech=speechs,
+                speech_lengths=speech_lengths,
+                speech_chunks=speech_chunks,
+                speech_wav=speech_wavs,
+                attention_mask=attention_masks,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria],
+                do_sample=True if gen_kwargs["temperature"] > 0 else False,
+                temperature=gen_kwargs["temperature"],
+                top_p=gen_kwargs["top_p"],
+                num_beams=gen_kwargs["num_beams"],
+                max_new_tokens=gen_kwargs["max_new_tokens"],
+            )
+    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
+    outputs = outputs.strip()
+    if outputs.endswith(stop_str):
+        outputs = outputs[:-len(stop_str)]
+    outputs = outputs.strip()
+    return outputs, None
+```
+### Model Architecture
+-   **Architecture:** Pre-trained [Oryx-ViT](https://huggingface.co/THUdyh/Oryx-ViT) + Qwen2.5-7B
+-   **Data:** a mixture of more than 5M image/video/audio data, training for 3 stage.
+-   **Precision:** BFloat16
+#### Hardware & Software
+-   **Hardware:** 64 \* NVIDIA Tesla A100
+-   **Orchestration:** HuggingFace Trainer
+-   **Code:** Pytorch
+## Citation
+@article{liu2025ola,
+title={Ola: Pushing the Frontiers of Omni-Modal Language Model with Progressive Modality Alignment},
+author={Liu, Zuyan and Dong, Yuhao and Wang, Jiahui and Liu, Ziwei and Hu, Winston and Lu, Jiwen and Rao, Yongming},
+journal={arXiv preprint arXiv:2502.04328},
+year={2025}
 }