NMCxyz commited on Apr 10, 2025

Commit

9942354

verified ·

1 Parent(s): b4f5cb4

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

omni_speech/model/speech_projector/__pycache__/speech_projector.cpython-312.pyc +0 -0
omni_speech/model/speech_projector/__pycache__/speech_projector.cpython-38.pyc +0 -0
omni_speech/model/speech_projector/__pycache__/speech_projector.cpython-39.pyc +0 -0
omni_speech/serve/__init__.py +0 -0
omni_speech/serve/controller.py +298 -0
omni_speech/serve/gradio_web_server.py +348 -0
omni_speech/serve/model_worker.py +292 -0
omni_speech/train/__pycache__/omni_trainer.cpython-310.pyc +0 -0
omni_speech/train/__pycache__/omni_trainer.cpython-312.pyc +0 -0
omni_speech/train/__pycache__/run_train.cpython-310.pyc +0 -0
omni_speech/train/__pycache__/run_train.cpython-312.pyc +0 -0
omni_speech/train/__pycache__/run_train.cpython-38.pyc +0 -0
omni_speech/train/__pycache__/train.cpython-312.pyc +0 -0
omni_speech/train/__pycache__/train_mem.cpython-312.pyc +0 -0
omni_speech/train/__pycache__/train_multiturn.cpython-312.pyc +0 -0
omni_speech/train/__pycache__/train_raw.cpython-312.pyc +0 -0
omni_speech/train/__pycache__/train_test.cpython-312.pyc +0 -0
omni_speech/train/__pycache__/trainer.cpython-310.pyc +0 -0
omni_speech/train/__pycache__/trainer.cpython-312.pyc +0 -0
omni_speech/train/export.py +512 -0
omni_speech/train/omni_trainer.py +345 -0
omni_speech/train/train.py +420 -0
omni_speech/train/train_mem.py +4 -0
omni_speech/train/train_minicpmo.py +660 -0
omni_speech/train/train_minicpmo_test.py +729 -0
omni_speech/train/train_multiturn.py +515 -0
omni_speech/train/trainer.py +249 -0
scripts/continue.sh +65 -0
scripts/ds_config_zero2.json +54 -0
scripts/ds_config_zero3.json +59 -0
scripts/export.sh +39 -0
scripts/finetune.sh +42 -0
scripts/finetune_llm_speech_decoder.sh +85 -0
scripts/finetune_lora.sh +43 -0
scripts/finetune_minicpmo.sh +65 -0
scripts/finetune_minicpmo_asr.sh +63 -0
scripts/finetune_speech_decoder.sh +42 -0
scripts/minicpmp_config.json +163 -0
scripts/pretrain_minicpmo_test.sh +89 -0
scripts/pretrained.sh +44 -0
scripts/pretrained_minicpmo.sh +74 -0
scripts/test_llama.sh +41 -0
scripts/test_qwen.sh +41 -0
scripts/wandb/debug-internal.log +7 -0
scripts/wandb/debug.log +25 -0
scripts/wandb/latest-run/files/output.log +559 -0
scripts/wandb/latest-run/files/requirements.txt +341 -0
scripts/wandb/latest-run/files/wandb-metadata.json +171 -0
scripts/wandb/latest-run/logs/debug-core.log +7 -0
scripts/wandb/latest-run/logs/debug-internal.log +7 -0

omni_speech/model/speech_projector/__pycache__/speech_projector.cpython-312.pyc ADDED Viewed

Binary file (2.07 kB). View file

omni_speech/model/speech_projector/__pycache__/speech_projector.cpython-38.pyc ADDED Viewed

Binary file (1.19 kB). View file

omni_speech/model/speech_projector/__pycache__/speech_projector.cpython-39.pyc ADDED Viewed

Binary file (1.23 kB). View file

omni_speech/serve/__init__.py ADDED Viewed

File without changes

omni_speech/serve/controller.py ADDED Viewed

	@@ -0,0 +1,298 @@

+"""
+A controller manages distributed workers.
+It sends worker addresses to clients.
+"""
+import argparse
+import asyncio
+import dataclasses
+from enum import Enum, auto
+import json
+import logging
+import time
+from typing import List, Union
+import threading
+from fastapi import FastAPI, Request
+from fastapi.responses import StreamingResponse
+import numpy as np
+import requests
+import uvicorn
+from omni_speech.constants import CONTROLLER_HEART_BEAT_EXPIRATION
+from omni_speech.utils import build_logger, server_error_msg
+logger = build_logger("controller", "controller.log")
+class DispatchMethod(Enum):
+    LOTTERY = auto()
+    SHORTEST_QUEUE = auto()
+    @classmethod
+    def from_str(cls, name):
+        if name == "lottery":
+            return cls.LOTTERY
+        elif name == "shortest_queue":
+            return cls.SHORTEST_QUEUE
+        else:
+            raise ValueError(f"Invalid dispatch method")
+@dataclasses.dataclass
+class WorkerInfo:
+    model_names: List[str]
+    speed: int
+    queue_length: int
+    check_heart_beat: bool
+    last_heart_beat: str
+def heart_beat_controller(controller):
+    while True:
+        time.sleep(CONTROLLER_HEART_BEAT_EXPIRATION)
+        controller.remove_stable_workers_by_expiration()
+class Controller:
+    def __init__(self, dispatch_method: str):
+        # Dict[str -> WorkerInfo]
+        self.worker_info = {}
+        self.dispatch_method = DispatchMethod.from_str(dispatch_method)
+        self.heart_beat_thread = threading.Thread(
+            target=heart_beat_controller, args=(self,), daemon=True)
+        self.heart_beat_thread.start()
+        logger.info("Init controller")
+    def register_worker(self, worker_name: str, check_heart_beat: bool,
+                        worker_status: dict):
+        if worker_name not in self.worker_info:
+            logger.info(f"Register a new worker: {worker_name}")
+        else:
+            logger.info(f"Register an existing worker: {worker_name}")
+        if not worker_status:
+            worker_status = self.get_worker_status(worker_name)
+        if not worker_status:
+            return False
+        self.worker_info[worker_name] = WorkerInfo(
+            worker_status["model_names"], worker_status["speed"], worker_status["queue_length"],
+            check_heart_beat, time.time())
+        logger.info(f"Register done: {worker_name}, {worker_status}")
+        return True
+    def get_worker_status(self, worker_name: str):
+        try:
+            r = requests.post(worker_name + "/worker_get_status", timeout=5)
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Get status fails: {worker_name}, {e}")
+            return None
+        if r.status_code != 200:
+            logger.error(f"Get status fails: {worker_name}, {r}")
+            return None
+        return r.json()
+    def remove_worker(self, worker_name: str):
+        del self.worker_info[worker_name]
+    def refresh_all_workers(self):
+        old_info = dict(self.worker_info)
+        self.worker_info = {}
+        for w_name, w_info in old_info.items():
+            if not self.register_worker(w_name, w_info.check_heart_beat, None):
+                logger.info(f"Remove stale worker: {w_name}")
+    def list_models(self):
+        model_names = set()
+        for w_name, w_info in self.worker_info.items():
+            model_names.update(w_info.model_names)
+        return list(model_names)
+    def get_worker_address(self, model_name: str):
+        if self.dispatch_method == DispatchMethod.LOTTERY:
+            worker_names = []
+            worker_speeds = []
+            for w_name, w_info in self.worker_info.items():
+                if model_name in w_info.model_names:
+                    worker_names.append(w_name)
+                    worker_speeds.append(w_info.speed)
+            worker_speeds = np.array(worker_speeds, dtype=np.float32)
+            norm = np.sum(worker_speeds)
+            if norm < 1e-4:
+                return ""
+            worker_speeds = worker_speeds / norm
+            if True:  # Directly return address
+                pt = np.random.choice(np.arange(len(worker_names)),
+                    p=worker_speeds)
+                worker_name = worker_names[pt]
+                return worker_name
+            # Check status before returning
+            while True:
+                pt = np.random.choice(np.arange(len(worker_names)),
+                    p=worker_speeds)
+                worker_name = worker_names[pt]
+                if self.get_worker_status(worker_name):
+                    break
+                else:
+                    self.remove_worker(worker_name)
+                    worker_speeds[pt] = 0
+                    norm = np.sum(worker_speeds)
+                    if norm < 1e-4:
+                        return ""
+                    worker_speeds = worker_speeds / norm
+                    continue
+            return worker_name
+        elif self.dispatch_method == DispatchMethod.SHORTEST_QUEUE:
+            worker_names = []
+            worker_qlen = []
+            for w_name, w_info in self.worker_info.items():
+                if model_name in w_info.model_names:
+                    worker_names.append(w_name)
+                    worker_qlen.append(w_info.queue_length / w_info.speed)
+            if len(worker_names) == 0:
+                return ""
+            min_index = np.argmin(worker_qlen)
+            w_name = worker_names[min_index]
+            self.worker_info[w_name].queue_length += 1
+            logger.info(f"names: {worker_names}, queue_lens: {worker_qlen}, ret: {w_name}")
+            return w_name
+        else:
+            raise ValueError(f"Invalid dispatch method: {self.dispatch_method}")
+    def receive_heart_beat(self, worker_name: str, queue_length: int):
+        if worker_name not in self.worker_info:
+            logger.info(f"Receive unknown heart beat. {worker_name}")
+            return False
+        self.worker_info[worker_name].queue_length = queue_length
+        self.worker_info[worker_name].last_heart_beat = time.time()
+        logger.info(f"Receive heart beat. {worker_name}")
+        return True
+    def remove_stable_workers_by_expiration(self):
+        expire = time.time() - CONTROLLER_HEART_BEAT_EXPIRATION
+        to_delete = []
+        for worker_name, w_info in self.worker_info.items():
+            if w_info.check_heart_beat and w_info.last_heart_beat < expire:
+                to_delete.append(worker_name)
+        for worker_name in to_delete:
+            self.remove_worker(worker_name)
+    def worker_api_generate_stream(self, params):
+        worker_addr = self.get_worker_address(params["model"])
+        if not worker_addr:
+            logger.info(f"no worker: {params['model']}")
+            ret = {
+                "text": server_error_msg,
+                "error_code": 2,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+        try:
+            response = requests.post(worker_addr + "/worker_generate_stream",
+                json=params, stream=True, timeout=5)
+            for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
+                if chunk:
+                    yield chunk + b"\0"
+        except requests.exceptions.RequestException as e:
+            logger.info(f"worker timeout: {worker_addr}")
+            ret = {
+                "text": server_error_msg,
+                "error_code": 3,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+    # Let the controller act as a worker to achieve hierarchical
+    # management. This can be used to connect isolated sub networks.
+    def worker_api_get_status(self):
+        model_names = set()
+        speed = 0
+        queue_length = 0
+        for w_name in self.worker_info:
+            worker_status = self.get_worker_status(w_name)
+            if worker_status is not None:
+                model_names.update(worker_status["model_names"])
+                speed += worker_status["speed"]
+                queue_length += worker_status["queue_length"]
+        return {
+            "model_names": list(model_names),
+            "speed": speed,
+            "queue_length": queue_length,
+        }
+app = FastAPI()
+@app.post("/register_worker")
+async def register_worker(request: Request):
+    data = await request.json()
+    controller.register_worker(
+        data["worker_name"], data["check_heart_beat"],
+        data.get("worker_status", None))
+@app.post("/refresh_all_workers")
+async def refresh_all_workers():
+    models = controller.refresh_all_workers()
+@app.post("/list_models")
+async def list_models():
+    models = controller.list_models()
+    return {"models": models}
+@app.post("/get_worker_address")
+async def get_worker_address(request: Request):
+    data = await request.json()
+    addr = controller.get_worker_address(data["model"])
+    return {"address": addr}
+@app.post("/receive_heart_beat")
+async def receive_heart_beat(request: Request):
+    data = await request.json()
+    exist = controller.receive_heart_beat(
+        data["worker_name"], data["queue_length"])
+    return {"exist": exist}
+@app.post("/worker_generate_stream")
+async def worker_api_generate_stream(request: Request):
+    params = await request.json()
+    generator = controller.worker_api_generate_stream(params)
+    return StreamingResponse(generator)
+@app.post("/worker_get_status")
+async def worker_api_get_status(request: Request):
+    return controller.worker_api_get_status()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=21001)
+    parser.add_argument("--dispatch-method", type=str, choices=[
+        "lottery", "shortest_queue"], default="shortest_queue")
+    args = parser.parse_args()
+    logger.info(f"args: {args}")
+    controller = Controller(args.dispatch_method)
+    uvicorn.run(app, host=args.host, port=args.port, log_level="info")

omni_speech/serve/gradio_web_server.py ADDED Viewed

	@@ -0,0 +1,348 @@

+import argparse
+import datetime
+import json
+import os
+import time
+import torch
+import torchaudio
+import gradio as gr
+import numpy as np
+import requests
+import soundfile as sf
+from omni_speech.conversation import default_conversation, conv_templates
+from omni_speech.constants import LOGDIR
+from omni_speech.utils import build_logger, server_error_msg
+from fairseq.models.text_to_speech.vocoder import CodeHiFiGANVocoder
+logger = build_logger("gradio_web_server", "gradio_web_server.log")
+vocoder = None
+headers = {"User-Agent": "LLaMA-Omni Client"}
+no_change_btn = gr.Button()
+enable_btn = gr.Button(interactive=True)
+disable_btn = gr.Button(interactive=False)
+def get_conv_log_filename():
+    t = datetime.datetime.now()
+    name = os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-conv.json")
+    return name
+def get_model_list():
+    ret = requests.post(args.controller_url + "/refresh_all_workers")
+    assert ret.status_code == 200
+    ret = requests.post(args.controller_url + "/list_models")
+    models = ret.json()["models"]
+    logger.info(f"Models: {models}")
+    return models
+get_window_url_params = """
+function() {
+    const params = new URLSearchParams(window.location.search);
+    url_params = Object.fromEntries(params);
+    console.log(url_params);
+    return url_params;
+    }
+"""
+def load_demo(url_params, request: gr.Request):
+    logger.info(f"load_demo. ip: {request.client.host}. params: {url_params}")
+    dropdown_update = gr.Dropdown(visible=True)
+    if "model" in url_params:
+        model = url_params["model"]
+        if model in models:
+            dropdown_update = gr.Dropdown(value=model, visible=True)
+    state = default_conversation.copy()
+    return state, dropdown_update
+def load_demo_refresh_model_list(request: gr.Request):
+    logger.info(f"load_demo. ip: {request.client.host}")
+    models = get_model_list()
+    state = default_conversation.copy()
+    dropdown_update = gr.Dropdown(
+        choices=models,
+        value=models[0] if len(models) > 0 else ""
+    )
+    return state, dropdown_update
+def clear_history(request: gr.Request):
+    logger.info(f"clear_history. ip: {request.client.host}")
+    state = default_conversation.copy()
+    return (state, None, "", "", None)
+def add_speech(state, speech, request: gr.Request):
+    text = "Please directly answer the questions in the user's speech."
+    text = '<speech>\n' + text
+    text = (text, speech)
+    state = default_conversation.copy()
+    state.append_message(state.roles[0], text)
+    state.append_message(state.roles[1], None)
+    state.skip_next = False
+    return (state)
+def http_bot(state, model_selector, temperature, top_p, max_new_tokens, chunk_size, request: gr.Request):
+    logger.info(f"http_bot. ip: {request.client.host}")
+    start_tstamp = time.time()
+    model_name = model_selector
+    if state.skip_next:
+        # This generate call is skipped due to invalid inputs
+        yield (state, "", "", None)
+        return
+    if len(state.messages) == state.offset + 2:
+        # First round of conversation
+        template_name = "llama_3"
+        new_state = conv_templates[template_name].copy()
+        new_state.append_message(new_state.roles[0], state.messages[-2][1])
+        new_state.append_message(new_state.roles[1], None)
+        state = new_state
+    # Query worker address
+    controller_url = args.controller_url
+    ret = requests.post(controller_url + "/get_worker_address",
+            json={"model": model_name})
+    worker_addr = ret.json()["address"]
+    logger.info(f"model_name: {model_name}, worker_addr: {worker_addr}")
+    # No available worker
+    if worker_addr == "":
+        state.messages[-1][-1] = server_error_msg
+        yield (state, "", "", None)
+        return
+    # Construct prompt
+    prompt = state.get_prompt()
+    sr, audio = state.messages[0][1][1]
+    resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
+    audio = torch.tensor(audio.astype(np.float32)).unsqueeze(0)
+    audio = resampler(audio).squeeze(0).numpy()
+    audio /= 32768.0
+    audio = audio.tolist()
+    # Make requests
+    pload = {
+        "model": model_name,
+        "prompt": prompt,
+        "temperature": float(temperature),
+        "top_p": float(top_p),
+        "max_new_tokens": min(int(max_new_tokens), 1500),
+        "stop": state.sep2,
+        "audio": audio,
+    }
+    yield (state, "", "", None)
+    cur_dir = os.path.dirname(os.path.abspath(__file__))
+    try:
+        # Stream output
+        response = requests.post(worker_addr + "/worker_generate_stream",
+            headers=headers, json=pload, stream=True, timeout=10)
+        num_generated_units = 0
+        wav_list = []
+        for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
+            if chunk:
+                data = json.loads(chunk.decode())
+                if data["error_code"] == 0:
+                    output = data["text"][len(prompt):].strip()
+                    output_unit = list(map(int, data["unit"].strip().split()))
+                    state.messages[-1][-1] = (output, data["unit"].strip())
+                    # vocoder
+                    new_units = output_unit[num_generated_units:]
+                    if len(new_units) >= chunk_size:
+                        num_generated_units = len(output_unit)
+                        x = {"code": torch.LongTensor(new_units).view(1, -1).cuda()}
+                        wav = vocoder(x, True)
+                        wav_list.append(wav.detach().cpu().numpy())
+                    if len(wav_list) > 0:
+                        wav_full = np.concatenate(wav_list)
+                        return_value = (16000, wav_full)
+                    else:
+                        return_value = None
+                    yield (state, state.messages[-1][-1][0], state.messages[-1][-1][1], return_value)
+                else:
+                    output = data["text"] + f" (error_code: {data['error_code']})"
+                    state.messages[-1][-1] = output
+                    yield (state, "", "", None)
+                    return
+                time.sleep(0.03)
+    except requests.exceptions.RequestException as e:
+        state.messages[-1][-1] = server_error_msg
+        yield (state, "", "", None)
+        return
+    if num_generated_units < len(output_unit):
+        new_units = output_unit[num_generated_units:]
+        num_generated_units = len(output_unit)
+        x = {
+            "code": torch.LongTensor(new_units).view(1, -1).cuda()
+        }
+        wav = vocoder(x, True)
+        wav_list.append(wav.detach().cpu().numpy())
+    if len(wav_list) > 0:
+        wav_full = np.concatenate(wav_list)
+        return_value = (16000, wav_full)
+    else:
+        return_value = None
+    yield (state, state.messages[-1][-1][0], state.messages[-1][-1][1], return_value)
+    finish_tstamp = time.time()
+    logger.info(f"{output}")
+    logger.info(f"{output_unit}")
+title_markdown = ("""
+# 🎧 LLaMA-Omni: Seamless Speech Interaction with Large Language Models
+""")
+block_css = """
+#buttons button {
+    min-width: min(120px,100%);
+}
+"""
+def build_demo(embed_mode, vocoder, cur_dir=None, concurrency_count=10):
+    with gr.Blocks(title="LLaMA-Omni Speech Chatbot", theme=gr.themes.Default(), css=block_css) as demo:
+        state = gr.State()
+        if not embed_mode:
+            gr.Markdown(title_markdown)
+        with gr.Row(elem_id="model_selector_row"):
+            model_selector = gr.Dropdown(
+                choices=models,
+                value=models[0] if len(models) > 0 else "",
+                interactive=True,
+                show_label=False,
+                container=False)
+        with gr.Row():
+            audio_input_box = gr.Audio(sources=["upload", "microphone"], label="Speech Input")
+            with gr.Accordion("Parameters", open=True) as parameter_row:
+                temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, interactive=True, label="Temperature",)
+                top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.1, interactive=True, label="Top P",)
+                max_output_tokens = gr.Slider(minimum=0, maximum=1024, value=512, step=64, interactive=True, label="Max Output Tokens",)
+                chunk_size = gr.Slider(minimum=10, maximum=500, value=40, step=10, interactive=True, label="Chunk Size",)
+        if cur_dir is None:
+            cur_dir = os.path.dirname(os.path.abspath(__file__))
+        gr.Examples(examples=[
+            [f"{cur_dir}/examples/vicuna_1.wav"],
+            [f"{cur_dir}/examples/vicuna_2.wav"],
+            [f"{cur_dir}/examples/vicuna_3.wav"],
+            [f"{cur_dir}/examples/vicuna_4.wav"],
+            [f"{cur_dir}/examples/vicuna_5.wav"],
+            [f"{cur_dir}/examples/helpful_base_1.wav"],
+            [f"{cur_dir}/examples/helpful_base_2.wav"],
+            [f"{cur_dir}/examples/helpful_base_3.wav"],
+            [f"{cur_dir}/examples/helpful_base_4.wav"],
+            [f"{cur_dir}/examples/helpful_base_5.wav"],
+        ], inputs=[audio_input_box])
+        with gr.Row():
+            submit_btn = gr.Button(value="Send", variant="primary")
+            clear_btn = gr.Button(value="Clear")
+        text_output_box = gr.Textbox(label="Text Output", type="text")
+        unit_output_box = gr.Textbox(label="Unit Output", type="text")
+        audio_output_box = gr.Audio(label="Speech Output")
+        url_params = gr.JSON(visible=False)
+        submit_btn.click(
+            add_speech,
+            [state, audio_input_box],
+            [state]
+        ).then(
+            http_bot,
+            [state, model_selector, temperature, top_p, max_output_tokens, chunk_size],
+            [state, text_output_box, unit_output_box, audio_output_box],
+            concurrency_limit=concurrency_count
+        )
+        clear_btn.click(
+            clear_history,
+            None,
+            [state, audio_input_box, text_output_box, unit_output_box, audio_output_box],
+            queue=False
+        )
+        if args.model_list_mode == "once":
+            demo.load(
+                load_demo,
+                [url_params],
+                [state, model_selector],
+                js=get_window_url_params
+            )
+        elif args.model_list_mode == "reload":
+            demo.load(
+                load_demo_refresh_model_list,
+                None,
+                [state, model_selector],
+                queue=False
+            )
+        else:
+            raise ValueError(f"Unknown model list mode: {args.model_list_mode}")
+    return demo
+def build_vocoder(args):
+    global vocoder
+    if args.vocoder is None:
+        return None
+    with open(args.vocoder_cfg) as f:
+        vocoder_cfg = json.load(f)
+    vocoder = CodeHiFiGANVocoder(args.vocoder, vocoder_cfg).cuda()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="0.0.0.0")
+    parser.add_argument("--port", type=int)
+    parser.add_argument("--controller-url", type=str, default="http://localhost:21001")
+    parser.add_argument("--concurrency-count", type=int, default=16)
+    parser.add_argument("--model-list-mode", type=str, default="once",
+        choices=["once", "reload"])
+    parser.add_argument("--share", action="store_true")
+    parser.add_argument("--moderate", action="store_true")
+    parser.add_argument("--embed", action="store_true")
+    parser.add_argument("--vocoder", type=str)
+    parser.add_argument("--vocoder-cfg", type=str)
+    args = parser.parse_args()
+    logger.info(f"args: {args}")
+    models = get_model_list()
+    build_vocoder(args)
+    logger.info(args)
+    demo = build_demo(args.embed, vocoder, concurrency_count=args.concurrency_count)
+    demo.queue(
+        api_open=False
+    ).launch(
+        server_name=args.host,
+        server_port=args.port,
+        share=args.share
+    )

omni_speech/serve/model_worker.py ADDED Viewed

	@@ -0,0 +1,292 @@

+"""
+A model worker executes the model.
+"""
+import argparse
+import asyncio
+import json
+import time
+import threading
+import uuid
+from fastapi import FastAPI, Request, BackgroundTasks
+from fastapi.responses import StreamingResponse
+import requests
+import torch
+import uvicorn
+import whisper
+import numpy as np
+from functools import partial
+from transformers import PreTrainedTokenizer
+from omni_speech.constants import WORKER_HEART_BEAT_INTERVAL
+from omni_speech.utils import (build_logger, server_error_msg,
+    pretty_print_semaphore)
+from omni_speech.model.builder import load_pretrained_model
+from omni_speech.constants import SPEECH_TOKEN_INDEX, DEFAULT_SPEECH_TOKEN
+from omni_speech.datasets.preprocess import tokenizer_speech_token
+from transformers import TextIteratorStreamer
+from threading import Thread
+GB = 1 << 30
+worker_id = str(uuid.uuid4())[:6]
+logger = build_logger("model_worker", f"model_worker_{worker_id}.log")
+global_counter = 0
+model_semaphore = None
+def heart_beat_worker(controller):
+    while True:
+        time.sleep(WORKER_HEART_BEAT_INTERVAL)
+        controller.send_heart_beat()
+def load_speech(audio, input_type, mel_size, speech_normalize):
+    speech = np.array(audio, dtype=np.float32)
+    if input_type == "raw":
+        speech = torch.from_numpy(speech)
+        if speech_normalize:
+            speech = torch.nn.functional.layer_norm(speech, speech.shape)
+    elif input_type == "mel":
+        speech = whisper.pad_or_trim(speech)
+        speech = whisper.log_mel_spectrogram(speech, n_mels=mel_size).permute(1, 0)
+    return speech
+def build_unit_tokenizer(vocab_size):
+    import os
+    from transformers import BertTokenizer
+    with open("unit_vocab.txt", "w") as f:
+        for i in range(vocab_size + 1):
+            f.write(str(i) + "\n")
+    tokenizer = BertTokenizer(vocab_file="unit_vocab.txt")
+    os.remove("unit_vocab.txt")
+    return tokenizer
+class ModelWorker:
+    def __init__(self, controller_addr, worker_addr,
+                 worker_id, no_register,
+                 model_path, model_base, model_name,
+                 load_8bit, load_4bit, device, input_type, mel_size, s2s, is_lora, use_flash_attn=False):
+        self.controller_addr = controller_addr
+        self.worker_addr = worker_addr
+        self.worker_id = worker_id
+        self.device = device
+        self.model_name = model_name
+        self.input_type = input_type
+        self.mel_size = mel_size
+        self.tokenizer, self.model, self.context_len = load_pretrained_model(
+            model_path, model_base, is_lora=is_lora, s2s=s2s, load_8bit=load_8bit, load_4bit=load_4bit, device=self.device, use_flash_attn=use_flash_attn)
+        self.unit_tokenizer = build_unit_tokenizer(self.model.config.unit_vocab_size)
+        if not no_register:
+            self.register_to_controller()
+            self.heart_beat_thread = threading.Thread(
+                target=heart_beat_worker, args=(self,), daemon=True)
+            self.heart_beat_thread.start()
+    def register_to_controller(self):
+        logger.info("Register to controller")
+        url = self.controller_addr + "/register_worker"
+        data = {
+            "worker_name": self.worker_addr,
+            "check_heart_beat": True,
+            "worker_status": self.get_status()
+        }
+        r = requests.post(url, json=data)
+        assert r.status_code == 200
+    def send_heart_beat(self):
+        logger.info(f"Send heart beat. Models: {[self.model_name]}. "
+                    f"Semaphore: {pretty_print_semaphore(model_semaphore)}. "
+                    f"global_counter: {global_counter}")
+        url = self.controller_addr + "/receive_heart_beat"
+        while True:
+            try:
+                ret = requests.post(url, json={
+                    "worker_name": self.worker_addr,
+                    "queue_length": self.get_queue_length()}, timeout=5)
+                exist = ret.json()["exist"]
+                break
+            except requests.exceptions.RequestException as e:
+                logger.error(f"heart beat error: {e}")
+            time.sleep(5)
+        if not exist:
+            self.register_to_controller()
+    def get_queue_length(self):
+        if model_semaphore is None:
+            return 0
+        else:
+            return args.limit_model_concurrency - model_semaphore._value + (len(
+                model_semaphore._waiters) if model_semaphore._waiters is not None else 0)
+    def get_status(self):
+        return {
+            "model_names": [self.model_name],
+            "speed": 1,
+            "queue_length": self.get_queue_length(),
+        }
+    @torch.inference_mode()
+    def generate_stream(self, params):
+        tokenizer, model = self.tokenizer, self.model
+        prompt = params["prompt"]
+        ori_prompt = prompt
+        audio = params.get("audio", None)
+        if audio is not None and len(audio) > 0:
+            speech = load_speech(audio, self.input_type, self.mel_size, self.model.config.speech_normalize)
+            speech_length = torch.LongTensor([speech.shape[0]]).unsqueeze(0).to(self.device)
+            speech_tensor = speech.unsqueeze(0).to(self.device, dtype=torch.float16)
+            speech_args = {"speech": speech_tensor, "speech_lengths": speech_length}
+        else:
+            speech = None
+            speech_args = {}
+        temperature = float(params.get("temperature", 1.0))
+        top_p = float(params.get("top_p", 1.0))
+        max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
+        max_new_tokens = min(int(params.get("max_new_tokens", 256)), 1024)
+        stop_str = params.get("stop", None)
+        do_sample = True if temperature > 0.001 else False
+        input_ids = tokenizer_speech_token(prompt, tokenizer, return_tensors='pt').unsqueeze(0).to(self.device)
+        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=15)
+        streamer_unit = TextIteratorStreamer(self.unit_tokenizer, skip_prompt=False, skip_special_tokens=True, timeout=15)
+        # max_new_tokens = min(max_new_tokens, max_context_length - input_ids.shape[-1] - num_image_tokens)
+        if max_new_tokens < 1:
+            yield json.dumps({"text": ori_prompt + "Exceeds max token length. Please start a new conversation, thanks.", "error_code": 0}).encode() + b"\0"
+            return
+        thread = Thread(target=model.generate, kwargs=dict(
+            inputs=input_ids,
+            do_sample=do_sample,
+            temperature=temperature,
+            top_p=top_p,
+            max_new_tokens=max_new_tokens,
+            streamer=streamer,
+            streamer_unit=streamer_unit,
+            streaming_unit_gen=True,
+            use_cache=True,
+            **speech_args
+        ))
+        thread.start()
+        generated_text = ori_prompt
+        for new_text in streamer:
+            generated_text += new_text
+            generated_unit = " ".join(map(str, streamer_unit.token_cache))
+            if generated_text.endswith(stop_str):
+                generated_text = generated_text[:-len(stop_str)]
+            yield json.dumps({"text": generated_text, "unit": generated_unit, "error_code": 0}).encode() + b"\0"
+    def generate_stream_gate(self, params):
+        try:
+            for x in self.generate_stream(params):
+                yield x
+        except ValueError as e:
+            print("Caught ValueError:", e)
+            ret = {
+                "text": server_error_msg,
+                "error_code": 1,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+        except torch.cuda.CudaError as e:
+            print("Caught torch.cuda.CudaError:", e)
+            ret = {
+                "text": server_error_msg,
+                "error_code": 1,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+        except Exception as e:
+            print("Caught Unknown Error", e)
+            ret = {
+                "text": server_error_msg,
+                "error_code": 1,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+app = FastAPI()
+def release_model_semaphore(fn=None):
+    model_semaphore.release()
+    if fn is not None:
+        fn()
+@app.post("/worker_generate_stream")
+async def generate_stream(request: Request):
+    global model_semaphore, global_counter
+    global_counter += 1
+    params = await request.json()
+    if model_semaphore is None:
+        model_semaphore = asyncio.Semaphore(args.limit_model_concurrency)
+    await model_semaphore.acquire()
+    worker.send_heart_beat()
+    generator = worker.generate_stream_gate(params)
+    background_tasks = BackgroundTasks()
+    background_tasks.add_task(partial(release_model_semaphore, fn=worker.send_heart_beat))
+    return StreamingResponse(generator, background=background_tasks)
+@app.post("/worker_get_status")
+async def get_status(request: Request):
+    return worker.get_status()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=21002)
+    parser.add_argument("--worker-address", type=str,
+        default="http://localhost:21002")
+    parser.add_argument("--controller-address", type=str,
+        default="http://localhost:21001")
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--model-name", type=str)
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--limit-model-concurrency", type=int, default=5)
+    parser.add_argument("--stream-interval", type=int, default=1)
+    parser.add_argument("--no-register", action="store_true")
+    parser.add_argument("--load-8bit", action="store_true")
+    parser.add_argument("--load-4bit", action="store_true")
+    parser.add_argument("--use-flash-attn", action="store_true")
+    parser.add_argument("--input-type", type=str, default="mel")
+    parser.add_argument("--mel-size", type=int, default=128)
+    parser.add_argument("--s2s", action="store_true", default=False)
+    parser.add_argument("--is-lora", action="store_true", default=False)
+    args = parser.parse_args()
+    logger.info(f"args: {args}")
+    worker = ModelWorker(args.controller_address,
+                         args.worker_address,
+                         worker_id,
+                         args.no_register,
+                         args.model_path,
+                         args.model_base,
+                         args.model_name,
+                         args.load_8bit,
+                         args.load_4bit,
+                         args.device,
+                         args.input_type,
+                         args.mel_size,
+                         args.s2s,
+                         args.is_lora,
+                         use_flash_attn=args.use_flash_attn)
+    uvicorn.run(app, host=args.host, port=args.port, log_level="info")

omni_speech/train/__pycache__/omni_trainer.cpython-310.pyc ADDED Viewed

Binary file (10.1 kB). View file

omni_speech/train/__pycache__/omni_trainer.cpython-312.pyc ADDED Viewed

Binary file (13.2 kB). View file

omni_speech/train/__pycache__/run_train.cpython-310.pyc ADDED Viewed

Binary file (11.5 kB). View file

omni_speech/train/__pycache__/run_train.cpython-312.pyc ADDED Viewed

Binary file (22.3 kB). View file

omni_speech/train/__pycache__/run_train.cpython-38.pyc ADDED Viewed

Binary file (12.3 kB). View file

omni_speech/train/__pycache__/train.cpython-312.pyc ADDED Viewed

Binary file (18.9 kB). View file

omni_speech/train/__pycache__/train_mem.cpython-312.pyc ADDED Viewed

Binary file (348 Bytes). View file

omni_speech/train/__pycache__/train_multiturn.cpython-312.pyc ADDED Viewed

Binary file (25.4 kB). View file

omni_speech/train/__pycache__/train_raw.cpython-312.pyc ADDED Viewed

Binary file (19.9 kB). View file

omni_speech/train/__pycache__/train_test.cpython-312.pyc ADDED Viewed

Binary file (17.8 kB). View file

omni_speech/train/__pycache__/trainer.cpython-310.pyc ADDED Viewed

Binary file (7.29 kB). View file

omni_speech/train/__pycache__/trainer.cpython-312.pyc ADDED Viewed

Binary file (13.2 kB). View file

omni_speech/train/export.py ADDED Viewed

	@@ -0,0 +1,512 @@

+# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
+# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
+#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import os
+import copy
+from dataclasses import dataclass, field
+import json
+import logging
+import pathlib
+from typing import Dict, Optional, Sequence, List
+import torch
+import transformers
+import tokenizers
+from omni_speech.constants import IGNORE_INDEX, SPEECH_TOKEN_INDEX, DEFAULT_SPEECH_TOKEN
+from torch.utils.data import Dataset
+from omni_speech.train.omni_trainer import OmniTrainer
+from audiomentations import AddBackgroundNoise, PolarityInversion
+from omni_speech import conversation as conversation_lib
+from omni_speech.model import *
+from omni_speech.utils import *
+from omni_speech.datasets.preprocess import *
+import whisper
+import time
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
+    version: Optional[str] = field(default="llama_3")
+    freeze_backbone: bool = field(default=False)
+    tune_speech_projector: bool = field(default=False)
+    tune_speech_encoder: bool = field(default=False)
+    tune_speech_generator_only: bool = field(default=False)
+    speech_encoder_type: Optional[str] = field(default=None)
+    speech_encoder: Optional[str] = field(default=None)
+    pretrain_speech_projector: Optional[str] = field(default=None)
+    speech_projector_type: Optional[str] = field(default='linear')
+    speech_generator_type: Optional[str] = field(default='ctc')
+    ctc_decoder_config: str = "(2,4096,32,11008)"
+    ctc_upsample_factor: int = 25
+    ctc_loss_weight: float = 1.0
+    unit_vocab_size: int = 1000
+    speech_encoder_ds_rate: int = 5
+    speech_encoder_hidden_size: int = 1280
+@dataclass
+class DataArguments:
+    data_path: str = field(default=None,
+                           metadata={"help": "Path to the training data."})
+    dev_path: str = field(default=None,
+                           metadata={"help": "Path to the dev data."})
+    is_multimodal: bool = False
+    input_type: str = field(default="mel")
+    speech_normalize: bool = False
+    mel_size: int = 128
+    has_tgt_units: bool = False
+    augment_prob: float = field(
+        default=0.0,
+        metadata={"help": "The probability of applying augmentation transform."}
+    )
+    augment_path: str = field(default=None,
+                           metadata={"help": "Path to the augment data."})
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    freeze_speech_projector: bool = field(default=False)
+    model_max_length: int = field(
+        default=512,
+        metadata={
+            "help":
+            "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    double_quant: bool = field(
+        default=True,
+        metadata={"help": "Compress the quantization statistics through double quantization."}
+    )
+    quant_type: str = field(
+        default="nf4",
+        metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
+    )
+    bits: int = field(
+        default=16,
+        metadata={"help": "How many bits to use."}
+    )
+    lora_enable: bool = False
+    lora_r: int = 64
+    lora_alpha: int = 16
+    lora_dropout: float = 0.05
+    lora_weight_path: str = ""
+    lora_bias: str = "none"
+    speech_projector_lr: Optional[float] = None
+    group_by_modality_length: bool = field(default=False)
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+    def __init__(self, data_path: str,
+                 tokenizer: transformers.PreTrainedTokenizer,
+                 data_args: DataArguments):
+        super(LazySupervisedDataset, self).__init__()
+        list_data_dict = json.load(open(data_path, "r"))
+        self.tokenizer = tokenizer
+        self.list_data_dict = list_data_dict
+        self.data_args = data_args
+        if self.data_args.augment_prob != 0.0:
+            with open(self.data_args.augment_path, "r") as f:
+                augment_path_list = f.read().splitlines()
+            self.transform = AddBackgroundNoise(
+                sounds_path=augment_path_list,
+                min_snr_db=5.0,
+                max_snr_db=30.0,
+                noise_transform=PolarityInversion(),
+                p=self.data_args.augment_prob
+            )
+    def __len__(self):
+        return len(self.list_data_dict)
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        # TODO: define number of retries somewhere else
+        num_base_retries = 3
+        num_final_retries = 300
+        # try the current sample first
+        for attempt_idx in range(num_base_retries):
+            try:
+                sample = self._get_item(i)
+                return sample
+            except Exception as e:
+                # sleep 1s in case it is a cloud disk issue
+                print(f"[Try #{attempt_idx}] Failed to fetch sample {i}. Exception:", e)
+                time.sleep(1)
+        # try other samples, in case it is file corruption issue
+        for attempt_idx in range(num_base_retries):
+            try:
+                next_index = min(i + 1, len(self.list_data_dict) - 1)
+                # sample_idx = random.choice(range(len(self)))
+                sample = self._get_item(next_index)
+                return sample
+            except Exception as e:
+                # no need to sleep
+                print(f"[Try other #{attempt_idx}] Failed to fetch sample {next_index}. Exception:", e)
+                pass
+        try:
+            sample = self._get_item(i)
+            return sample
+        except Exception as e:
+            raise e
+    def process_speech(self, speech_file):
+        speech = whisper.load_audio(speech_file)
+        if self.data_args.augment_prob != 0.0:
+            speech = self.transform(speech, sample_rate=16000)
+        if self.data_args.input_type == "raw":
+            speech = torch.from_numpy(speech)
+            if self.model_config.data_args.speech_normalize:
+                speech = torch.nn.functional.layer_norm(speech, speech.shape)
+        elif self.data_args.input_type == "mel":
+            speech = whisper.pad_or_trim(speech)
+            speech = whisper.log_mel_spectrogram(speech, n_mels=self.data_args.mel_size).permute(1, 0)
+        speech_lengths = torch.LongTensor([speech.shape[0]])
+        return speech, speech_lengths
+    def _get_item(self, i) -> Dict[str, torch.Tensor]:
+        sources = self.list_data_dict[i]
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+        for item in sources:
+            if 'tools' in item:
+                tools_dict = {
+                    "from": "tools",
+                    "value": item["tools"]
+                }
+                item["conversations"].insert(0,tools_dict)
+        if self.data_args.has_tgt_units:
+            tgt_units = [e["tgt_units"] for e in sources]
+            tgt_units = torch.tensor(tgt_units, dtype=torch.long)
+        else:
+            tgt_units = None
+        if 'speech' in sources[0]:
+            import numpy as np
+            speech_file = self.list_data_dict[i]['speech']
+            if type(speech_file) is list:
+                speech = [self.process_speech(f) for f in speech_file]
+            else:
+                speech = [self.process_speech(speech_file)]
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args)
+        else:
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+        data_dict = preprocess(
+            sources,
+            self.tokenizer,
+            has_speech=('speech' in self.list_data_dict[i]))
+        if isinstance(i, int):
+            data_dict = dict(input_ids=data_dict["input_ids"][0],
+                             labels=data_dict["labels"][0])
+        # speech exist in the data
+        if 'speech' in self.list_data_dict[i]:
+            data_dict['speech'] = speech
+        if tgt_units is not None:
+            data_dict['tgt_units'] = tgt_units[0]
+        data_dict["id"] = self.list_data_dict[i].get("id", i)
+        return data_dict
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+    tokenizer: transformers.PreTrainedTokenizer
+    def pad_sequence(self, input_ids, batch_first, padding_value):
+        if self.tokenizer.padding_side == "left":
+            input_ids = [torch.flip(_input_ids, [0]) for _input_ids in input_ids]
+        input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=batch_first, padding_value=padding_value)
+        if self.tokenizer.padding_side == "left":
+            input_ids = torch.flip(input_ids, [1])
+        return input_ids
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
+        # input_ids, labels, ids = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels", "id"))
+        input_ids = [_input_ids[: self.tokenizer.model_max_length] for _input_ids in input_ids]
+        labels = [_labels[: self.tokenizer.model_max_length] for _labels in labels]
+        if self.tokenizer.pad_token_id is None:
+            # self.tokenizer.pad_token_id = self.tokenizer.eos_token_id  # FIXME: this could only be triggered for llama3 model.
+            self.tokenizer.pad_token_id = 0 # This gets the best result. Don't know why.
+        input_ids = self.pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
+        labels = self.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
+        batch = dict(input_ids=input_ids, labels=labels.long() if labels.dtype == torch.int32 else labels, attention_mask=input_ids.ne(self.tokenizer.pad_token_id))
+        # batch = dict(input_ids=input_ids, labels=labels, attention_mask=input_ids.ne(self.tokenizer.pad_token_id), ids=ids)
+        if 'speech' in instances[0]:
+            speechs = [instance['speech'] for instance in instances]
+            speech = [sp[0] for sp_list in speechs for sp in sp_list]
+            speech_lengths = [sp[1] for sp_list in speechs for sp in sp_list]
+            batch["speech"] = speech
+            # print(len(speech)) # sum(len(audio) for audio in each batch)
+            # print(speech[0].shape) # seq_len, dim
+            batch['speech_lengths'] = speech_lengths
+            # print(speech_lengths[0].shape) # seq_len
+        if 'tgt_units' in instances[0]:
+            tgt_units = [instance['tgt_units'] for instance in instances]
+            tgt_units = self.pad_sequence(tgt_units, batch_first=True, padding_value=self.tokenizer.pad_token_id)
+            batch['tgt_units'] = tgt_units
+            # print(batch['tgt_units'])
+            # print("---------------")
+            # print(batch['input_ids'])
+        return batch
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer,
+                                data_args) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    train_dataset = LazySupervisedDataset(tokenizer=tokenizer,
+                                data_path=data_args.data_path,
+                                data_args=data_args)
+    if data_args.dev_path is not None:
+        dev_dataset = LazySupervisedDataset(tokenizer=tokenizer,
+                                data_path=data_args.dev_path,
+                                data_args=data_args)
+    else:
+        dev_dataset = None
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(train_dataset=train_dataset,
+                eval_dataset=dev_dataset,
+                data_collator=data_collator)
+def train(attn_implementation="flash_attention_2"):
+    parser = transformers.HfArgumentParser(
+        (ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
+    bnb_model_from_pretrained_args = {}
+    if training_args.bits in [4, 8]:
+        from transformers import BitsAndBytesConfig
+        bnb_model_from_pretrained_args.update(dict(
+            device_map={"": training_args.device},
+            load_in_4bit=training_args.bits == 4,
+            load_in_8bit=training_args.bits == 8,
+            quantization_config=BitsAndBytesConfig(
+                load_in_4bit=training_args.bits == 4,
+                load_in_8bit=training_args.bits == 8,
+                llm_int8_skip_modules=["speech_projector"],
+                llm_int8_threshold=6.0,
+                llm_int8_has_fp16_weight=False,
+                bnb_4bit_compute_dtype=compute_dtype,
+                bnb_4bit_use_double_quant=training_args.double_quant,
+                bnb_4bit_quant_type=training_args.quant_type # {'fp4', 'nf4'}
+            )
+        ))
+    if data_args.has_tgt_units:
+        if model_args.version == "llama_3":
+            model = OmniSpeech2SLlamaForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                **bnb_model_from_pretrained_args
+            )
+        elif model_args.version == "qwen":
+            model = OmniSpeech2SQwen2ForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                **bnb_model_from_pretrained_args
+            )
+        else:
+            raise ValueError("--currently only support llama or qwen model!")
+    else:
+        if model_args.version == "llama_3":
+            model = OmniSpeechLlamaForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                **bnb_model_from_pretrained_args
+            )
+        elif model_args.version == "qwen":
+            model = OmniSpeechQwen2ForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                **bnb_model_from_pretrained_args
+            )
+        else:
+            raise ValueError("--currently only support llama or qwen model!")
+    model.config.use_cache = False
+    if model_args.freeze_backbone:
+        model.model.requires_grad_(False)
+    if training_args.bits in [4, 8]:
+        from peft import prepare_model_for_kbit_training
+        model.config.torch_dtype=(torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
+        model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing)
+    if training_args.gradient_checkpointing:
+        if hasattr(model, "enable_input_require_grads"):
+            model.enable_input_require_grads()
+        else:
+            def make_inputs_require_grad(module, input, output):
+                output.requires_grad_(True)
+            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+    if training_args.lora_enable:
+        from peft import LoraConfig, get_peft_model
+        lora_config = LoraConfig(
+            r=training_args.lora_r,
+            lora_alpha=training_args.lora_alpha,
+            target_modules=find_all_linear_names(model),
+            lora_dropout=training_args.lora_dropout,
+            bias=training_args.lora_bias,
+            task_type="CAUSAL_LM",
+        )
+        if training_args.bits == 16:
+            if training_args.bf16:
+                model.to(torch.bfloat16)
+            if training_args.fp16:
+                model.to(torch.float16)
+        model = get_peft_model(model, lora_config)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+        model_max_length=training_args.model_max_length,
+        padding_side="right",
+        use_fast=False,
+    )
+    model.resize_token_embeddings(len(tokenizer))
+    model.config.max_length = training_args.model_max_length
+    if model_args.version in conversation_lib.conv_templates:
+        conversation_lib.default_conversation = conversation_lib.conv_templates[model_args.version]
+    else:
+        conversation_lib.default_conversation = conversation_lib.conv_templates["llama_3"]
+    if model_args.speech_encoder is not None:
+        model.get_model().initialize_speech_modules(
+            model_args=model_args,
+            fsdp=training_args.fsdp
+        )
+        data_args.is_multimodal = True
+        model.config.tokenizer_padding_side = tokenizer.padding_side
+        model.config.tokenizer_model_max_length = tokenizer.model_max_length
+        model.config.tune_speech_projector = training_args.tune_speech_projector = model_args.tune_speech_projector
+        model.config.speech_normalize = data_args.speech_normalize
+        for p in model.get_speech_encoder().parameters():
+            p.requires_grad = False
+        if model_args.tune_speech_projector:
+            model.requires_grad_(False)
+            for p in model.get_speech_projector().parameters():
+                p.requires_grad = True
+        model.config.freeze_speech_projector = training_args.freeze_speech_projector
+        if training_args.freeze_speech_projector:
+            for p in model.get_speech_projector().parameters():
+                p.requires_grad = False
+        if training_args.bits in [4, 8]:
+            model.get_model().speech_projector.to(dtype=compute_dtype, device=training_args.device)
+        model.config.speech_projector_lr = training_args.speech_projector_lr
+    if data_args.has_tgt_units:
+        model.initialize_speech_generator(model_args=model_args)
+    if training_args.bits in [4, 8]:
+        from peft.tuners.lora import LoraLayer
+        for name, module in model.named_modules():
+            if isinstance(module, LoraLayer):
+                if training_args.bf16:
+                    module = module.to(torch.bfloat16)
+            if 'norm' in name:
+                module = module.to(torch.float32)
+            if 'lm_head' in name or 'embed_tokens' in name:
+                if hasattr(module, 'weight'):
+                    if training_args.bf16 and module.weight.dtype == torch.float32:
+                        module = module.to(torch.bfloat16)
+    data_module = make_supervised_data_module(tokenizer=tokenizer,
+                                              data_args=data_args)
+    print("Training Layers:")
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            print(name, param.grad)
+    trainer = OmniTrainer(model=model,
+                    tokenizer=tokenizer,
+                    args=training_args,
+                    **data_module)
+    # if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
+    #     trainer.train(resume_from_checkpoint=True)
+    # else:
+    #     trainer.train()
+    # trainer.save_state()
+    model.config.use_cache = True
+    if training_args.lora_enable:
+        state_dict = get_peft_state_maybe_zero_3(
+            model.named_parameters(), training_args.lora_bias
+        )
+        non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(
+            model.named_parameters()
+        )
+        if training_args.local_rank == 0 or training_args.local_rank == -1:
+            model.config.save_pretrained(training_args.output_dir)
+            model.save_pretrained(training_args.output_dir, state_dict=state_dict)
+            torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin'))
+    else:
+        safe_save_model_for_hf_trainer(trainer=trainer,
+                                       output_dir=training_args.output_dir)
+if __name__ == "__main__":
+    train()

omni_speech/train/omni_trainer.py ADDED Viewed

	@@ -0,0 +1,345 @@

+import os
+import torch
+import torch.nn as nn
+from torch.utils.data import Sampler
+from transformers import Trainer
+from transformers.trainer import (
+    is_sagemaker_mp_enabled,
+    get_parameter_names,
+    has_length,
+    ALL_LAYERNORM_LAYERS,
+    logger,
+)
+from typing import List, Optional
+from omni_speech.utils import *
+def split_to_even_chunks(indices, lengths, num_chunks):
+    """
+    Split a list of indices into `chunks` chunks of roughly equal lengths.
+    """
+    if len(indices) % num_chunks != 0:
+        return [indices[i::num_chunks] for i in range(num_chunks)]
+    num_indices_per_chunk = len(indices) // num_chunks
+    chunks = [[] for _ in range(num_chunks)]
+    chunks_lengths = [0 for _ in range(num_chunks)]
+    for index in indices:
+        shortest_chunk = chunks_lengths.index(min(chunks_lengths))
+        chunks[shortest_chunk].append(index)
+        chunks_lengths[shortest_chunk] += lengths[index]
+        if len(chunks[shortest_chunk]) == num_indices_per_chunk:
+            chunks_lengths[shortest_chunk] = float("inf")
+    return chunks
+def get_modality_length_grouped_indices(lengths, batch_size, world_size, generator=None):
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    assert all(l != 0 for l in lengths), "Should not have zero length."
+    if all(l > 0 for l in lengths) or all(l < 0 for l in lengths):
+        # all samples are in the same modality
+        return get_length_grouped_indices(lengths, batch_size, world_size, generator=generator)
+    mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0])
+    lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0])
+    mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)]
+    lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)]
+    megabatch_size = world_size * batch_size
+    mm_megabatches = [mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)]
+    lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)]
+    last_mm = mm_megabatches[-1]
+    last_lang = lang_megabatches[-1]
+    additional_batch = last_mm + last_lang
+    megabatches = mm_megabatches[:-1] + lang_megabatches[:-1]
+    megabatch_indices = torch.randperm(len(megabatches), generator=generator)
+    megabatches = [megabatches[i] for i in megabatch_indices]
+    if len(additional_batch) > 0:
+        megabatches.append(sorted(additional_batch))
+    return [i for megabatch in megabatches for i in megabatch]
+def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True):
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    indices = torch.randperm(len(lengths), generator=generator)
+    megabatch_size = world_size * batch_size
+    megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
+    megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
+    megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches]
+    return [i for megabatch in megabatches for batch in megabatch for i in batch]
+class LengthGroupedSampler(Sampler):
+    r"""
+    Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
+    keeping a bit of randomness.
+    """
+    def __init__(
+        self,
+        batch_size: int,
+        world_size: int,
+        lengths: Optional[List[int]] = None,
+        generator=None,
+        group_by_modality: bool = False,
+    ):
+        if lengths is None:
+            raise ValueError("Lengths must be provided.")
+        self.batch_size = batch_size
+        self.world_size = world_size
+        self.lengths = lengths
+        self.generator = generator
+        self.group_by_modality = group_by_modality
+    def __len__(self):
+        return len(self.lengths)
+    def __iter__(self):
+        if self.group_by_modality:
+            indices = get_modality_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
+        else:
+            indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
+        return iter(indices)
+class OmniTrainer(Trainer):
+    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
+        if self.train_dataset is None or not has_length(self.train_dataset):
+            return None
+        if self.args.group_by_modality_length:
+            lengths = self.train_dataset.modality_lengths
+            return LengthGroupedSampler(
+                self.args.train_batch_size,
+                world_size=self.args.world_size * self.args.gradient_accumulation_steps,
+                lengths=lengths,
+                group_by_modality=True,
+            )
+        else:
+            return super()._get_train_sampler()
+    # def create_optimizer(self):
+    #     from transformers.utils import (
+    #         is_sagemaker_mp_enabled,
+    #     )
+    #     import torch.nn as nn
+    #     if is_sagemaker_mp_enabled():
+    #         import smdistributed.modelparallel.torch as smp
+    #     """
+    #     Setup the optimizer.
+    #     We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+    #     Trainer's init through `optimizers`, or subclass and override this method in a subclass.
+    #     """
+    #     opt_model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model
+    #     if self.optimizer is None:
+    #         decay_parameters = self.get_decay_parameter_names(opt_model)
+    #         optimizer_cls, optimizer_kwargs = self.get_optimizer_cls_and_kwargs(self.args, opt_model)
+    #         optimizer_grouped_parameters = [
+    #             # speech projector
+    #             {
+    #                 "params": [
+    #                     p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad and "speech_projector" in n)
+    #                 ],
+    #                 "weight_decay": self.args.weight_decay,
+    #                 "learning_rate": optimizer_kwargs["lr"] * 20,
+    #             },
+    #             {
+    #                 "params": [
+    #                     p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad and "speech_projector" in n)
+    #                 ],
+    #                 "weight_decay": 0.0,
+    #                 "learning_rate": optimizer_kwargs["lr"] * 20,
+    #             },
+    #             # non speech project
+    #             {
+    #                 "params": [
+    #                     p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad and "speech_projector" not in n)
+    #                 ],
+    #                 "weight_decay": self.args.weight_decay,
+    #             },
+    #             {
+    #                 "params": [
+    #                     p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad and "speech_projector" not in n)
+    #                 ],
+    #                 "weight_decay": 0.0,
+    #             },
+    #         ]
+    #         # Overwrite `params` in case it's created by `get_optimizer_cls_and_kwargs`
+    #         # e.g. for GaLore optimizer.
+    #         if "params" in optimizer_kwargs:
+    #             optimizer_grouped_parameters = optimizer_kwargs.pop("params")
+    #         # Overwrite `model` in case it's created by `get_optimizer_cls_and_kwargs`
+    #         # e.g. for LOMO optimizer.
+    #         if "model" in optimizer_kwargs:
+    #             optimizer_grouped_parameters = optimizer_kwargs.pop("model")
+    #         # For layer-wise dummy optimizers we overwrite optimizer_grouped_parameters with `optimizer_dict`
+    #         # to avoid arguments conflicts.
+    #         if "optimizer_dict" in optimizer_kwargs:
+    #             optimizer_grouped_parameters = optimizer_kwargs.pop("optimizer_dict")
+    #         self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+    #         if optimizer_cls.__name__ == "Adam8bit":
+    #             import bitsandbytes
+    #             manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
+    #             skipped = 0
+    #             for module in opt_model.modules():
+    #                 if isinstance(module, nn.Embedding):
+    #                     skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
+    #                     logger.info(f"skipped {module}: {skipped / 2 ** 20}M params")
+    #                     manager.register_module_override(module, "weight", {"optim_bits": 32})
+    #                     logger.debug(f"bitsandbytes: will optimize {module} in fp32")
+    #             logger.info(f"skipped: {skipped / 2 ** 20}M params")
+    #     if is_sagemaker_mp_enabled():
+    #         self.optimizer = smp.DistributedOptimizer(self.optimizer)
+    #     return self.optimizer
+    def create_optimizer(self):
+        """
+        Setup the optimizer.
+        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
+        """
+        if is_sagemaker_mp_enabled():
+            return super().create_optimizer()
+        opt_model = self.model
+        if self.optimizer is None:
+            decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
+            decay_parameters = [name for name in decay_parameters if "bias" not in name]
+            if self.args.speech_projector_lr is not None:
+                projector_parameters = [name for name, _ in opt_model.named_parameters() if "speech_projector" in name]
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                        "lr": self.args.speech_projector_lr,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                        "lr": self.args.speech_projector_lr,
+                    },
+                ]
+            else:
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                    },
+                ]
+            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
+            self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+            if optimizer_cls.__name__ == "Adam8bit":
+                import bitsandbytes
+                manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
+                skipped = 0
+                for module in opt_model.modules():
+                    if isinstance(module, nn.Embedding):
+                        skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
+                        logger.info(f"skipped {module}: {skipped/2**20}M params")
+                        manager.register_module_override(module, "weight", {"optim_bits": 32})
+                        logger.debug(f"bitsandbytes: will optimize {module} in fp32")
+                logger.info(f"skipped: {skipped/2**20}M params")
+        return self.optimizer
+    def _save_checkpoint(self, model, trial, metrics=None):
+        if getattr(self.args, 'tune_speech_projector', False):
+            from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+            checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+            run_dir = self._get_output_dir(trial=trial)
+            output_dir = os.path.join(run_dir, checkpoint_folder)
+            # Only save Adapter
+            keys_to_match = ['speech_projector']
+            weight_to_save = get_speech_projector_state_maybe_zero_3(self.model.named_parameters(), keys_to_match)
+            if self.args.local_rank == 0 or self.args.local_rank == -1:
+                self.model.config.save_pretrained(output_dir)
+                torch.save(weight_to_save, os.path.join(output_dir, f'speech_projector.bin'))
+        else:
+            super(OmniTrainer, self)._save_checkpoint(model, trial, metrics)
+    def _save(self, output_dir: Optional[str] = None, state_dict=None):
+        if getattr(self.args, 'tune_speech_projector', False):
+            pass
+        else:
+            super(OmniTrainer, self)._save(output_dir, state_dict)
+    # def training_step(self, model, inputs):
+    #     # Move inputs to device
+    #     inputs = self._prepare_inputs(inputs)
+    #     # Forward pass
+    #     outputs = model(**inputs)
+    #     loss = outputs.loss
+    #     # Backward pass
+    #     loss.backward()
+    #     # Check gradients
+    #     for name, param in model.module.named_parameters():
+    #         if param.requires_grad:
+    #             if param.grad is None:
+    #                 print(f"Gradients for {name} are None.")
+    #             else:
+    #                 print(f"Gradients for {name}: {param.grad.norm()}")  # Check norm of the gradients
+    #     # Return loss for optimization
+    #     return loss.detach()

omni_speech/train/train.py ADDED Viewed

	@@ -0,0 +1,420 @@

+# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
+# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
+#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import os
+import copy
+from dataclasses import dataclass, field
+import json
+import logging
+import pathlib
+from typing import Dict, Optional, Sequence, List
+import torch
+import transformers
+import tokenizers
+from omni_speech.constants import IGNORE_INDEX, SPEECH_TOKEN_INDEX, DEFAULT_SPEECH_TOKEN
+from torch.utils.data import Dataset
+from omni_speech.train.omni_trainer import OmniTrainer
+from omni_speech import conversation as conversation_lib
+from omni_speech.model import *
+from omni_speech.utils import *
+from omni_speech.datasets.preprocess import *
+import whisper
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
+    version: Optional[str] = field(default="llama_3")
+    freeze_backbone: bool = field(default=False)
+    tune_speech_projector: bool = field(default=False)
+    tune_speech_encoder: bool = field(default=False)
+    tune_speech_generator_only: bool = field(default=False)
+    speech_encoder_type: Optional[str] = field(default=None)
+    speech_encoder: Optional[str] = field(default=None)
+    pretrain_speech_projector: Optional[str] = field(default=None)
+    speech_projector_type: Optional[str] = field(default='linear')
+    speech_generator_type: Optional[str] = field(default='ctc')
+    ctc_decoder_config: str = "(2,4096,32,11008)"
+    ctc_upsample_factor: int = 1
+    ctc_loss_weight: float = 1.0
+    unit_vocab_size: int = 1000
+    speech_encoder_ds_rate: int = 5
+    speech_encoder_hidden_size: int = 1280
+@dataclass
+class DataArguments:
+    data_path: str = field(default=None,
+                           metadata={"help": "Path to the training data."})
+    dev_path: str = field(default=None,
+                           metadata={"help": "Path to the dev data."})
+    is_multimodal: bool = False
+    input_type: str = field(default="mel")
+    speech_normalize: bool = False
+    mel_size: int = 128
+    has_tgt_units: bool = False
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    freeze_speech_projector: bool = field(default=False)
+    model_max_length: int = field(
+        default=512,
+        metadata={
+            "help":
+            "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    double_quant: bool = field(
+        default=True,
+        metadata={"help": "Compress the quantization statistics through double quantization."}
+    )
+    quant_type: str = field(
+        default="nf4",
+        metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
+    )
+    bits: int = field(
+        default=16,
+        metadata={"help": "How many bits to use."}
+    )
+    lora_enable: bool = False
+    lora_r: int = 64
+    lora_alpha: int = 16
+    lora_dropout: float = 0.05
+    lora_weight_path: str = ""
+    lora_bias: str = "none"
+    speech_projector_lr: Optional[float] = None
+    group_by_modality_length: bool = field(default=False)
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+    def __init__(self, data_path: str,
+                 tokenizer: transformers.PreTrainedTokenizer,
+                 data_args: DataArguments):
+        super(LazySupervisedDataset, self).__init__()
+        list_data_dict = json.load(open(data_path, "r"))
+        self.tokenizer = tokenizer
+        self.list_data_dict = list_data_dict
+        self.data_args = data_args
+    def __len__(self):
+        return len(self.list_data_dict)
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        sources = self.list_data_dict[i]
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+        if 'speech' in sources[0]:
+            import numpy as np
+            speech_file = self.list_data_dict[i]['speech']
+            speech = whisper.load_audio(speech_file)
+            # speech = np.random.uniform(low=-1.0, high=1.0, size=speech.shape[0]).astype(speech.dtype)
+            if self.data_args.input_type == "raw":
+                speech = torch.from_numpy(speech)
+                if self.model_config.data_args.speech_normalize:
+                    speech = torch.nn.functional.layer_norm(speech, speech.shape)
+            elif self.data_args.input_type == "mel":
+                speech = whisper.pad_or_trim(speech)
+                speech = whisper.log_mel_spectrogram(speech, n_mels=self.data_args.mel_size).permute(1, 0)
+            speech_lengths = torch.LongTensor([speech.shape[0]])
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args)
+        else:
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+        data_dict = preprocess(
+            sources,
+            self.tokenizer,
+            has_speech=('speech' in self.list_data_dict[i]))
+        if isinstance(i, int):
+            data_dict = dict(input_ids=data_dict["input_ids"][0],
+                             labels=data_dict["labels"][0])
+        # speech exist in the data
+        if 'speech' in self.list_data_dict[i]:
+            data_dict['speech'] = speech
+            data_dict['speech_lengths'] = speech_lengths
+        return data_dict
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+    tokenizer: transformers.PreTrainedTokenizer
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances]
+                                  for key in ("input_ids", "labels"))
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids,
+            batch_first=True,
+            padding_value=self.tokenizer.pad_token_id)
+        labels = torch.nn.utils.rnn.pad_sequence(labels,
+                                                 batch_first=True,
+                                                 padding_value=IGNORE_INDEX)
+        input_ids = input_ids[:, :self.tokenizer.model_max_length]
+        labels = labels[:, :self.tokenizer.model_max_length]
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+        )
+        if 'speech' in instances[0]:
+            speech = [instance['speech'] for instance in instances]
+            speech_lengths = [instance['speech_lengths'] for instance in instances]
+            if all(x is not None and x.shape == speech[0].shape for x in speech):
+                batch['speech'] = torch.stack(speech)
+                batch['speech_lengths'] = torch.stack(speech_lengths)
+            else:
+                batch['speech'] = speech
+                batch['speech_lengths'] = speech_lengths
+        return batch
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer,
+                                data_args) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    train_dataset = LazySupervisedDataset(tokenizer=tokenizer,
+                                data_path=data_args.data_path,
+                                data_args=data_args)
+    if data_args.dev_path is not None:
+        dev_dataset = LazySupervisedDataset(tokenizer=tokenizer,
+                                data_path=data_args.dev_path,
+                                data_args=data_args)
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(train_dataset=train_dataset,
+                eval_dataset=dev_dataset,
+                data_collator=data_collator)
+def train(attn_implementation="flash_attention_2"):
+    parser = transformers.HfArgumentParser(
+        (ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
+    bnb_model_from_pretrained_args = {}
+    if training_args.bits in [4, 8]:
+        from transformers import BitsAndBytesConfig
+        bnb_model_from_pretrained_args.update(dict(
+            device_map={"": training_args.device},
+            load_in_4bit=training_args.bits == 4,
+            load_in_8bit=training_args.bits == 8,
+            quantization_config=BitsAndBytesConfig(
+                load_in_4bit=training_args.bits == 4,
+                load_in_8bit=training_args.bits == 8,
+                llm_int8_skip_modules=["speech_projector"],
+                llm_int8_threshold=6.0,
+                llm_int8_has_fp16_weight=False,
+                bnb_4bit_compute_dtype=compute_dtype,
+                bnb_4bit_use_double_quant=training_args.double_quant,
+                bnb_4bit_quant_type=training_args.quant_type # {'fp4', 'nf4'}
+            )
+        ))
+    if data_args.has_tgt_units:
+        if model_args.version == "llama_3":
+            model = OmniSpeech2SLlamaForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                **bnb_model_from_pretrained_args
+            )
+        elif model_args.version == "qwen":
+            model = OmniSpeech2SQwen2ForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                **bnb_model_from_pretrained_args
+            )
+        else:
+            raise ValueError("--currently only support llama or qwen model!")
+    else:
+        if model_args.version == "llama_3":
+            model = OmniSpeechLlamaForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                **bnb_model_from_pretrained_args
+            )
+        elif model_args.version == "qwen":
+            model = OmniSpeechQwen2ForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                **bnb_model_from_pretrained_args
+            )
+        else:
+            raise ValueError("--currently only support llama or qwen model!")
+    model.config.use_cache = False
+    if model_args.freeze_backbone:
+        model.model.requires_grad_(False)
+    if training_args.bits in [4, 8]:
+        from peft import prepare_model_for_kbit_training
+        model.config.torch_dtype=(torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
+        model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing)
+    if training_args.gradient_checkpointing:
+        if hasattr(model, "enable_input_require_grads"):
+            model.enable_input_require_grads()
+        else:
+            def make_inputs_require_grad(module, input, output):
+                output.requires_grad_(True)
+            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+    if training_args.lora_enable:
+        from peft import LoraConfig, get_peft_model
+        lora_config = LoraConfig(
+            r=training_args.lora_r,
+            lora_alpha=training_args.lora_alpha,
+            target_modules=find_all_linear_names(model),
+            lora_dropout=training_args.lora_dropout,
+            bias=training_args.lora_bias,
+            task_type="CAUSAL_LM",
+        )
+        if training_args.bits == 16:
+            if training_args.bf16:
+                model.to(torch.bfloat16)
+            if training_args.fp16:
+                model.to(torch.float16)
+        model = get_peft_model(model, lora_config)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+        model_max_length=training_args.model_max_length,
+        padding_side="right",
+        use_fast=False,
+    )
+    model.resize_token_embeddings(len(tokenizer))
+    model.config.max_length = training_args.model_max_length
+    if model_args.version in conversation_lib.conv_templates:
+        conversation_lib.default_conversation = conversation_lib.conv_templates[model_args.version]
+    else:
+        conversation_lib.default_conversation = conversation_lib.conv_templates["llama_3"]
+    if model_args.speech_encoder is not None:
+        model.get_model().initialize_speech_modules(
+            model_args=model_args,
+            fsdp=training_args.fsdp
+        )
+        data_args.is_multimodal = True
+        model.config.tokenizer_padding_side = tokenizer.padding_side
+        model.config.tokenizer_model_max_length = tokenizer.model_max_length
+        model.config.tune_speech_projector = training_args.tune_speech_projector = model_args.tune_speech_projector
+        model.config.speech_normalize = data_args.speech_normalize
+        for p in model.get_speech_encoder().parameters():
+            p.requires_grad = False
+        if model_args.tune_speech_projector:
+            model.requires_grad_(False)
+            for p in model.get_speech_projector().parameters():
+                p.requires_grad = True
+        model.config.freeze_speech_projector = training_args.freeze_speech_projector
+        if training_args.freeze_speech_projector:
+            for p in model.get_speech_projector().parameters():
+                p.requires_grad = False
+        if training_args.bits in [4, 8]:
+            model.get_model().speech_projector.to(dtype=compute_dtype, device=training_args.device)
+        model.config.speech_projector_lr = training_args.speech_projector_lr
+    if data_args.has_tgt_units:
+        model.initialize_speech_generator(model_args=model_args)
+    if training_args.bits in [4, 8]:
+        from peft.tuners.lora import LoraLayer
+        for name, module in model.named_modules():
+            if isinstance(module, LoraLayer):
+                if training_args.bf16:
+                    module = module.to(torch.bfloat16)
+            if 'norm' in name:
+                module = module.to(torch.float32)
+            if 'lm_head' in name or 'embed_tokens' in name:
+                if hasattr(module, 'weight'):
+                    if training_args.bf16 and module.weight.dtype == torch.float32:
+                        module = module.to(torch.bfloat16)
+    data_module = make_supervised_data_module(tokenizer=tokenizer,
+                                              data_args=data_args)
+    print("Training Layers:")
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            print(name, param.grad)
+    trainer = OmniTrainer(model=model,
+                    tokenizer=tokenizer,
+                    args=training_args,
+                    **data_module)
+    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+    trainer.save_state()
+    model.config.use_cache = True
+    if training_args.lora_enable:
+        state_dict = get_peft_state_maybe_zero_3(
+            model.named_parameters(), training_args.lora_bias
+        )
+        non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(
+            model.named_parameters()
+        )
+        if training_args.local_rank == 0 or training_args.local_rank == -1:
+            model.config.save_pretrained(training_args.output_dir)
+            model.save_pretrained(training_args.output_dir, state_dict=state_dict)
+            torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin'))
+    else:
+        safe_save_model_for_hf_trainer(trainer=trainer,
+                                       output_dir=training_args.output_dir)
+if __name__ == "__main__":
+    train()

omni_speech/train/train_mem.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from omni_speech.train.train_multiturn import train
+if __name__ == "__main__":
+    train(attn_implementation="flash_attention_2")

omni_speech/train/train_minicpmo.py ADDED Viewed

	@@ -0,0 +1,660 @@

+import glob
+import json
+import logging
+import os
+from dataclasses import dataclass, field
+from functools import partial
+from typing import Dict, List, Optional, Union, Literal, Tuple
+from types import MethodType
+from torchvision import transforms
+from copy import deepcopy
+import torch
+import transformers
+from accelerate.utils import DistributedType
+from deepspeed import zero
+from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+import pathlib
+from transformers import AutoModel, AutoTokenizer, AutoProcessor
+from transformers.integrations import deepspeed
+from omni_speech.datasets.dataset import SupervisedDataset, data_collator
+from omni_speech.model import *
+from trainer import CPMTrainer
+from transformers import Trainer
+import librosa
+from datasets import load_dataset
+import numpy as np
+from PIL import Image
+from functools import partial
+from audiomentations import AddBackgroundNoise, PolarityInversion
+from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="openbmb/MiniCPM-o-2_6")
+    tokenizer_path: Optional[str] = field(default=None)
+    audio_encoder_path: Optional[str] = field(default=None)
+    pretrained_llm_path: Optional[str] = field(default=None)
+@dataclass
+class DataArguments:
+    data_path: str = field(
+        default=None, metadata={"help": "Path to the training data."}
+    )
+    eval_data_path: str = field(
+        default=None, metadata={"help": "Path to the evaluation data."}
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+                    "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+                    "value if set."
+        },
+    )
+    augment_prob: float = field(
+        default=0.0,
+        metadata={"help": "The probability of applying augmentation transform."}
+    )
+    augment_path: str = field(default=None,
+                           metadata={"help": "Path to the augment data."})
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    model_max_length: int = field(
+        default=2048,
+        metadata={
+            "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    tune_vision: Optional[bool] = field(default=True)
+    tune_speech: Optional[bool] = field(default=True)
+    tune_llm: Optional[bool] = field(default=True)
+    llm_type: str = field(default="qwen")
+    use_lora: Optional[bool] = field(default=False)
+    max_slice_nums: Optional[int] = field(default=9)
+    config_path: Optional[str] = field(default=None)
+    chunk_input: Optional[bool] = field(default=True)
+    init_vision: Optional[bool] = field(default=False)
+    init_speech: Optional[bool] = field(default=True)
+@dataclass
+class LoraArguments:
+    lora_r: int = 64
+    lora_alpha: int = 64
+    lora_dropout: float = 0.05
+    lora_target_modules: str = r"llm\..*layers\.\d+\.self_attn\.(q_proj|k_proj|v_proj)"
+    lora_weight_path: str = ""
+    lora_bias: str = "none"
+    q_lora: bool = False
+    lora_modules_to_save: str = ""
+    lora_layer_replication: Optional[List[Tuple[int, int]]] = None
+    lora_layers_to_transform: Optional[List[int]] = None
+    lora_layers_pattern: Optional[str] = None
+local_rank = None
+def rank0_print(*args):
+    if local_rank == 0:
+        print(*args)
+def safe_save_model_for_hf_trainer(trainer, output_dir: str, bias="none"):
+    """Collects the state dict and dump to disk."""
+    if trainer.args.should_save and trainer.args.local_rank == 0:
+        trainer.save_model(output_dir,)
+# class CollateFn:
+#     def __init__(self, processor, prompt="Please transcribe this audio into text.", system_prompt="You are a helpful language and speech assistant. You are able to understand the speech content that the user provides, and assist the user with a variety of tasks using natural language."):
+#         self.prompt = prompt
+#         self.system_prompt = system_prompt
+#         self.processor = processor
+#     def __call__(self, examples):
+#         prompts_lists = []
+#         input_images_list = []
+#         input_audios_list = []
+#         audio_parts_list = []
+#         for msgs in examples:
+#             msgs = msgs["conversations"]
+#             if isinstance(msgs, str):
+#                 msgs = json.loads(msgs)
+#             copy_msgs = deepcopy(msgs)
+#             assert len(msgs) > 0, "msgs is empty"
+#             system_turn = {'role': 'system', 'content': self.system_prompt}
+#             if copy_msgs[0]["role"] != 'system':
+#                 copy_msgs.insert(0, system_turn)
+#             images = []
+#             audios = []
+#             audio_parts = []
+#             for i, msg in enumerate(copy_msgs):
+#                 role = msg["role"]
+#                 content = msg["content"]
+#                 assert role in ["system", "user", "assistant"]
+#                 if i == 0:
+#                     assert role in ["user", "system"], "The role of first msg should be user"
+#                 content = [content, self.prompt]
+#                 cur_msgs = []
+#                 for c in content:
+#                     if os.path.exists(c):
+#                         c, _ = librosa.load(c, sr=16000, mono=True)
+#                     if isinstance(c, Image.Image):
+#                         images.append(c)
+#                         cur_msgs.append("(<image>./</image>)")
+#                     elif isinstance(c, np.ndarray):  # audio
+#                         audios.append(c)
+#                         audio_parts.append(i)
+#                         cur_msgs.append("(<audio>./</audio>)")
+#                     elif isinstance(c, str):
+#                         cur_msgs.append(c)
+#                 else:
+#                     msg["content"] = "\n".join(cur_msgs)
+#             prompts_lists.append(
+#                 self.processor.tokenizer.apply_chat_template(
+#                     copy_msgs,
+#                     tokenize=False,
+#                     add_generation_prompt=False,
+#                 )
+#             )
+#             input_images_list.append(images)
+#             input_audios_list.append(audios)
+#             audio_parts_list.append(audio_parts)
+#         inputs = self.processor(
+#             prompts_lists,
+#             input_images_list,
+#             input_audios_list,
+#             audio_parts_list,
+#             return_tensors="pt",
+#             max_length=32768,
+#             return_labels=True,
+#         )
+#         return inputs
+def collate_fn(examples, processor, chunk_input, max_len, prompt=None, system_prompt="You are a helpful language and speech assistant. You are able to understand the speech content that the user provides, and assist the user with a variety of tasks using natural language.", transform=None):
+    prompts_lists = []
+    input_images_list = []
+    input_audios_list = []
+    audio_parts_list = []
+    for msgs in examples:
+        raw_msgs = deepcopy(msgs)
+        msgs = msgs["conversations"]
+        if isinstance(msgs, str):
+            msgs = json.loads(msgs)
+        copy_msgs = deepcopy(msgs)
+        assert len(msgs) > 0, "msgs is empty"
+        system_turn = {'role': 'system', 'content': system_prompt}
+        if copy_msgs[0]["role"] != 'system':
+            copy_msgs.insert(0, system_turn)
+        fc = None
+        if "tools" in raw_msgs:
+            # if raw_msgs["tools"] != "":
+            #     json_objects = raw_msgs["tools"].split("\n\n")
+            #     try:
+            #         fc = [json.loads(obj) for obj in json_objects]
+            #     except:
+            #         if len(json_objects) > 1:
+            #             json_objects = json_objects[:-1]
+            #         fc = [json.loads(obj) for obj in json_objects]
+            if raw_msgs["tools"] != "":
+                fc = json.loads(raw_msgs["tools"])
+        # print(fc)
+        # print("-----------")
+        images = []
+        audios = []
+        audio_parts = []
+        for i, msg in enumerate(copy_msgs):
+            role = msg["role"]
+            content = msg["content"]
+            assert role in ["system", "user", "assistant", "tool"]
+            if i == 0:
+                assert role in ["user", "system"], "The role of first msg should be user or system"
+            if role == "user":
+                if prompt is not None:
+                    content = [content, prompt]
+                else:
+                    content = [content]
+                cur_msgs = []
+                for c in content:
+                    if os.path.exists(c):
+                        c, _ = librosa.load(c, sr=16000, mono=True)
+                        if transform is not None:
+                            c = transform(c, sample_rate=16000)
+                    if isinstance(c, Image.Image):
+                        images.append(c)
+                        cur_msgs.append("(<image>./</image>)")
+                    elif isinstance(c, np.ndarray):  # audio
+                        audios.append(c)
+                        audio_parts.append(i)
+                        cur_msgs.append("(<audio>./</audio>)")
+                    elif isinstance(c, str):
+                        cur_msgs.append(c)
+                msg["content"] = "\n".join(cur_msgs)
+            if "tool_calls" in msg:
+                if msg["tool_calls"] is not None:
+                    assert isinstance(msg["tool_calls"], str), f"Invalid type: {type(msg['tool_calls'])}"
+                    msg["tool_calls"] = json.loads(msg["tool_calls"])
+                    if type(msg["tool_calls"]) != list:
+                        msg["tool_calls"] = [msg["tool_calls"]]
+        # print(copy_msgs)
+        # print("--------")
+        qwen_template = processor.tokenizer.apply_chat_template(
+                copy_msgs,
+                tokenize=False,
+                add_generation_prompt=False,
+                tools = fc,
+            )
+        # print(qwen_template)
+        # print("---------------")
+        prompts_lists.append(qwen_template)
+        input_images_list.append(images)
+        input_audios_list.append(audios)
+        audio_parts_list.append(audio_parts)
+    inputs = processor(
+        prompts_lists,
+        input_images_list,
+        input_audios_list,
+        audio_parts_list,
+        chunk_input=chunk_input,
+        return_tensors="pt",
+        # max_length=max_len,
+        return_labels=True,
+    )
+    return inputs
+def make_supervised_data_module(
+    tokenizer: transformers.PreTrainedTokenizer,
+    processor: transformers.ProcessorMixin,
+    data_args,
+    transform,
+    data_collator=None,
+    llm_type="qwen",
+    slice_config=None,
+    patch_size=14,
+    query_nums=64,
+    batch_vision=False,
+    max_length=2048,
+) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    dataset_cls = SupervisedDataset
+    rank0_print("Loading data...")
+    train_json = json.load(open(data_args.data_path, "r"))
+    train_dataset = dataset_cls(
+        train_json,
+        transform,
+        tokenizer,
+        processor,
+        slice_config=slice_config,
+        llm_type=llm_type,
+        patch_size=patch_size,
+        query_nums=query_nums,
+        batch_vision=batch_vision,
+        max_length=max_length,
+    )
+    if data_args.eval_data_path:
+        eval_json = json.load(open(data_args.eval_data_path, "r"))
+        eval_dataset = dataset_cls(
+            eval_json,
+            transform,
+            tokenizer,
+            processor,
+            slice_config=slice_config,
+            llm_type=llm_type,
+            patch_size=patch_size,
+            query_nums=query_nums,
+            batch_vision=batch_vision,
+            max_length=max_length,
+        )
+    else:
+        eval_dataset = None
+    return dict(
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        data_collator= partial(data_collator, max_length=max_length),
+    )
+def build_transform():
+    IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5) # timm.data.IMAGENET_INCEPTION_MEAN
+    IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5)  # timm.data.IMAGENET_INCEPTION_STD
+    return transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD
+                ),
+            ]
+        )
+def get_parameter_number(model):
+    trainable_params, all_param = 0, 0
+    for param in model.parameters():
+        num_params = param.numel()
+        # if using DS Zero 3 and the weights are initialized empty
+        if num_params == 0 and hasattr(param, "ds_numel"):
+            num_params = param.ds_numel
+        all_param += num_params
+        if param.requires_grad:
+            trainable_params += num_params
+    return {'Total': all_param, 'Trainable': trainable_params}
+local_rank = 0
+def train(attn_implementation="flash_attention_2"):
+    global local_rank
+    parser = transformers.HfArgumentParser(
+        (ModelArguments, DataArguments, TrainingArguments, LoraArguments)
+    )
+    (
+        model_args,
+        data_args,
+        training_args,
+        lora_args,
+    ) = parser.parse_args_into_dataclasses()
+    if getattr(training_args, "deepspeed", None) :
+        training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED
+    compute_dtype = (
+        torch.float16
+        if training_args.fp16
+        else (torch.bfloat16 if training_args.bf16 else torch.float32)
+    )
+    local_rank = training_args.local_rank
+    world_size = int(os.environ.get("WORLD_SIZE", 1))
+    ddp = world_size != 1
+    device_map = None
+    if lora_args.q_lora:
+        device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} if ddp else None
+        if len(training_args.fsdp) > 0 or deepspeed.is_deepspeed_zero3_enabled():
+            logging.warning(
+                "FSDP or ZeRO3 are not incompatible with QLoRA."
+            )
+    minipcmo_config = {}
+    if training_args.config_path is not None:
+        minipcmo_config = json.load(open(training_args.config_path, "r"))
+    # if model_args.tokenizer_path is not None:
+    #     tokenizer = AutoTokenizer.from_pretrained(
+    #         model_args.tokenizer_path, trust_remote_code=True
+    #     )
+    # else:
+    #     tokenizer = AutoTokenizer.from_pretrained(
+    #         model_args.model_name_or_path, trust_remote_code=True
+    #     )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path, trust_remote_code=True
+    )
+    if model_args.pretrained_llm_path is None:
+        print("Finetuning model!!!")
+        model = MiniCPMO.from_pretrained(
+            model_args.model_name_or_path,
+            torch_dtype=compute_dtype,
+            device_map=device_map,
+            attn_implementation=attn_implementation,
+            init_vision=training_args.init_vision,
+            init_audio=training_args.init_speech,
+            init_tts=False,
+            processor_path=model_args.tokenizer_path,
+            **minipcmo_config
+        )
+    else:
+        print("Load pretrained LLM from scratch!!!")
+        # # Create the config object as needed
+        # config = MiniCPMOConfig(
+        #     model_name_or_path=model_args.model_name_or_path,
+        #     pretrained_llm_path=model_args.pretrained_llm_path,
+        #     init_vision=training_args.init_vision,
+        #     init_audio=training_args.init_speech,
+        #     pretrained_encoder_path=model_args.audio_encoder_path,
+        #     processor_path=model_args.tokenizer_path,
+        #     **minipcmo_config
+        # )
+        # # Initialize model
+        # model = MiniCPMO(config)
+        model = MiniCPMO.from_pretrained(
+            model_args.model_name_or_path,
+            pretrained_llm_path=model_args.pretrained_llm_path,
+            init_vision=training_args.init_vision,
+            init_audio=training_args.init_speech,
+            pretrained_encoder_path=model_args.audio_encoder_path,
+            processor_path=model_args.tokenizer_path,
+            **minipcmo_config
+        )
+        # tokenizer.audio_start_id = tokenizer.convert_tokens_to_ids("<|box_start|>")
+        # tokenizer.audio_end_id = tokenizer.convert_tokens_to_ids("<|box_end|>")
+        # tokenizer.audio_start = "<|box_start|>"
+        # tokenizer.audio_end = "<|box_end|>"
+        # tokenizer.im_start_id = tokenizer.convert_tokens_to_ids("<|vision_start|>")
+        # tokenizer.im_end_id = tokenizer.convert_tokens_to_ids("<|vision_end|>")
+        # tokenizer.im_start = "<|vision_start|>"
+        # tokenizer.im_end = "<|vision_end|>"
+        # tokenizer.slice_start_id = tokenizer.convert_tokens_to_ids("<|quad_start|>")
+        # tokenizer.slice_end_id = tokenizer.convert_tokens_to_ids("<|quad_end|>")
+        # tokenizer.slice_start = "<|quad_start|>"
+        # tokenizer.slice_end = "<|quad_end|>"
+        # tokenizer.unk_token = "<unk>"
+        # print("Audio Start Token:", tokenizer.audio_start)
+        # print("Audio End Token:", tokenizer.audio_end)
+        # print(tokenizer.audio_start_id)
+        # print(tokenizer.audio_end_id)
+        # print("Start Token:", tokenizer.im_start)
+        # print("End Token:", tokenizer.im_end)
+        # print(tokenizer.im_start_id)
+        # print(tokenizer.im_end_id)
+        # print("Slice Start Token:", tokenizer.slice_start)
+        # print("Slice End Token:", tokenizer.slice_end)
+        # print(tokenizer.slice_start_id)
+        # print(tokenizer.slice_end_id)
+        model.config.chunk_input = training_args.chunk_input
+        # model.llm.resize_token_embeddings(len(tokenizer))
+        # model.resize_token_embeddings(len(tokenizer))
+    model.llm.config.use_cache = False
+    model.config.max_length = training_args.model_max_length
+    if not training_args.tune_vision and training_args.init_vision:
+        model.vpm.requires_grad_(False)
+    if not training_args.tune_speech and training_args.init_speech:
+        model.apm.requires_grad_(False)
+    if not training_args.tune_llm:
+        model.llm.requires_grad_(False)
+    if training_args.use_lora:
+        if training_args.use_lora and training_args.tune_llm:
+            raise ValueError("The model cannot simultaneously adjust LLM parameters and apply LoRA.")
+        rank0_print("Currently using LoRA for fine-tuning the MiniCPM-V model.")
+        for name, param in model.llm.named_parameters():
+            param.requires_grad = False
+        modules_to_save = ['embed_tokens','resampler']
+        if training_args.tune_vision:
+            modules_to_save.append('vpm')
+        lora_config = LoraConfig(
+            r=lora_args.lora_r,
+            lora_alpha=lora_args.lora_alpha,
+            target_modules=lora_args.lora_target_modules,
+            lora_dropout=lora_args.lora_dropout,
+            bias=lora_args.lora_bias,
+            layers_to_transform=lora_args.lora_layers_to_transform,
+            modules_to_save=modules_to_save,
+        )
+        if not hasattr(model, 'get_input_embeddings'):
+            def get_input_embeddings(self):
+                return self.llm.get_input_embeddings()
+            model.get_input_embeddings = MethodType(get_input_embeddings, model)
+        if lora_args.q_lora:
+            model = prepare_model_for_kbit_training(
+                model, use_gradient_checkpointing=training_args.gradient_checkpointing
+            )
+        model = get_peft_model(model, lora_config)
+        if training_args.gradient_checkpointing:
+            model.enable_input_require_grads()
+    rank0_print(get_parameter_number(model))
+    llm_type = training_args.llm_type
+    rank0_print(f'llm_type={llm_type}')
+    # Load data
+    if hasattr(model.config, "slice_config"):
+        model.config.slice_config.max_slice_nums = training_args.max_slice_nums
+        slice_config = model.config.slice_config.to_dict()
+    else:
+        model.config.max_slice_nums = training_args.max_slice_nums
+        slice_config = model.config.to_dict()
+    if hasattr(model.config, "batch_vision_input"):
+        batch_vision = model.config.batch_vision_input
+    else:
+        batch_vision = False
+    transform_func = build_transform()
+    if model_args.tokenizer_path is not None:
+        processor = AutoProcessor.from_pretrained(model_args.tokenizer_path, trust_remote_code=True)
+    else:
+        processor = AutoProcessor.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
+    processor.tokenizer = tokenizer
+    raw_datasets = load_dataset(
+        "json",
+        data_files={
+            "train": data_args.data_path,
+            "validation": data_args.eval_data_path,
+        },
+        cache_dir=training_args.cache_dir,
+    )
+    train_ds = raw_datasets["train"]
+    if data_args.max_train_samples is not None:
+        train_ds = train_ds.select(range(data_args.max_train_samples))
+    eval_ds = raw_datasets["validation"]
+    if data_args.max_eval_samples is not None:
+        eval_ds = eval_ds.select(range(data_args.max_eval_samples))
+    # data_module = make_supervised_data_module(
+    #     tokenizer=tokenizer,
+    #     processor=processor,
+    #     data_args=data_args,
+    #     transform=transform_func,
+    #     data_collator=data_collator,
+    #     slice_config=slice_config,
+    #     llm_type=llm_type,
+    #     patch_size=model.config.patch_size,
+    #     query_nums=model.config.query_num,
+    #     batch_vision=batch_vision,
+    #     max_length=training_args.model_max_length,
+    # )
+    init_prompt = None
+    if not training_args.tune_llm and training_args.tune_speech: # asr finetuning
+        init_prompt = "Please transcribe this audio into text."
+    transform = None
+    if data_args.augment_prob != 0.0 and data_args.augment_path is not None:
+        with open(data_args.augment_path, "r") as f:
+            augment_path_list = f.read().splitlines()
+        transform = AddBackgroundNoise(
+            sounds_path=augment_path_list,
+            min_snr_db=5.0,
+            max_snr_db=30.0,
+            noise_transform=PolarityInversion(),
+            p=data_args.augment_prob
+        )
+    custom_collate_fn = partial(collate_fn, processor = processor, chunk_input=training_args.chunk_input, max_len=training_args.model_max_length, prompt=init_prompt, transform=transform)
+    training_args.gradient_checkpointing_kwargs={"use_reentrant":False}
+    print("Training Layers:")
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            print(name, param.grad)
+    # trainer = CPMTrainer(
+    #     model=model,
+    #     tokenizer=tokenizer,
+    #     args=training_args,
+    #     **data_module,
+    # )
+    trainer = Trainer(
+        model=model,
+        tokenizer=tokenizer,
+        args=training_args,
+        train_dataset=train_ds,
+        eval_dataset=eval_ds,
+        data_collator=custom_collate_fn
+    )
+    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+    trainer.save_state()
+    safe_save_model_for_hf_trainer(
+        trainer=trainer,
+        output_dir=training_args.output_dir,
+        bias=lora_args.lora_bias)
+if __name__ == "__main__":
+    train()

omni_speech/train/train_minicpmo_test.py ADDED Viewed

	@@ -0,0 +1,729 @@

+import glob
+import json
+import logging
+import os
+from dataclasses import dataclass, field
+from functools import partial
+from typing import Dict, List, Optional, Union, Literal, Tuple
+from types import MethodType
+from torchvision import transforms
+from copy import deepcopy
+import torch
+import transformers
+from accelerate.utils import DistributedType
+from deepspeed import zero
+from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+import pathlib
+from transformers import AutoModel, AutoTokenizer, AutoProcessor
+from transformers.integrations import deepspeed
+from omni_speech.datasets.dataset import SupervisedDataset, data_collator
+from omni_speech.model import *
+from trainer import CPMTrainer
+from transformers import Trainer
+import librosa
+from datasets import load_dataset
+import numpy as np
+from PIL import Image
+from functools import partial
+from audiomentations import AddBackgroundNoise, PolarityInversion
+from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="openbmb/MiniCPM-o-2_6")
+    tokenizer_path: Optional[str] = field(default=None)
+    audio_encoder_path: Optional[str] = field(default=None)
+    pretrained_llm_path: Optional[str] = field(default=None)
+@dataclass
+class DataArguments:
+    data_path: str = field(
+        default=None, metadata={"help": "Path to the training data."}
+    )
+    eval_data_path: str = field(
+        default=None, metadata={"help": "Path to the evaluation data."}
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+                    "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+                    "value if set."
+        },
+    )
+    augment_prob: float = field(
+        default=0.0,
+        metadata={"help": "The probability of applying augmentation transform."}
+    )
+    augment_path: str = field(default=None,
+                           metadata={"help": "Path to the augment data."})
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    model_max_length: int = field(
+        default=2048,
+        metadata={
+            "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    tune_vision: Optional[bool] = field(default=True)
+    tune_speech: Optional[bool] = field(default=True)
+    tune_llm: Optional[bool] = field(default=True)
+    llm_type: str = field(default="qwen")
+    use_lora: Optional[bool] = field(default=False)
+    max_slice_nums: Optional[int] = field(default=9)
+    config_path: Optional[str] = field(default=None)
+    chunk_input: Optional[bool] = field(default=True)
+    init_vision: Optional[bool] = field(default=False)
+    init_speech: Optional[bool] = field(default=True)
+@dataclass
+class LoraArguments:
+    lora_r: int = 64
+    lora_alpha: int = 64
+    lora_dropout: float = 0.05
+    lora_target_modules: str = r"llm\..*layers\.\d+\.self_attn\.(q_proj|k_proj|v_proj)"
+    lora_weight_path: str = ""
+    lora_bias: str = "none"
+    q_lora: bool = False
+    lora_modules_to_save: str = ""
+    lora_layer_replication: Optional[List[Tuple[int, int]]] = None
+    lora_layers_to_transform: Optional[List[int]] = None
+    lora_layers_pattern: Optional[str] = None
+local_rank = None
+def rank0_print(*args):
+    if local_rank == 0:
+        print(*args)
+def print_trainable_parameters_by_module(model):
+    """
+    In ra chi tiết các tham số trainable theo module và số lượng tham số
+    """
+    print("\n" + "="*50)
+    print("TRAINABLE PARAMETERS BY MODULE")
+    print("="*50)
+    # Lưu trữ tham số theo module cấp 2
+    module_params = {}
+    all_params = 0
+    trainable_params = 0
+    for name, param in model.named_parameters():
+        all_params += param.numel()
+        # Lấy module cấp 2
+        parts = name.split('.')
+        if len(parts) >= 2:
+            module_name = f"{parts[0]}.{parts[1]}"
+        else:
+            module_name = parts[0]
+        if param.requires_grad:
+            trainable_params += param.numel()
+            if module_name not in module_params:
+                module_params[module_name] = {
+                    'count': 0,
+                    'names': []
+                }
+            module_params[module_name]['count'] += param.numel()
+            module_params[module_name]['names'].append(name)
+    # Sắp xếp và in kết quả
+    sorted_modules = sorted(module_params.items(), key=lambda x: x[1]['count'], reverse=True)
+    for module_name, info in sorted_modules:
+        param_count = info['count']
+        percentage = 100 * param_count / trainable_params
+        print(f"{module_name:<30} {param_count:,} params ({percentage:.2f}%)")
+        # In ra 3 tham số đầu tiên của module này
+        for i, param_name in enumerate(info['names'][:3]):
+            print(f"  - {param_name}")
+        if len(info['names']) > 3:
+            print(f"  ... and {len(info['names']) - 3} more parameters")
+    print("\n" + "-"*50)
+    print(f"Total trainable parameters: {trainable_params:,} / {all_params:,} ({100 * trainable_params / all_params:.2f}%)")
+    print("="*50 + "\n")
+def safe_save_model_for_hf_trainer(trainer, output_dir: str, bias="none"):
+    """Collects the state dict and dump to disk."""
+    if trainer.args.should_save and trainer.args.local_rank == 0:
+        trainer.save_model(output_dir,)
+# class CollateFn:
+#     def __init__(self, processor, prompt="Please transcribe this audio into text.", system_prompt="You are a helpful language and speech assistant. You are able to understand the speech content that the user provides, and assist the user with a variety of tasks using natural language."):
+#         self.prompt = prompt
+#         self.system_prompt = system_prompt
+#         self.processor = processor
+#     def __call__(self, examples):
+#         prompts_lists = []
+#         input_images_list = []
+#         input_audios_list = []
+#         audio_parts_list = []
+#         for msgs in examples:
+#             msgs = msgs["conversations"]
+#             if isinstance(msgs, str):
+#                 msgs = json.loads(msgs)
+#             copy_msgs = deepcopy(msgs)
+#             assert len(msgs) > 0, "msgs is empty"
+#             system_turn = {'role': 'system', 'content': self.system_prompt}
+#             if copy_msgs[0]["role"] != 'system':
+#                 copy_msgs.insert(0, system_turn)
+#             images = []
+#             audios = []
+#             audio_parts = []
+#             for i, msg in enumerate(copy_msgs):
+#                 role = msg["role"]
+#                 content = msg["content"]
+#                 assert role in ["system", "user", "assistant"]
+#                 if i == 0:
+#                     assert role in ["user", "system"], "The role of first msg should be user"
+#                 content = [content, self.prompt]
+#                 cur_msgs = []
+#                 for c in content:
+#                     if os.path.exists(c):
+#                         c, _ = librosa.load(c, sr=16000, mono=True)
+#                     if isinstance(c, Image.Image):
+#                         images.append(c)
+#                         cur_msgs.append("(<image>./</image>)")
+#                     elif isinstance(c, np.ndarray):  # audio
+#                         audios.append(c)
+#                         audio_parts.append(i)
+#                         cur_msgs.append("(<audio>./</audio>)")
+#                     elif isinstance(c, str):
+#                         cur_msgs.append(c)
+#                 else:
+#                     msg["content"] = "\n".join(cur_msgs)
+#             prompts_lists.append(
+#                 self.processor.tokenizer.apply_chat_template(
+#                     copy_msgs,
+#                     tokenize=False,
+#                     add_generation_prompt=False,
+#                 )
+#             )
+#             input_images_list.append(images)
+#             input_audios_list.append(audios)
+#             audio_parts_list.append(audio_parts)
+#         inputs = self.processor(
+#             prompts_lists,
+#             input_images_list,
+#             input_audios_list,
+#             audio_parts_list,
+#             return_tensors="pt",
+#             max_length=32768,
+#             return_labels=True,
+#         )
+#         return inputs
+def collate_fn(examples, processor, chunk_input, max_len, prompt=None, system_prompt="You are a helpful language and speech assistant. You are able to understand the speech content that the user provides, and assist the user with a variety of tasks using natural language.", transform=None):
+    prompts_lists = []
+    input_images_list = []
+    input_audios_list = []
+    audio_parts_list = []
+    for msgs in examples:
+        raw_msgs = deepcopy(msgs)
+        msgs = msgs["conversations"]
+        if isinstance(msgs, str):
+            msgs = json.loads(msgs)
+        copy_msgs = deepcopy(msgs)
+        assert len(msgs) > 0, "msgs is empty"
+        system_turn = {'role': 'system', 'content': system_prompt}
+        if copy_msgs[0]["role"] != 'system':
+            copy_msgs.insert(0, system_turn)
+        fc = None
+        if "tools" in raw_msgs:
+            # if raw_msgs["tools"] != "":
+            #     json_objects = raw_msgs["tools"].split("\n\n")
+            #     try:
+            #         fc = [json.loads(obj) for obj in json_objects]
+            #     except:
+            #         if len(json_objects) > 1:
+            #             json_objects = json_objects[:-1]
+            #         fc = [json.loads(obj) for obj in json_objects]
+            if raw_msgs["tools"] != "":
+                fc = json.loads(raw_msgs["tools"])
+        # print(fc)
+        # print("-----------")
+        images = []
+        audios = []
+        audio_parts = []
+        for i, msg in enumerate(copy_msgs):
+            role = msg["role"]
+            content = msg["content"]
+            assert role in ["system", "user", "assistant", "tool"]
+            if i == 0:
+                assert role in ["user", "system"], "The role of first msg should be user or system"
+            if role == "user":
+                if prompt is not None:
+                    content = [content, prompt]
+                else:
+                    content = [content]
+                cur_msgs = []
+                for c in content:
+                    if os.path.exists(c):
+                        c, _ = librosa.load(c, sr=16000, mono=True)
+                        if transform is not None:
+                            c = transform(c, sample_rate=16000)
+                    if isinstance(c, Image.Image):
+                        images.append(c)
+                        cur_msgs.append("(<image>./</image>)")
+                    elif isinstance(c, np.ndarray):  # audio
+                        audios.append(c)
+                        audio_parts.append(i)
+                        cur_msgs.append("(<audio>./</audio>)")
+                    elif isinstance(c, str):
+                        cur_msgs.append(c)
+                msg["content"] = "\n".join(cur_msgs)
+            if "tool_calls" in msg:
+                if msg["tool_calls"] is not None:
+                    assert isinstance(msg["tool_calls"], str), f"Invalid type: {type(msg['tool_calls'])}"
+                    msg["tool_calls"] = json.loads(msg["tool_calls"])
+                    if type(msg["tool_calls"]) != list:
+                        msg["tool_calls"] = [msg["tool_calls"]]
+        # print(copy_msgs)
+        # print("--------")
+        qwen_template = processor.tokenizer.apply_chat_template(
+                copy_msgs,
+                tokenize=False,
+                add_generation_prompt=False,
+                tools = fc,
+            )
+        # print(qwen_template)
+        # print("---------------")
+        prompts_lists.append(qwen_template)
+        input_images_list.append(images)
+        input_audios_list.append(audios)
+        audio_parts_list.append(audio_parts)
+    inputs = processor(
+        prompts_lists,
+        input_images_list,
+        input_audios_list,
+        audio_parts_list,
+        chunk_input=chunk_input,
+        return_tensors="pt",
+        # max_length=max_len,
+        return_labels=True,
+    )
+    return inputs
+def make_supervised_data_module(
+    tokenizer: transformers.PreTrainedTokenizer,
+    processor: transformers.ProcessorMixin,
+    data_args,
+    transform,
+    data_collator=None,
+    llm_type="qwen",
+    slice_config=None,
+    patch_size=14,
+    query_nums=64,
+    batch_vision=False,
+    max_length=2048,
+) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    dataset_cls = SupervisedDataset
+    rank0_print("Loading data...")
+    train_json = json.load(open(data_args.data_path, "r"))
+    train_dataset = dataset_cls(
+        train_json,
+        transform,
+        tokenizer,
+        processor,
+        slice_config=slice_config,
+        llm_type=llm_type,
+        patch_size=patch_size,
+        query_nums=query_nums,
+        batch_vision=batch_vision,
+        max_length=max_length,
+    )
+    if data_args.eval_data_path:
+        eval_json = json.load(open(data_args.eval_data_path, "r"))
+        eval_dataset = dataset_cls(
+            eval_json,
+            transform,
+            tokenizer,
+            processor,
+            slice_config=slice_config,
+            llm_type=llm_type,
+            patch_size=patch_size,
+            query_nums=query_nums,
+            batch_vision=batch_vision,
+            max_length=max_length,
+        )
+    else:
+        eval_dataset = None
+    return dict(
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        data_collator= partial(data_collator, max_length=max_length),
+    )
+def build_transform():
+    IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5) # timm.data.IMAGENET_INCEPTION_MEAN
+    IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5)  # timm.data.IMAGENET_INCEPTION_STD
+    return transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD
+                ),
+            ]
+        )
+def get_parameter_number(model):
+    trainable_params, all_param = 0, 0
+    for param in model.parameters():
+        num_params = param.numel()
+        # if using DS Zero 3 and the weights are initialized empty
+        if num_params == 0 and hasattr(param, "ds_numel"):
+            num_params = param.ds_numel
+        all_param += num_params
+        if param.requires_grad:
+            trainable_params += num_params
+    return {'Total': all_param, 'Trainable': trainable_params}
+local_rank = 0
+def train(attn_implementation="flash_attention_2"):
+    global local_rank
+    parser = transformers.HfArgumentParser(
+        (ModelArguments, DataArguments, TrainingArguments, LoraArguments)
+    )
+    (
+        model_args,
+        data_args,
+        training_args,
+        lora_args,
+    ) = parser.parse_args_into_dataclasses()
+    if getattr(training_args, "deepspeed", None) :
+        training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED
+    compute_dtype = (
+        torch.float16
+        if training_args.fp16
+        else (torch.bfloat16 if training_args.bf16 else torch.float32)
+    )
+    local_rank = training_args.local_rank
+    world_size = int(os.environ.get("WORLD_SIZE", 1))
+    ddp = world_size != 1
+    device_map = None
+    if lora_args.q_lora:
+        device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} if ddp else None
+        if len(training_args.fsdp) > 0 or deepspeed.is_deepspeed_zero3_enabled():
+            logging.warning(
+                "FSDP or ZeRO3 are not incompatible with QLoRA."
+            )
+    minipcmo_config = {}
+    if training_args.config_path is not None:
+        minipcmo_config = json.load(open(training_args.config_path, "r"))
+    # if model_args.tokenizer_path is not None:
+    #     tokenizer = AutoTokenizer.from_pretrained(
+    #         model_args.tokenizer_path, trust_remote_code=True
+    #     )
+    # else:
+    #     tokenizer = AutoTokenizer.from_pretrained(
+    #         model_args.model_name_or_path, trust_remote_code=True
+    #     )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path, trust_remote_code=True
+    )
+    if model_args.pretrained_llm_path is None:
+        print("Finetuning model!!!")
+        model = MiniCPMO.from_pretrained(
+            model_args.model_name_or_path,
+            torch_dtype=compute_dtype,
+            device_map=device_map,
+            attn_implementation=attn_implementation,
+            init_vision=training_args.init_vision,
+            init_audio=training_args.init_speech,
+            init_tts=False,
+            processor_path=model_args.tokenizer_path,
+            **minipcmo_config
+        )
+    else:
+        print("Load pretrained LLM from scratch!!!")
+        # # Create the config object as needed
+        # config = MiniCPMOConfig(
+        #     model_name_or_path=model_args.model_name_or_path,
+        #     pretrained_llm_path=model_args.pretrained_llm_path,
+        #     init_vision=training_args.init_vision,
+        #     init_audio=training_args.init_speech,
+        #     pretrained_encoder_path=model_args.audio_encoder_path,
+        #     processor_path=model_args.tokenizer_path,
+        #     **minipcmo_config
+        # )
+        # # Initialize model
+        # model = MiniCPMO(config)
+        model = MiniCPMO.from_pretrained(
+            model_args.model_name_or_path,
+            pretrained_llm_path=model_args.pretrained_llm_path,
+            init_vision=training_args.init_vision,
+            init_audio=training_args.init_speech,
+            init_tts=False,
+            pretrained_encoder_path=model_args.audio_encoder_path,
+            processor_path=model_args.tokenizer_path,
+            **minipcmo_config
+        )
+        # tokenizer.audio_start_id = tokenizer.convert_tokens_to_ids("<|box_start|>")
+        # tokenizer.audio_end_id = tokenizer.convert_tokens_to_ids("<|box_end|>")
+        # tokenizer.audio_start = "<|box_start|>"
+        # tokenizer.audio_end = "<|box_end|>"
+        # tokenizer.im_start_id = tokenizer.convert_tokens_to_ids("<|vision_start|>")
+        # tokenizer.im_end_id = tokenizer.convert_tokens_to_ids("<|vision_end|>")
+        # tokenizer.im_start = "<|vision_start|>"
+        # tokenizer.im_end = "<|vision_end|>"
+        # tokenizer.slice_start_id = tokenizer.convert_tokens_to_ids("<|quad_start|>")
+        # tokenizer.slice_end_id = tokenizer.convert_tokens_to_ids("<|quad_end|>")
+        # tokenizer.slice_start = "<|quad_start|>"
+        # tokenizer.slice_end = "<|quad_end|>"
+        # tokenizer.unk_token = "<unk>"
+        # print("Audio Start Token:", tokenizer.audio_start)
+        # print("Audio End Token:", tokenizer.audio_end)
+        # print(tokenizer.audio_start_id)
+        # print(tokenizer.audio_end_id)
+        # print("Start Token:", tokenizer.im_start)
+        # print("End Token:", tokenizer.im_end)
+        # print(tokenizer.im_start_id)
+        # print(tokenizer.im_end_id)
+        # print("Slice Start Token:", tokenizer.slice_start)
+        # print("Slice End Token:", tokenizer.slice_end)
+        # print(tokenizer.slice_start_id)
+        # print(tokenizer.slice_end_id)
+        model.config.chunk_input = training_args.chunk_input
+        # model.llm.resize_token_embeddings(len(tokenizer))
+        # model.resize_token_embeddings(len(tokenizer))
+    model.llm.config.use_cache = False
+    model.config.max_length = training_args.model_max_length
+    # if not training_args.tune_vision and training_args.init_vision:
+    #     model.vpm.requires_grad_(False)
+    # if not training_args.tune_speech and training_args.init_speech:
+    #     model.apm.requires_grad_(False)
+    # if not training_args.tune_llm:
+    #     model.llm.requires_grad_(False)
+    model.requires_grad_(False)
+    if training_args.tune_llm:
+        model.llm.requires_grad_(True)
+        print("Enabled training for LLM")
+    model.audio_projection_layer.requires_grad_(True)
+    print("Enabled training for audio_projection_layer")
+    if training_args.use_lora:
+        if training_args.use_lora and training_args.tune_llm:
+            raise ValueError("The model cannot simultaneously adjust LLM parameters and apply LoRA.")
+        rank0_print("Currently using LoRA for fine-tuning the MiniCPM-V model.")
+        for name, param in model.llm.named_parameters():
+            param.requires_grad = False
+        modules_to_save = ['embed_tokens','resampler']
+        if training_args.tune_vision:
+            modules_to_save.append('vpm')
+        lora_config = LoraConfig(
+            r=lora_args.lora_r,
+            lora_alpha=lora_args.lora_alpha,
+            target_modules=lora_args.lora_target_modules,
+            lora_dropout=lora_args.lora_dropout,
+            bias=lora_args.lora_bias,
+            layers_to_transform=lora_args.lora_layers_to_transform,
+            modules_to_save=modules_to_save,
+        )
+        if not hasattr(model, 'get_input_embeddings'):
+            def get_input_embeddings(self):
+                return self.llm.get_input_embeddings()
+            model.get_input_embeddings = MethodType(get_input_embeddings, model)
+        if lora_args.q_lora:
+            model = prepare_model_for_kbit_training(
+                model, use_gradient_checkpointing=training_args.gradient_checkpointing
+            )
+        model = get_peft_model(model, lora_config)
+        if training_args.gradient_checkpointing:
+            model.enable_input_require_grads()
+    rank0_print(get_parameter_number(model))
+    print_trainable_parameters_by_module(model)
+    llm_type = training_args.llm_type
+    rank0_print(f'llm_type={llm_type}')
+    # Load data
+    if hasattr(model.config, "slice_config"):
+        model.config.slice_config.max_slice_nums = training_args.max_slice_nums
+        slice_config = model.config.slice_config.to_dict()
+    else:
+        model.config.max_slice_nums = training_args.max_slice_nums
+        slice_config = model.config.to_dict()
+    if hasattr(model.config, "batch_vision_input"):
+        batch_vision = model.config.batch_vision_input
+    else:
+        batch_vision = False
+    transform_func = build_transform()
+    if model_args.tokenizer_path is not None:
+        processor = AutoProcessor.from_pretrained(model_args.tokenizer_path, trust_remote_code=True)
+    else:
+        processor = AutoProcessor.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
+    processor.tokenizer = tokenizer
+    raw_datasets = load_dataset(
+        "json",
+        data_files={
+            "train": data_args.data_path,
+            "validation": data_args.eval_data_path,
+        },
+        cache_dir=training_args.cache_dir,
+    )
+    train_ds = raw_datasets["train"]
+    if data_args.max_train_samples is not None:
+        train_ds = train_ds.select(range(data_args.max_train_samples))
+    eval_ds = raw_datasets["validation"]
+    if data_args.max_eval_samples is not None:
+        eval_ds = eval_ds.select(range(data_args.max_eval_samples))
+    # data_module = make_supervised_data_module(
+    #     tokenizer=tokenizer,
+    #     processor=processor,
+    #     data_args=data_args,
+    #     transform=transform_func,
+    #     data_collator=data_collator,
+    #     slice_config=slice_config,
+    #     llm_type=llm_type,
+    #     patch_size=model.config.patch_size,
+    #     query_nums=model.config.query_num,
+    #     batch_vision=batch_vision,
+    #     max_length=training_args.model_max_length,
+    # )
+    init_prompt = None
+    if not training_args.tune_llm and training_args.tune_speech: # asr finetuning
+        init_prompt = "Please transcribe this audio into text."
+    transform = None
+    if data_args.augment_prob != 0.0 and data_args.augment_path is not None:
+        with open(data_args.augment_path, "r") as f:
+            augment_path_list = f.read().splitlines()
+        transform = AddBackgroundNoise(
+            sounds_path=augment_path_list,
+            min_snr_db=5.0,
+            max_snr_db=30.0,
+            noise_transform=PolarityInversion(),
+            p=data_args.augment_prob
+        )
+    custom_collate_fn = partial(collate_fn, processor = processor, chunk_input=training_args.chunk_input, max_len=training_args.model_max_length, prompt=init_prompt, transform=transform)
+    training_args.gradient_checkpointing_kwargs={"use_reentrant":False}
+    # print("Training Layers:")
+    # for name, param in model.named_parameters():
+    #     if param.requires_grad:
+    #         print(name, param.grad)
+    # trainer = CPMTrainer(
+    #     model=model,
+    #     tokenizer=tokenizer,
+    #     args=training_args,
+    #     **data_module,
+    # )
+    trainer = Trainer(
+        model=model,
+        tokenizer=tokenizer,
+        args=training_args,
+        train_dataset=train_ds,
+        eval_dataset=eval_ds,
+        data_collator=custom_collate_fn
+    )
+    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+    trainer.save_state()
+    safe_save_model_for_hf_trainer(
+        trainer=trainer,
+        output_dir=training_args.output_dir,
+        bias=lora_args.lora_bias)
+if __name__ == "__main__":
+    train()

omni_speech/train/train_multiturn.py ADDED Viewed

	@@ -0,0 +1,515 @@

+# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
+# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
+#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import os
+import copy
+from dataclasses import dataclass, field
+import json
+import logging
+import pathlib
+from typing import Dict, Optional, Sequence, List
+import torch
+import transformers
+import tokenizers
+from omni_speech.constants import IGNORE_INDEX, SPEECH_TOKEN_INDEX, DEFAULT_SPEECH_TOKEN
+from torch.utils.data import Dataset
+from omni_speech.train.omni_trainer import OmniTrainer
+from audiomentations import AddBackgroundNoise, PolarityInversion
+from omni_speech import conversation as conversation_lib
+from omni_speech.model import *
+from omni_speech.utils import *
+from omni_speech.datasets.preprocess import *
+import whisper
+import time
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
+    version: Optional[str] = field(default="llama_3")
+    freeze_backbone: bool = field(default=False)
+    tune_speech_projector: bool = field(default=False)
+    tune_speech_encoder: bool = field(default=False)
+    tune_speech_generator_only: bool = field(default=False)
+    speech_encoder_type: Optional[str] = field(default=None)
+    speech_encoder: Optional[str] = field(default=None)
+    pretrain_speech_projector: Optional[str] = field(default=None)
+    speech_projector_type: Optional[str] = field(default='linear')
+    speech_generator_type: Optional[str] = field(default='ctc')
+    # ctc_decoder_config: str = "(2,4096,32,11008)" # num layers, hidden sizes, attn heads, ff dimensions of LLaMA
+    ctc_decoder_config: str = "(2,4096,32,22016)"
+    ctc_upsample_factor: int = 25
+    ctc_loss_weight: float = 1.0
+    unit_vocab_size: int = 1000
+    speech_encoder_ds_rate: int = 5
+    speech_encoder_hidden_size: int = 1280
+@dataclass
+class DataArguments:
+    data_path: str = field(default=None,
+                           metadata={"help": "Path to the training data."})
+    dev_path: str = field(default=None,
+                           metadata={"help": "Path to the dev data."})
+    is_multimodal: bool = False
+    input_type: str = field(default="mel")
+    speech_normalize: bool = False
+    mel_size: int = 128
+    has_tgt_units: bool = False
+    augment_prob: float = field(
+        default=0.0,
+        metadata={"help": "The probability of applying augmentation transform."}
+    )
+    augment_path: str = field(default=None,
+                           metadata={"help": "Path to the augment data."})
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    freeze_speech_projector: bool = field(default=False)
+    model_max_length: int = field(
+        default=512,
+        metadata={
+            "help":
+            "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    double_quant: bool = field(
+        default=True,
+        metadata={"help": "Compress the quantization statistics through double quantization."}
+    )
+    quant_type: str = field(
+        default="nf4",
+        metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
+    )
+    bits: int = field(
+        default=16,
+        metadata={"help": "How many bits to use."}
+    )
+    lora_enable: bool = False
+    lora_r: int = 64
+    lora_alpha: int = 16
+    lora_dropout: float = 0.05
+    lora_weight_path: str = ""
+    lora_bias: str = "none"
+    speech_projector_lr: Optional[float] = None
+    group_by_modality_length: bool = field(default=False)
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+    def __init__(self, data_path: str,
+                 tokenizer: transformers.PreTrainedTokenizer,
+                 data_args: DataArguments):
+        super(LazySupervisedDataset, self).__init__()
+        list_data_dict = json.load(open(data_path, "r"))
+        self.tokenizer = tokenizer
+        self.list_data_dict = list_data_dict
+        self.data_args = data_args
+        if self.data_args.augment_prob != 0.0:
+            with open(self.data_args.augment_path, "r") as f:
+                augment_path_list = f.read().splitlines()
+            self.transform = AddBackgroundNoise(
+                sounds_path=augment_path_list,
+                min_snr_db=5.0,
+                max_snr_db=30.0,
+                noise_transform=PolarityInversion(),
+                p=self.data_args.augment_prob
+            )
+    def __len__(self):
+        return len(self.list_data_dict)
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        # TODO: define number of retries somewhere else
+        num_base_retries = 3
+        num_final_retries = 300
+        # try the current sample first
+        for attempt_idx in range(num_base_retries):
+            try:
+                sample = self._get_item(i)
+                return sample
+            except Exception as e:
+                # sleep 1s in case it is a cloud disk issue
+                print(f"[Try #{attempt_idx}] Failed to fetch sample {i}. Exception:", e)
+                time.sleep(1)
+        # try other samples, in case it is file corruption issue
+        for attempt_idx in range(num_base_retries):
+            try:
+                next_index = min(i + 1, len(self.list_data_dict) - 1)
+                # sample_idx = random.choice(range(len(self)))
+                sample = self._get_item(next_index)
+                return sample
+            except Exception as e:
+                # no need to sleep
+                print(f"[Try other #{attempt_idx}] Failed to fetch sample {next_index}. Exception:", e)
+                pass
+        try:
+            sample = self._get_item(i)
+            return sample
+        except Exception as e:
+            raise e
+    def process_speech(self, speech_file):
+        speech = whisper.load_audio(speech_file)
+        if self.data_args.augment_prob != 0.0:
+            speech = self.transform(speech, sample_rate=16000)
+        if self.data_args.input_type == "raw":
+            speech = torch.from_numpy(speech)
+            if self.model_config.data_args.speech_normalize:
+                speech = torch.nn.functional.layer_norm(speech, speech.shape)
+        elif self.data_args.input_type == "mel":
+            speech = whisper.pad_or_trim(speech)
+            speech = whisper.log_mel_spectrogram(speech, n_mels=self.data_args.mel_size).permute(1, 0)
+        speech_lengths = torch.LongTensor([speech.shape[0]])
+        return speech, speech_lengths
+    def _get_item(self, i) -> Dict[str, torch.Tensor]:
+        sources = self.list_data_dict[i]
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+        for item in sources:
+            if 'tools' in item:
+                tools_dict = {
+                    "from": "tools",
+                    "value": item["tools"]
+                }
+                item["conversations"].insert(0,tools_dict)
+        if self.data_args.has_tgt_units:
+            # pad_list = [0]
+            # tgt_units = [e["tgt_units"] if "tgt_units" in e else pad_list for e in sources]
+            tgt_units = [e["tgt_units"] for e in sources]
+            tgt_units = torch.tensor(tgt_units, dtype=torch.long)
+        else:
+            tgt_units = None
+        if 'speech' in sources[0]:
+            import numpy as np
+            speech_file = self.list_data_dict[i]['speech']
+            if type(speech_file) is list:
+                speech = [self.process_speech(f) for f in speech_file]
+            else:
+                speech = [self.process_speech(speech_file)]
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args)
+        else:
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+        data_dict = preprocess(
+            sources,
+            self.tokenizer,
+            has_speech=('speech' in self.list_data_dict[i]))
+        if isinstance(i, int):
+            data_dict = dict(input_ids=data_dict["input_ids"][0],
+                             labels=data_dict["labels"][0])
+        # speech exist in the data
+        if 'speech' in self.list_data_dict[i]:
+            data_dict['speech'] = speech
+        if tgt_units is not None:
+            data_dict['tgt_units'] = tgt_units[0]
+        data_dict["id"] = self.list_data_dict[i].get("id", i)
+        return data_dict
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+    tokenizer: transformers.PreTrainedTokenizer
+    def pad_sequence(self, input_ids, batch_first, padding_value):
+        if self.tokenizer.padding_side == "left":
+            input_ids = [torch.flip(_input_ids, [0]) for _input_ids in input_ids]
+        input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=batch_first, padding_value=padding_value)
+        if self.tokenizer.padding_side == "left":
+            input_ids = torch.flip(input_ids, [1])
+        return input_ids
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
+        # input_ids, labels, ids = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels", "id"))
+        input_ids = [_input_ids[: self.tokenizer.model_max_length] for _input_ids in input_ids]
+        labels = [_labels[: self.tokenizer.model_max_length] for _labels in labels]
+        if self.tokenizer.pad_token_id is None:
+            # self.tokenizer.pad_token_id = self.tokenizer.eos_token_id  # FIXME: this could only be triggered for llama3 model.
+            self.tokenizer.pad_token_id = 0 # This gets the best result. Don't know why.
+        input_ids = self.pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
+        labels = self.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
+        batch = dict(input_ids=input_ids, labels=labels.long() if labels.dtype == torch.int32 else labels, attention_mask=input_ids.ne(self.tokenizer.pad_token_id))
+        # batch = dict(input_ids=input_ids, labels=labels, attention_mask=input_ids.ne(self.tokenizer.pad_token_id), ids=ids)
+        if 'speech' in instances[0]:
+            speechs = [instance['speech'] for instance in instances]
+            speech = [sp[0] for sp_list in speechs for sp in sp_list]
+            speech_lengths = [sp[1] for sp_list in speechs for sp in sp_list]
+            batch["speech"] = speech
+            # print(len(speech)) # sum(len(audio) for audio in each batch)
+            # print(speech[0].shape) # seq_len, dim
+            batch['speech_lengths'] = speech_lengths
+            # print(speech_lengths[0].shape) # seq_len
+        if 'tgt_units' in instances[0]:
+            tgt_units = [instance['tgt_units'] for instance in instances]
+            tgt_units = self.pad_sequence(tgt_units, batch_first=True, padding_value=self.tokenizer.pad_token_id)
+            batch['tgt_units'] = tgt_units
+            # print(batch['tgt_units'])
+            # print("---------------")
+            # print(batch['input_ids'])
+        return batch
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer,
+                                data_args) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    train_dataset = LazySupervisedDataset(tokenizer=tokenizer,
+                                data_path=data_args.data_path,
+                                data_args=data_args)
+    if data_args.dev_path is not None:
+        dev_dataset = LazySupervisedDataset(tokenizer=tokenizer,
+                                data_path=data_args.dev_path,
+                                data_args=data_args)
+    else:
+        dev_dataset = None
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(train_dataset=train_dataset,
+                eval_dataset=dev_dataset,
+                data_collator=data_collator)
+def train(attn_implementation="flash_attention_2"):
+    parser = transformers.HfArgumentParser(
+        (ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
+    bnb_model_from_pretrained_args = {}
+    if training_args.bits in [4, 8]:
+        from transformers import BitsAndBytesConfig
+        bnb_model_from_pretrained_args.update(dict(
+            device_map={"": training_args.device},
+            load_in_4bit=training_args.bits == 4,
+            load_in_8bit=training_args.bits == 8,
+            quantization_config=BitsAndBytesConfig(
+                load_in_4bit=training_args.bits == 4,
+                load_in_8bit=training_args.bits == 8,
+                llm_int8_skip_modules=["speech_projector"],
+                llm_int8_threshold=6.0,
+                llm_int8_has_fp16_weight=False,
+                bnb_4bit_compute_dtype=compute_dtype,
+                bnb_4bit_use_double_quant=training_args.double_quant,
+                bnb_4bit_quant_type=training_args.quant_type # {'fp4', 'nf4'}
+            )
+        ))
+    if data_args.has_tgt_units:
+        if model_args.version == "llama_3":
+            model = OmniSpeech2SLlamaForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                **bnb_model_from_pretrained_args
+            )
+        elif model_args.version == "qwen":
+            model = OmniSpeech2SQwen2ForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                **bnb_model_from_pretrained_args
+            )
+        else:
+            raise ValueError("--currently only support llama or qwen model!")
+    else:
+        if model_args.version == "llama_3":
+            model = OmniSpeechLlamaForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                **bnb_model_from_pretrained_args
+            )
+        elif model_args.version == "qwen":
+            model = OmniSpeechQwen2ForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                **bnb_model_from_pretrained_args
+            )
+        else:
+            raise ValueError("--currently only support llama or qwen model!")
+    model.config.use_cache = False
+    if model_args.freeze_backbone:
+        model.model.requires_grad_(False)
+    if training_args.bits in [4, 8]:
+        from peft import prepare_model_for_kbit_training
+        model.config.torch_dtype=(torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
+        model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing)
+    if training_args.gradient_checkpointing:
+        if hasattr(model, "enable_input_require_grads"):
+            model.enable_input_require_grads()
+        else:
+            def make_inputs_require_grad(module, input, output):
+                output.requires_grad_(True)
+            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+    if training_args.lora_enable:
+        from peft import LoraConfig, get_peft_model
+        lora_config = LoraConfig(
+            r=training_args.lora_r,
+            lora_alpha=training_args.lora_alpha,
+            target_modules=find_all_linear_names(model),
+            lora_dropout=training_args.lora_dropout,
+            bias=training_args.lora_bias,
+            task_type="CAUSAL_LM",
+        )
+        if training_args.bits == 16:
+            if training_args.bf16:
+                model.to(torch.bfloat16)
+            if training_args.fp16:
+                model.to(torch.float16)
+        model = get_peft_model(model, lora_config)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+        model_max_length=training_args.model_max_length,
+        padding_side="right",
+        use_fast=False,
+    )
+    model.resize_token_embeddings(len(tokenizer))
+    model.config.max_length = training_args.model_max_length
+    if model_args.version in conversation_lib.conv_templates:
+        conversation_lib.default_conversation = conversation_lib.conv_templates[model_args.version]
+    else:
+        conversation_lib.default_conversation = conversation_lib.conv_templates["llama_3"]
+    if model_args.speech_encoder is not None:
+        model.get_model().initialize_speech_modules(
+            model_args=model_args,
+            fsdp=training_args.fsdp
+        )
+        data_args.is_multimodal = True
+        model.config.tokenizer_padding_side = tokenizer.padding_side
+        model.config.tokenizer_model_max_length = tokenizer.model_max_length
+        model.config.tune_speech_projector = training_args.tune_speech_projector = model_args.tune_speech_projector
+        model.config.speech_normalize = data_args.speech_normalize
+        for p in model.get_speech_encoder().parameters():
+            p.requires_grad = False
+        if model_args.tune_speech_projector:
+            model.requires_grad_(False)
+            for p in model.get_speech_projector().parameters():
+                p.requires_grad = True
+        model.config.freeze_speech_projector = training_args.freeze_speech_projector
+        if training_args.freeze_speech_projector:
+            for p in model.get_speech_projector().parameters():
+                p.requires_grad = False
+        if training_args.bits in [4, 8]:
+            model.get_model().speech_projector.to(dtype=compute_dtype, device=training_args.device)
+        model.config.speech_projector_lr = training_args.speech_projector_lr
+    if data_args.has_tgt_units:
+        model.initialize_speech_generator(model_args=model_args)
+    if training_args.bits in [4, 8]:
+        from peft.tuners.lora import LoraLayer
+        for name, module in model.named_modules():
+            if isinstance(module, LoraLayer):
+                if training_args.bf16:
+                    module = module.to(torch.bfloat16)
+            if 'norm' in name:
+                module = module.to(torch.float32)
+            if 'lm_head' in name or 'embed_tokens' in name:
+                if hasattr(module, 'weight'):
+                    if training_args.bf16 and module.weight.dtype == torch.float32:
+                        module = module.to(torch.bfloat16)
+    data_module = make_supervised_data_module(tokenizer=tokenizer,
+                                              data_args=data_args)
+    print("Training Layers:")
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            print(name, param.grad)
+    trainer = OmniTrainer(model=model,
+                    tokenizer=tokenizer,
+                    args=training_args,
+                    **data_module)
+    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+    trainer.save_state()
+    model.config.use_cache = True
+    if training_args.lora_enable:
+        state_dict = get_peft_state_maybe_zero_3(
+            model.named_parameters(), training_args.lora_bias
+        )
+        non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(
+            model.named_parameters()
+        )
+        if training_args.local_rank == 0 or training_args.local_rank == -1:
+            model.config.save_pretrained(training_args.output_dir)
+            model.save_pretrained(training_args.output_dir, state_dict=state_dict)
+            torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin'))
+    else:
+        safe_save_model_for_hf_trainer(trainer=trainer,
+                                       output_dir=training_args.output_dir)
+if __name__ == "__main__":
+    train()

omni_speech/train/trainer.py ADDED Viewed

	@@ -0,0 +1,249 @@

+import torch
+import torch.nn as nn
+import deepspeed
+from transformers import Trainer
+from transformers.trainer_pt_utils import nested_detach
+from transformers.utils import is_sagemaker_mp_enabled
+from transformers.trainer import *
+from transformers.integrations import is_deepspeed_zero3_enabled
+class CPMTrainer(Trainer):
+    def compute_loss(self, model, inputs, return_outputs=False):
+        if "labels" in inputs:
+            labels = inputs.pop("labels")
+        else:
+            labels = None
+        if not self.args.use_lora:
+            outputs = self.model(data = inputs, use_cache=False)
+        else:
+            with self.model._enable_peft_forward_hooks(**inputs):
+                outputs = self.model.base_model(data = inputs, use_cache=False)
+        if labels is not None:
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            logits = outputs.logits.view(-1,
+                                         self.model.config.vocab_size).contiguous()
+            labels = labels.view(-1).long().contiguous()
+            # Enable model parallelism
+            labels = labels.to(logits.device)
+            loss = loss_fct(logits, labels)
+        else:
+            if isinstance(outputs, dict) and "loss" not in outputs:
+                raise ValueError(
+                    "The model did not return a loss from the inputs, only the following keys: "
+                    f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
+                )
+            # We don't use .loss here since the model may return tuples instead of ModelOutput.
+            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
+        return (loss, outputs) if return_outputs else loss
+    def prediction_step(
+        self,
+        model: nn.Module,
+        inputs: Dict[str, Union[torch.Tensor, Any]],
+        prediction_loss_only: bool,
+        ignore_keys: Optional[List[str]] = None,
+    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        Perform an evaluation step on `model` using `inputs`.
+        Subclass and override to inject custom behavior.
+        Args:
+            model (`nn.Module`):
+                The model to evaluate.
+            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
+                The inputs and targets of the model.
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument `labels`. Check your model's documentation for all accepted arguments.
+            prediction_loss_only (`bool`):
+                Whether or not to return the loss only.
+            ignore_keys (`List[str]`, *optional*):
+                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
+                gathering predictions.
+        Return:
+            Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss,
+            logits and labels (each being optional).
+        """
+        has_labels = (
+            False
+            if len(self.label_names) == 0
+            else all(inputs.get(k) is not None for k in self.label_names)
+        )
+        # For CLIP-like models capable of returning loss values.
+        # If `return_loss` is not specified or being `None` in `inputs`, we check if the default value of `return_loss`
+        # is `True` in `model.forward`.
+        return_loss = inputs.get("return_loss", None)
+        if return_loss is None:
+            return_loss = self.can_return_loss
+        loss_without_labels = (
+            True if len(self.label_names) == 0 and return_loss else False
+        )
+        inputs = self._prepare_inputs(inputs)
+        if ignore_keys is None:
+            if hasattr(self.model, "config"):
+                ignore_keys = getattr(
+                    self.model.config, "keys_to_ignore_at_inference", []
+                )
+            else:
+                ignore_keys = []
+        # labels may be popped when computing the loss (label smoothing for instance) so we grab them first.
+        if has_labels or loss_without_labels:
+            labels = nested_detach(tuple(inputs.get(name)
+                                   for name in self.label_names))
+            if len(labels) == 1:
+                labels = labels[0]
+        else:
+            labels = None
+        with torch.no_grad():
+            if is_sagemaker_mp_enabled():
+                raw_outputs = smp_forward_only(model, inputs)
+                if has_labels or loss_without_labels:
+                    if isinstance(raw_outputs, dict):
+                        loss_mb = raw_outputs["loss"]
+                        logits_mb = tuple(
+                            v
+                            for k, v in raw_outputs.items()
+                            if k not in ignore_keys + ["loss"]
+                        )
+                    else:
+                        loss_mb = raw_outputs[0]
+                        logits_mb = raw_outputs[1:]
+                    loss = loss_mb.reduce_mean().detach().cpu()
+                    logits = smp_nested_concat(logits_mb)
+                else:
+                    loss = None
+                    if isinstance(raw_outputs, dict):
+                        logits_mb = tuple(
+                            v for k, v in raw_outputs.items() if k not in ignore_keys
+                        )
+                    else:
+                        logits_mb = raw_outputs
+                    logits = smp_nested_concat(logits_mb)
+            else:
+                if has_labels or loss_without_labels:
+                    with self.compute_loss_context_manager():
+                        loss, outputs = self.compute_loss(
+                            model, inputs, return_outputs=True
+                        )
+                    loss = loss.mean().detach()
+                    if isinstance(outputs, dict):
+                        logits = tuple(
+                            v
+                            for k, v in outputs.items()
+                            if k not in ignore_keys + ["loss"]
+                        )
+                    else:
+                        logits = outputs[1:]
+                else:
+                    loss = None
+                    with self.compute_loss_context_manager():
+                        outputs = model(**inputs)
+                    if isinstance(outputs, dict):
+                        logits = tuple(
+                            v for k, v in outputs.items() if k not in ignore_keys
+                        )
+                    else:
+                        logits = outputs
+                    # TODO: this needs to be fixed and made cleaner later.
+                    if self.args.past_index >= 0:
+                        self._past = outputs[self.args.past_index - 1]
+        if prediction_loss_only:
+            return (loss, None, None)
+        logits = nested_detach(logits)
+        if len(logits) == 1:
+            logits = logits[0]
+        return (loss, logits, labels)
+    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
+        """
+        Perform a training step on a batch of inputs.
+        Subclass and override to inject custom behavior.
+        Args:
+            model (`nn.Module`):
+                The model to train.
+            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
+                The inputs and targets of the model.
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument `labels`. Check your model's documentation for all accepted arguments.
+        Return:
+            `torch.Tensor`: The tensor with training loss on this batch.
+        """
+        model.train()
+        inputs = self._prepare_inputs(inputs)
+        if is_sagemaker_mp_enabled():
+            loss_mb = smp_forward_backward(model, inputs, self.args.gradient_accumulation_steps)
+            return loss_mb.reduce_mean().detach().to(self.args.device)
+        with self.compute_loss_context_manager():
+            loss = self.compute_loss(model, inputs)
+        del inputs
+        torch.cuda.empty_cache()
+        if self.args.n_gpu > 1:
+            loss = loss.mean()  # mean() to average on multi-gpu parallel training
+        if self.use_apex:
+            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            self.accelerator.backward(loss)
+        return loss.detach() / self.args.gradient_accumulation_steps
+    def _save(self, output_dir: Optional[str] = None, state_dict=None):
+        # If we are executing this function, we are the process zero, so we don't check for that.
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        os.makedirs(output_dir, exist_ok=True)
+        logger.info(f"Saving model checkpoint to {output_dir}")
+        supported_classes = (PreTrainedModel,) if not is_peft_available() else (PreTrainedModel, PeftModel)
+        # Save a trained model and configuration using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        if not isinstance(self.model, supported_classes):
+            if state_dict is None:
+                state_dict = self.model.state_dict()
+            if isinstance(unwrap_model(self.model), supported_classes):
+                unwrap_model(self.model).save_pretrained(
+                    output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors
+                )
+            else:
+                logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
+                if self.args.save_safetensors:
+                    safetensors.torch.save_file(
+                        state_dict, os.path.join(output_dir, SAFE_WEIGHTS_NAME), metadata={"format": "pt"}
+                    )
+                else:
+                    torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
+        else:
+            self.model.save_pretrained(
+                output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors
+            )
+        if self.tokenizer is not None:
+            self.tokenizer.save_pretrained(output_dir)
+        # Good practice: save your training arguments together with the trained model
+        torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))

scripts/continue.sh ADDED Viewed

	@@ -0,0 +1,65 @@

+#!/bin/bash
+# GPUS_PER_NODE=8
+# NNODES=1
+# NODE_RANK=0
+# MASTER_ADDR=localhost
+# MASTER_PORT=6001
+MODEL="/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/checkpoints/minicpmo_sft_asr"
+TOKENIZER_PATH="/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6"
+# or openbmb/MiniCPM-V-2, openbmb/MiniCPM-Llama3-V-2_5, openbmb/MiniCPM-V-2_6
+# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
+# See the section for finetuning in README for more information.
+DATA="/data1/speech/anhnmt2/dataset/s2s/minicpmo/asr/train_asr_mixed_500k.jsonl"
+EVAL_DATA="/data1/speech/anhnmt2/dataset/s2s/minicpmo/asr/dev_asr_mixed.jsonl"
+# if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm, if use openbmb/MiniCPM-Llama3-V-2_5, please set LLM_TYPE="llama3",
+# if use openbmb/MiniCPM-o-2_6 or openbmb/MiniCPM-V-2_6, please set LLM_TYPE=qwen
+LLM_TYPE="qwen"
+MODEL_MAX_Length=2048 # if conduct multi-images sft, please set MODEL_MAX_Length=4096
+# DISTRIBUTED_ARGS="
+#     --nproc_per_node $GPUS_PER_NODE \
+#     --nnodes $NNODES \
+#     --node_rank $NODE_RANK \
+#     --master_addr $MASTER_ADDR \
+#     --master_port $MASTER_PORT
+# "
+deepspeed ../omni_speech/train/train_minicpmo.py  \
+    --deepspeed zero2.json \
+    --model_name_or_path $MODEL \
+    --tokenizer_path $TOKENIZER_PATH \
+    --llm_type $LLM_TYPE \
+    --data_path $DATA \
+    --eval_data_path $EVAL_DATA \
+    --remove_unused_columns false \
+    --label_names "labels" \
+    --prediction_loss_only false \
+    --bf16 true \
+    --do_train \
+    --do_eval \
+    --tune_speech true \
+    --tune_llm false \
+    --model_max_length $MODEL_MAX_Length \
+    --eval_steps 2000 \
+    --output_dir ../checkpoints/minicpmo_sft_asr \
+    --num_train_epochs 2 \
+    --logging_strategy "steps" \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 4 \
+    --evaluation_strategy "steps" \
+    --save_strategy "steps" \
+    --save_steps 5000 \
+    --save_total_limit 1 \
+    --learning_rate 1e-5 \
+    --max_grad_norm 20. \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --gradient_checkpointing true

scripts/ds_config_zero2.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 2e8,
+        "contiguous_gradients": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}

scripts/ds_config_zero3.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}

scripts/export.sh ADDED Viewed

	@@ -0,0 +1,39 @@

+#!/bin/bash
+MODEL_PATH=/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/checkpoints/omni_whisper-medium_Qwen2.5-3B_pretrained-sft-fc_speech_decoder_fixed_all/checkpoint-4000
+SPEECH_ENCODER=/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/speech_encoder/whisper-medium
+PROMPT_VERSION=qwen
+DATA_PATH=/data1/speech/anhnmt2/dataset/s2s/english/qna/moss/moss_100K_phase3_tgt_units_processed.jsonl
+# DEV_PATH=/data1/speech/anhnmt2/dataset/s2s/english/qna/dev_20250103.jsonl
+CACHE_DIR="../output/cached_sft_speech_decoder_20250103"
+deepspeed --master_port 29501 ../omni_speech/train/export.py \
+    --deepspeed zero2.json \
+    --model_name_or_path $MODEL_PATH \
+    --version $PROMPT_VERSION \
+    --data_path $DATA_PATH \
+    --cache_dir $CACHE_DIR \
+    --speech_encoder $SPEECH_ENCODER  \
+    --mel_size 80 \
+    --speech_encoder_hidden_size 1024 \
+    --speech_encoder_type whisper \
+    --tune_speech_generator_only True \
+    --bf16 True \
+    --output_dir ../checkpoints/tmp \
+    --num_train_epochs 8 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 2 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 2000 \
+    --save_total_limit 1 \
+    --learning_rate 1e-4 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --logging_steps 10 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 8 \
+    --has_tgt_units True

scripts/finetune.sh ADDED Viewed

	@@ -0,0 +1,42 @@

+#!/bin/bash
+MODEL_PATH=/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/llm/Qwen2.5-3B-Instruct
+SPEECH_ENCODER=/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/speech_encoder/whisper-medium
+SPEECH_ADAPTER=/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/checkpoints/omni_whisper-medium_Qwen2.5-3B_pretrained-asr/speech_projector.bin
+PROMPT_VERSION=qwen
+DATA_PATH=/data1/speech/anhnmt2/dataset/s2s/english/qna/train_20250112_fc_mixed_vfva_text_fake_audios.jsonl
+DEV_PATH=/data1/speech/anhnmt2/dataset/s2s/english/qna/dev_20250112_fc_mixed_vfva_text_fake_audios.jsonl
+CACHE_DIR="../output/cached_sft_20250112"
+deepspeed ../omni_speech/train/train_mem.py \
+    --deepspeed zero2.json \
+    --model_name_or_path $MODEL_PATH \
+    --version $PROMPT_VERSION \
+    --data_path $DATA_PATH \
+    --dev_path $DEV_PATH \
+    --cache_dir $CACHE_DIR \
+    --speech_encoder $SPEECH_ENCODER  \
+    --mel_size 80 \
+    --speech_encoder_hidden_size 1024 \
+    --speech_encoder_type whisper \
+    --pretrain_speech_projector $SPEECH_ADAPTER \
+    --bf16 True \
+    --output_dir ../checkpoints/omni_whisper-medium_Qwen2.5-3B_pretrained-sft-fc-mixed-vfva-text \
+    --num_train_epochs 2 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 4 \
+    --evaluation_strategy "steps" \
+    --save_strategy "steps" \
+    --eval_steps 2000 \
+    --save_steps 6000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 8192 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 8

scripts/finetune_llm_speech_decoder.sh ADDED Viewed

	@@ -0,0 +1,85 @@

+#!/bin/bash
+# it currently supports for batch = 1 only.
+MODEL_PATH=/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/llm/Qwen2.5-3B-Instruct
+SPEECH_ENCODER=/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/speech_encoder/whisper-medium
+SPEECH_ADAPTER=/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/checkpoints/omni_whisper-medium_Qwen2.5-3B_pretrained-asr/speech_projector.bin
+PROMPT_VERSION=qwen
+DATA_PATH=/data1/speech/anhnmt2/dataset/s2s/english/qna/train_20250106_fc_mixed_tgt_units.jsonl
+DEV_PATH=/data1/speech/anhnmt2/dataset/s2s/english/qna/dev_20250106_fc_mixed_tgt_units.jsonl
+CACHE_DIR="../output/cached_sft_speech_decoder_all_20250103"
+deepspeed ../omni_speech/train/train_mem.py \
+    --deepspeed zero2.json \
+    --model_name_or_path $MODEL_PATH \
+    --version $PROMPT_VERSION \
+    --data_path $DATA_PATH \
+    --dev_path $DEV_PATH \
+    --cache_dir $CACHE_DIR \
+    --speech_encoder $SPEECH_ENCODER  \
+    --mel_size 80 \
+    --speech_encoder_hidden_size 1024 \
+    --speech_encoder_type whisper \
+    --pretrain_speech_projector $SPEECH_ADAPTER \
+    --bf16 True \
+    --output_dir ../checkpoints/omni_whisper-medium_Qwen2.5-3B_pretrained-sft-fc_speech_decoder_fixed_all \
+    --num_train_epochs 3 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 4 \
+    --evaluation_strategy "steps" \
+    --save_strategy "steps" \
+    --eval_steps 2000 \
+    --save_steps 2000 \
+    --save_total_limit 1 \
+    --learning_rate 1e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 1024 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 8 \
+    --has_tgt_units True \
+    --ctc_loss_weight 2.0
+# MODEL_PATH=/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/checkpoints/omni_whisper-medium_Qwen2.5-3B_pretrained-sft-fc
+# SPEECH_ENCODER=/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/speech_encoder/whisper-medium
+# PROMPT_VERSION=qwen
+# DATA_PATH=/data1/speech/anhnmt2/dataset/s2s/english/qna/moss/moss_100K_phase3_tgt_units_processed.jsonl
+# # DEV_PATH=/data1/speech/anhnmt2/dataset/s2s/english/qna/dev_20250106_fc_mixed_tgt_units.jsonl
+# CACHE_DIR="../output/cached_sft_speech_decoder_all_20250103"
+# deepspeed ../omni_speech/train/train_mem.py \
+#     --deepspeed zero2.json \
+#     --model_name_or_path $MODEL_PATH \
+#     --version $PROMPT_VERSION \
+#     --data_path $DATA_PATH \
+#     --cache_dir $CACHE_DIR \
+#     --speech_encoder $SPEECH_ENCODER  \
+#     --mel_size 80 \
+#     --speech_encoder_hidden_size 1024 \
+#     --speech_encoder_type whisper \
+#     --bf16 True \
+#     --output_dir ../checkpoints/omni_whisper-medium_Qwen2.5-3B_pretrained-sft-fc_speech_decoder_all \
+#     --num_train_epochs 5 \
+#     --per_device_train_batch_size 1 \
+#     --per_device_eval_batch_size 1 \
+#     --gradient_accumulation_steps 4 \
+#     --evaluation_strategy "no" \
+#     --save_strategy "steps" \
+#     --save_steps 10000 \
+#     --save_total_limit 1 \
+#     --learning_rate 1e-4 \
+#     --weight_decay 0. \
+#     --warmup_ratio 0.03 \
+#     --logging_steps 1 \
+#     --tf32 True \
+#     --model_max_length 2048 \
+#     --gradient_checkpointing True \
+#     --dataloader_num_workers 8 \
+#     --has_tgt_units True \
+#     --ctc_loss_weight 10.0

scripts/finetune_lora.sh ADDED Viewed

	@@ -0,0 +1,43 @@

+#!/bin/bash
+MODEL_PATH=/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/llm/Qwen2.5-3B-Instruct
+SPEECH_ENCODER=/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/speech_encoder/whisper-medium
+SPEECH_ADAPTER=/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/checkpoints/omni_whisper-medium_Qwen2.5-3B_pretrained-asr/speech_projector.bin
+PROMPT_VERSION=qwen
+DATA_PATH=/data1/speech/anhnmt2/dataset/s2s/english/qna/train_tmp.jsonl
+DEV_PATH=/data1/speech/anhnmt2/dataset/s2s/english/qna/dev_tmp.jsonl
+CACHE_DIR="../output/cached_sft"
+deepspeed ../omni_speech/train/train_mem.py \
+    --deepspeed zero2.json \
+    --lora_enable True \
+    --model_name_or_path $MODEL_PATH \
+    --version $PROMPT_VERSION \
+    --data_path $DATA_PATH \
+    --dev_path $DEV_PATH \
+    --cache_dir $CACHE_DIR \
+    --speech_encoder $SPEECH_ENCODER  \
+    --mel_size 80 \
+    --speech_encoder_hidden_size 1024 \
+    --speech_encoder_type whisper \
+    --pretrain_speech_projector $SPEECH_ADAPTER \
+    --bf16 True \
+    --output_dir ../checkpoints/omni_whisper-medium_Qwen2.5-3B_pretrained-sft-lora \
+    --num_train_epochs 18 \
+    --per_device_train_batch_size 2 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 4 \
+    --evaluation_strategy "steps" \
+    --save_strategy "steps" \
+    --eval_steps 1000 \
+    --save_steps 1000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --optim adamw_torch \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 8

scripts/finetune_minicpmo.sh ADDED Viewed

	@@ -0,0 +1,65 @@

+#!/bin/bash
+# GPUS_PER_NODE=8
+# NNODES=1
+# NODE_RANK=0
+# MASTER_ADDR=localhost
+# MASTER_PORT=6001
+MODEL="/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/checkpoints/minicpmo_sft_asr"
+TOKENIZER_PATH="/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6"
+# or openbmb/MiniCPM-V-2, openbmb/MiniCPM-Llama3-V-2_5, openbmb/MiniCPM-V-2_6
+# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
+# See the section for finetuning in README for more information.
+DATA="/data1/speech/anhnmt2/dataset/s2s/minicpmo/sft/train_20250219_fc_mixed_text_filter_a_um.jsonl"
+EVAL_DATA="/data1/speech/anhnmt2/dataset/s2s/minicpmo/sft/dev_20250219_fc_mixed_text_filter_a_um.jsonl"
+# if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm, if use openbmb/MiniCPM-Llama3-V-2_5, please set LLM_TYPE="llama3",
+# if use openbmb/MiniCPM-o-2_6 or openbmb/MiniCPM-V-2_6, please set LLM_TYPE=qwen
+LLM_TYPE="qwen"
+MODEL_MAX_Length=8192 # if conduct multi-images sft, please set MODEL_MAX_Length=4096
+# DISTRIBUTED_ARGS="
+#     --nproc_per_node $GPUS_PER_NODE \
+#     --nnodes $NNODES \
+#     --node_rank $NODE_RANK \
+#     --master_addr $MASTER_ADDR \
+#     --master_port $MASTER_PORT
+# "
+deepspeed ../omni_speech/train/train_minicpmo.py  \
+    --deepspeed zero2.json \
+    --model_name_or_path $MODEL \
+    --tokenizer_path $TOKENIZER_PATH \
+    --llm_type $LLM_TYPE \
+    --data_path $DATA \
+    --eval_data_path $EVAL_DATA \
+    --remove_unused_columns false \
+    --label_names "labels" \
+    --prediction_loss_only false \
+    --bf16 true \
+    --do_train \
+    --do_eval \
+    --tune_speech true \
+    --tune_llm true \
+    --model_max_length $MODEL_MAX_Length \
+    --eval_steps 1000 \
+    --output_dir ../checkpoints/minicpmo_sft_vi_fc_fixed \
+    --num_train_epochs 1 \
+    --logging_strategy "steps" \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 4 \
+    --evaluation_strategy "steps" \
+    --save_strategy "no" \
+    --save_steps 4000 \
+    --save_total_limit 1 \
+    --learning_rate 1e-5 \
+    --max_grad_norm 20. \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --gradient_checkpointing true

scripts/finetune_minicpmo_asr.sh ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/bin/bash
+# GPUS_PER_NODE=8
+# NNODES=1
+# NODE_RANK=0
+# MASTER_ADDR=localhost
+# MASTER_PORT=6001
+MODEL="/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6"
+# or openbmb/MiniCPM-V-2, openbmb/MiniCPM-Llama3-V-2_5, openbmb/MiniCPM-V-2_6
+# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
+# See the section for finetuning in README for more information.
+DATA="/data1/speech/anhnmt2/dataset/s2s/minicpmo/asr/train_asr_mixed_500k.jsonl"
+EVAL_DATA="/data1/speech/anhnmt2/dataset/s2s/minicpmo/asr/dev_asr_mixed.jsonl"
+# if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm, if use openbmb/MiniCPM-Llama3-V-2_5, please set LLM_TYPE="llama3",
+# if use openbmb/MiniCPM-o-2_6 or openbmb/MiniCPM-V-2_6, please set LLM_TYPE=qwen
+LLM_TYPE="qwen"
+MODEL_MAX_Length=2048 # if conduct multi-images sft, please set MODEL_MAX_Length=4096
+# DISTRIBUTED_ARGS="
+#     --nproc_per_node $GPUS_PER_NODE \
+#     --nnodes $NNODES \
+#     --node_rank $NODE_RANK \
+#     --master_addr $MASTER_ADDR \
+#     --master_port $MASTER_PORT
+# "
+deepspeed ../omni_speech/train/train_minicpmo.py  \
+    --deepspeed zero2.json \
+    --model_name_or_path $MODEL \
+    --llm_type $LLM_TYPE \
+    --data_path $DATA \
+    --eval_data_path $EVAL_DATA \
+    --remove_unused_columns false \
+    --label_names "labels" \
+    --prediction_loss_only false \
+    --bf16 true \
+    --do_train \
+    --do_eval \
+    --tune_speech true \
+    --tune_llm false \
+    --model_max_length $MODEL_MAX_Length \
+    --eval_steps 4000 \
+    --output_dir ../checkpoints/minicpmo_sft_asr_new \
+    --num_train_epochs 1 \
+    --logging_strategy "steps" \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 4 \
+    --evaluation_strategy "steps" \
+    --save_strategy "steps" \
+    --save_steps 10000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-4 \
+    --max_grad_norm 20. \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --gradient_checkpointing true

scripts/finetune_speech_decoder.sh ADDED Viewed

	@@ -0,0 +1,42 @@

+#!/bin/bash
+# it currently supports for batch = 1 only.
+MODEL_PATH=/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/checkpoints/omni_whisper-medium_Qwen2.5-3B_pretrained-sft-fc-mixed-vfva-text
+SPEECH_ENCODER=/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/speech_encoder/whisper-medium
+PROMPT_VERSION=qwen
+DATA_PATH=/data1/speech/anhnmt2/dataset/s2s/english/qna/20250114_tgt_unit_preprocessed_combined_mix_text_filtered.jsonl
+# DEV_PATH=/data1/speech/anhnmt2/dataset/s2s/english/qna/dev_20250103.jsonl
+CACHE_DIR="../output/cached_sft_speech_decoder_20250114"
+deepspeed ../omni_speech/train/train_mem.py \
+    --deepspeed zero2.json \
+    --model_name_or_path $MODEL_PATH \
+    --version $PROMPT_VERSION \
+    --data_path $DATA_PATH \
+    --cache_dir $CACHE_DIR \
+    --speech_encoder $SPEECH_ENCODER  \
+    --mel_size 80 \
+    --speech_encoder_hidden_size 1024 \
+    --speech_encoder_type whisper \
+    --tune_speech_generator_only True \
+    --bf16 True \
+    --output_dir ../checkpoints/omni_whisper-medium_Qwen2.5-3B_pretrained-sft-fc-mixed-vfva-text_speech-decoder \
+    --num_train_epochs 16 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 4 \
+    --evaluation_strategy "no" \
+    --save_strategy "no" \
+    --save_steps 3000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-4 \
+    --max_grad_norm 200. \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 4096 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 8 \
+    --has_tgt_units True

scripts/minicpmp_config.json ADDED Viewed

	@@ -0,0 +1,163 @@

+{
+  "batch_vision_input": true,
+  "drop_vision_last_layer": false,
+  "image_size": 448,
+  "audio_chunk_length": 1.0,
+  "audio_config": {
+    "_name_or_path": "openai/whisper-medium",
+    "architectures": [
+      "MiniCPMWhisperEncoder"
+    ],
+    "begin_suppress_tokens": [
+      220,
+      50257
+    ],
+    "bos_token_id": 50257,
+    "d_model": 1024,
+    "decoder_attention_heads": 16,
+    "decoder_ffn_dim": 4096,
+    "decoder_layers": 24,
+    "decoder_start_token_id": 50258,
+    "encoder_attention_heads": 16,
+    "encoder_ffn_dim": 4096,
+    "encoder_layers": 24,
+    "eos_token_id": 50257,
+    "forced_decoder_ids": [
+      [
+        1,
+        50259
+      ],
+      [
+        2,
+        50359
+      ],
+      [
+        3,
+        50363
+      ]
+    ],
+    "max_length": 448,
+    "model_type": "whisper",
+    "num_hidden_layers": 24,
+    "pad_token_id": 50257,
+    "suppress_tokens": [
+      1,
+      2,
+      7,
+      8,
+      9,
+      10,
+      14,
+      25,
+      26,
+      27,
+      28,
+      29,
+      31,
+      58,
+      59,
+      60,
+      61,
+      62,
+      63,
+      90,
+      91,
+      92,
+      93,
+      359,
+      503,
+      522,
+      542,
+      873,
+      893,
+      902,
+      918,
+      922,
+      931,
+      1350,
+      1853,
+      1982,
+      2460,
+      2627,
+      3246,
+      3253,
+      3268,
+      3536,
+      3846,
+      3961,
+      4183,
+      4667,
+      6585,
+      6647,
+      7273,
+      9061,
+      9383,
+      10428,
+      10929,
+      11938,
+      12033,
+      12331,
+      12562,
+      13793,
+      14157,
+      14635,
+      15265,
+      15618,
+      16553,
+      16604,
+      18362,
+      18956,
+      20075,
+      21675,
+      22520,
+      26130,
+      26161,
+      26435,
+      28279,
+      29464,
+      31650,
+      32302,
+      32470,
+      36865,
+      42863,
+      47425,
+      49870,
+      50254,
+      50258,
+      50358,
+      50359,
+      50360,
+      50361,
+      50362
+    ],
+    "torch_dtype": "float32"
+  },
+  "audio_pool_step": 2,
+  "chunk_input": true,
+  "model_type": "minicpmo",
+  "patch_size": 14,
+  "query_num": 64,
+  "slice_config": {
+    "max_slice_nums": 9,
+    "model_type": "minicpmv"
+  },
+  "slice_mode": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.44.2",
+  "tts_config": {
+    "model_type": "conditional_chattts",
+    "llm_dim": 3584
+  },
+  "use_cache": false,
+  "use_image_id": true,
+  "vision_batch_size": 16,
+  "vision_config": {
+    "hidden_size": 1152,
+    "image_size": 980,
+    "intermediate_size": 4304,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 27,
+    "patch_size": 14
+  }
+}

scripts/pretrain_minicpmo_test.sh ADDED Viewed

	@@ -0,0 +1,89 @@

+#!/bin/bash
+# GPUS_PER_NODE=8
+# NNODES=1
+# NODE_RANK=0
+# MASTER_ADDR=localhost
+# MASTER_PORT=6001
+# MODEL="/data1/speech/anhnmt2/cuongnm/EOT/Qwen2.5-0.5B-Instruct"
+PRETRAINED_LLM="/data1/speech/anhnmt2/cuongnm/EOT/Qwen2.5-0.5B-Instruct"
+MODEL="/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/llm/Qwen2.5-7B-Instruct"
+# PRETRAINED_LLM="/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/llm/Qwen2.5-7B-Instruct"
+TOKENIZER_PATH="/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6"
+AUDIO_ENCODER_PATH="/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6"
+# or openbmb/MiniCPM-V-2, openbmb/MiniCPM-Llama3-V-2_5, openbmb/MiniCPM-V-2_6
+# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
+# See the section for finetuning in README for more information.
+# DATA="/data1/speech/anhnmt2/cuongnm/datasets/asr/train_asr_mixed_balanced_1M5_train.json "
+# EVAL_DATA="/data1/speech/anhnmt2/cuongnm/datasets/asr/train_asr_mixed_balanced_1M5_dev.json "
+# DATA="/data1/speech/anhnmt2/dataset/s2s/english/minicpmo/train_asr_eng_100000_new_dataloader.jsonl"
+# EVAL_DATA="/data1/speech/anhnmt2/dataset/s2s/english/minicpmo/dev_asr_eng_1000_new_dataloader.jsonl"
+DATA="/data1/speech/anhnmt2/dataset/s2s/minicpmo/asr/train_asr_mixed_500k.jsonl"
+EVAL_DATA="/data1/speech/anhnmt2/dataset/s2s/minicpmo/asr/dev_asr_mixed.jsonl"
+CONFIG_PATH="minicpmp_config.json"
+AUGMENT_PATH="/data1/speech/anhnmt2/dataset/s2s/augment/noise_list_non_speech.txt"
+# if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm, if use openbmb/MiniCPM-Llama3-V-2_5, please set LLM_TYPE="llama3",
+# if use openbmb/MiniCPM-o-2_6 or openbmb/MiniCPM-V-2_6, please set LLM_TYPE=qwen
+LLM_TYPE="qwen"
+MODEL_MAX_Length=2048 # if conduct multi-images sft, please set MODEL_MAX_Length=4096
+CACHE_DIR="../output/cached_sft_20252502"
+# DISTRIBUTED_ARGS="
+#     --nproc_per_node $GPUS_PER_NODE \
+#     --nnodes $NNODES \
+#     --node_rank $NODE_RANK \
+#     --master_addr $MASTER_ADDR \
+#     --master_port $MASTER_PORT
+# "
+DEEPSPEED_CMD="/home/anhnmt2/.local/bin/deepspeed"
+# Kiểm tra file thực thi DeepSpeed
+if [ ! -x "$DEEPSPEED_CMD" ]; then
+    echo "Error: DeepSpeed executable not found at $DEEPSPEED_CMD."
+    echo "Try reinstalling with: pip install deepspeed"
+    exit 1
+fi
+CUDA_LAUNCH_BLOCKING=1 "$DEEPSPEED_CMD" --master_port 29501 ../omni_speech/train/train_minicpmo_test.py  \
+    --deepspeed zero2.json \
+    --model_name_or_path $MODEL \
+    --pretrained_llm_path $PRETRAINED_LLM \
+    --tokenizer_path $TOKENIZER_PATH \
+    --cache_dir $CACHE_DIR \
+    --audio_encoder_path $AUDIO_ENCODER_PATH \
+    --llm_type $LLM_TYPE \
+    --data_path $DATA \
+    --eval_data_path $EVAL_DATA \
+    --config_path $CONFIG_PATH \
+    --remove_unused_columns false \
+    --prediction_loss_only false \
+    --bf16 true \
+    --do_train \
+    --do_eval \
+    --tune_speech false \
+    --tune_llm false \
+    --model_max_length $MODEL_MAX_Length \
+    --eval_steps 3000 \
+    --output_dir ../checkpoints/minicpmo_whisper-medium_Qwen2.5-0.5B_pretrained-asr-projector \
+    --num_train_epochs 3 \
+    --logging_strategy "steps" \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 8 \
+    --gradient_accumulation_steps 4 \
+    --evaluation_strategy "steps" \
+    --save_strategy "steps" \
+    --save_steps 5000 \
+    --save_total_limit 1 \
+    --learning_rate 5e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 true \
+    --gradient_checkpointing true
+    # --augment_prob 0.2 \
+    # --augment_path $AUGMENT_PATH

scripts/pretrained.sh ADDED Viewed

	@@ -0,0 +1,44 @@

+#!/bin/bash
+MODEL_PATH=/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/llm/Qwen2.5-3B-Instruct
+SPEECH_ENCODER=/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/speech_encoder/whisper-medium
+PROMPT_VERSION=qwen
+DATA_PATH=/data1/speech/anhnmt2/dataset/s2s/english/asr/dataset/train_asr_eng_5M.jsonl
+DEV_PATH=/data1/speech/anhnmt2/dataset/s2s/english/asr/dataset/dev_asr_libri_spgi.jsonl
+CACHE_DIR="../output/cached_asr_full"
+AUGMENT_PATH="/data1/speech/anhnmt2/dataset/s2s/augment/noise_list_non_speech.txt"
+deepspeed ../omni_speech/train/train_mem.py \
+    --deepspeed zero2.json \
+    --model_name_or_path $MODEL_PATH \
+    --version $PROMPT_VERSION \
+    --data_path $DATA_PATH \
+    --dev_path $DEV_PATH \
+    --cache_dir $CACHE_DIR \
+    --speech_encoder $SPEECH_ENCODER  \
+    --mel_size 80 \
+    --speech_encoder_hidden_size 1024 \
+    --speech_encoder_type whisper \
+    --bf16 True \
+    --output_dir ../checkpoints/omni_whisper-medium_Qwen2.5-3B_pretrained-asr-5M \
+    --num_train_epochs 4 \
+    --tune_speech_projector True \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 2 \
+    --evaluation_strategy "steps" \
+    --save_strategy "steps" \
+    --eval_steps 2000 \
+    --save_steps 2000 \
+    --save_total_limit 1 \
+    --learning_rate 1e-3 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 4096 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 8
+    # --augment_prob 0.2 \
+    # --augment_path $AUGMENT_PATH \

scripts/pretrained_minicpmo.sh ADDED Viewed

	@@ -0,0 +1,74 @@

+#!/bin/bash
+# GPUS_PER_NODE=8
+# NNODES=1
+# NODE_RANK=0
+# MASTER_ADDR=localhost
+# MASTER_PORT=6001
+MODEL="/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/llm/Qwen2.5-7B-Instruct"
+PRETRAINED_LLM="/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/llm/Qwen2.5-7B-Instruct"
+TOKENIZER_PATH="/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6"
+AUDIO_ENCODER_PATH="/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6"
+# or openbmb/MiniCPM-V-2, openbmb/MiniCPM-Llama3-V-2_5, openbmb/MiniCPM-V-2_6
+# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
+# See the section for finetuning in README for more information.
+DATA="/data1/speech/anhnmt2/dataset/s2s/english/minicpmo/train_asr_eng_100000_new_dataloader.jsonl"
+EVAL_DATA="/data1/speech/anhnmt2/dataset/s2s/english/minicpmo/dev_asr_eng_1000_new_dataloader.jsonl"
+CONFIG_PATH="minicpmp_config.json"
+AUGMENT_PATH="/data1/speech/anhnmt2/dataset/s2s/augment/noise_list_non_speech.txt"
+# if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm, if use openbmb/MiniCPM-Llama3-V-2_5, please set LLM_TYPE="llama3",
+# if use openbmb/MiniCPM-o-2_6 or openbmb/MiniCPM-V-2_6, please set LLM_TYPE=qwen
+LLM_TYPE="qwen"
+MODEL_MAX_Length=4096 # if conduct multi-images sft, please set MODEL_MAX_Length=4096
+CACHE_DIR="../output/cached_sft_20252502"
+# DISTRIBUTED_ARGS="
+#     --nproc_per_node $GPUS_PER_NODE \
+#     --nnodes $NNODES \
+#     --node_rank $NODE_RANK \
+#     --master_addr $MASTER_ADDR \
+#     --master_port $MASTER_PORT
+# "
+deepspeed --master_port 29501 ../omni_speech/train/train_minicpmo.py  \
+    --deepspeed zero2.json \
+    --model_name_or_path $MODEL \
+    --pretrained_llm_path $PRETRAINED_LLM \
+    --tokenizer_path $TOKENIZER_PATH \
+    --cache_dir $CACHE_DIR \
+    --audio_encoder_path $AUDIO_ENCODER_PATH \
+    --llm_type $LLM_TYPE \
+    --data_path $DATA \
+    --eval_data_path $EVAL_DATA \
+    --config_path $CONFIG_PATH \
+    --remove_unused_columns false \
+    --prediction_loss_only false \
+    --bf16 true \
+    --do_train \
+    --do_eval \
+    --tune_speech true \
+    --tune_llm false \
+    --model_max_length $MODEL_MAX_Length \
+    --eval_steps 1000 \
+    --output_dir ../checkpoints/minicpmo_whisper-medium_Qwen2.5-3B_pretrained-asr \
+    --num_train_epochs 1 \
+    --logging_strategy "steps" \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 4 \
+    --evaluation_strategy "steps" \
+    --save_strategy "no" \
+    --save_steps 2000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-4 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 true \
+    --gradient_checkpointing true
+    # --augment_prob 0.2 \
+    # --augment_path $AUGMENT_PATH

scripts/test_llama.sh ADDED Viewed

	@@ -0,0 +1,41 @@

+#!/bin/bash
+MODEL_PATH=/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/llm/Llama-3.1-8B-Instruct
+SPEECH_ENCODER=/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/speech_encoder/whisper-medium
+PROMPT_VERSION=llama_3
+DATA_PATH=/data1/speech/anhnmt2/dataset/s2s/new/train_asr_eng_50000.jsonl
+DEV_PATH=/data1/speech/anhnmt2/dataset/s2s/new/dev_asr_eng_5000.jsonl
+CACHE_DIR="../output/cached_asr"
+deepspeed ../omni_speech/train/train.py \
+    --deepspeed zero2.json \
+    --model_name_or_path $MODEL_PATH \
+    --version $PROMPT_VERSION \
+    --data_path $DATA_PATH \
+    --dev_path $DEV_PATH \
+    --cache_dir $CACHE_DIR \
+    --speech_encoder $SPEECH_ENCODER  \
+    --mel_size 80 \
+    --speech_encoder_hidden_size 1024 \
+    --speech_encoder_type whisper \
+    --bf16 True \
+    --output_dir ../checkpoints/llama-omni-pretrained-asr-test \
+    --num_train_epochs 10 \
+    --tune_speech_projector True \
+    --per_device_train_batch_size 4 \
+    --per_device_eval_batch_size 2 \
+    --gradient_accumulation_steps 4 \
+    --evaluation_strategy "steps" \
+    --save_strategy "steps" \
+    --eval_steps 2000 \
+    --save_steps 2000 \
+    --save_total_limit 1 \
+    --learning_rate 1e-3 \
+    --optim adamw_torch \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 8

scripts/test_qwen.sh ADDED Viewed

	@@ -0,0 +1,41 @@

+#!/bin/bash
+MODEL_PATH=/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/llm/Qwen2.5-1.5B-Instruct
+SPEECH_ENCODER=/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/speech_encoder/whisper-medium
+PROMPT_VERSION=qwen
+DATA_PATH=/data1/speech/anhnmt2/dataset/s2s/new/dev_asr_eng_5000_multiturn.jsonl
+DEV_PATH=/data1/speech/anhnmt2/dataset/s2s/new/dev_asr_eng_5000_multiturn.jsonl
+CACHE_DIR="../output/cached_asr"
+deepspeed ../omni_speech/train/train_multiturn.py \
+    --deepspeed zero2.json \
+    --model_name_or_path $MODEL_PATH \
+    --version $PROMPT_VERSION \
+    --data_path $DATA_PATH \
+    --dev_path $DEV_PATH \
+    --cache_dir $CACHE_DIR \
+    --speech_encoder $SPEECH_ENCODER  \
+    --mel_size 80 \
+    --speech_encoder_hidden_size 1024 \
+    --speech_encoder_type whisper \
+    --bf16 True \
+    --output_dir ../checkpoints/llama-omni-pretrained-asr-qwen \
+    --num_train_epochs 10 \
+    --tune_speech_projector True \
+    --per_device_train_batch_size 4 \
+    --per_device_eval_batch_size 2 \
+    --gradient_accumulation_steps 4 \
+    --evaluation_strategy "steps" \
+    --save_strategy "steps" \
+    --eval_steps 2000 \
+    --save_steps 2000 \
+    --save_total_limit 1 \
+    --learning_rate 1e-3 \
+    --optim adamw_torch \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 8

scripts/wandb/debug-internal.log ADDED Viewed

	@@ -0,0 +1,7 @@

+{"time":"2025-04-10T17:19:28.842729448+07:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/scripts/wandb/run-20250410_171928-pfaibe0c/logs/debug-core.log"}
+{"time":"2025-04-10T17:19:28.960322418+07:00","level":"INFO","msg":"created new stream","id":"pfaibe0c"}
+{"time":"2025-04-10T17:19:28.960351593+07:00","level":"INFO","msg":"stream: started","id":"pfaibe0c"}
+{"time":"2025-04-10T17:19:28.960375959+07:00","level":"INFO","msg":"writer: Do: started","stream_id":"pfaibe0c"}
+{"time":"2025-04-10T17:19:28.960456552+07:00","level":"INFO","msg":"handler: started","stream_id":"pfaibe0c"}
+{"time":"2025-04-10T17:19:28.961574927+07:00","level":"INFO","msg":"sender: started","stream_id":"pfaibe0c"}
+{"time":"2025-04-10T17:19:29.497777718+07:00","level":"INFO","msg":"Starting system monitor"}

scripts/wandb/debug.log ADDED Viewed

	@@ -0,0 +1,25 @@

+2025-04-10 17:19:28,830 INFO    MainThread:1734298 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
+2025-04-10 17:19:28,830 INFO    MainThread:1734298 [wandb_setup.py:_flush():67] Configure stats pid to 1734298
+2025-04-10 17:19:28,830 INFO    MainThread:1734298 [wandb_setup.py:_flush():67] Loading settings from /home/anhnmt2/.config/wandb/settings
+2025-04-10 17:19:28,830 INFO    MainThread:1734298 [wandb_setup.py:_flush():67] Loading settings from /data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/scripts/wandb/settings
+2025-04-10 17:19:28,830 INFO    MainThread:1734298 [wandb_setup.py:_flush():67] Loading settings from environment variables
+2025-04-10 17:19:28,830 INFO    MainThread:1734298 [wandb_init.py:setup_run_log_directory():647] Logging user logs to /data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/scripts/wandb/run-20250410_171928-pfaibe0c/logs/debug.log
+2025-04-10 17:19:28,830 INFO    MainThread:1734298 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to /data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/scripts/wandb/run-20250410_171928-pfaibe0c/logs/debug-internal.log
+2025-04-10 17:19:28,830 INFO    MainThread:1734298 [wandb_init.py:init():761] calling init triggers
+2025-04-10 17:19:28,830 INFO    MainThread:1734298 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2025-04-10 17:19:28,830 INFO    MainThread:1734298 [wandb_init.py:init():784] starting backend
+2025-04-10 17:19:28,830 INFO    MainThread:1734298 [wandb_init.py:init():788] sending inform_init request
+2025-04-10 17:19:28,834 INFO    MainThread:1734298 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2025-04-10 17:19:28,834 INFO    MainThread:1734298 [wandb_init.py:init():798] backend started and connected
+2025-04-10 17:19:28,836 INFO    MainThread:1734298 [wandb_init.py:init():891] updated telemetry
+2025-04-10 17:19:28,852 INFO    MainThread:1734298 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
+2025-04-10 17:19:29,493 INFO    MainThread:1734298 [wandb_init.py:init():990] starting run threads in backend
+2025-04-10 17:19:29,890 INFO    MainThread:1734298 [wandb_run.py:_console_start():2375] atexit reg
+2025-04-10 17:19:29,891 INFO    MainThread:1734298 [wandb_run.py:_redirect():2227] redirect: wrap_raw
+2025-04-10 17:19:29,891 INFO    MainThread:1734298 [wandb_run.py:_redirect():2292] Wrapping output streams.
+2025-04-10 17:19:29,891 INFO    MainThread:1734298 [wandb_run.py:_redirect():2315] Redirects installed.
+2025-04-10 17:19:29,895 INFO    MainThread:1734298 [wandb_init.py:init():1032] run started, returning control to user process
+2025-04-10 17:19:29,898 INFO    MainThread:1734298 [wandb_run.py:_config_callback():1261] config_cb None None {'use_cache': False, 'query_num': 64, 'image_size': 448, 'drop_vision_last_layer': False, 'batch_vision_input': True, 'use_image_id': True, 'vision_batch_size': 16, 'audio_pool_step': 2, 'audio_chunk_length': 1.0, 'stream_input': False, 'init_vision': False, 'init_audio': True, 'init_tts': False, 'processor_path': '/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6', 'pretrained_encoder_path': '/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6', 'pretrained_llm_path': '/data1/speech/anhnmt2/cuongnm/EOT/Qwen2.5-0.5B-Instruct', 'chunk_input': True, 'slice_config': {'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': None, 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': None, 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': None, 'eos_token_id': None, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': '', 'model_type': 'minicpmv', 'patch_size': 14, 'max_slice_nums': 9, 'scale_resolution': 448}, 'slice_mode': True, 'vision_config': {'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': None, 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': None, 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': None, 'eos_token_id': None, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': '', 'model_type': 'siglip_vision_model', 'hidden_size': 1152, 'intermediate_size': 4304, 'num_hidden_layers': 27, 'num_attention_heads': 16, 'num_channels': 3, 'patch_size': 14, 'image_size': 980, 'attention_dropout': 0.0, 'layer_norm_eps': 1e-06, 'hidden_act': 'gelu_pytorch_tanh'}, 'audio_config': {'vocab_size': 51865, 'num_mel_bins': 80, 'd_model': 1024, 'encoder_layers': 24, 'encoder_attention_heads': 16, 'decoder_layers': 24, 'decoder_attention_heads': 16, 'decoder_ffn_dim': 4096, 'encoder_ffn_dim': 4096, 'dropout': 0.0, 'attention_dropout': 0.0, 'activation_dropout': 0.0, 'activation_function': 'gelu', 'init_std': 0.02, 'encoder_layerdrop': 0.0, 'decoder_layerdrop': 0.0, 'use_cache': True, 'num_hidden_layers': 24, 'scale_embedding': False, 'max_source_positions': 1500, 'max_target_positions': 448, 'classifier_proj_size': 256, 'use_weighted_layer_sum': False, 'apply_spec_augment': False, 'mask_time_prob': 0.05, 'mask_time_length': 10, 'mask_time_min_masks': 2, 'mask_feature_prob': 0.0, 'mask_feature_length': 10, 'mask_feature_min_masks': 0, 'median_filter_width': 7, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float32', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': True, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 448, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257], 'architectures': ['MiniCPMWhisperEncoder'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 50257, 'pad_token_id': 50257, 'eos_token_id': 50257, 'sep_token_id': None, 'decoder_start_token_id': 50258, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'openai/whisper-medium', 'forced_decoder_ids': [[1, 50259], [2, 50359], [3, 50363]], 'model_type': 'whisper'}, 'tts_config': {'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': None, 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': True, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 20, 'top_p': 0.7, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': None, 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': None, 'eos_token_id': None, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': '', 'model_type': 'conditional_chattts', 'llm_dim': 3584, 'hidden_size': 768, 'intermediate_size': 3072, 'num_attention_heads': 12, 'num_hidden_layers': 20, 'max_position_embeddings': 4096, 'num_audio_tokens': 626, 'num_text_tokens': 21178, 'num_mel_bins': 100, 'num_vq': 4, 'use_speaker_embedding': True, 'use_llm_hidden_state': False, 'spk_emb_token_id': 21143, 'num_spk_embs': 1, 'audio_bos_token_id': 21132, 'text_eos_token_id': 21133, 'use_text': True, 'streaming': True, 'streaming_text_chunk_size': 10, 'streaming_text_reserved_len': 300, 'streaming_audio_chunk_size': 50, 'attn_implementation': 'sdpa', 'use_mlp': True, 'aug_loss_weight': True}, 'patch_size': 14, 'vocab_size': 152064, 'max_position_embeddings': 32768, 'hidden_size': 3584, 'intermediate_size': 18944, 'num_hidden_layers': 28, 'num_attention_heads': 28, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 28, 'num_key_value_heads': 4, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'rope_theta': 1000000.0, 'rope_scaling': None, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float32', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 2048, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['Qwen2ForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 151643, 'pad_token_id': None, 'eos_token_id': 151645, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': '/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/llm/Qwen2.5-7B-Instruct', 'transformers_version': '4.45.0', 'model_type': 'minicpmo', 'output_dir': '../checkpoints/minicpmo_whisper-medium_Qwen2.5-0.5B_pretrained-asr-projector', 'overwrite_output_dir': False, 'do_train': True, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.03, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '../checkpoints/minicpmo_whisper-medium_Qwen2.5-0.5B_pretrained-asr-projector/runs/Apr10_17-18-52_dgx-a100-5', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1.0, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 5000, 'save_total_limit': 1, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': True, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 3000, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': '../checkpoints/minicpmo_whisper-medium_Qwen2.5-0.5B_pretrained-asr-projector', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': 'zero2.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'steps', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'cache_dir': '../output/cached_sft_20252502', 'model_max_length': 2048, 'tune_vision': True, 'tune_speech': False, 'tune_llm': False, 'llm_type': 'qwen', 'use_lora': False, 'max_slice_nums': 9, 'config_path': 'minicpmp_config.json', 'init_speech': True}
+2025-04-10 17:19:29,901 INFO    MainThread:1734298 [wandb_config.py:__setitem__():154] config set model/num_parameters = 802971264 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x1553c5eda240>>
+2025-04-10 17:19:29,901 INFO    MainThread:1734298 [wandb_run.py:_config_callback():1261] config_cb model/num_parameters 802971264 None

scripts/wandb/latest-run/files/output.log ADDED Viewed

	@@ -0,0 +1,559 @@

+  0%|                                                                                                                                                                 | 0/43233 [00:00<?, ?it/s]DEBUG:numba.core.byteflow:bytecode dump:
+>          0	NOP(arg=None, lineno=1141)
+           2	RESUME(arg=0, lineno=1141)
+           4	LOAD_FAST(arg=0, lineno=1144)
+           6	LOAD_CONST(arg=1, lineno=1144)
+           8	BINARY_SUBSCR(arg=None, lineno=1144)
+          12	STORE_FAST(arg=3, lineno=1144)
+          14	LOAD_FAST(arg=1, lineno=1145)
+          16	UNARY_NEGATIVE(arg=None, lineno=1145)
+          18	LOAD_FAST(arg=3, lineno=1145)
+          20	SWAP(arg=2, lineno=1145)
+          22	COPY(arg=2, lineno=1145)
+          24	COMPARE_OP(arg=26, lineno=1145)
+          28	POP_JUMP_IF_FALSE(arg=5, lineno=1145)
+          30	LOAD_FAST(arg=1, lineno=1145)
+          32	COMPARE_OP(arg=26, lineno=1145)
+          36	POP_JUMP_IF_FALSE(arg=5, lineno=1145)
+          38	JUMP_FORWARD(arg=2, lineno=1145)
+>         40	POP_TOP(arg=None, lineno=1145)
+          42	JUMP_FORWARD(arg=2, lineno=1145)
+>         44	LOAD_CONST(arg=1, lineno=1146)
+          46	STORE_FAST(arg=3, lineno=1146)
+>         48	LOAD_FAST(arg=0, lineno=1148)
+          50	LOAD_CONST(arg=2, lineno=1148)
+          52	BINARY_SUBSCR(arg=None, lineno=1148)
+          56	STORE_FAST(arg=4, lineno=1148)
+          58	LOAD_FAST(arg=1, lineno=1149)
+          60	UNARY_NEGATIVE(arg=None, lineno=1149)
+          62	LOAD_FAST(arg=4, lineno=1149)
+          64	SWAP(arg=2, lineno=1149)
+          66	COPY(arg=2, lineno=1149)
+          68	COMPARE_OP(arg=26, lineno=1149)
+          72	POP_JUMP_IF_FALSE(arg=5, lineno=1149)
+          74	LOAD_FAST(arg=1, lineno=1149)
+          76	COMPARE_OP(arg=26, lineno=1149)
+          80	POP_JUMP_IF_FALSE(arg=5, lineno=1149)
+          82	JUMP_FORWARD(arg=2, lineno=1149)
+>         84	POP_TOP(arg=None, lineno=1149)
+          86	JUMP_FORWARD(arg=2, lineno=1149)
+>         88	LOAD_CONST(arg=1, lineno=1150)
+          90	STORE_FAST(arg=4, lineno=1150)
+>         92	LOAD_FAST(arg=2, lineno=1152)
+          94	POP_JUMP_IF_FALSE(arg=43, lineno=1152)
+          96	LOAD_GLOBAL(arg=1, lineno=1153)
+         106	LOAD_ATTR(arg=2, lineno=1153)
+         126	LOAD_FAST(arg=3, lineno=1153)
+         128	CALL(arg=1, lineno=1153)
+         136	LOAD_GLOBAL(arg=1, lineno=1153)
+         146	LOAD_ATTR(arg=2, lineno=1153)
+         166	LOAD_FAST(arg=4, lineno=1153)
+         168	CALL(arg=1, lineno=1153)
+         176	COMPARE_OP(arg=55, lineno=1153)
+         180	RETURN_VALUE(arg=None, lineno=1153)
+>        182	LOAD_GLOBAL(arg=1, lineno=1155)
+         192	LOAD_ATTR(arg=4, lineno=1155)
+         212	LOAD_FAST(arg=3, lineno=1155)
+         214	CALL(arg=1, lineno=1155)
+         222	LOAD_GLOBAL(arg=1, lineno=1155)
+         232	LOAD_ATTR(arg=4, lineno=1155)
+         252	LOAD_FAST(arg=4, lineno=1155)
+         254	CALL(arg=1, lineno=1155)
+         262	COMPARE_OP(arg=55, lineno=1155)
+         266	RETURN_VALUE(arg=None, lineno=1155)
+DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=0 nstack_initial=0)])
+DEBUG:numba.core.byteflow:stack: []
+DEBUG:numba.core.byteflow:state.pc_initial: State(pc_initial=0 nstack_initial=0)
+DEBUG:numba.core.byteflow:dispatch pc=0, inst=NOP(arg=None, lineno=1141)
+DEBUG:numba.core.byteflow:stack []
+DEBUG:numba.core.byteflow:dispatch pc=2, inst=RESUME(arg=0, lineno=1141)
+DEBUG:numba.core.byteflow:stack []
+DEBUG:numba.core.byteflow:dispatch pc=4, inst=LOAD_FAST(arg=0, lineno=1144)
+DEBUG:numba.core.byteflow:stack []
+DEBUG:numba.core.byteflow:dispatch pc=6, inst=LOAD_CONST(arg=1, lineno=1144)
+DEBUG:numba.core.byteflow:stack ['$x4.0']
+DEBUG:numba.core.byteflow:dispatch pc=8, inst=BINARY_SUBSCR(arg=None, lineno=1144)
+DEBUG:numba.core.byteflow:stack ['$x4.0', '$const6.1']
+DEBUG:numba.core.byteflow:dispatch pc=12, inst=STORE_FAST(arg=3, lineno=1144)
+DEBUG:numba.core.byteflow:stack ['$8binary_subscr.2']
+DEBUG:numba.core.byteflow:dispatch pc=14, inst=LOAD_FAST(arg=1, lineno=1145)
+DEBUG:numba.core.byteflow:stack []
+DEBUG:numba.core.byteflow:dispatch pc=16, inst=UNARY_NEGATIVE(arg=None, lineno=1145)
+DEBUG:numba.core.byteflow:stack ['$threshold14.3']
+DEBUG:numba.core.byteflow:dispatch pc=18, inst=LOAD_FAST(arg=3, lineno=1145)
+DEBUG:numba.core.byteflow:stack ['$16unary_negative.4']
+DEBUG:numba.core.byteflow:dispatch pc=20, inst=SWAP(arg=2, lineno=1145)
+DEBUG:numba.core.byteflow:stack ['$16unary_negative.4', '$x018.5']
+DEBUG:numba.core.byteflow:dispatch pc=22, inst=COPY(arg=2, lineno=1145)
+DEBUG:numba.core.byteflow:stack ['$x018.5', '$16unary_negative.4']
+DEBUG:numba.core.byteflow:dispatch pc=24, inst=COMPARE_OP(arg=26, lineno=1145)
+DEBUG:numba.core.byteflow:stack ['$x018.5', '$16unary_negative.4', '$x018.5']
+DEBUG:numba.core.byteflow:dispatch pc=28, inst=POP_JUMP_IF_FALSE(arg=5, lineno=1145)
+DEBUG:numba.core.byteflow:stack ['$x018.5', '$24compare_op.6']
+DEBUG:numba.core.byteflow:end state. edges=[Edge(pc=30, stack=('$x018.5',), blockstack=(), npush=0), Edge(pc=40, stack=('$x018.5',), blockstack=(), npush=0)]
+DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=30 nstack_initial=1), State(pc_initial=40 nstack_initial=1)])
+DEBUG:numba.core.byteflow:stack: ['$phi30.0']
+DEBUG:numba.core.byteflow:state.pc_initial: State(pc_initial=30 nstack_initial=1)
+DEBUG:numba.core.byteflow:dispatch pc=30, inst=LOAD_FAST(arg=1, lineno=1145)
+DEBUG:numba.core.byteflow:stack ['$phi30.0']
+DEBUG:numba.core.byteflow:dispatch pc=32, inst=COMPARE_OP(arg=26, lineno=1145)
+DEBUG:numba.core.byteflow:stack ['$phi30.0', '$threshold30.1']
+DEBUG:numba.core.byteflow:dispatch pc=36, inst=POP_JUMP_IF_FALSE(arg=5, lineno=1145)
+DEBUG:numba.core.byteflow:stack ['$32compare_op.2']
+DEBUG:numba.core.byteflow:end state. edges=[Edge(pc=38, stack=(), blockstack=(), npush=0), Edge(pc=48, stack=(), blockstack=(), npush=0)]
+DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=40 nstack_initial=1), State(pc_initial=38 nstack_initial=0), State(pc_initial=48 nstack_initial=0)])
+DEBUG:numba.core.byteflow:stack: ['$phi40.0']
+DEBUG:numba.core.byteflow:state.pc_initial: State(pc_initial=40 nstack_initial=1)
+DEBUG:numba.core.byteflow:dispatch pc=40, inst=POP_TOP(arg=None, lineno=1145)
+DEBUG:numba.core.byteflow:stack ['$phi40.0']
+DEBUG:numba.core.byteflow:dispatch pc=42, inst=JUMP_FORWARD(arg=2, lineno=1145)
+DEBUG:numba.core.byteflow:stack []
+DEBUG:numba.core.byteflow:end state. edges=[Edge(pc=48, stack=(), blockstack=(), npush=0)]
+DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=38 nstack_initial=0), State(pc_initial=48 nstack_initial=0), State(pc_initial=48 nstack_initial=0)])
+DEBUG:numba.core.byteflow:stack: []
+DEBUG:numba.core.byteflow:state.pc_initial: State(pc_initial=38 nstack_initial=0)
+DEBUG:numba.core.byteflow:dispatch pc=38, inst=JUMP_FORWARD(arg=2, lineno=1145)
+DEBUG:numba.core.byteflow:stack []
+DEBUG:numba.core.byteflow:end state. edges=[Edge(pc=44, stack=(), blockstack=(), npush=0)]
+DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=48 nstack_initial=0), State(pc_initial=48 nstack_initial=0), State(pc_initial=44 nstack_initial=0)])
+DEBUG:numba.core.byteflow:stack: []
+DEBUG:numba.core.byteflow:state.pc_initial: State(pc_initial=48 nstack_initial=0)
+DEBUG:numba.core.byteflow:dispatch pc=48, inst=LOAD_FAST(arg=0, lineno=1148)
+DEBUG:numba.core.byteflow:stack []
+DEBUG:numba.core.byteflow:dispatch pc=50, inst=LOAD_CONST(arg=2, lineno=1148)
+DEBUG:numba.core.byteflow:stack ['$x48.0']
+DEBUG:numba.core.byteflow:dispatch pc=52, inst=BINARY_SUBSCR(arg=None, lineno=1148)
+DEBUG:numba.core.byteflow:stack ['$x48.0', '$const50.1']
+DEBUG:numba.core.byteflow:dispatch pc=56, inst=STORE_FAST(arg=4, lineno=1148)
+DEBUG:numba.core.byteflow:stack ['$52binary_subscr.2']
+DEBUG:numba.core.byteflow:dispatch pc=58, inst=LOAD_FAST(arg=1, lineno=1149)
+DEBUG:numba.core.byteflow:stack []
+DEBUG:numba.core.byteflow:dispatch pc=60, inst=UNARY_NEGATIVE(arg=None, lineno=1149)
+DEBUG:numba.core.byteflow:stack ['$threshold58.3']
+DEBUG:numba.core.byteflow:dispatch pc=62, inst=LOAD_FAST(arg=4, lineno=1149)
+DEBUG:numba.core.byteflow:stack ['$60unary_negative.4']
+DEBUG:numba.core.byteflow:dispatch pc=64, inst=SWAP(arg=2, lineno=1149)
+DEBUG:numba.core.byteflow:stack ['$60unary_negative.4', '$x162.5']
+DEBUG:numba.core.byteflow:dispatch pc=66, inst=COPY(arg=2, lineno=1149)
+DEBUG:numba.core.byteflow:stack ['$x162.5', '$60unary_negative.4']
+DEBUG:numba.core.byteflow:dispatch pc=68, inst=COMPARE_OP(arg=26, lineno=1149)
+DEBUG:numba.core.byteflow:stack ['$x162.5', '$60unary_negative.4', '$x162.5']
+DEBUG:numba.core.byteflow:dispatch pc=72, inst=POP_JUMP_IF_FALSE(arg=5, lineno=1149)
+DEBUG:numba.core.byteflow:stack ['$x162.5', '$68compare_op.6']
+DEBUG:numba.core.byteflow:end state. edges=[Edge(pc=74, stack=('$x162.5',), blockstack=(), npush=0), Edge(pc=84, stack=('$x162.5',), blockstack=(), npush=0)]
+DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=48 nstack_initial=0), State(pc_initial=44 nstack_initial=0), State(pc_initial=74 nstack_initial=1), State(pc_initial=84 nstack_initial=1)])
+DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=44 nstack_initial=0), State(pc_initial=74 nstack_initial=1), State(pc_initial=84 nstack_initial=1)])
+DEBUG:numba.core.byteflow:stack: []
+DEBUG:numba.core.byteflow:state.pc_initial: State(pc_initial=44 nstack_initial=0)
+DEBUG:numba.core.byteflow:dispatch pc=44, inst=LOAD_CONST(arg=1, lineno=1146)
+DEBUG:numba.core.byteflow:stack []
+DEBUG:numba.core.byteflow:dispatch pc=46, inst=STORE_FAST(arg=3, lineno=1146)
+DEBUG:numba.core.byteflow:stack ['$const44.0']
+DEBUG:numba.core.byteflow:end state. edges=[Edge(pc=48, stack=(), blockstack=(), npush=0)]
+DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=74 nstack_initial=1), State(pc_initial=84 nstack_initial=1), State(pc_initial=48 nstack_initial=0)])
+DEBUG:numba.core.byteflow:stack: ['$phi74.0']
+DEBUG:numba.core.byteflow:state.pc_initial: State(pc_initial=74 nstack_initial=1)
+DEBUG:numba.core.byteflow:dispatch pc=74, inst=LOAD_FAST(arg=1, lineno=1149)
+DEBUG:numba.core.byteflow:stack ['$phi74.0']
+DEBUG:numba.core.byteflow:dispatch pc=76, inst=COMPARE_OP(arg=26, lineno=1149)
+DEBUG:numba.core.byteflow:stack ['$phi74.0', '$threshold74.1']
+DEBUG:numba.core.byteflow:dispatch pc=80, inst=POP_JUMP_IF_FALSE(arg=5, lineno=1149)
+DEBUG:numba.core.byteflow:stack ['$76compare_op.2']
+DEBUG:numba.core.byteflow:end state. edges=[Edge(pc=82, stack=(), blockstack=(), npush=0), Edge(pc=92, stack=(), blockstack=(), npush=0)]
+DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=84 nstack_initial=1), State(pc_initial=48 nstack_initial=0), State(pc_initial=82 nstack_initial=0), State(pc_initial=92 nstack_initial=0)])
+DEBUG:numba.core.byteflow:stack: ['$phi84.0']
+DEBUG:numba.core.byteflow:state.pc_initial: State(pc_initial=84 nstack_initial=1)
+DEBUG:numba.core.byteflow:dispatch pc=84, inst=POP_TOP(arg=None, lineno=1149)
+DEBUG:numba.core.byteflow:stack ['$phi84.0']
+DEBUG:numba.core.byteflow:dispatch pc=86, inst=JUMP_FORWARD(arg=2, lineno=1149)
+DEBUG:numba.core.byteflow:stack []
+DEBUG:numba.core.byteflow:end state. edges=[Edge(pc=92, stack=(), blockstack=(), npush=0)]
+DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=48 nstack_initial=0), State(pc_initial=82 nstack_initial=0), State(pc_initial=92 nstack_initial=0), State(pc_initial=92 nstack_initial=0)])
+DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=82 nstack_initial=0), State(pc_initial=92 nstack_initial=0), State(pc_initial=92 nstack_initial=0)])
+DEBUG:numba.core.byteflow:stack: []
+DEBUG:numba.core.byteflow:state.pc_initial: State(pc_initial=82 nstack_initial=0)
+DEBUG:numba.core.byteflow:dispatch pc=82, inst=JUMP_FORWARD(arg=2, lineno=1149)
+DEBUG:numba.core.byteflow:stack []
+DEBUG:numba.core.byteflow:end state. edges=[Edge(pc=88, stack=(), blockstack=(), npush=0)]
+DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=92 nstack_initial=0), State(pc_initial=92 nstack_initial=0), State(pc_initial=88 nstack_initial=0)])
+DEBUG:numba.core.byteflow:stack: []
+DEBUG:numba.core.byteflow:state.pc_initial: State(pc_initial=92 nstack_initial=0)
+DEBUG:numba.core.byteflow:dispatch pc=92, inst=LOAD_FAST(arg=2, lineno=1152)
+DEBUG:numba.core.byteflow:stack []
+DEBUG:numba.core.byteflow:dispatch pc=94, inst=POP_JUMP_IF_FALSE(arg=43, lineno=1152)
+DEBUG:numba.core.byteflow:stack ['$zero_pos92.0']
+DEBUG:numba.core.byteflow:end state. edges=[Edge(pc=96, stack=(), blockstack=(), npush=0), Edge(pc=182, stack=(), blockstack=(), npush=0)]
+DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=92 nstack_initial=0), State(pc_initial=88 nstack_initial=0), State(pc_initial=96 nstack_initial=0), State(pc_initial=182 nstack_initial=0)])
+DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=88 nstack_initial=0), State(pc_initial=96 nstack_initial=0), State(pc_initial=182 nstack_initial=0)])
+DEBUG:numba.core.byteflow:stack: []
+DEBUG:numba.core.byteflow:state.pc_initial: State(pc_initial=88 nstack_initial=0)
+DEBUG:numba.core.byteflow:dispatch pc=88, inst=LOAD_CONST(arg=1, lineno=1150)
+DEBUG:numba.core.byteflow:stack []
+DEBUG:numba.core.byteflow:dispatch pc=90, inst=STORE_FAST(arg=4, lineno=1150)
+DEBUG:numba.core.byteflow:stack ['$const88.0']
+DEBUG:numba.core.byteflow:end state. edges=[Edge(pc=92, stack=(), blockstack=(), npush=0)]
+DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=96 nstack_initial=0), State(pc_initial=182 nstack_initial=0), State(pc_initial=92 nstack_initial=0)])
+DEBUG:numba.core.byteflow:stack: []
+DEBUG:numba.core.byteflow:state.pc_initial: State(pc_initial=96 nstack_initial=0)
+DEBUG:numba.core.byteflow:dispatch pc=96, inst=LOAD_GLOBAL(arg=1, lineno=1153)
+DEBUG:numba.core.byteflow:stack []
+DEBUG:numba.core.byteflow:dispatch pc=106, inst=LOAD_ATTR(arg=2, lineno=1153)
+DEBUG:numba.core.byteflow:stack ['$null$96.1', '$96load_global.0']
+DEBUG:numba.core.byteflow:dispatch pc=126, inst=LOAD_FAST(arg=3, lineno=1153)
+DEBUG:numba.core.byteflow:stack ['$null$96.1', '$106load_attr.2']
+DEBUG:numba.core.byteflow:dispatch pc=128, inst=CALL(arg=1, lineno=1153)
+DEBUG:numba.core.byteflow:stack ['$null$96.1', '$106load_attr.2', '$x0126.3']
+DEBUG:numba.core.byteflow:dispatch pc=136, inst=LOAD_GLOBAL(arg=1, lineno=1153)
+DEBUG:numba.core.byteflow:stack ['$128call.4']
+DEBUG:numba.core.byteflow:dispatch pc=146, inst=LOAD_ATTR(arg=2, lineno=1153)
+DEBUG:numba.core.byteflow:stack ['$128call.4', '$null$136.6', '$136load_global.5']
+DEBUG:numba.core.byteflow:dispatch pc=166, inst=LOAD_FAST(arg=4, lineno=1153)
+DEBUG:numba.core.byteflow:stack ['$128call.4', '$null$136.6', '$146load_attr.7']
+DEBUG:numba.core.byteflow:dispatch pc=168, inst=CALL(arg=1, lineno=1153)
+DEBUG:numba.core.byteflow:stack ['$128call.4', '$null$136.6', '$146load_attr.7', '$x1166.8']
+DEBUG:numba.core.byteflow:dispatch pc=176, inst=COMPARE_OP(arg=55, lineno=1153)
+DEBUG:numba.core.byteflow:stack ['$128call.4', '$168call.9']
+DEBUG:numba.core.byteflow:dispatch pc=180, inst=RETURN_VALUE(arg=None, lineno=1153)
+DEBUG:numba.core.byteflow:stack ['$176compare_op.10']
+DEBUG:numba.core.byteflow:end state. edges=[]
+DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=182 nstack_initial=0), State(pc_initial=92 nstack_initial=0)])
+DEBUG:numba.core.byteflow:stack: []
+DEBUG:numba.core.byteflow:state.pc_initial: State(pc_initial=182 nstack_initial=0)
+DEBUG:numba.core.byteflow:dispatch pc=182, inst=LOAD_GLOBAL(arg=1, lineno=1155)
+DEBUG:numba.core.byteflow:stack []
+DEBUG:numba.core.byteflow:dispatch pc=192, inst=LOAD_ATTR(arg=4, lineno=1155)
+DEBUG:numba.core.byteflow:stack ['$null$182.1', '$182load_global.0']
+DEBUG:numba.core.byteflow:dispatch pc=212, inst=LOAD_FAST(arg=3, lineno=1155)
+DEBUG:numba.core.byteflow:stack ['$null$182.1', '$192load_attr.2']
+DEBUG:numba.core.byteflow:dispatch pc=214, inst=CALL(arg=1, lineno=1155)
+DEBUG:numba.core.byteflow:stack ['$null$182.1', '$192load_attr.2', '$x0212.3']
+DEBUG:numba.core.byteflow:dispatch pc=222, inst=LOAD_GLOBAL(arg=1, lineno=1155)
+DEBUG:numba.core.byteflow:stack ['$214call.4']
+DEBUG:numba.core.byteflow:dispatch pc=232, inst=LOAD_ATTR(arg=4, lineno=1155)
+DEBUG:numba.core.byteflow:stack ['$214call.4', '$null$222.6', '$222load_global.5']
+DEBUG:numba.core.byteflow:dispatch pc=252, inst=LOAD_FAST(arg=4, lineno=1155)
+DEBUG:numba.core.byteflow:stack ['$214call.4', '$null$222.6', '$232load_attr.7']
+DEBUG:numba.core.byteflow:dispatch pc=254, inst=CALL(arg=1, lineno=1155)
+DEBUG:numba.core.byteflow:stack ['$214call.4', '$null$222.6', '$232load_attr.7', '$x1252.8']
+DEBUG:numba.core.byteflow:dispatch pc=262, inst=COMPARE_OP(arg=55, lineno=1155)
+DEBUG:numba.core.byteflow:stack ['$214call.4', '$254call.9']
+DEBUG:numba.core.byteflow:dispatch pc=266, inst=RETURN_VALUE(arg=None, lineno=1155)
+DEBUG:numba.core.byteflow:stack ['$262compare_op.10']
+DEBUG:numba.core.byteflow:end state. edges=[]
+DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=92 nstack_initial=0)])
+DEBUG:numba.core.byteflow:-------------------------Prune PHIs-------------------------
+DEBUG:numba.core.byteflow:Used_phis: defaultdict(<class 'set'>,
+            {State(pc_initial=0 nstack_initial=0): set(),
+             State(pc_initial=30 nstack_initial=1): {'$phi30.0'},
+             State(pc_initial=38 nstack_initial=0): set(),
+             State(pc_initial=40 nstack_initial=1): set(),
+             State(pc_initial=44 nstack_initial=0): set(),
+             State(pc_initial=48 nstack_initial=0): set(),
+             State(pc_initial=74 nstack_initial=1): {'$phi74.0'},
+             State(pc_initial=82 nstack_initial=0): set(),
+             State(pc_initial=84 nstack_initial=1): set(),
+             State(pc_initial=88 nstack_initial=0): set(),
+             State(pc_initial=92 nstack_initial=0): set(),
+             State(pc_initial=96 nstack_initial=0): set(),
+             State(pc_initial=182 nstack_initial=0): set()})
+DEBUG:numba.core.byteflow:defmap: {'$phi30.0': State(pc_initial=0 nstack_initial=0),
+ '$phi40.0': State(pc_initial=0 nstack_initial=0),
+ '$phi74.0': State(pc_initial=48 nstack_initial=0),
+ '$phi84.0': State(pc_initial=48 nstack_initial=0)}
+DEBUG:numba.core.byteflow:phismap: defaultdict(<class 'set'>,
+            {'$phi30.0': {('$x018.5', State(pc_initial=0 nstack_initial=0))},
+             '$phi40.0': {('$x018.5', State(pc_initial=0 nstack_initial=0))},
+             '$phi74.0': {('$x162.5', State(pc_initial=48 nstack_initial=0))},
+             '$phi84.0': {('$x162.5', State(pc_initial=48 nstack_initial=0))}})
+DEBUG:numba.core.byteflow:changing phismap: defaultdict(<class 'set'>,
+            {'$phi30.0': {('$x018.5', State(pc_initial=0 nstack_initial=0))},
+             '$phi40.0': {('$x018.5', State(pc_initial=0 nstack_initial=0))},
+             '$phi74.0': {('$x162.5', State(pc_initial=48 nstack_initial=0))},
+             '$phi84.0': {('$x162.5', State(pc_initial=48 nstack_initial=0))}})
+DEBUG:numba.core.byteflow:keep phismap: {'$phi30.0': {('$x018.5', State(pc_initial=0 nstack_initial=0))},
+ '$phi74.0': {('$x162.5', State(pc_initial=48 nstack_initial=0))}}
+DEBUG:numba.core.byteflow:new_out: defaultdict(<class 'dict'>,
+            {State(pc_initial=0 nstack_initial=0): {'$phi30.0': '$x018.5'},
+             State(pc_initial=48 nstack_initial=0): {'$phi74.0': '$x162.5'}})
+DEBUG:numba.core.byteflow:----------------------DONE Prune PHIs-----------------------
+DEBUG:numba.core.byteflow:block_infos State(pc_initial=0 nstack_initial=0):
+AdaptBlockInfo(insts=((0, {}), (2, {}), (4, {'res': '$x4.0'}), (6, {'res': '$const6.1'}), (8, {'index': '$const6.1', 'target': '$x4.0', 'res': '$8binary_subscr.2'}), (12, {'value': '$8binary_subscr.2'}), (14, {'res': '$threshold14.3'}), (16, {'value': '$threshold14.3', 'res': '$16unary_negative.4'}), (18, {'res': '$x018.5'}), (24, {'lhs': '$16unary_negative.4', 'rhs': '$x018.5', 'res': '$24compare_op.6'}), (28, {'pred': '$24compare_op.6'})), outgoing_phis={'$phi30.0': '$x018.5'}, blockstack=(), active_try_block=None, outgoing_edgepushed={30: ('$x018.5',), 40: ('$x018.5',)})
+DEBUG:numba.core.byteflow:block_infos State(pc_initial=30 nstack_initial=1):
+AdaptBlockInfo(insts=((30, {'res': '$threshold30.1'}), (32, {'lhs': '$phi30.0', 'rhs': '$threshold30.1', 'res': '$32compare_op.2'}), (36, {'pred': '$32compare_op.2'})), outgoing_phis={}, blockstack=(), active_try_block=None, outgoing_edgepushed={38: (), 48: ()})
+DEBUG:numba.core.byteflow:block_infos State(pc_initial=38 nstack_initial=0):
+AdaptBlockInfo(insts=((38, {}),), outgoing_phis={}, blockstack=(), active_try_block=None, outgoing_edgepushed={44: ()})
+DEBUG:numba.core.byteflow:block_infos State(pc_initial=40 nstack_initial=1):
+AdaptBlockInfo(insts=((42, {}),), outgoing_phis={}, blockstack=(), active_try_block=None, outgoing_edgepushed={48: ()})
+DEBUG:numba.core.byteflow:block_infos State(pc_initial=44 nstack_initial=0):
+AdaptBlockInfo(insts=((44, {'res': '$const44.0'}), (46, {'value': '$const44.0'})), outgoing_phis={}, blockstack=(), active_try_block=None, outgoing_edgepushed={48: ()})
+DEBUG:numba.core.byteflow:block_infos State(pc_initial=48 nstack_initial=0):
+AdaptBlockInfo(insts=((48, {'res': '$x48.0'}), (50, {'res': '$const50.1'}), (52, {'index': '$const50.1', 'target': '$x48.0', 'res': '$52binary_subscr.2'}), (56, {'value': '$52binary_subscr.2'}), (58, {'res': '$threshold58.3'}), (60, {'value': '$threshold58.3', 'res': '$60unary_negative.4'}), (62, {'res': '$x162.5'}), (68, {'lhs': '$60unary_negative.4', 'rhs': '$x162.5', 'res': '$68compare_op.6'}), (72, {'pred': '$68compare_op.6'})), outgoing_phis={'$phi74.0': '$x162.5'}, blockstack=(), active_try_block=None, outgoing_edgepushed={74: ('$x162.5',), 84: ('$x162.5',)})
+DEBUG:numba.core.byteflow:block_infos State(pc_initial=74 nstack_initial=1):
+AdaptBlockInfo(insts=((74, {'res': '$threshold74.1'}), (76, {'lhs': '$phi74.0', 'rhs': '$threshold74.1', 'res': '$76compare_op.2'}), (80, {'pred': '$76compare_op.2'})), outgoing_phis={}, blockstack=(), active_try_block=None, outgoing_edgepushed={82: (), 92: ()})
+DEBUG:numba.core.byteflow:block_infos State(pc_initial=82 nstack_initial=0):
+AdaptBlockInfo(insts=((82, {}),), outgoing_phis={}, blockstack=(), active_try_block=None, outgoing_edgepushed={88: ()})
+DEBUG:numba.core.byteflow:block_infos State(pc_initial=84 nstack_initial=1):
+AdaptBlockInfo(insts=((86, {}),), outgoing_phis={}, blockstack=(), active_try_block=None, outgoing_edgepushed={92: ()})
+DEBUG:numba.core.byteflow:block_infos State(pc_initial=88 nstack_initial=0):
+AdaptBlockInfo(insts=((88, {'res': '$const88.0'}), (90, {'value': '$const88.0'})), outgoing_phis={}, blockstack=(), active_try_block=None, outgoing_edgepushed={92: ()})
+DEBUG:numba.core.byteflow:block_infos State(pc_initial=92 nstack_initial=0):
+AdaptBlockInfo(insts=((92, {'res': '$zero_pos92.0'}), (94, {'pred': '$zero_pos92.0'})), outgoing_phis={}, blockstack=(), active_try_block=None, outgoing_edgepushed={96: (), 182: ()})
+DEBUG:numba.core.byteflow:block_infos State(pc_initial=96 nstack_initial=0):
+AdaptBlockInfo(insts=((96, {'idx': 0, 'res': '$96load_global.0'}), (106, {'item': '$96load_global.0', 'res': '$106load_attr.2'}), (126, {'res': '$x0126.3'}), (128, {'func': '$106load_attr.2', 'args': ['$x0126.3'], 'kw_names': None, 'res': '$128call.4'}), (136, {'idx': 0, 'res': '$136load_global.5'}), (146, {'item': '$136load_global.5', 'res': '$146load_attr.7'}), (166, {'res': '$x1166.8'}), (168, {'func': '$146load_attr.7', 'args': ['$x1166.8'], 'kw_names': None, 'res': '$168call.9'}), (176, {'lhs': '$128call.4', 'rhs': '$168call.9', 'res': '$176compare_op.10'}), (180, {'retval': '$176compare_op.10', 'castval': '$180return_value.11'})), outgoing_phis={}, blockstack=(), active_try_block=None, outgoing_edgepushed={})
+DEBUG:numba.core.byteflow:block_infos State(pc_initial=182 nstack_initial=0):
+AdaptBlockInfo(insts=((182, {'idx': 0, 'res': '$182load_global.0'}), (192, {'item': '$182load_global.0', 'res': '$192load_attr.2'}), (212, {'res': '$x0212.3'}), (214, {'func': '$192load_attr.2', 'args': ['$x0212.3'], 'kw_names': None, 'res': '$214call.4'}), (222, {'idx': 0, 'res': '$222load_global.5'}), (232, {'item': '$222load_global.5', 'res': '$232load_attr.7'}), (252, {'res': '$x1252.8'}), (254, {'func': '$232load_attr.7', 'args': ['$x1252.8'], 'kw_names': None, 'res': '$254call.9'}), (262, {'lhs': '$214call.4', 'rhs': '$254call.9', 'res': '$262compare_op.10'}), (266, {'retval': '$262compare_op.10', 'castval': '$266return_value.11'})), outgoing_phis={}, blockstack=(), active_try_block=None, outgoing_edgepushed={})
+DEBUG:numba.core.interpreter:label 0:
+    x = arg(0, name=x)                       ['x']
+    threshold = arg(1, name=threshold)       ['threshold']
+    zero_pos = arg(2, name=zero_pos)         ['zero_pos']
+    $const6.1 = const(int, 0)                ['$const6.1']
+    x0 = getitem(value=x, index=$const6.1, fn=<built-in function getitem>) ['$const6.1', 'x', 'x0']
+    $16unary_negative.4 = unary(fn=<built-in function neg>, value=threshold) ['$16unary_negative.4', 'threshold']
+    $24compare_op.6 = $16unary_negative.4 <= x0 ['$16unary_negative.4', '$24compare_op.6', 'x0']
+    bool28 = global(bool: <class 'bool'>)    ['bool28']
+    $28pred = call bool28($24compare_op.6, func=bool28, args=(Var($24compare_op.6, audio.py:1145),), kws=(), vararg=None, varkwarg=None, target=None) ['$24compare_op.6', '$28pred', 'bool28']
+    $phi30.0 = x0                            ['$phi30.0', 'x0']
+    branch $28pred, 30, 40                   ['$28pred']
+label 30:
+    $32compare_op.2 = $phi30.0 <= threshold  ['$32compare_op.2', '$phi30.0', 'threshold']
+    bool36 = global(bool: <class 'bool'>)    ['bool36']
+    $36pred = call bool36($32compare_op.2, func=bool36, args=(Var($32compare_op.2, audio.py:1145),), kws=(), vararg=None, varkwarg=None, target=None) ['$32compare_op.2', '$36pred', 'bool36']
+    branch $36pred, 38, 48                   ['$36pred']
+label 38:
+    jump 44                                  []
+label 40:
+    jump 48                                  []
+label 44:
+    x0 = const(int, 0)                       ['x0']
+    jump 48                                  []
+label 48:
+    $const50.1 = const(int, -1)              ['$const50.1']
+    x1 = getitem(value=x, index=$const50.1, fn=<built-in function getitem>) ['$const50.1', 'x', 'x1']
+    $60unary_negative.4 = unary(fn=<built-in function neg>, value=threshold) ['$60unary_negative.4', 'threshold']
+    $68compare_op.6 = $60unary_negative.4 <= x1 ['$60unary_negative.4', '$68compare_op.6', 'x1']
+    bool72 = global(bool: <class 'bool'>)    ['bool72']
+    $72pred = call bool72($68compare_op.6, func=bool72, args=(Var($68compare_op.6, audio.py:1149),), kws=(), vararg=None, varkwarg=None, target=None) ['$68compare_op.6', '$72pred', 'bool72']
+    $phi74.0 = x1                            ['$phi74.0', 'x1']
+    branch $72pred, 74, 84                   ['$72pred']
+label 74:
+    $76compare_op.2 = $phi74.0 <= threshold  ['$76compare_op.2', '$phi74.0', 'threshold']
+    bool80 = global(bool: <class 'bool'>)    ['bool80']
+    $80pred = call bool80($76compare_op.2, func=bool80, args=(Var($76compare_op.2, audio.py:1149),), kws=(), vararg=None, varkwarg=None, target=None) ['$76compare_op.2', '$80pred', 'bool80']
+    branch $80pred, 82, 92                   ['$80pred']
+label 82:
+    jump 88                                  []
+label 84:
+    jump 92                                  []
+label 88:
+    x1 = const(int, 0)                       ['x1']
+    jump 92                                  []
+label 92:
+    bool94 = global(bool: <class 'bool'>)    ['bool94']
+    $94pred = call bool94(zero_pos, func=bool94, args=(Var(zero_pos, audio.py:1141),), kws=(), vararg=None, varkwarg=None, target=None) ['$94pred', 'bool94', 'zero_pos']
+    branch $94pred, 96, 182                  ['$94pred']
+label 96:
+    $96load_global.0 = global(np: <module 'numpy' from '/home/anhnmt2/.local/lib/python3.12/site-packages/numpy/__init__.py'>) ['$96load_global.0']
+    $106load_attr.2 = getattr(value=$96load_global.0, attr=signbit) ['$106load_attr.2', '$96load_global.0']
+    $128call.4 = call $106load_attr.2(x0, func=$106load_attr.2, args=[Var(x0, audio.py:1144)], kws=(), vararg=None, varkwarg=None, target=None) ['$106load_attr.2', '$128call.4', 'x0']
+    $136load_global.5 = global(np: <module 'numpy' from '/home/anhnmt2/.local/lib/python3.12/site-packages/numpy/__init__.py'>) ['$136load_global.5']
+    $146load_attr.7 = getattr(value=$136load_global.5, attr=signbit) ['$136load_global.5', '$146load_attr.7']
+    $168call.9 = call $146load_attr.7(x1, func=$146load_attr.7, args=[Var(x1, audio.py:1148)], kws=(), vararg=None, varkwarg=None, target=None) ['$146load_attr.7', '$168call.9', 'x1']
+    $176compare_op.10 = $128call.4 != $168call.9 ['$128call.4', '$168call.9', '$176compare_op.10']
+    $180return_value.11 = cast(value=$176compare_op.10) ['$176compare_op.10', '$180return_value.11']
+    return $180return_value.11               ['$180return_value.11']
+label 182:
+    $182load_global.0 = global(np: <module 'numpy' from '/home/anhnmt2/.local/lib/python3.12/site-packages/numpy/__init__.py'>) ['$182load_global.0']
+    $192load_attr.2 = getattr(value=$182load_global.0, attr=sign) ['$182load_global.0', '$192load_attr.2']
+    $214call.4 = call $192load_attr.2(x0, func=$192load_attr.2, args=[Var(x0, audio.py:1144)], kws=(), vararg=None, varkwarg=None, target=None) ['$192load_attr.2', '$214call.4', 'x0']
+    $222load_global.5 = global(np: <module 'numpy' from '/home/anhnmt2/.local/lib/python3.12/site-packages/numpy/__init__.py'>) ['$222load_global.5']
+    $232load_attr.7 = getattr(value=$222load_global.5, attr=sign) ['$222load_global.5', '$232load_attr.7']
+    $254call.9 = call $232load_attr.7(x1, func=$232load_attr.7, args=[Var(x1, audio.py:1148)], kws=(), vararg=None, varkwarg=None, target=None) ['$232load_attr.7', '$254call.9', 'x1']
+    $262compare_op.10 = $214call.4 != $254call.9 ['$214call.4', '$254call.9', '$262compare_op.10']
+    $266return_value.11 = cast(value=$262compare_op.10) ['$262compare_op.10', '$266return_value.11']
+    return $266return_value.11               ['$266return_value.11']
+DEBUG:numba.core.byteflow:bytecode dump:
+>          0	NOP(arg=None, lineno=1039)
+           2	RESUME(arg=0, lineno=1039)
+           4	LOAD_FAST(arg=0, lineno=1042)
+           6	LOAD_CONST(arg=1, lineno=1042)
+           8	BINARY_SUBSCR(arg=None, lineno=1042)
+          12	LOAD_FAST(arg=0, lineno=1042)
+          14	LOAD_CONST(arg=2, lineno=1042)
+          16	BINARY_SUBSCR(arg=None, lineno=1042)
+          20	COMPARE_OP(arg=68, lineno=1042)
+          24	LOAD_FAST(arg=0, lineno=1042)
+          26	LOAD_CONST(arg=1, lineno=1042)
+          28	BINARY_SUBSCR(arg=None, lineno=1042)
+          32	LOAD_FAST(arg=0, lineno=1042)
+          34	LOAD_CONST(arg=3, lineno=1042)
+          36	BINARY_SUBSCR(arg=None, lineno=1042)
+          40	COMPARE_OP(arg=92, lineno=1042)
+          44	BINARY_OP(arg=1, lineno=1042)
+          48	RETURN_VALUE(arg=None, lineno=1042)
+DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=0 nstack_initial=0)])
+DEBUG:numba.core.byteflow:stack: []
+DEBUG:numba.core.byteflow:state.pc_initial: State(pc_initial=0 nstack_initial=0)
+DEBUG:numba.core.byteflow:dispatch pc=0, inst=NOP(arg=None, lineno=1039)
+DEBUG:numba.core.byteflow:stack []
+DEBUG:numba.core.byteflow:dispatch pc=2, inst=RESUME(arg=0, lineno=1039)
+DEBUG:numba.core.byteflow:stack []
+DEBUG:numba.core.byteflow:dispatch pc=4, inst=LOAD_FAST(arg=0, lineno=1042)
+DEBUG:numba.core.byteflow:stack []
+DEBUG:numba.core.byteflow:dispatch pc=6, inst=LOAD_CONST(arg=1, lineno=1042)
+DEBUG:numba.core.byteflow:stack ['$x4.0']
+DEBUG:numba.core.byteflow:dispatch pc=8, inst=BINARY_SUBSCR(arg=None, lineno=1042)
+DEBUG:numba.core.byteflow:stack ['$x4.0', '$const6.1']
+DEBUG:numba.core.byteflow:dispatch pc=12, inst=LOAD_FAST(arg=0, lineno=1042)
+DEBUG:numba.core.byteflow:stack ['$8binary_subscr.2']
+DEBUG:numba.core.byteflow:dispatch pc=14, inst=LOAD_CONST(arg=2, lineno=1042)
+DEBUG:numba.core.byteflow:stack ['$8binary_subscr.2', '$x12.3']
+DEBUG:numba.core.byteflow:dispatch pc=16, inst=BINARY_SUBSCR(arg=None, lineno=1042)
+DEBUG:numba.core.byteflow:stack ['$8binary_subscr.2', '$x12.3', '$const14.4']
+DEBUG:numba.core.byteflow:dispatch pc=20, inst=COMPARE_OP(arg=68, lineno=1042)
+DEBUG:numba.core.byteflow:stack ['$8binary_subscr.2', '$16binary_subscr.5']
+DEBUG:numba.core.byteflow:dispatch pc=24, inst=LOAD_FAST(arg=0, lineno=1042)
+DEBUG:numba.core.byteflow:stack ['$20compare_op.6']
+DEBUG:numba.core.byteflow:dispatch pc=26, inst=LOAD_CONST(arg=1, lineno=1042)
+DEBUG:numba.core.byteflow:stack ['$20compare_op.6', '$x24.7']
+DEBUG:numba.core.byteflow:dispatch pc=28, inst=BINARY_SUBSCR(arg=None, lineno=1042)
+DEBUG:numba.core.byteflow:stack ['$20compare_op.6', '$x24.7', '$const26.8']
+DEBUG:numba.core.byteflow:dispatch pc=32, inst=LOAD_FAST(arg=0, lineno=1042)
+DEBUG:numba.core.byteflow:stack ['$20compare_op.6', '$28binary_subscr.9']
+DEBUG:numba.core.byteflow:dispatch pc=34, inst=LOAD_CONST(arg=3, lineno=1042)
+DEBUG:numba.core.byteflow:stack ['$20compare_op.6', '$28binary_subscr.9', '$x32.10']
+DEBUG:numba.core.byteflow:dispatch pc=36, inst=BINARY_SUBSCR(arg=None, lineno=1042)
+DEBUG:numba.core.byteflow:stack ['$20compare_op.6', '$28binary_subscr.9', '$x32.10', '$const34.11']
+DEBUG:numba.core.byteflow:dispatch pc=40, inst=COMPARE_OP(arg=92, lineno=1042)
+DEBUG:numba.core.byteflow:stack ['$20compare_op.6', '$28binary_subscr.9', '$36binary_subscr.12']
+DEBUG:numba.core.byteflow:dispatch pc=44, inst=BINARY_OP(arg=1, lineno=1042)
+DEBUG:numba.core.byteflow:stack ['$20compare_op.6', '$40compare_op.13']
+DEBUG:numba.core.byteflow:dispatch pc=48, inst=RETURN_VALUE(arg=None, lineno=1042)
+DEBUG:numba.core.byteflow:stack ['$binop_and_44.14']
+DEBUG:numba.core.byteflow:end state. edges=[]
+DEBUG:numba.core.byteflow:-------------------------Prune PHIs-------------------------
+DEBUG:numba.core.byteflow:Used_phis: defaultdict(<class 'set'>, {State(pc_initial=0 nstack_initial=0): set()})
+DEBUG:numba.core.byteflow:defmap: {}
+DEBUG:numba.core.byteflow:phismap: defaultdict(<class 'set'>, {})
+DEBUG:numba.core.byteflow:changing phismap: defaultdict(<class 'set'>, {})
+DEBUG:numba.core.byteflow:keep phismap: {}
+DEBUG:numba.core.byteflow:new_out: defaultdict(<class 'dict'>, {})
+DEBUG:numba.core.byteflow:----------------------DONE Prune PHIs-----------------------
+DEBUG:numba.core.byteflow:block_infos State(pc_initial=0 nstack_initial=0):
+AdaptBlockInfo(insts=((0, {}), (2, {}), (4, {'res': '$x4.0'}), (6, {'res': '$const6.1'}), (8, {'index': '$const6.1', 'target': '$x4.0', 'res': '$8binary_subscr.2'}), (12, {'res': '$x12.3'}), (14, {'res': '$const14.4'}), (16, {'index': '$const14.4', 'target': '$x12.3', 'res': '$16binary_subscr.5'}), (20, {'lhs': '$8binary_subscr.2', 'rhs': '$16binary_subscr.5', 'res': '$20compare_op.6'}), (24, {'res': '$x24.7'}), (26, {'res': '$const26.8'}), (28, {'index': '$const26.8', 'target': '$x24.7', 'res': '$28binary_subscr.9'}), (32, {'res': '$x32.10'}), (34, {'res': '$const34.11'}), (36, {'index': '$const34.11', 'target': '$x32.10', 'res': '$36binary_subscr.12'}), (40, {'lhs': '$28binary_subscr.9', 'rhs': '$36binary_subscr.12', 'res': '$40compare_op.13'}), (44, {'op': '&', 'lhs': '$20compare_op.6', 'rhs': '$40compare_op.13', 'res': '$binop_and_44.14'}), (48, {'retval': '$binop_and_44.14', 'castval': '$48return_value.15'})), outgoing_phis={}, blockstack=(), active_try_block=None, outgoing_edgepushed={})
+DEBUG:numba.core.interpreter:label 0:
+    x = arg(0, name=x)                       ['x']
+    $const6.1 = const(int, 0)                ['$const6.1']
+    $8binary_subscr.2 = getitem(value=x, index=$const6.1, fn=<built-in function getitem>) ['$8binary_subscr.2', '$const6.1', 'x']
+    $const14.4 = const(int, -1)              ['$const14.4']
+    $16binary_subscr.5 = getitem(value=x, index=$const14.4, fn=<built-in function getitem>) ['$16binary_subscr.5', '$const14.4', 'x']
+    $20compare_op.6 = $8binary_subscr.2 > $16binary_subscr.5 ['$16binary_subscr.5', '$20compare_op.6', '$8binary_subscr.2']
+    $const26.8 = const(int, 0)               ['$const26.8']
+    $28binary_subscr.9 = getitem(value=x, index=$const26.8, fn=<built-in function getitem>) ['$28binary_subscr.9', '$const26.8', 'x']
+    $const34.11 = const(int, 1)              ['$const34.11']
+    $36binary_subscr.12 = getitem(value=x, index=$const34.11, fn=<built-in function getitem>) ['$36binary_subscr.12', '$const34.11', 'x']
+    $40compare_op.13 = $28binary_subscr.9 >= $36binary_subscr.12 ['$28binary_subscr.9', '$36binary_subscr.12', '$40compare_op.13']
+    $binop_and_44.14 = $20compare_op.6 & $40compare_op.13 ['$20compare_op.6', '$40compare_op.13', '$binop_and_44.14']
+    $48return_value.15 = cast(value=$binop_and_44.14) ['$48return_value.15', '$binop_and_44.14']
+    return $48return_value.15                ['$48return_value.15']
+DEBUG:numba.core.byteflow:bytecode dump:
+>          0	NOP(arg=None, lineno=1045)
+           2	RESUME(arg=0, lineno=1045)
+           4	LOAD_FAST(arg=0, lineno=1048)
+           6	LOAD_CONST(arg=1, lineno=1048)
+           8	BINARY_SUBSCR(arg=None, lineno=1048)
+          12	LOAD_FAST(arg=0, lineno=1048)
+          14	LOAD_CONST(arg=2, lineno=1048)
+          16	BINARY_SUBSCR(arg=None, lineno=1048)
+          20	COMPARE_OP(arg=2, lineno=1048)
+          24	LOAD_FAST(arg=0, lineno=1048)
+          26	LOAD_CONST(arg=1, lineno=1048)
+          28	BINARY_SUBSCR(arg=None, lineno=1048)
+          32	LOAD_FAST(arg=0, lineno=1048)
+          34	LOAD_CONST(arg=3, lineno=1048)
+          36	BINARY_SUBSCR(arg=None, lineno=1048)
+          40	COMPARE_OP(arg=26, lineno=1048)
+          44	BINARY_OP(arg=1, lineno=1048)
+          48	RETURN_VALUE(arg=None, lineno=1048)
+DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=0 nstack_initial=0)])
+DEBUG:numba.core.byteflow:stack: []
+DEBUG:numba.core.byteflow:state.pc_initial: State(pc_initial=0 nstack_initial=0)
+DEBUG:numba.core.byteflow:dispatch pc=0, inst=NOP(arg=None, lineno=1045)
+DEBUG:numba.core.byteflow:stack []
+DEBUG:numba.core.byteflow:dispatch pc=2, inst=RESUME(arg=0, lineno=1045)
+DEBUG:numba.core.byteflow:stack []
+DEBUG:numba.core.byteflow:dispatch pc=4, inst=LOAD_FAST(arg=0, lineno=1048)
+DEBUG:numba.core.byteflow:stack []
+DEBUG:numba.core.byteflow:dispatch pc=6, inst=LOAD_CONST(arg=1, lineno=1048)
+DEBUG:numba.core.byteflow:stack ['$x4.0']
+DEBUG:numba.core.byteflow:dispatch pc=8, inst=BINARY_SUBSCR(arg=None, lineno=1048)
+DEBUG:numba.core.byteflow:stack ['$x4.0', '$const6.1']
+DEBUG:numba.core.byteflow:dispatch pc=12, inst=LOAD_FAST(arg=0, lineno=1048)
+DEBUG:numba.core.byteflow:stack ['$8binary_subscr.2']
+DEBUG:numba.core.byteflow:dispatch pc=14, inst=LOAD_CONST(arg=2, lineno=1048)
+DEBUG:numba.core.byteflow:stack ['$8binary_subscr.2', '$x12.3']
+DEBUG:numba.core.byteflow:dispatch pc=16, inst=BINARY_SUBSCR(arg=None, lineno=1048)
+DEBUG:numba.core.byteflow:stack ['$8binary_subscr.2', '$x12.3', '$const14.4']
+DEBUG:numba.core.byteflow:dispatch pc=20, inst=COMPARE_OP(arg=2, lineno=1048)
+DEBUG:numba.core.byteflow:stack ['$8binary_subscr.2', '$16binary_subscr.5']
+DEBUG:numba.core.byteflow:dispatch pc=24, inst=LOAD_FAST(arg=0, lineno=1048)
+DEBUG:numba.core.byteflow:stack ['$20compare_op.6']
+DEBUG:numba.core.byteflow:dispatch pc=26, inst=LOAD_CONST(arg=1, lineno=1048)
+DEBUG:numba.core.byteflow:stack ['$20compare_op.6', '$x24.7']
+DEBUG:numba.core.byteflow:dispatch pc=28, inst=BINARY_SUBSCR(arg=None, lineno=1048)
+DEBUG:numba.core.byteflow:stack ['$20compare_op.6', '$x24.7', '$const26.8']
+DEBUG:numba.core.byteflow:dispatch pc=32, inst=LOAD_FAST(arg=0, lineno=1048)
+DEBUG:numba.core.byteflow:stack ['$20compare_op.6', '$28binary_subscr.9']
+DEBUG:numba.core.byteflow:dispatch pc=34, inst=LOAD_CONST(arg=3, lineno=1048)
+DEBUG:numba.core.byteflow:stack ['$20compare_op.6', '$28binary_subscr.9', '$x32.10']
+DEBUG:numba.core.byteflow:dispatch pc=36, inst=BINARY_SUBSCR(arg=None, lineno=1048)
+DEBUG:numba.core.byteflow:stack ['$20compare_op.6', '$28binary_subscr.9', '$x32.10', '$const34.11']
+DEBUG:numba.core.byteflow:dispatch pc=40, inst=COMPARE_OP(arg=26, lineno=1048)
+DEBUG:numba.core.byteflow:stack ['$20compare_op.6', '$28binary_subscr.9', '$36binary_subscr.12']
+DEBUG:numba.core.byteflow:dispatch pc=44, inst=BINARY_OP(arg=1, lineno=1048)
+DEBUG:numba.core.byteflow:stack ['$20compare_op.6', '$40compare_op.13']
+DEBUG:numba.core.byteflow:dispatch pc=48, inst=RETURN_VALUE(arg=None, lineno=1048)
+DEBUG:numba.core.byteflow:stack ['$binop_and_44.14']
+DEBUG:numba.core.byteflow:end state. edges=[]
+DEBUG:numba.core.byteflow:-------------------------Prune PHIs-------------------------
+DEBUG:numba.core.byteflow:Used_phis: defaultdict(<class 'set'>, {State(pc_initial=0 nstack_initial=0): set()})
+DEBUG:numba.core.byteflow:defmap: {}
+DEBUG:numba.core.byteflow:phismap: defaultdict(<class 'set'>, {})
+DEBUG:numba.core.byteflow:changing phismap: defaultdict(<class 'set'>, {})
+DEBUG:numba.core.byteflow:keep phismap: {}
+DEBUG:numba.core.byteflow:new_out: defaultdict(<class 'dict'>, {})
+DEBUG:numba.core.byteflow:----------------------DONE Prune PHIs-----------------------
+DEBUG:numba.core.byteflow:block_infos State(pc_initial=0 nstack_initial=0):
+AdaptBlockInfo(insts=((0, {}), (2, {}), (4, {'res': '$x4.0'}), (6, {'res': '$const6.1'}), (8, {'index': '$const6.1', 'target': '$x4.0', 'res': '$8binary_subscr.2'}), (12, {'res': '$x12.3'}), (14, {'res': '$const14.4'}), (16, {'index': '$const14.4', 'target': '$x12.3', 'res': '$16binary_subscr.5'}), (20, {'lhs': '$8binary_subscr.2', 'rhs': '$16binary_subscr.5', 'res': '$20compare_op.6'}), (24, {'res': '$x24.7'}), (26, {'res': '$const26.8'}), (28, {'index': '$const26.8', 'target': '$x24.7', 'res': '$28binary_subscr.9'}), (32, {'res': '$x32.10'}), (34, {'res': '$const34.11'}), (36, {'index': '$const34.11', 'target': '$x32.10', 'res': '$36binary_subscr.12'}), (40, {'lhs': '$28binary_subscr.9', 'rhs': '$36binary_subscr.12', 'res': '$40compare_op.13'}), (44, {'op': '&', 'lhs': '$20compare_op.6', 'rhs': '$40compare_op.13', 'res': '$binop_and_44.14'}), (48, {'retval': '$binop_and_44.14', 'castval': '$48return_value.15'})), outgoing_phis={}, blockstack=(), active_try_block=None, outgoing_edgepushed={})
+DEBUG:numba.core.interpreter:label 0:
+    x = arg(0, name=x)                       ['x']
+    $const6.1 = const(int, 0)                ['$const6.1']
+    $8binary_subscr.2 = getitem(value=x, index=$const6.1, fn=<built-in function getitem>) ['$8binary_subscr.2', '$const6.1', 'x']
+    $const14.4 = const(int, -1)              ['$const14.4']
+    $16binary_subscr.5 = getitem(value=x, index=$const14.4, fn=<built-in function getitem>) ['$16binary_subscr.5', '$const14.4', 'x']
+    $20compare_op.6 = $8binary_subscr.2 < $16binary_subscr.5 ['$16binary_subscr.5', '$20compare_op.6', '$8binary_subscr.2']
+    $const26.8 = const(int, 0)               ['$const26.8']
+    $28binary_subscr.9 = getitem(value=x, index=$const26.8, fn=<built-in function getitem>) ['$28binary_subscr.9', '$const26.8', 'x']
+    $const34.11 = const(int, 1)              ['$const34.11']
+    $36binary_subscr.12 = getitem(value=x, index=$const34.11, fn=<built-in function getitem>) ['$36binary_subscr.12', '$const34.11', 'x']
+    $40compare_op.13 = $28binary_subscr.9 <= $36binary_subscr.12 ['$28binary_subscr.9', '$36binary_subscr.12', '$40compare_op.13']
+    $binop_and_44.14 = $20compare_op.6 & $40compare_op.13 ['$20compare_op.6', '$40compare_op.13', '$binop_and_44.14']
+    $48return_value.15 = cast(value=$binop_and_44.14) ['$48return_value.15', '$binop_and_44.14']
+    return $48return_value.15                ['$48return_value.15']
+  0%|                                                                                                                                                     | 14/43233 [00:59<27:10:42,  2.26s/it]Traceback (most recent call last):
+{'loss': 6.6172, 'grad_norm': 10.827251434326172, 'learning_rate': 3.8550501156515035e-08, 'epoch': 0.0}
+{'loss': 5.794, 'grad_norm': 14.017024040222168, 'learning_rate': 7.710100231303007e-08, 'epoch': 0.0}
+{'loss': 6.8788, 'grad_norm': 13.020977020263672, 'learning_rate': 1.1565150346954511e-07, 'epoch': 0.0}
+{'loss': 6.6162, 'grad_norm': 18.2950439453125, 'learning_rate': 1.5420200462606014e-07, 'epoch': 0.0}
+{'loss': 6.9646, 'grad_norm': 14.263402938842773, 'learning_rate': 1.9275250578257518e-07, 'epoch': 0.0}
+{'loss': 6.631, 'grad_norm': 13.121792793273926, 'learning_rate': 2.3130300693909022e-07, 'epoch': 0.0}
+{'loss': 7.2093, 'grad_norm': 18.358381271362305, 'learning_rate': 2.6985350809560526e-07, 'epoch': 0.0}
+{'loss': 7.7485, 'grad_norm': 24.542631149291992, 'learning_rate': 3.084040092521203e-07, 'epoch': 0.0}
+{'loss': 6.2489, 'grad_norm': 12.371420860290527, 'learning_rate': 3.469545104086353e-07, 'epoch': 0.0}
+{'loss': 6.6981, 'grad_norm': 22.148744583129883, 'learning_rate': 3.8550501156515036e-07, 'epoch': 0.0}
+{'loss': 6.9253, 'grad_norm': 25.32149314880371, 'learning_rate': 4.240555127216654e-07, 'epoch': 0.0}
+{'loss': 6.7832, 'grad_norm': 36.084407806396484, 'learning_rate': 4.6260601387818044e-07, 'epoch': 0.0}
+{'loss': 6.9372, 'grad_norm': 12.946453094482422, 'learning_rate': 5.011565150346955e-07, 'epoch': 0.0}
+{'loss': 6.6468, 'grad_norm': 14.272549629211426, 'learning_rate': 5.397070161912105e-07, 'epoch': 0.0}

scripts/wandb/latest-run/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,341 @@

+tomlkit==0.12.0
+python-dotenv==1.0.1
+SQLAlchemy==2.0.36
+psutil==6.1.0
+anyio==4.8.0
+onnxruntime==1.20.1
+antlr4-python3-runtime==4.9.3
+httpx-sse==0.4.0
+annotated-types==0.7.0
+tqdm==4.66.5
+simplejson==3.19.3
+csvw==3.5.1
+pooch==1.8.2
+trl==0.9.6
+more-itertools==10.5.0
+jiter==0.6.1
+markdown2==2.5.1
+segments==2.2.1
+opentelemetry-instrumentation-asgi==0.50b0
+Deprecated==1.2.15
+pyasn1_modules==0.4.1
+bcrypt==4.2.1
+opentelemetry-util-http==0.50b0
+intervaltree==3.1.0
+hjson==3.1.0
+modelscope==1.18.1
+fastapi==0.112.4
+pyarrow==17.0.0
+sounddevice==0.5.1
+modelscope_studio==0.4.0.9
+build==1.2.2.post1
+oauthlib==3.2.2
+gunicorn==23.0.0
+pyasn1==0.6.1
+matplotlib==3.9.2
+speechbrain==0.5.16
+joblib==1.4.2
+tyro==0.8.13
+rsa==4.9
+numba==0.60.0
+fastprogress==1.0.3
+wrapt==1.17.0
+PyPika==0.48.9
+dacite==1.8.1
+googleapis-common-protos==1.66.0
+openai==1.68.0
+tabulate==0.9.0
+monotonic==1.6
+lazy_loader==0.4
+google-auth==2.37.0
+fairseq==0.12.3
+opentelemetry-semantic-conventions==0.50b0
+sacrebleu==2.4.3
+requests-toolbelt==1.0.0
+ruff==0.7.0
+bitsandbytes==0.43.1
+tenacity==9.0.0
+uvloop==0.21.0
+Pygments==2.18.0
+langchain==0.3.18
+typer==0.12.5
+uritemplate==4.1.1
+rich==13.9.3
+lion-pytorch==0.2.3
+pydub==0.25.1
+fastcore==1.7.28
+encodec==0.1.1
+cytoolz==1.0.1
+huggingface-hub==0.26.1
+python-dateutil==2.9.0.post0
+duckduckgo_search==7.3.2
+rfc3986==1.5.0
+wavedrom==2.0.3.post3
+sentence-transformers==3.3.1
+httpx==0.28.1
+colorlog==6.9.0
+xxhash==3.5.0
+termcolor==2.5.0
+importlib_resources==6.4.5
+lilcom==1.8.1
+llamafactory==0.9.2.dev0
+lhotse==1.31.0
+kiwisolver==1.4.7
+watchfiles==1.0.3
+marshmallow==3.23.1
+overrides==7.7.0
+langchain-text-splitters==0.3.6
+lxml==5.3.0
+blinker==1.8.2
+whisper==1.1.10
+triton==3.1.0
+python-multipart==0.0.12
+isodate==0.7.2
+wandb==0.19.8
+nvidia-ml-py==12.560.30
+h11==0.14.0
+zipp==3.20.2
+transformers==4.45.0
+websocket-client==1.8.0
+opentelemetry-instrumentation==0.50b0
+pydantic==2.9.2
+latex2mathml==3.77.0
+numpy-rms==0.4.2
+opentelemetry-exporter-otlp-proto-grpc==1.29.0
+humanfriendly==10.0
+decorator==5.1.1
+fonttools==4.54.1
+fire==0.7.0
+ninja==1.11.1.1
+shortuuid==1.0.13
+tiktoken==0.8.0
+aliyun-python-sdk-kms==2.16.5
+einops==0.8.0
+threadpoolctl==3.5.0
+docker-pycreds==0.4.0
+Flask==3.0.3
+opentelemetry-sdk==1.29.0
+opentelemetry-exporter-otlp-proto-common==1.29.0
+pylatexenc==2.10
+orjson==3.10.10
+durationpy==0.9
+addict==2.4.0
+py-cpuinfo==9.0.0
+contourpy==1.3.0
+crcmod==1.7
+pydantic-settings==2.6.1
+pyproject_hooks==1.2.0
+future==1.0.0
+jsonschema-specifications==2024.10.1
+coloredlogs==15.0.1
+timm==0.6.13
+deepspeed==0.14.5
+referencing==0.35.1
+binpacking==1.5.2
+peft==0.12.0
+language-tags==1.2.0
+speechtokenizer==1.0.1
+shellingham==1.5.4
+primp==0.12.1
+tavily-python==0.5.1
+uvicorn==0.32.0
+opentelemetry-proto==1.29.0
+typing-inspect==0.9.0
+backoff==2.2.1
+sortedcontainers==2.4.0
+gitdb==4.0.12
+aiofiles==23.2.1
+jsonschema==4.23.0
+svgwrite==1.4.3
+protobuf==5.29.1
+starlette==0.38.6
+transformers-stream-generator==0.0.5
+sentry-sdk==2.22.0
+toolz==1.0.0
+einops-exts==0.0.4
+WhisperSpeech==0.8
+hydra-core==1.3.2
+portalocker==2.10.1
+jieba==0.42.1
+pandas==2.2.3
+requests==2.32.3
+flash-attn==2.6.3
+msgpack==1.1.0
+chroma-hnswlib==0.7.6
+librosa==0.10.2.post1
+sniffio==1.3.1
+smmap==5.0.2
+opentelemetry-api==1.29.0
+websockets==14.2
+kubernetes==31.0.0
+audioread==3.0.1
+docstring_parser==0.16
+scipy==1.12.0
+aliyun-python-sdk-core==2.16.0
+accelerate==1.0.0
+dill==0.3.8
+llama-omni==1.0.0
+mdurl==0.1.2
+chromadb==0.5.23
+oss2==2.19.0
+rdflib==7.1.1
+bibtexparser==2.0.0b8
+rpds-py==0.22.3
+soundfile==0.12.1
+langdetect==1.0.9
+duckdb==1.2.0
+numpy==1.26.3
+dataclasses-json==0.6.7
+tokenizers==0.20.3
+cpm-kernels==1.0.11
+einx==0.3.0
+langchain-core==0.3.34
+clldutils==3.24.0
+openai-whisper==20240930
+setuptools==69.5.1
+requests-oauthlib==2.0.0
+langchain-community==0.3.17
+langsmith==0.2.3
+colorama==0.4.6
+omegaconf==2.3.0
+asgiref==3.8.1
+pydantic_core==2.23.4
+ffmpy==0.4.0
+multiprocess==0.70.16
+mmh3==5.0.1
+babel==2.16.0
+phonemizer==3.3.0
+pycryptodome==3.21.0
+gradio==4.44.1
+google-genai==1.5.0
+tzdata==2024.2
+llvmlite==0.43.0
+cachetools==5.5.0
+seaborn==0.13.2
+httptools==0.6.4
+GitPython==3.1.44
+markdown-it-py==3.0.0
+beartype==0.20.2
+whisper_normalizer==0.0.10
+dlinfo==1.2.1
+vocos==0.1.0
+itsdangerous==2.2.0
+bitarray==3.0.0
+opentelemetry-instrumentation-fastapi==0.50b0
+setproctitle==1.3.5
+cycler==0.12.1
+vector-quantize-pytorch==1.18.5
+jmespath==0.10.0
+mypy-extensions==1.0.0
+flatbuffers==24.3.25
+scikit-learn==1.5.2
+pytz==2024.2
+pyparsing==3.2.0
+posthog==3.7.4
+rouge==1.0.1
+semantic-version==2.10.0
+httpcore==1.0.6
+soxr==0.5.0.post1
+importlib_metadata==8.5.0
+audiomentations==0.36.1
+shtab==1.7.1
+Unidecode==1.3.8
+click==8.1.8
+tensorboardX==2.6.2.2
+greenlet==3.1.1
+nltk==3.9.1
+gradio_client==1.3.0
+datasets==2.21.0
+attrdict==2.0.1
+llamafactory==0.9.2.dev0
+ms-swift==2.6.0.dev0
+Brotli==1.0.9
+Cython==3.0.10
+HyperPyYAML==1.2.2
+Markdown==3.6
+MarkupSafe==2.1.3
+PySocks==1.7.1
+PyYAML==6.0.1
+absl-py==2.1.0
+aiohttp==3.9.5
+aiosignal==1.3.1
+anaconda-anon-usage==0.4.4
+archspec==0.2.3
+attrs==23.2.0
+boltons==23.0.0
+certifi==2024.6.2
+cffi==1.16.0
+charset-normalizer==2.0.4
+click==8.1.7
+conda==24.5.0
+conda-content-trust==0.2.0
+conda-libmamba-solver==24.1.0
+conda-package-handling==2.2.0
+conda_package_streaming==0.9.0
+cryptography==42.0.5
+distro==1.9.0
+filelock==3.13.1
+frozendict==2.4.2
+frozenlist==1.4.1
+fsspec==2024.6.0
+grpcio==1.64.1
+huggingface-hub==0.23.3
+idna==3.7
+Jinja2==3.1.4
+jiwer==3.0.4
+jsonargparse==4.29.0
+jsonpatch==1.33
+jsonpointer==2.1
+kaldialign==0.9.1
+libmambapy==1.5.8
+lightning==2.2.5
+lightning-utilities==0.11.2
+llvmlite==0.42.0
+menuinst==2.0.2
+mkl-fft==1.3.8
+mkl-random==1.2.4
+mkl-service==2.4.0
+mpmath==1.3.0
+multidict==6.0.5
+networkx==3.2.1
+numba==0.59.1
+numpy==1.26.4
+packaging==23.2
+pillow==10.3.0
+pip==24.0
+platformdirs==3.10.0
+pluggy==1.0.0
+protobuf==4.25.3
+pycosat==0.6.6
+pycparser==2.21
+pytorch-lightning==2.2.5
+rapidfuzz==3.9.3
+regex==2024.5.15
+requests==2.31.0
+ruamel.yaml==0.18.6
+ruamel.yaml.clib==0.2.8
+safetensors==0.4.3
+scipy==1.13.1
+sentencepiece==0.2.0
+setuptools==69.5.1
+six==1.16.0
+sympy==1.12
+tensorboard==2.17.0
+tensorboard-data-server==0.7.2
+tokenizers==0.19.1
+torch==2.2.1
+torch-complex==0.4.3
+torchaudio==2.2.1
+torchmetrics==1.4.0.post0
+torchvision==0.17.1
+tqdm==4.66.2
+transformers==4.41.2
+truststore==0.8.0
+typeguard==2.13.3
+typing_extensions==4.11.0
+urllib3==2.1.0
+Werkzeug==3.0.3
+wheel==0.43.0
+yarl==1.9.4
+zstandard==0.22.0
+warprnnt_pytorch==0.1

scripts/wandb/latest-run/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,171 @@

+{
+  "os": "Linux-5.15.0-1029-nvidia-x86_64-with-glibc2.31",
+  "python": "CPython 3.12.3",
+  "startedAt": "2025-04-10T10:19:28.834922Z",
+  "args": [
+    "--local_rank=0",
+    "--deepspeed",
+    "zero2.json",
+    "--model_name_or_path",
+    "/data1/speech/anhnmt2/Speech2Speech/LLaMA-Omni/models/llm/Qwen2.5-7B-Instruct",
+    "--pretrained_llm_path",
+    "/data1/speech/anhnmt2/cuongnm/EOT/Qwen2.5-0.5B-Instruct",
+    "--tokenizer_path",
+    "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6",
+    "--cache_dir",
+    "../output/cached_sft_20252502",
+    "--audio_encoder_path",
+    "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/omni_speech/model/minicpmo/MiniCPM-o-2_6",
+    "--llm_type",
+    "qwen",
+    "--data_path",
+    "/data1/speech/anhnmt2/dataset/s2s/minicpmo/asr/train_asr_mixed_500k.jsonl",
+    "--eval_data_path",
+    "/data1/speech/anhnmt2/dataset/s2s/minicpmo/asr/dev_asr_mixed.jsonl",
+    "--config_path",
+    "minicpmp_config.json",
+    "--remove_unused_columns",
+    "false",
+    "--prediction_loss_only",
+    "false",
+    "--bf16",
+    "true",
+    "--do_train",
+    "--do_eval",
+    "--tune_speech",
+    "false",
+    "--tune_llm",
+    "false",
+    "--model_max_length",
+    "2048",
+    "--eval_steps",
+    "3000",
+    "--output_dir",
+    "../checkpoints/minicpmo_whisper-medium_Qwen2.5-0.5B_pretrained-asr-projector",
+    "--num_train_epochs",
+    "3",
+    "--logging_strategy",
+    "steps",
+    "--per_device_train_batch_size",
+    "8",
+    "--per_device_eval_batch_size",
+    "8",
+    "--gradient_accumulation_steps",
+    "4",
+    "--evaluation_strategy",
+    "steps",
+    "--save_strategy",
+    "steps",
+    "--save_steps",
+    "5000",
+    "--save_total_limit",
+    "1",
+    "--learning_rate",
+    "5e-5",
+    "--weight_decay",
+    "0.",
+    "--warmup_ratio",
+    "0.03",
+    "--lr_scheduler_type",
+    "cosine",
+    "--logging_steps",
+    "1",
+    "--tf32",
+    "true",
+    "--gradient_checkpointing",
+    "true"
+  ],
+  "program": "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/scripts/../omni_speech/train/train_minicpmo_test.py",
+  "codePath": "omni_speech/train/train_minicpmo_test.py",
+  "git": {
+    "remote": "https://bitbucket.org/vinbdi-slp/half-streaming-speech-nlp.git",
+    "commit": "3876ef3c080c3ca44ad5ea0bd316241f0323ada6"
+  },
+  "email": "cuong220103@gmail.com",
+  "root": "/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/scripts",
+  "host": "dgx-a100-5",
+  "executable": "/opt/conda/bin/python3",
+  "cpu_count": 128,
+  "cpu_count_logical": 256,
+  "gpu": "NVIDIA A100-SXM4-40GB",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "1900954378240",
+      "used": "286067507200"
+    }
+  },
+  "memory": {
+    "total": "1081975545856"
+  },
+  "cpu": {
+    "count": 128,
+    "countLogical": 256
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA A100-SXM4-40GB",
+      "memoryTotal": "42949672960",
+      "cudaCores": 6912,
+      "architecture": "Ampere"
+    }
+  ],
+  "slurm": {
+    "cluster_name": "slurm",
+    "conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf",
+    "cpus_on_node": "24",
+    "cpus_per_task": "24",
+    "gpus_on_node": "1",
+    "gpus_per_node": "1",
+    "gtids": "0",
+    "job_cpus_per_node": "24",
+    "job_end_time": "1775042326",
+    "job_gid": "1400",
+    "job_group": "speech",
+    "job_id": "5154",
+    "job_name": "bash",
+    "job_nodelist": "dgx-a100-5",
+    "job_num_nodes": "1",
+    "job_partition": "defq",
+    "job_qos": "normal",
+    "job_start_time": "1743506326",
+    "job_uid": "1407",
+    "job_user": "anhnmt2",
+    "jobid": "5154",
+    "launch_node_ipaddr": "192.168.100.102",
+    "localid": "0",
+    "mpi_type": "pmix",
+    "nnodes": "1",
+    "nodeid": "0",
+    "nodelist": "dgx-a100-5",
+    "nprocs": "1",
+    "ntasks": "1",
+    "ntasks_per_node": "1",
+    "pmix_mapping_serv": "(vector,(0,1,1))",
+    "pmixp_abort_agent_port": "37119",
+    "prio_process": "0",
+    "procid": "0",
+    "pty_port": "45373",
+    "pty_win_col": "137",
+    "pty_win_row": "10",
+    "srun_comm_host": "192.168.100.102",
+    "srun_comm_port": "43475",
+    "step_gpus": "4",
+    "step_id": "0",
+    "step_launcher_port": "43475",
+    "step_nodelist": "dgx-a100-5",
+    "step_num_nodes": "1",
+    "step_num_tasks": "1",
+    "step_tasks_per_node": "1",
+    "stepid": "0",
+    "submit_dir": "/data1/speech/anhnmt2/ASR/speechgpt/slurm/submit",
+    "submit_host": "login-1",
+    "task_pid": "268175",
+    "tasks_per_node": "1",
+    "topology_addr": "dgx-a100-5",
+    "topology_addr_pattern": "node",
+    "umask": "0022",
+    "working_cluster": "slurm:bcm10-headnode:6817:9984:109"
+  },
+  "cudaVersion": "12.2"
+}

scripts/wandb/latest-run/logs/debug-core.log ADDED Viewed

	@@ -0,0 +1,7 @@

+{"time":"2025-04-10T17:19:28.173097267+07:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpm4_vxj8m/port-1734298.txt","pid":1734298,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
+{"time":"2025-04-10T17:19:28.173483898+07:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":44091,"Zone":""}}
+{"time":"2025-04-10T17:19:28.173583196+07:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":1734298}
+{"time":"2025-04-10T17:19:28.338675346+07:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:60304"}
+{"time":"2025-04-10T17:19:28.838813222+07:00","level":"INFO","msg":"handleInformInit: received","streamId":"pfaibe0c","id":"127.0.0.1:60304"}
+{"time":"2025-04-10T17:19:28.960357084+07:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"pfaibe0c","id":"127.0.0.1:60304"}
+{"time":"2025-04-10T17:20:36.908864225+07:00","level":"INFO","msg":"received shutdown signal","signal":15}

scripts/wandb/latest-run/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,7 @@

+{"time":"2025-04-10T17:19:28.842729448+07:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"/data1/speech/anhnmt2/Speech2Speech/half-streaming-speech-nlp/scripts/wandb/run-20250410_171928-pfaibe0c/logs/debug-core.log"}
+{"time":"2025-04-10T17:19:28.960322418+07:00","level":"INFO","msg":"created new stream","id":"pfaibe0c"}
+{"time":"2025-04-10T17:19:28.960351593+07:00","level":"INFO","msg":"stream: started","id":"pfaibe0c"}
+{"time":"2025-04-10T17:19:28.960375959+07:00","level":"INFO","msg":"writer: Do: started","stream_id":"pfaibe0c"}
+{"time":"2025-04-10T17:19:28.960456552+07:00","level":"INFO","msg":"handler: started","stream_id":"pfaibe0c"}
+{"time":"2025-04-10T17:19:28.961574927+07:00","level":"INFO","msg":"sender: started","stream_id":"pfaibe0c"}
+{"time":"2025-04-10T17:19:29.497777718+07:00","level":"INFO","msg":"Starting system monitor"}