File size: 2,565 Bytes
eaf61fe
9a41897
3d1304c
 
 
b5ad8ed
3d1304c
eaf61fe
9804570
3d1304c
b5ad8ed
 
3d1304c
b5ad8ed
3d1304c
b5ad8ed
 
 
3d1304c
 
b5ad8ed
 
 
 
 
 
dc6e6db
 
9804570
f2c31e0
3d1304c
 
 
 
 
b5ad8ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d1304c
b5ad8ed
 
 
 
 
3d1304c
b5ad8ed
3d1304c
b5ad8ed
 
3d1304c
b5ad8ed
 
 
 
3d1304c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f2c31e0
3d1304c
 
 
9804570
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import spaces
import gradio as gr

import soundfile as sf
import torch
from transformers import AutoModelForCausalLM, AutoProcessor, infer_device




model_path = "microsoft/Phi-4-multimodal-instruct"
device = f"{infer_device()}:0"

processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map=device,
    dtype=torch.float16,
)

model.load_adapter(
    model_path,
    adapter_name="speech",
    device_map=device,
    adapter_kwargs={"subfolder": 'speech-lora'}
)
model.set_adapter("speech")

@spaces.GPU
def run_phi4(audio_path: str, instruction: str) -> str:
    if not audio_path:
        return "Please upload an audio file."

    audio, samplerate = sf.read(audio_path)

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "audio", "url": audio_path},
                {"type": "text", "text": instruction},
            ],
        }
    ]

    chat_text = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=False,
        return_dict=False,
    )

    inputs = processor(
        text=chat_text,
        audios=[(audio, samplerate)],
        return_tensors="pt",
    ).to(model.device)

    generate_ids = model.generate(
        **inputs,
        max_new_tokens=1000,
        do_sample=False,
    )
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(
        generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    return response


with gr.Blocks(title="Phi-4 Multimodal Audio Demo") as demo:
    gr.Markdown("# Phi-4 Multimodal (Audio) Demo")
    gr.Markdown("Upload an audio file and run instructions with Phi-4.")

    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(type="filepath", label="Upload Audio")
            instruction = gr.Textbox(
                label="Instruction",
                value=(
                    "Transcribe the audio to text, and then translate the audio to French. "
                    "Use <sep> as a separator between the original transcript and the translation."
                ),
            )
            submit_btn = gr.Button("Run", variant="primary")
        with gr.Column():
            output_text = gr.Textbox(label="Model Response", lines=14)

    submit_btn.click(run_phi4, [audio_input, instruction], output_text)


if __name__ == "__main__":
    demo.queue().launch(share=False, ssr_mode=False)