Gijs Wijngaard commited on
Commit
b5ad8ed
Β·
1 Parent(s): dc6e6db
Files changed (1) hide show
  1. app.py +41 -23
app.py CHANGED
@@ -1,30 +1,31 @@
1
  import spaces
2
  import gradio as gr
3
- import io
4
- from urllib.request import urlopen
5
 
6
  import soundfile as sf
7
  import torch
8
- from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
9
 
10
 
11
 
12
 
13
- MODEL_ID = "microsoft/Phi-4-multimodal-instruct"
 
14
 
15
- processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
16
  model = AutoModelForCausalLM.from_pretrained(
17
- MODEL_ID,
18
- device_map="cuda" if torch.cuda.is_available() else "cpu",
19
- torch_dtype="auto",
20
- _attn_implementation="flash_attention_2",
21
  )
22
 
23
- model.load_adapter(MODEL_ID, adapter_name="speech", device_map="cuda" if torch.cuda.is_available() else "cpu", adapter_kwargs={"subfolder": 'speech-lora'})
 
 
 
 
 
24
  model.set_adapter("speech")
25
 
26
- generation_config = GenerationConfig.from_pretrained(MODEL_ID)
27
-
28
  @spaces.GPU
29
  def run_phi4(audio_path: str, instruction: str) -> str:
30
  if not audio_path:
@@ -32,21 +33,38 @@ def run_phi4(audio_path: str, instruction: str) -> str:
32
 
33
  audio, samplerate = sf.read(audio_path)
34
 
35
- user_prompt = "<|user|>"
36
- assistant_prompt = "<|assistant|>"
37
- prompt_suffix = "<|end|>"
38
-
39
- prompt = f"{user_prompt}<|audio_1|>{instruction}{prompt_suffix}{assistant_prompt}"
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors="pt").to(model.device)
 
 
 
 
42
 
43
- output_ids = model.generate(
44
  **inputs,
45
- max_new_tokens=4096,
46
- generation_config=generation_config,
47
  )
48
- output_ids = output_ids[:, inputs["input_ids"].shape[1]:]
49
- response = processor.batch_decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 
 
50
  return response
51
 
52
 
 
1
  import spaces
2
  import gradio as gr
 
 
3
 
4
  import soundfile as sf
5
  import torch
6
+ from transformers import AutoModelForCausalLM, AutoProcessor, infer_device
7
 
8
 
9
 
10
 
11
+ model_path = "microsoft/Phi-4-multimodal-instruct"
12
+ device = f"{infer_device()}:0"
13
 
14
+ processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
15
  model = AutoModelForCausalLM.from_pretrained(
16
+ model_path,
17
+ device_map=device,
18
+ dtype=torch.float16,
 
19
  )
20
 
21
+ model.load_adapter(
22
+ model_path,
23
+ adapter_name="speech",
24
+ device_map=device,
25
+ adapter_kwargs={"subfolder": 'speech-lora'}
26
+ )
27
  model.set_adapter("speech")
28
 
 
 
29
  @spaces.GPU
30
  def run_phi4(audio_path: str, instruction: str) -> str:
31
  if not audio_path:
 
33
 
34
  audio, samplerate = sf.read(audio_path)
35
 
36
+ messages = [
37
+ {
38
+ "role": "user",
39
+ "content": [
40
+ {"type": "audio", "url": audio_path},
41
+ {"type": "text", "text": instruction},
42
+ ],
43
+ }
44
+ ]
45
+
46
+ chat_text = processor.apply_chat_template(
47
+ messages,
48
+ add_generation_prompt=True,
49
+ tokenize=False,
50
+ return_dict=False,
51
+ )
52
 
53
+ inputs = processor(
54
+ text=chat_text,
55
+ audios=[(audio, samplerate)],
56
+ return_tensors="pt",
57
+ ).to(model.device)
58
 
59
+ generate_ids = model.generate(
60
  **inputs,
61
+ max_new_tokens=1000,
62
+ do_sample=False,
63
  )
64
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
65
+ response = processor.batch_decode(
66
+ generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
67
+ )[0]
68
  return response
69
 
70