Gijs Wijngaard commited on
Commit
4346fab
Β·
1 Parent(s): 6f64d8d
Files changed (1) hide show
  1. app.py +23 -41
app.py CHANGED
@@ -1,30 +1,29 @@
1
  import spaces
2
  import gradio as gr
 
 
3
 
4
  import soundfile as sf
5
- from transformers import AutoModelForCausalLM, AutoProcessor
 
6
 
7
 
8
 
9
 
10
- model_path = "microsoft/Phi-4-multimodal-instruct"
11
 
12
- processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
13
  model = AutoModelForCausalLM.from_pretrained(
14
- model_path,
15
- device_map="auto",
16
- trust_remote_code=True,
17
- _attn_implementation="flash_attention_2",
18
  )
19
 
20
- model.load_adapter(
21
- model_path,
22
- adapter_name="speech",
23
- device_map="auto",
24
- adapter_kwargs={"subfolder": 'speech-lora'}
25
- )
26
  model.set_adapter("speech")
27
 
 
 
28
  @spaces.GPU
29
  def run_phi4(audio_path: str, instruction: str) -> str:
30
  if not audio_path:
@@ -32,38 +31,21 @@ def run_phi4(audio_path: str, instruction: str) -> str:
32
 
33
  audio, samplerate = sf.read(audio_path)
34
 
35
- messages = [
36
- {
37
- "role": "user",
38
- "content": [
39
- {"type": "audio", "url": audio_path},
40
- {"type": "text", "text": instruction},
41
- ],
42
- }
43
- ]
44
-
45
- chat_text = processor.apply_chat_template(
46
- messages,
47
- add_generation_prompt=True,
48
- tokenize=False,
49
- return_dict=False,
50
- )
51
 
52
- inputs = processor(
53
- text=chat_text,
54
- audios=[(audio, samplerate)],
55
- return_tensors="pt",
56
- ).to(model.device)
57
 
58
- generate_ids = model.generate(
59
  **inputs,
60
- max_new_tokens=1000,
61
- do_sample=False,
62
  )
63
- generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
64
- response = processor.batch_decode(
65
- generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
66
- )[0]
67
  return response
68
 
69
 
 
1
  import spaces
2
  import gradio as gr
3
+ import io
4
+ from urllib.request import urlopen
5
 
6
  import soundfile as sf
7
+ import torch
8
+ from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
9
 
10
 
11
 
12
 
13
+ MODEL_ID = "microsoft/Phi-4-multimodal-instruct"
14
 
15
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
16
  model = AutoModelForCausalLM.from_pretrained(
17
+ MODEL_ID,
18
+ device_map="cuda" if torch.cuda.is_available() else "cpu",
19
+ torch_dtype="auto",
 
20
  )
21
 
22
+ model.load_adapter(MODEL_ID, adapter_name="speech", device_map="cuda" if torch.cuda.is_available() else "cpu", adapter_kwargs={"subfolder": 'speech-lora'})
 
 
 
 
 
23
  model.set_adapter("speech")
24
 
25
+ generation_config = GenerationConfig.from_pretrained(MODEL_ID)
26
+
27
  @spaces.GPU
28
  def run_phi4(audio_path: str, instruction: str) -> str:
29
  if not audio_path:
 
31
 
32
  audio, samplerate = sf.read(audio_path)
33
 
34
+ user_prompt = "<|user|>"
35
+ assistant_prompt = "<|assistant|>"
36
+ prompt_suffix = "<|end|>"
37
+
38
+ prompt = f"{user_prompt}<|audio_1|>{instruction}{prompt_suffix}{assistant_prompt}"
 
 
 
 
 
 
 
 
 
 
 
39
 
40
+ inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors="pt").to(model.device)
 
 
 
 
41
 
42
+ output_ids = model.generate(
43
  **inputs,
44
+ max_new_tokens=4096,
45
+ generation_config=generation_config,
46
  )
47
+ output_ids = output_ids[:, inputs["input_ids"].shape[1]:]
48
+ response = processor.batch_decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 
 
49
  return response
50
 
51