matroks commited on
Commit
abc41b6
·
verified ·
1 Parent(s): 65fda47

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -0
app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import soundfile as sf
4
+ from transformers import AutoProcessor
5
+ from peft import PeftModel
6
+ from transformers import CsmForConditionalGeneration
7
+
8
+ device = "cuda" if torch.cuda.is_available() else "cpu"
9
+ sampling_rate = 24_000
10
+
11
+ base_id = "unsloth/csm-1b"
12
+ adapter_id = "TurkishCodeMan/csm-1b-lora-fft"
13
+
14
+ processor = AutoProcessor.from_pretrained(base_id)
15
+ base = CsmForConditionalGeneration.from_pretrained(base_id, torch_dtype="auto").to(device)
16
+ model = PeftModel.from_pretrained(base, adapter_id).to(device)
17
+ model.eval()
18
+
19
+ def _resample_linear(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
20
+ if orig_sr == target_sr:
21
+ return audio
22
+ if audio.ndim == 2:
23
+ audio = audio.mean(axis=1)
24
+ n = audio.shape[0]
25
+ new_n = int(round(n * (target_sr / orig_sr)))
26
+ if new_n <= 1:
27
+ return audio[:1].astype(np.float32)
28
+ x_old = np.linspace(0.0, 1.0, num=n, endpoint=True)
29
+ x_new = np.linspace(0.0, 1.0, num=new_n, endpoint=True)
30
+ return np.interp(x_new, x_old, audio).astype(np.float32)
31
+
32
+ # Reference audio (wav path)
33
+ ref_path = "reference.wav"
34
+ ref_audio, ref_sr = sf.read(ref_path, dtype="float32")
35
+ if ref_audio.ndim == 2:
36
+ ref_audio = ref_audio.mean(axis=1).astype(np.float32)
37
+ if ref_sr != sampling_rate:
38
+ ref_audio = _resample_linear(ref_audio, ref_sr, sampling_rate)
39
+
40
+ ref_text = "Reference transcript (optional)."
41
+ target_text = "We extend the standard NIAH task, to investigate model behavior in previously underexplored settings."
42
+
43
+ speaker_role = "0"
44
+ conversation = [
45
+ {
46
+ "role": speaker_role,
47
+ "content": [
48
+ {"type": "text", "text": "Please speak english\n\n" + ref_text},
49
+ {"type": "audio", "audio": ref_audio},
50
+ ],
51
+ },
52
+ {
53
+ "role": speaker_role,
54
+ "content": [
55
+ {"type": "text", "text": target_text},
56
+ ],
57
+ },
58
+ ]
59
+
60
+ inputs = processor.apply_chat_template(
61
+ conversation,
62
+ tokenize=True,
63
+ return_dict=True,
64
+ return_tensors="pt",
65
+ ).to(device)
66
+
67
+ with torch.no_grad():
68
+ out = model.generate(
69
+ **inputs,
70
+ output_audio=True,
71
+ max_new_tokens=200,
72
+ depth_decoder_temperature=0.6,
73
+ depth_decoder_top_k=0,
74
+ depth_decoder_top_p=0.7,
75
+ temperature=0.3,
76
+ top_k=50,
77
+ top_p=1.0,
78
+ )
79
+
80
+ generated_audio = out[0].detach().cpu().to(torch.float32).numpy()
81
+ sf.write("generated_audio.wav", generated_audio, samplerate=sampling_rate)
82
+ print("Wrote generated_audio.wav")