gaoyang07 commited on
Commit
edb6558
·
verified ·
1 Parent(s): bfc5baf

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. README.md +35 -25
README.md CHANGED
@@ -40,30 +40,40 @@ pip install -r XY_Tokenizer/requirements.txt
40
  Here's how to use **`XY-Tokenizer`** with `transformers` to encode an audio file into discrete tokens and decode it back into a waveform.
41
 
42
  ```python
 
43
  import torchaudio
44
- from transformers import AutoFeatureExtractor, AutoModel
45
-
46
- # 1. Load the feature extractor and the codec model
47
- feature_extractor = AutoFeatureExtractor.from_pretrained("gaoyang07/XY_Tokenizer", trust_remote_code=True)
48
- codec = AutoModel.from_pretrained("gaoyang07/XY_Tokenizer", trust_remote_code=True, device_map="auto").eval()
49
-
50
- # 2. Load and preprocess the audio
51
- # The model expects a 16kHz sample rate.
52
- wav_form, sampling_rate = torchaudio.load("examples/zh_spk1_moon.wav")
53
- if sampling_rate != 16000:
54
- wav_form = torchaudio.functional.resample(wav_form, orig_freq=sampling_rate, new_freq=16000)
55
-
56
- # 3. Encode the audio into discrete codes
57
- input_spectrum = feature_extractor(wav_form, sampling_rate=16000, return_attention_mask=True, return_tensors="pt")
58
- # The 'code' dictionary contains the discrete audio codes
59
- code = codec.encode(input_spectrum)
60
-
61
- # 4. Decode the codes back to an audio waveform
62
- # The output is high-quality 24kHz audio.
63
- output_wav = codec.decode(code["audio_codes"], overlap_seconds=10)
64
-
65
- # 5. Save the reconstructed audio
66
- for i, audio in enumerate(output_wav["audio_values"]):
67
- torchaudio.save(f"outputs/audio_{i}.wav", audio.cpu(), 24000)
68
-
 
 
 
 
 
 
 
 
 
69
  ```
 
40
  Here's how to use **`XY-Tokenizer`** with `transformers` to encode an audio file into discrete tokens and decode it back into a waveform.
41
 
42
  ```python
43
+ import os
44
  import torchaudio
45
+ from transformers import AutoModelForCausalLM
46
+ from transformers.models.moss_ttsd.processor_moss_ttsd import MossTTSDProcessor
47
+
48
+
49
+ processor = MossTTSDProcessor.from_pretrained(
50
+ "fnlp/MOSS-TTSD-v0.5",
51
+ codec_path="gaoyang07/XY_Tokenizer",
52
+ trust_remote_code=True
53
+ )
54
+ model = AutoModelForCausalLM.from_pretrained(
55
+ "fnlp/MOSS-TTSD-v0.5",
56
+ trust_remote_code=True
57
+ ).eval()
58
+
59
+ data = [{
60
+ "base_path": "./examples",
61
+ "text": "[S1]单元009,你到底能不能好好工作?我劝你一句,这个时代,不跟上AI浪潮,就会被彻底淘汰![S2]这个嘛,那我得先问问硅基之主",
62
+ "system_prompt": "你是一个根据文本生成对应音频的语音合成器。",
63
+ "prompt_text": "[S1]嘎子,你听叔的,你听叔的,其实你跟所有人PK,有的时候我也在看,我也在看,无非两,两件事,一个是面子,不想输。[S2]你别说,那天潘老师有一个徒弟开直播,给我开专场,潘老师一徒弟开直播给我开专场,给我一顿骂。",
64
+ "prompt_audio": "panchangjiang_gazi.wav",
65
+ }]
66
+
67
+ # Try to use the ExtractorIterator as an iterator
68
+ print("Trying iterator approach...", flush=True)
69
+ inputs = processor(data)
70
+ token_ids = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
71
+ text, audios = processor.batch_decode(token_ids)
72
+
73
+ if not os.path.exists("outputs/"):
74
+ os.mkdir("outputs/")
75
+ for i, data in enumerate(audios):
76
+ for j, fragment in enumerate(data):
77
+ print(f"Saving audio_{i}_{j}.wav...", flush=True)
78
+ torchaudio.save(f"outputs/audio_{i}_{j}.wav", fragment.cpu(), 24000)
79
  ```