cmots commited on
Commit
efdea0e
·
verified ·
1 Parent(s): 055afce

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +56 -21
README.md CHANGED
@@ -27,13 +27,34 @@ UniSS supports English and Chinese now.
27
  - **Demo:** https://cmots.github.io/uniss.github.io
28
 
29
  ## Quick Start
30
- 1. Install the environment
31
  ```bash
32
  conda create -n uniss python=3.10.16
33
  conda activate uniss
34
- pip install uniss
 
 
 
 
35
  ```
36
- 2. Run the code
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  ``` python
38
  import soundfile
39
  from uniss import UniSSTokenizer
@@ -41,44 +62,58 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
41
  import torch
42
  from uniss import process_input, process_output
43
 
 
44
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
45
 
46
  wav_path = "prompt_audio.wav"
 
47
 
48
- model_path = "cmots/UniSS"
 
 
 
49
 
50
- # load the model, text tokenizer, and speech tokenizer
51
- model = AutoModelForCausalLM.from_pretrained(model_path)
52
  tokenizer = AutoTokenizer.from_pretrained(model_path)
53
- speech_tokenizer = UniSSTokenizer.from_pretrained(model_path)
54
 
55
- # extract speech tokens
 
 
56
  glm4_tokens, bicodec_tokens = speech_tokenizer.tokenize(wav_path)
57
 
58
- tgt_lang = "<|eng|>"
59
 
60
- # process the input
61
- input_text = process_input(glm4_tokens, bicodec_tokens, "Quality", tgt_lang)
 
62
 
63
- # translate the speech
64
  output = model.generate(
65
- glm4_tokens,
66
- bicodec_tokens,
67
- max_new_tokens=100,
68
- num_beams=1,
69
- early_stopping=True,
70
  )
71
- output_text = tokenizer.decode(output, skip_special_tokens=True)
72
 
73
- audio, translation, transcription = process_output(output_text, input_text, speech_tokenizer, "Quality", device)
 
74
 
 
 
 
 
75
  soundfile.write("output_audio.wav", audio, 16000)
76
- print(translation)
77
- print(transcription)
78
 
 
 
 
79
  ```
80
 
 
 
81
  ## Citation
 
82
  ```bibtex
83
 
84
  ```
 
27
  - **Demo:** https://cmots.github.io/uniss.github.io
28
 
29
  ## Quick Start
30
+ 1. Install the environment and get the code
31
  ```bash
32
  conda create -n uniss python=3.10.16
33
  conda activate uniss
34
+ git clone https://github.com/cmots/UniSS.git
35
+ cd UniSS
36
+ pip install -r requirements.txt
37
+ # If you are in mainland China, you can set the mirror as follows:
38
+ pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
39
  ```
40
+ 2. Download the weight
41
+ The weight of UniSS is on [HuggingFace](https://huggingface.co/cmots/UniSS).
42
+
43
+ You have to download the model manually, you can download it via provided script:
44
+ ```
45
+ python download_weight.py
46
+ ```
47
+
48
+ or download via git clone (skip this if you have download via python script):
49
+ ``` bash
50
+ mkdir -p pretrained_models
51
+
52
+ # Make sure you have git-lfs installed (https://git-lfs.com)
53
+ git lfs install
54
+
55
+ git clone https://huggingface.co/cmots/UniSS pretrained_models/UniSS
56
+ ```
57
+ 3. Run the code
58
  ``` python
59
  import soundfile
60
  from uniss import UniSSTokenizer
 
62
  import torch
63
  from uniss import process_input, process_output
64
 
65
+ # 1. Set the device, wav path, model path
66
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
67
 
68
  wav_path = "prompt_audio.wav"
69
+ model_path = "pretrained_models/UniSS"
70
 
71
+ # 2. Set the mode and target language
72
+ mode = 'Quality' # 'Quality' or 'Performance'
73
+ tgt_lang = "<|eng|>" # for English output
74
+ # tgt_lang = "<|cmn|>" # for Chinese output
75
 
76
+ # 3. load the model, text tokenizer, and speech tokenizer
77
+ model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device)
78
  tokenizer = AutoTokenizer.from_pretrained(model_path)
 
79
 
80
+ speech_tokenizer = UniSSTokenizer.from_pretrained(model_path, device=device)
81
+
82
+ # 4. extract speech tokens
83
  glm4_tokens, bicodec_tokens = speech_tokenizer.tokenize(wav_path)
84
 
 
85
 
86
+ # 5. process the input
87
+ input_text = process_input(glm4_tokens, bicodec_tokens, mode, tgt_lang)
88
+ input_token_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
89
 
90
+ # 6. translate the speech
91
  output = model.generate(
92
+ input_token_ids,
93
+ max_new_tokens=1500,
94
+ temperature=0.8,
95
+ top_p=0.8,
96
+ repetition_penalty=1.1
97
  )
 
98
 
99
+ # 7. decode the output
100
+ output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
101
 
102
+ # 8. process the output
103
+ audio, translation, transcription = process_output(output_text[0], input_text, speech_tokenizer, mode, device)
104
+
105
+ # 9. save and show the results
106
  soundfile.write("output_audio.wav", audio, 16000)
 
 
107
 
108
+ if mode == 'Quality':
109
+ print("Transcription:\n", transcription)
110
+ print("Translation:\n", translation)
111
  ```
112
 
113
+ More examples and details is on [Our Github Repo](https://github.com/cmots/UniSS).
114
+
115
  ## Citation
116
+ If you find our paper and code useful in your research, please consider giving a like and citation.
117
  ```bibtex
118
 
119
  ```