tezuesh commited on
Commit
4768721
·
verified ·
1 Parent(s): 3188cb2

Upload folder using huggingface_hub

Browse files
inference.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import librosa
4
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
5
+ import soundfile as sf
6
+ import os
7
+
8
+
9
+ class InferenceRecipe:
10
+ def __init__(self, cache_dir='./models', device='cuda'):
11
+ self.device = device
12
+ self.asr_pipeline = None
13
+ self.chat_tokenizer = None
14
+ self.chat_model = None
15
+ self.tts_model = None
16
+ self.tts_sample_rate = 22050 # TTS output sample rate
17
+ self.cache_dir = cache_dir
18
+ self.initialize_models()
19
+
20
+ def initialize_models(self):
21
+ """Initialize models from local cache"""
22
+ # ASR: OpenAI Whisper
23
+ self.asr_pipeline = pipeline(
24
+ "automatic-speech-recognition",
25
+ model=os.path.join(self.cache_dir, "asr"),
26
+ device=self.device,
27
+ torch_dtype=torch.float32
28
+ )
29
+
30
+ # Chat: DialoGPT
31
+ dialogpt_path = os.path.join(self.cache_dir, "llm")
32
+ self.chat_tokenizer = AutoTokenizer.from_pretrained(dialogpt_path)
33
+ self.chat_model = AutoModelForCausalLM.from_pretrained(dialogpt_path)
34
+ self.chat_model = self.chat_model.to(self.device)
35
+
36
+
37
+ # TTS: Facebook MMS
38
+ self.tts_model = pipeline(
39
+ "text-to-speech",
40
+ model=os.path.join(self.cache_dir, "tts"),
41
+ device=self.device,
42
+ torch_dtype=torch.float32
43
+ )
44
+
45
+ def inference(self, audio_array, sample_rate):
46
+ """Updated inference pipeline"""
47
+ # Speech-to-Text using Whisper
48
+ text = self.asr_pipeline({"raw": audio_array, "sampling_rate": sample_rate})["text"]
49
+
50
+ # Generate response with proper attention mask
51
+ input_ids = self.chat_tokenizer.encode(text + self.chat_tokenizer.eos_token, return_tensors="pt")
52
+ attention_mask = torch.ones_like(input_ids)
53
+ chat_output = self.chat_model.generate(
54
+ input_ids.to(self.device),
55
+ attention_mask=attention_mask.to(self.device),
56
+ max_length=1000,
57
+ pad_token_id=self.chat_tokenizer.eos_token_id
58
+ )
59
+ reply = self.chat_tokenizer.decode(chat_output[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
60
+
61
+ # Text-to-Speech using HF Pipeline
62
+ tts_output = self.tts_model(reply)
63
+ audio_array = tts_output['audio']
64
+
65
+ # Ensure audio is in correct format
66
+ audio_array = audio_array.astype(np.float32)
67
+ audio_array = np.clip(audio_array, -1.0, 1.0)
68
+
69
+ # Resample to match input rate
70
+ if sample_rate != self.tts_sample_rate:
71
+ audio_array = librosa.resample(
72
+ audio_array,
73
+ orig_sr=self.tts_sample_rate,
74
+ target_sr=sample_rate
75
+ )
76
+
77
+ # Ensure the audio is 1D
78
+ if len(audio_array.shape) > 1:
79
+ audio_array = audio_array.squeeze()
80
+
81
+ return audio_array
82
+
83
+ if __name__ == "__main__":
84
+ recipe = InferenceRecipe(cache_dir="./models") # Specify your cache directory here
85
+ # Test with realistic input (silent audio)
86
+ sr = 16000
87
+ duration = 3
88
+ audio = np.zeros(int(sr * duration)) # Silent input
89
+ response_audio = recipe.inference(audio, sr)
90
+
91
+ print(f"Audio shape: {response_audio.shape}, Range: [{response_audio.min()}, {response_audio.max()}]")
92
+
93
+ # Save with explicit format
94
+ sf.write(
95
+ "response.wav",
96
+ response_audio,
97
+ sr,
98
+ format='WAV',
99
+ subtype='FLOAT'
100
+ )
models/asr/config.json ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai/whisper-small.en",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "architectures": [
6
+ "WhisperForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "begin_suppress_tokens": [
10
+ 220,
11
+ 50256
12
+ ],
13
+ "bos_token_id": 50257,
14
+ "d_model": 768,
15
+ "decoder_attention_heads": 12,
16
+ "decoder_ffn_dim": 3072,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 12,
19
+ "decoder_start_token_id": 50257,
20
+ "dropout": 0.0,
21
+ "encoder_attention_heads": 12,
22
+ "encoder_ffn_dim": 3072,
23
+ "encoder_layerdrop": 0.0,
24
+ "encoder_layers": 12,
25
+ "eos_token_id": 50256,
26
+ "forced_decoder_ids": [
27
+ [
28
+ 1,
29
+ 50362
30
+ ]
31
+ ],
32
+ "init_std": 0.02,
33
+ "is_encoder_decoder": true,
34
+ "max_length": 448,
35
+ "max_source_positions": 1500,
36
+ "max_target_positions": 448,
37
+ "model_type": "whisper",
38
+ "num_hidden_layers": 12,
39
+ "num_mel_bins": 80,
40
+ "pad_token_id": 50256,
41
+ "scale_embedding": false,
42
+ "suppress_tokens": [
43
+ 1,
44
+ 2,
45
+ 7,
46
+ 8,
47
+ 9,
48
+ 10,
49
+ 14,
50
+ 25,
51
+ 26,
52
+ 27,
53
+ 28,
54
+ 29,
55
+ 31,
56
+ 58,
57
+ 59,
58
+ 60,
59
+ 61,
60
+ 62,
61
+ 63,
62
+ 90,
63
+ 91,
64
+ 92,
65
+ 93,
66
+ 357,
67
+ 366,
68
+ 438,
69
+ 532,
70
+ 685,
71
+ 705,
72
+ 796,
73
+ 930,
74
+ 1058,
75
+ 1220,
76
+ 1267,
77
+ 1279,
78
+ 1303,
79
+ 1343,
80
+ 1377,
81
+ 1391,
82
+ 1635,
83
+ 1782,
84
+ 1875,
85
+ 2162,
86
+ 2361,
87
+ 2488,
88
+ 3467,
89
+ 4008,
90
+ 4211,
91
+ 4600,
92
+ 4808,
93
+ 5299,
94
+ 5855,
95
+ 6329,
96
+ 7203,
97
+ 9609,
98
+ 9959,
99
+ 10563,
100
+ 10786,
101
+ 11420,
102
+ 11709,
103
+ 11907,
104
+ 13163,
105
+ 13697,
106
+ 13700,
107
+ 14808,
108
+ 15306,
109
+ 16410,
110
+ 16791,
111
+ 17992,
112
+ 19203,
113
+ 19510,
114
+ 20724,
115
+ 22305,
116
+ 22935,
117
+ 27007,
118
+ 30109,
119
+ 30420,
120
+ 33409,
121
+ 34949,
122
+ 40283,
123
+ 40493,
124
+ 40549,
125
+ 47282,
126
+ 49146,
127
+ 50257,
128
+ 50357,
129
+ 50358,
130
+ 50359,
131
+ 50360,
132
+ 50361
133
+ ],
134
+ "torch_dtype": "float32",
135
+ "transformers_version": "4.27.0.dev0",
136
+ "use_cache": true,
137
+ "vocab_size": 51864
138
+ }
models/asr/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6014ac49b506df900f66f4aca6b0801eed7245594ace97bcaf73e0ae5b863066
3
+ size 966992008
models/asr/preprocessor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
models/asr/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
models/llm/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_ctx": 1024,
14
+ "n_embd": 768,
15
+ "n_head": 12,
16
+ "n_layer": 12,
17
+ "n_positions": 1024,
18
+ "resid_pdrop": 0.1,
19
+ "summary_activation": null,
20
+ "summary_first_dropout": 0.1,
21
+ "summary_proj_to_labels": true,
22
+ "summary_type": "cls_index",
23
+ "summary_use_proj": true,
24
+ "task_specific_params": {
25
+ "conversational": {
26
+ "max_length": 1000
27
+ }
28
+ },
29
+ "vocab_size": 50257
30
+ }
models/llm/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
models/llm/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea50e39b0e9368b9110cc5ab48012ddbc7bd90f2b17410aed643a7404acac567
3
+ size 351256598
models/llm/tokenizer_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "50256": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ }
13
+ },
14
+ "bos_token": "<|endoftext|>",
15
+ "chat_template": "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
16
+ "clean_up_tokenization_spaces": true,
17
+ "eos_token": "<|endoftext|>",
18
+ "errors": "replace",
19
+ "model_max_length": 1024,
20
+ "pad_token": null,
21
+ "tokenizer_class": "GPT2Tokenizer",
22
+ "unk_token": "<|endoftext|>"
23
+ }
models/llm/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
models/tts/config.json ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "architectures": [
4
+ "VitsModel"
5
+ ],
6
+ "attention_dropout": 0.1,
7
+ "depth_separable_channels": 2,
8
+ "depth_separable_num_layers": 3,
9
+ "duration_predictor_dropout": 0.5,
10
+ "duration_predictor_filter_channels": 256,
11
+ "duration_predictor_flow_bins": 10,
12
+ "duration_predictor_kernel_size": 3,
13
+ "duration_predictor_num_flows": 4,
14
+ "duration_predictor_tail_bound": 5.0,
15
+ "ffn_dim": 768,
16
+ "ffn_kernel_size": 3,
17
+ "flow_size": 192,
18
+ "hidden_act": "relu",
19
+ "hidden_dropout": 0.1,
20
+ "hidden_size": 192,
21
+ "initializer_range": 0.02,
22
+ "layer_norm_eps": 1e-05,
23
+ "layerdrop": 0.1,
24
+ "leaky_relu_slope": 0.1,
25
+ "model_type": "vits",
26
+ "noise_scale": 0.667,
27
+ "noise_scale_duration": 0.8,
28
+ "num_attention_heads": 2,
29
+ "num_hidden_layers": 6,
30
+ "num_speakers": 1,
31
+ "posterior_encoder_num_wavenet_layers": 16,
32
+ "prior_encoder_num_flows": 4,
33
+ "prior_encoder_num_wavenet_layers": 4,
34
+ "resblock_dilation_sizes": [
35
+ [
36
+ 1,
37
+ 3,
38
+ 5
39
+ ],
40
+ [
41
+ 1,
42
+ 3,
43
+ 5
44
+ ],
45
+ [
46
+ 1,
47
+ 3,
48
+ 5
49
+ ]
50
+ ],
51
+ "resblock_kernel_sizes": [
52
+ 3,
53
+ 7,
54
+ 11
55
+ ],
56
+ "sampling_rate": 16000,
57
+ "speaker_embedding_size": 0,
58
+ "speaking_rate": 1.0,
59
+ "spectrogram_bins": 513,
60
+ "torch_dtype": "float32",
61
+ "transformers_version": "4.33.0.dev0",
62
+ "upsample_initial_channel": 512,
63
+ "upsample_kernel_sizes": [
64
+ 16,
65
+ 16,
66
+ 4,
67
+ 4
68
+ ],
69
+ "upsample_rates": [
70
+ 8,
71
+ 8,
72
+ 2,
73
+ 2
74
+ ],
75
+ "use_bias": true,
76
+ "use_stochastic_duration_prediction": true,
77
+ "vocab_size": 38,
78
+ "wavenet_dilation_rate": 1,
79
+ "wavenet_dropout": 0.0,
80
+ "wavenet_kernel_size": 5,
81
+ "window_size": 4
82
+ }
models/tts/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69cf8b651c1493f1801dfd2311c298d694a38357bc9a1e41f410491ea6f0e1be
3
+ size 145227512
models/tts/special_tokens_map.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "pad_token": "k",
3
+ "unk_token": "<unk>"
4
+ }
models/tts/tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_blank": true,
3
+ "clean_up_tokenization_spaces": true,
4
+ "is_uroman": false,
5
+ "language": "eng",
6
+ "model_max_length": 1000000000000000019884624838656,
7
+ "normalize": true,
8
+ "pad_token": "k",
9
+ "phonemize": false,
10
+ "tokenizer_class": "VitsTokenizer",
11
+ "unk_token": "<unk>"
12
+ }
models/tts/vocab.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ " ": 19,
3
+ "'": 1,
4
+ "-": 14,
5
+ "0": 23,
6
+ "1": 15,
7
+ "2": 28,
8
+ "3": 11,
9
+ "4": 27,
10
+ "5": 35,
11
+ "6": 36,
12
+ "_": 30,
13
+ "a": 26,
14
+ "b": 24,
15
+ "c": 12,
16
+ "d": 5,
17
+ "e": 7,
18
+ "f": 20,
19
+ "g": 37,
20
+ "h": 6,
21
+ "i": 18,
22
+ "j": 16,
23
+ "k": 0,
24
+ "l": 21,
25
+ "m": 17,
26
+ "n": 29,
27
+ "o": 22,
28
+ "p": 13,
29
+ "q": 34,
30
+ "r": 25,
31
+ "s": 8,
32
+ "t": 33,
33
+ "u": 4,
34
+ "v": 32,
35
+ "w": 9,
36
+ "x": 31,
37
+ "y": 3,
38
+ "z": 2,
39
+ "–": 10
40
+ }
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ transformers==4.48.1
2
+ torch==2.5.1
3
+ librosa==0.10.2.post1