Upload folder using huggingface_hub
Browse files- README.md +28 -1
- example.py +64 -13
README.md
CHANGED
|
@@ -107,7 +107,34 @@ pip install torch torchaudio huggingface_hub
|
|
| 107 |
|
| 108 |
## Easy Usage with Automatic Download
|
| 109 |
|
| 110 |
-
See [example.py](https://huggingface.co/Tabahi/CUPE-2i/blob/main/example.py) for a bootstrap example.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
|
| 112 |
To use `huggingface_hub` to automatically download and run:
|
| 113 |
|
|
|
|
| 107 |
|
| 108 |
## Easy Usage with Automatic Download
|
| 109 |
|
| 110 |
+
See [example.py](https://huggingface.co/Tabahi/CUPE-2i/blob/main/example.py) for a bootstrap example. Running it with the sample audio [109867__timkahn__butterfly.wav](samples/109867__timkahn__butterfly.wav.wav) should show the following output:
|
| 111 |
+
|
| 112 |
+
```stdout
|
| 113 |
+
Loading CUPE english model...
|
| 114 |
+
Model loaded on cpu
|
| 115 |
+
Processing audio: 1.26s duration
|
| 116 |
+
Processed 75 frames (1200ms total)
|
| 117 |
+
|
| 118 |
+
Results:
|
| 119 |
+
Phoneme predictions shape: (75,)
|
| 120 |
+
Group predictions shape: (75,)
|
| 121 |
+
Model info: {'model_name': 'english', 'sample_rate': 16000, 'frames_per_second': 62.5, 'num_phoneme_classes': 67, 'num_group_classes': 17}
|
| 122 |
+
|
| 123 |
+
First 10 frame predictions:
|
| 124 |
+
Frame 0: phoneme=66, group=16
|
| 125 |
+
Frame 1: phoneme=66, group=16
|
| 126 |
+
Frame 2: phoneme=29, group=7
|
| 127 |
+
Frame 3: phoneme=66, group=16
|
| 128 |
+
Frame 4: phoneme=66, group=16
|
| 129 |
+
Frame 5: phoneme=66, group=16
|
| 130 |
+
Frame 6: phoneme=10, group=2
|
| 131 |
+
Frame 7: phoneme=66, group=16
|
| 132 |
+
Frame 8: phoneme=66, group=16
|
| 133 |
+
Frame 9: phoneme=66, group=16
|
| 134 |
+
|
| 135 |
+
Phonemes sequence: ['b', 'ʌ', 't', 'h', 'ʌ', 'f', 'l', 'æ']...
|
| 136 |
+
Groups sequence: ['voiced_stops', 'central_vowels', 'voiceless_stops', 'voiceless_fricatives', 'central_vowels', 'voiceless_fricatives', 'laterals', 'low_vowels']...
|
| 137 |
+
```
|
| 138 |
|
| 139 |
To use `huggingface_hub` to automatically download and run:
|
| 140 |
|
example.py
CHANGED
|
@@ -41,6 +41,7 @@ def load_cupe_model(model_name="english", device="auto"):
|
|
| 41 |
|
| 42 |
model_file = hf_hub_download(repo_id=repo_id, filename="model2i.py")
|
| 43 |
windowing_file = hf_hub_download(repo_id=repo_id, filename="windowing.py")
|
|
|
|
| 44 |
checkpoint_file = hf_hub_download(repo_id=repo_id, filename=f"ckpt/{model_files[model_name]}")
|
| 45 |
|
| 46 |
# Dynamically import the modules
|
|
@@ -53,12 +54,18 @@ def load_cupe_model(model_name="english", device="auto"):
|
|
| 53 |
|
| 54 |
model2i = import_module_from_file("model2i", model_file)
|
| 55 |
windowing = import_module_from_file("windowing", windowing_file)
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
# Initialize the model
|
| 58 |
extractor = model2i.CUPEEmbeddingsExtractor(checkpoint_file, device=device)
|
| 59 |
|
| 60 |
print(f"Model loaded on {device}")
|
| 61 |
-
return extractor, windowing
|
| 62 |
|
| 63 |
def predict_phonemes(audio_path, model_name="english", device="auto"):
|
| 64 |
"""
|
|
@@ -74,7 +81,7 @@ def predict_phonemes(audio_path, model_name="english", device="auto"):
|
|
| 74 |
"""
|
| 75 |
|
| 76 |
# Load model
|
| 77 |
-
extractor, windowing = load_cupe_model(model_name, device)
|
| 78 |
|
| 79 |
# Audio processing parameters
|
| 80 |
sample_rate = 16000
|
|
@@ -88,10 +95,16 @@ def predict_phonemes(audio_path, model_name="english", device="auto"):
|
|
| 88 |
if audio.shape[0] > 1:
|
| 89 |
audio = audio.mean(dim=0, keepdim=True)
|
| 90 |
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
# Move to device and add batch dimension
|
| 97 |
audio = audio.to(device)
|
|
@@ -148,18 +161,25 @@ def predict_phonemes(audio_path, model_name="english", device="auto"):
|
|
| 148 |
phoneme_preds = torch.argmax(phoneme_probs, dim=-1)
|
| 149 |
group_preds = torch.argmax(group_probs, dim=-1)
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
# Calculate timestamps (approximately 16ms per frame)
|
| 152 |
num_frames = phoneme_probs.shape[0]
|
| 153 |
-
timestamps_ms = torch.arange(num_frames) * 16 # ~16ms per frame
|
| 154 |
|
| 155 |
-
print(f"
|
| 156 |
|
| 157 |
return {
|
| 158 |
'phoneme_probabilities': phoneme_probs.cpu().numpy(),
|
| 159 |
'phoneme_predictions': phoneme_preds.cpu().numpy(),
|
| 160 |
'group_probabilities': group_probs.cpu().numpy(),
|
| 161 |
'group_predictions': group_preds.cpu().numpy(),
|
| 162 |
-
'
|
|
|
|
| 163 |
'model_info': {
|
| 164 |
'model_name': model_name,
|
| 165 |
'sample_rate': sample_rate,
|
|
@@ -175,9 +195,12 @@ if __name__ == "__main__":
|
|
| 175 |
# Simple example
|
| 176 |
audio_file = "samples/109867__timkahn__butterfly.wav.wav" # Replace with your audio file
|
| 177 |
|
|
|
|
| 178 |
if not os.path.exists(audio_file):
|
| 179 |
print(f"Audio file {audio_file} does not exist. Please provide a valid path.")
|
| 180 |
sys.exit(1)
|
|
|
|
|
|
|
| 181 |
# Predict with English model
|
| 182 |
results = predict_phonemes(
|
| 183 |
audio_path=audio_file,
|
|
@@ -188,12 +211,40 @@ if __name__ == "__main__":
|
|
| 188 |
print(f"\nResults:")
|
| 189 |
print(f"Phoneme predictions shape: {results['phoneme_predictions'].shape}")
|
| 190 |
print(f"Group predictions shape: {results['group_predictions'].shape}")
|
| 191 |
-
print(f"Timestamps shape: {results['timestamps_ms'].shape}")
|
| 192 |
print(f"Model info: {results['model_info']}")
|
| 193 |
|
| 194 |
# Show first 10 predictions with timestamps
|
| 195 |
print(f"\nFirst 10 frame predictions:")
|
| 196 |
for i in range(min(10, len(results['phoneme_predictions']))):
|
| 197 |
print(f"Frame {i}: phoneme={results['phoneme_predictions'][i]}, "
|
| 198 |
-
f"group={results['group_predictions'][i]}
|
| 199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
model_file = hf_hub_download(repo_id=repo_id, filename="model2i.py")
|
| 43 |
windowing_file = hf_hub_download(repo_id=repo_id, filename="windowing.py")
|
| 44 |
+
mapper_file = hf_hub_download(repo_id=repo_id, filename="mapper.py")
|
| 45 |
checkpoint_file = hf_hub_download(repo_id=repo_id, filename=f"ckpt/{model_files[model_name]}")
|
| 46 |
|
| 47 |
# Dynamically import the modules
|
|
|
|
| 54 |
|
| 55 |
model2i = import_module_from_file("model2i", model_file)
|
| 56 |
windowing = import_module_from_file("windowing", windowing_file)
|
| 57 |
+
mapper = import_module_from_file("mapper", mapper_file)
|
| 58 |
+
|
| 59 |
+
phoneme_to_token = mapper.phoneme_mapped_index
|
| 60 |
+
token_to_phoneme = {v: k for k, v in phoneme_to_token.items()}
|
| 61 |
+
group_to_token = mapper.phoneme_groups_index
|
| 62 |
+
token_to_group = {v: k for k, v in group_to_token.items()}
|
| 63 |
+
|
| 64 |
# Initialize the model
|
| 65 |
extractor = model2i.CUPEEmbeddingsExtractor(checkpoint_file, device=device)
|
| 66 |
|
| 67 |
print(f"Model loaded on {device}")
|
| 68 |
+
return extractor, windowing, token_to_phoneme, token_to_group
|
| 69 |
|
| 70 |
def predict_phonemes(audio_path, model_name="english", device="auto"):
|
| 71 |
"""
|
|
|
|
| 81 |
"""
|
| 82 |
|
| 83 |
# Load model
|
| 84 |
+
extractor, windowing, token_to_phoneme, token_to_group = load_cupe_model(model_name, device)
|
| 85 |
|
| 86 |
# Audio processing parameters
|
| 87 |
sample_rate = 16000
|
|
|
|
| 95 |
if audio.shape[0] > 1:
|
| 96 |
audio = audio.mean(dim=0, keepdim=True)
|
| 97 |
|
| 98 |
+
resampler = torchaudio.transforms.Resample(
|
| 99 |
+
sample_rate,
|
| 100 |
+
lowpass_filter_width=64,
|
| 101 |
+
rolloff=0.9475937167399596,
|
| 102 |
+
resampling_method="sinc_interp_kaiser",
|
| 103 |
+
beta=14.769656459379492,
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
# Always use resampler for consistency
|
| 107 |
+
audio = resampler(audio)
|
| 108 |
|
| 109 |
# Move to device and add batch dimension
|
| 110 |
audio = audio.to(device)
|
|
|
|
| 161 |
phoneme_preds = torch.argmax(phoneme_probs, dim=-1)
|
| 162 |
group_preds = torch.argmax(group_probs, dim=-1)
|
| 163 |
|
| 164 |
+
phonemes_sequence = [token_to_phoneme[int(p)] for p in phoneme_preds.cpu().numpy()]
|
| 165 |
+
groups_sequence = [token_to_group[int(g)] for g in group_preds.cpu().numpy()]
|
| 166 |
+
# remove noise
|
| 167 |
+
phonemes_sequence = [p for p in phonemes_sequence if p != 'noise']
|
| 168 |
+
groups_sequence = [g for g in groups_sequence if g != 'noise']
|
| 169 |
+
|
| 170 |
+
|
| 171 |
# Calculate timestamps (approximately 16ms per frame)
|
| 172 |
num_frames = phoneme_probs.shape[0]
|
|
|
|
| 173 |
|
| 174 |
+
print(f"Processed {num_frames} frames ({num_frames*16}ms total)")
|
| 175 |
|
| 176 |
return {
|
| 177 |
'phoneme_probabilities': phoneme_probs.cpu().numpy(),
|
| 178 |
'phoneme_predictions': phoneme_preds.cpu().numpy(),
|
| 179 |
'group_probabilities': group_probs.cpu().numpy(),
|
| 180 |
'group_predictions': group_preds.cpu().numpy(),
|
| 181 |
+
'phonemes_sequence': phonemes_sequence,
|
| 182 |
+
'groups_sequence': groups_sequence,
|
| 183 |
'model_info': {
|
| 184 |
'model_name': model_name,
|
| 185 |
'sample_rate': sample_rate,
|
|
|
|
| 195 |
# Simple example
|
| 196 |
audio_file = "samples/109867__timkahn__butterfly.wav.wav" # Replace with your audio file
|
| 197 |
|
| 198 |
+
|
| 199 |
if not os.path.exists(audio_file):
|
| 200 |
print(f"Audio file {audio_file} does not exist. Please provide a valid path.")
|
| 201 |
sys.exit(1)
|
| 202 |
+
|
| 203 |
+
torch.manual_seed(42) # For reproducibility
|
| 204 |
# Predict with English model
|
| 205 |
results = predict_phonemes(
|
| 206 |
audio_path=audio_file,
|
|
|
|
| 211 |
print(f"\nResults:")
|
| 212 |
print(f"Phoneme predictions shape: {results['phoneme_predictions'].shape}")
|
| 213 |
print(f"Group predictions shape: {results['group_predictions'].shape}")
|
|
|
|
| 214 |
print(f"Model info: {results['model_info']}")
|
| 215 |
|
| 216 |
# Show first 10 predictions with timestamps
|
| 217 |
print(f"\nFirst 10 frame predictions:")
|
| 218 |
for i in range(min(10, len(results['phoneme_predictions']))):
|
| 219 |
print(f"Frame {i}: phoneme={results['phoneme_predictions'][i]}, "
|
| 220 |
+
f"group={results['group_predictions'][i]}")
|
| 221 |
+
|
| 222 |
+
print(f"\nPhonemes sequence: {results['phonemes_sequence'][:10]}...") # Show first 10 phonemes
|
| 223 |
+
print(f"Groups sequence: {results['groups_sequence'][:10]}...")
|
| 224 |
+
|
| 225 |
+
''' output:
|
| 226 |
+
Loading CUPE english model...
|
| 227 |
+
Model loaded on cpu
|
| 228 |
+
Processing audio: 1.26s duration
|
| 229 |
+
Processed 75 frames (1200ms total)
|
| 230 |
+
|
| 231 |
+
Results:
|
| 232 |
+
Phoneme predictions shape: (75,)
|
| 233 |
+
Group predictions shape: (75,)
|
| 234 |
+
Model info: {'model_name': 'english', 'sample_rate': 16000, 'frames_per_second': 62.5, 'num_phoneme_classes': 67, 'num_group_classes': 17}
|
| 235 |
+
|
| 236 |
+
First 10 frame predictions:
|
| 237 |
+
Frame 0: phoneme=66, group=16
|
| 238 |
+
Frame 1: phoneme=66, group=16
|
| 239 |
+
Frame 2: phoneme=29, group=7
|
| 240 |
+
Frame 3: phoneme=66, group=16
|
| 241 |
+
Frame 4: phoneme=66, group=16
|
| 242 |
+
Frame 5: phoneme=66, group=16
|
| 243 |
+
Frame 6: phoneme=10, group=2
|
| 244 |
+
Frame 7: phoneme=66, group=16
|
| 245 |
+
Frame 8: phoneme=66, group=16
|
| 246 |
+
Frame 9: phoneme=66, group=16
|
| 247 |
+
|
| 248 |
+
Phonemes sequence: ['b', 'ʌ', 't', 'h', 'ʌ', 'f', 'l', 'æ']...
|
| 249 |
+
Groups sequence: ['voiced_stops', 'central_vowels', 'voiceless_stops', 'voiceless_fricatives', 'central_vowels', 'voiceless_fricatives', 'laterals', 'low_vowels']...
|
| 250 |
+
'''
|