Ubuntu commited on
Commit
24d0b1d
·
1 Parent(s): f3d3b39

update tokenizer

Browse files
speech/tools/S3Tokenizer/.flake8 CHANGED
File without changes
speech/tools/S3Tokenizer/.github/workflows/python-publish.yml CHANGED
File without changes
speech/tools/S3Tokenizer/.github/workflows/unit_test_cpu.yaml CHANGED
File without changes
speech/tools/S3Tokenizer/.gitignore CHANGED
File without changes
speech/tools/S3Tokenizer/.pre-commit-config.yaml CHANGED
File without changes
speech/tools/S3Tokenizer/LICENSE CHANGED
File without changes
speech/tools/S3Tokenizer/MANIFEST.in CHANGED
File without changes
speech/tools/S3Tokenizer/README.md CHANGED
File without changes
speech/tools/S3Tokenizer/requirements.txt CHANGED
File without changes
speech/tools/S3Tokenizer/s3tokenizer/__init__.py CHANGED
File without changes
speech/tools/S3Tokenizer/s3tokenizer/assets/mel_filters.npz CHANGED
File without changes
speech/tools/S3Tokenizer/s3tokenizer/cli.py CHANGED
@@ -32,7 +32,6 @@ torchrun --nproc_per_node=8 --nnodes=1 \
32
 
33
  import argparse
34
  import os
35
- from pathlib import Path
36
 
37
  import torch
38
  import torch.distributed as dist
@@ -50,12 +49,10 @@ class AudioDataset(Dataset):
50
 
51
  # Define cache file path
52
  if cache_file is None:
53
- cache_file = Path(root_path) / '.audio_file_cache.pkl'
54
- else:
55
- cache_file = Path(cache_file)
56
 
57
  # Try to load from cache first
58
- if use_cache and cache_file.exists():
59
  import pickle
60
  print(f"Loading file list from cache: {cache_file}")
61
  try:
@@ -80,7 +77,7 @@ class AudioDataset(Dataset):
80
  with os.scandir(dirpath) as entries:
81
  for entry in entries:
82
  if entry.is_file() and any(entry.name.endswith(ext) for ext in extensions):
83
- files.append(Path(entry.path))
84
  except PermissionError:
85
  pass
86
  return files
@@ -112,7 +109,10 @@ class AudioDataset(Dataset):
112
  try:
113
  import pickle
114
  print(f"Saving file list to cache: {cache_file}")
115
- cache_file.parent.mkdir(exist_ok=True)
 
 
 
116
  with open(cache_file, 'wb') as f:
117
  pickle.dump(self.data, f)
118
  except Exception as e:
@@ -124,7 +124,7 @@ class AudioDataset(Dataset):
124
  def __getitem__(self, idx):
125
  file_path = self.data[idx]
126
  try:
127
- audio = s3tokenizer.load_audio(str(file_path))
128
  mel = s3tokenizer.log_mel_spectrogram(audio)
129
  return file_path, mel
130
  except Exception as e:
@@ -219,8 +219,8 @@ def get_args():
219
  def save_tokens(file_path, codes, codes_len):
220
  """Save tokens as .pt file with _fsq suffix"""
221
  # Remove extension and add _fsq.pt
222
- output_path = file_path.with_suffix('').with_suffix('.pt')
223
- output_path = output_path.parent / f"{output_path.stem}_fsq.pt"
224
 
225
  # Extract only valid codes (up to codes_len)
226
  valid_codes = codes[:codes_len]
@@ -248,11 +248,11 @@ def main():
248
  # Option 3: Load from pre-generated file list
249
  print(f"Loading file list from: {args.file_list}")
250
  with open(args.file_list, 'r') as f:
251
- file_paths = [Path(line.strip()) for line in f if line.strip()]
252
-
253
- # Filter by extensions if specified
254
- if args.extensions:
255
- file_paths = [f for f in file_paths if any(str(f).endswith(ext) for ext in args.extensions)]
256
 
257
  # Create a simple dataset
258
  class FileListDataset(Dataset):
@@ -261,9 +261,9 @@ def main():
261
  skipped_existing = 0
262
  for fp in file_paths:
263
  if skip_existing:
264
- output_path = fp.with_suffix('').with_suffix('.pt')
265
- output_path = output_path.parent / f"{output_path.stem}_fsq.pt"
266
- if output_path.exists():
267
  skipped_existing += 1
268
  continue
269
  self.data.append(fp)
@@ -278,17 +278,13 @@ def main():
278
  file_path = self.data[idx]
279
  try:
280
  # Check if file exists
281
- if not file_path.exists():
282
  print(f"File not found: {file_path}")
283
  return None, None
284
-
285
- # Check if it's a file (not directory)
286
- if not file_path.is_file():
287
- print(f"Not a file: {file_path}")
288
- return None, None
289
 
290
  # Try to load audio
291
- audio = s3tokenizer.load_audio(str(file_path))
292
  mel = s3tokenizer.log_mel_spectrogram(audio)
293
  return file_path, mel
294
  except Exception as e:
@@ -311,7 +307,7 @@ def main():
311
  original_count = len(dataset.data)
312
  dataset.data = [
313
  fp for fp in dataset.data
314
- if not (fp.parent / f"{fp.stem}_fsq.pt").exists()
315
  ]
316
  print(f"Skipping {original_count - len(dataset.data)} already processed files")
317
 
@@ -363,7 +359,7 @@ def main():
363
  processed_count += 1
364
  except Exception as e:
365
  failed_count += 1
366
- failed_files.append(str(file_path))
367
  if rank == 0:
368
  tqdm.write(f"Failed to save {file_path}: {e}")
369
 
@@ -377,7 +373,7 @@ def main():
377
  print(f"Failed to process {failed_count} files")
378
 
379
  # Save failed files list
380
- failed_list_path = Path(args.root_path if not args.file_list else ".") / "failed_files.txt"
381
  with open(failed_list_path, 'w') as f:
382
  for ff in failed_files:
383
  f.write(f"{ff}\n")
 
32
 
33
  import argparse
34
  import os
 
35
 
36
  import torch
37
  import torch.distributed as dist
 
49
 
50
  # Define cache file path
51
  if cache_file is None:
52
+ cache_file = os.path.join(root_path, '.audio_file_cache.pkl')
 
 
53
 
54
  # Try to load from cache first
55
+ if use_cache and os.path.exists(cache_file):
56
  import pickle
57
  print(f"Loading file list from cache: {cache_file}")
58
  try:
 
77
  with os.scandir(dirpath) as entries:
78
  for entry in entries:
79
  if entry.is_file() and any(entry.name.endswith(ext) for ext in extensions):
80
+ files.append(entry.path)
81
  except PermissionError:
82
  pass
83
  return files
 
109
  try:
110
  import pickle
111
  print(f"Saving file list to cache: {cache_file}")
112
+ # Ensure parent directory exists
113
+ cache_dir = os.path.dirname(cache_file)
114
+ if cache_dir and not os.path.exists(cache_dir):
115
+ os.makedirs(cache_dir, exist_ok=True)
116
  with open(cache_file, 'wb') as f:
117
  pickle.dump(self.data, f)
118
  except Exception as e:
 
124
  def __getitem__(self, idx):
125
  file_path = self.data[idx]
126
  try:
127
+ audio = s3tokenizer.load_audio(file_path)
128
  mel = s3tokenizer.log_mel_spectrogram(audio)
129
  return file_path, mel
130
  except Exception as e:
 
219
  def save_tokens(file_path, codes, codes_len):
220
  """Save tokens as .pt file with _fsq suffix"""
221
  # Remove extension and add _fsq.pt
222
+ base_name = os.path.splitext(file_path)[0]
223
+ output_path = f"{base_name}_fsq.pt"
224
 
225
  # Extract only valid codes (up to codes_len)
226
  valid_codes = codes[:codes_len]
 
248
  # Option 3: Load from pre-generated file list
249
  print(f"Loading file list from: {args.file_list}")
250
  with open(args.file_list, 'r') as f:
251
+ file_paths = []
252
+ for line in f:
253
+ line = line.strip()
254
+ if line:
255
+ file_paths.append(line)
256
 
257
  # Create a simple dataset
258
  class FileListDataset(Dataset):
 
261
  skipped_existing = 0
262
  for fp in file_paths:
263
  if skip_existing:
264
+ output_path = fp.replace('.wav', '_fsq.pt')
265
+ if os.path.exists(output_path):
266
+ print(f'*******skip file {output_path}')
267
  skipped_existing += 1
268
  continue
269
  self.data.append(fp)
 
278
  file_path = self.data[idx]
279
  try:
280
  # Check if file exists
281
+ if not os.path.exists(file_path):
282
  print(f"File not found: {file_path}")
283
  return None, None
284
+
 
 
 
 
285
 
286
  # Try to load audio
287
+ audio = s3tokenizer.load_audio(file_path)
288
  mel = s3tokenizer.log_mel_spectrogram(audio)
289
  return file_path, mel
290
  except Exception as e:
 
307
  original_count = len(dataset.data)
308
  dataset.data = [
309
  fp for fp in dataset.data
310
+ if not os.path.exists(os.path.join(os.path.dirname(fp), f"{os.path.splitext(os.path.basename(fp))[0]}_fsq.pt"))
311
  ]
312
  print(f"Skipping {original_count - len(dataset.data)} already processed files")
313
 
 
359
  processed_count += 1
360
  except Exception as e:
361
  failed_count += 1
362
+ failed_files.append(file_path)
363
  if rank == 0:
364
  tqdm.write(f"Failed to save {file_path}: {e}")
365
 
 
373
  print(f"Failed to process {failed_count} files")
374
 
375
  # Save failed files list
376
+ failed_list_path = os.path.join(args.root_path if not args.file_list else ".", "failed_files.txt")
377
  with open(failed_list_path, 'w') as f:
378
  for ff in failed_files:
379
  f.write(f"{ff}\n")
speech/tools/S3Tokenizer/s3tokenizer/model.py CHANGED
File without changes
speech/tools/S3Tokenizer/s3tokenizer/model_v2.py CHANGED
File without changes
speech/tools/S3Tokenizer/s3tokenizer/utils.py CHANGED
File without changes
speech/tools/S3Tokenizer/setup.py CHANGED
File without changes
speech/tools/S3Tokenizer/test/test_batch_efficiency.py CHANGED
File without changes
speech/tools/S3Tokenizer/test/test_onnx.py CHANGED
File without changes