Spaces:

mnhatdaous
/

learnable-speech

Sleeping

App Files Files Community

primepake commited on Jul 22

Commit

ea8cd35

1 Parent(s): 34bf06f

Edit cli s3

Browse files

Files changed (1) hide show

speech/tools/S3Tokenizer/s3tokenizer/cli.py +208 -28

speech/tools/S3Tokenizer/s3tokenizer/cli.py CHANGED Viewed

@@ -15,18 +15,18 @@
 cpu:
 s3tokenizer --root_path /path/to/audio/files \
-            --model speech_tokenizer_v2_25hz \
             --device "cpu" \
             --batch_size 32
 gpu:
-torchrun --nproc_per_node=1 --nnodes=1 \
      --rdzv_id=2024 --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \
-    `which s3tokenizer` --root_path /data/dataset \
-                --model speech_tokenizer_v2_25hz \
                 --device "cuda" \
-                --batch_size 64
 """
@@ -44,13 +44,60 @@ import s3tokenizer
 class AudioDataset(Dataset):
-    def __init__(self, root_path, extensions=['.wav', '.flac', '.mp3']):
         self.data = []
-        # Recursively find all audio files
-        root = Path(root_path)
-        for ext in extensions:
-            self.data.extend(root.rglob(f'*{ext}'))
         # Sort for consistent ordering
         self.data.sort()
@@ -59,18 +106,39 @@ class AudioDataset(Dataset):
             raise ValueError(f"No audio files found in {root_path}")
         print(f"Found {len(self.data)} audio files")
     def __len__(self):
         return len(self.data)
     def __getitem__(self, idx):
         file_path = self.data[idx]
-        audio = s3tokenizer.load_audio(str(file_path))
-        mel = s3tokenizer.log_mel_spectrogram(audio)
-        return file_path, mel
 def collate_fn(batch):
     file_paths = [item[0] for item in batch]
     mels = [item[1] for item in batch]
     mels, mels_lens = s3tokenizer.padding(mels)
@@ -123,6 +191,27 @@ def get_args():
                         nargs='+',
                         default=['.wav', '.flac', '.mp3'],
                         help='audio file extensions to process')
     args = parser.parse_args()
     return args
@@ -135,8 +224,6 @@ def save_tokens(file_path, codes, codes_len):
     # Extract only valid codes (up to codes_len)
     valid_codes = codes[:codes_len]
-    # convert valid codes to list
-    valid_codes = valid_codes.tolist()
     # Save as tensor
     torch.save(valid_codes, output_path)
@@ -155,7 +242,78 @@ def main():
     device = torch.device(args.device)
     model = s3tokenizer.load_model(args.model).to(device)
-    dataset = AudioDataset(args.root_path, args.extensions)
     if args.device == "cuda":
         model = torch.nn.parallel.DistributedDataParallel(
@@ -180,28 +338,50 @@ def main():
         progress_bar = tqdm(total=total_steps, desc="Processing", unit="wavs")
     processed_count = 0
     for file_paths, mels, mels_lens in dataloader:
         codes, codes_lens = model(mels.to(device), mels_lens.to(device))
         # Process each file in the batch
         for i, file_path in enumerate(file_paths):
-            code = codes[i]
-            code_len = codes_lens[i].item()
-            # Save tokens as .pt file
-            output_path = save_tokens(file_path, code, code_len)
-            if rank == 0:
-                tqdm.write(f"Saved: {file_path} -> {output_path}")
-        processed_count += len(file_paths)
         if rank == 0:
-            progress_bar.update(world_size * len(file_paths))
     if rank == 0:
         progress_bar.close()
-        print(f"\nProcessed {processed_count} files on rank {rank}")
     if args.device == "cuda":
         dist.barrier()

 cpu:
 s3tokenizer --root_path /path/to/audio/files \
+            --model speech_tokenizer_v1 \
             --device "cpu" \
             --batch_size 32
 gpu:
+torchrun --nproc_per_node=8 --nnodes=1 \
      --rdzv_id=2024 --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \
+    `which s3tokenizer` --root_path /path/to/audio/files \
+                --model speech_tokenizer_v1 \
                 --device "cuda" \
+                --batch_size 32
 """
 class AudioDataset(Dataset):
+    def __init__(self, root_path, extensions=['.wav', '.flac', '.mp3'],
+                 use_cache=True, cache_file=None, max_workers=8):
         self.data = []
+        # Define cache file path
+        if cache_file is None:
+            cache_file = Path(root_path) / '.audio_file_cache.pkl'
+        else:
+            cache_file = Path(cache_file)
+        # Try to load from cache first
+        if use_cache and cache_file.exists():
+            import pickle
+            print(f"Loading file list from cache: {cache_file}")
+            try:
+                with open(cache_file, 'rb') as f:
+                    self.data = pickle.load(f)
+                print(f"Loaded {len(self.data)} files from cache")
+                return
+            except Exception as e:
+                print(f"Failed to load cache: {e}, scanning directory...")
+        # Method 1: Use os.walk() which is typically faster than pathlib
+        print(f"Scanning directory: {root_path}")
+        print(f"Looking for extensions: {extensions}")
+        import os
+        from concurrent.futures import ThreadPoolExecutor, as_completed
+        def scan_directory(args):
+            dirpath, extensions = args
+            files = []
+            try:
+                with os.scandir(dirpath) as entries:
+                    for entry in entries:
+                        if entry.is_file() and any(entry.name.endswith(ext) for ext in extensions):
+                            files.append(Path(entry.path))
+            except PermissionError:
+                pass
+            return files
+        # Collect all directories first
+        all_dirs = [root_path]
+        for dirpath, dirnames, _ in os.walk(root_path):
+            all_dirs.extend(os.path.join(dirpath, d) for d in dirnames)
+        # Process directories in parallel
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            futures = [executor.submit(scan_directory, (d, extensions)) for d in all_dirs]
+            with tqdm(total=len(all_dirs), desc="Scanning directories") as pbar:
+                for future in as_completed(futures):
+                    self.data.extend(future.result())
+                    pbar.update(1)
         # Sort for consistent ordering
         self.data.sort()
             raise ValueError(f"No audio files found in {root_path}")
         print(f"Found {len(self.data)} audio files")
+        # Save to cache
+        if use_cache:
+            try:
+                import pickle
+                print(f"Saving file list to cache: {cache_file}")
+                cache_file.parent.mkdir(exist_ok=True)
+                with open(cache_file, 'wb') as f:
+                    pickle.dump(self.data, f)
+            except Exception as e:
+                print(f"Failed to save cache: {e}")
     def __len__(self):
         return len(self.data)
     def __getitem__(self, idx):
         file_path = self.data[idx]
+        try:
+            audio = s3tokenizer.load_audio(str(file_path))
+            mel = s3tokenizer.log_mel_spectrogram(audio)
+            return file_path, mel
+        except Exception as e:
+            print(f"Error processing {file_path}: {e}")
+            return None, None
 def collate_fn(batch):
+    # Filter out None entries (failed files)
+    batch = [item for item in batch if item[0] is not None]
+    if len(batch) == 0:
+        return [], None, None
     file_paths = [item[0] for item in batch]
     mels = [item[1] for item in batch]
     mels, mels_lens = s3tokenizer.padding(mels)
                         nargs='+',
                         default=['.wav', '.flac', '.mp3'],
                         help='audio file extensions to process')
+    parser.add_argument('--use_cache',
+                        action='store_true',
+                        help='use cached file list to avoid re-scanning')
+    parser.add_argument('--no_cache',
+                        action='store_true',
+                        help='force re-scan even if cache exists')
+    parser.add_argument('--cache_file',
+                        type=str,
+                        default=None,
+                        help='path to cache file (default: root_path/.audio_file_cache.pkl)')
+    parser.add_argument('--scan_workers',
+                        type=int,
+                        default=8,
+                        help='number of workers for directory scanning')
+    parser.add_argument('--file_list',
+                        type=str,
+                        default=None,
+                        help='path to pre-generated file list (one file per line)')
+    parser.add_argument('--skip_existing',
+                        action='store_true',
+                        help='skip files that already have _fsq.pt output')
     args = parser.parse_args()
     return args
     # Extract only valid codes (up to codes_len)
     valid_codes = codes[:codes_len]
     # Save as tensor
     torch.save(valid_codes, output_path)
     device = torch.device(args.device)
     model = s3tokenizer.load_model(args.model).to(device)
+    # Handle different data loading methods
+    if args.file_list:
+        # Option 3: Load from pre-generated file list
+        print(f"Loading file list from: {args.file_list}")
+        with open(args.file_list, 'r') as f:
+            file_paths = [Path(line.strip()) for line in f if line.strip()]
+        # Filter by extensions if specified
+        if args.extensions:
+            file_paths = [f for f in file_paths if any(str(f).endswith(ext) for ext in args.extensions)]
+        # Create a simple dataset
+        class FileListDataset(Dataset):
+            def __init__(self, file_paths, skip_existing=False):
+                self.data = []
+                skipped_existing = 0
+                for fp in file_paths:
+                    if skip_existing:
+                        output_path = fp.with_suffix('').with_suffix('.pt')
+                        output_path = output_path.parent / f"{output_path.stem}_fsq.pt"
+                        if output_path.exists():
+                            skipped_existing += 1
+                            continue
+                    self.data.append(fp)
+                print(f"Will process {len(self.data)} files")
+                if skip_existing and skipped_existing > 0:
+                    print(f"Skipped {skipped_existing} already processed files")
+            def __len__(self):
+                return len(self.data)
+            def __getitem__(self, idx):
+                file_path = self.data[idx]
+                try:
+                    # Check if file exists
+                    if not file_path.exists():
+                        print(f"File not found: {file_path}")
+                        return None, None
+                    # Check if it's a file (not directory)
+                    if not file_path.is_file():
+                        print(f"Not a file: {file_path}")
+                        return None, None
+                    # Try to load audio
+                    audio = s3tokenizer.load_audio(str(file_path))
+                    mel = s3tokenizer.log_mel_spectrogram(audio)
+                    return file_path, mel
+                except Exception as e:
+                    print(f"Error processing {file_path}: {e}")
+                    return None, None
+        dataset = FileListDataset(file_paths, skip_existing=args.skip_existing)
+    else:
+        # Use the enhanced AudioDataset with caching
+        dataset = AudioDataset(
+            args.root_path,
+            args.extensions,
+            use_cache=not args.no_cache,
+            cache_file=args.cache_file,
+            max_workers=args.scan_workers
+        )
+        # Filter out existing files if requested
+        if args.skip_existing:
+            original_count = len(dataset.data)
+            dataset.data = [
+                fp for fp in dataset.data
+                if not (fp.parent / f"{fp.stem}_fsq.pt").exists()
+            ]
+            print(f"Skipping {original_count - len(dataset.data)} already processed files")
     if args.device == "cuda":
         model = torch.nn.parallel.DistributedDataParallel(
         progress_bar = tqdm(total=total_steps, desc="Processing", unit="wavs")
     processed_count = 0
+    failed_count = 0
+    failed_files = []
     for file_paths, mels, mels_lens in dataloader:
+        # Skip empty batches (all files failed)
+        if len(file_paths) == 0:
+            continue
         codes, codes_lens = model(mels.to(device), mels_lens.to(device))
         # Process each file in the batch
         for i, file_path in enumerate(file_paths):
+            try:
+                code = codes[i]
+                code_len = codes_lens[i].item()
+                # Save tokens as .pt file
+                output_path = save_tokens(file_path, code, code_len)
+                if rank == 0 and processed_count < 10:  # Only show first 10 to avoid spam
+                    tqdm.write(f"Saved: {file_path} -> {output_path}")
+                processed_count += 1
+            except Exception as e:
+                failed_count += 1
+                failed_files.append(str(file_path))
+                if rank == 0:
+                    tqdm.write(f"Failed to save {file_path}: {e}")
         if rank == 0:
+            progress_bar.update(world_size * (len(file_paths) + failed_count))
     if rank == 0:
         progress_bar.close()
+        print(f"\nProcessed {processed_count} files successfully on rank {rank}")
+        if failed_count > 0:
+            print(f"Failed to process {failed_count} files")
+            # Save failed files list
+            failed_list_path = Path(args.root_path if not args.file_list else ".") / "failed_files.txt"
+            with open(failed_list_path, 'w') as f:
+                for ff in failed_files:
+                    f.write(f"{ff}\n")
+            print(f"Failed files saved to: {failed_list_path}")
     if args.device == "cuda":
         dist.barrier()