StyleTTS2_vi / check_data.py
hieuducle's picture
Upload folder using huggingface_hub
84f3a60 verified
# import soundfile as sf
# import os
# # Đọc train.txt
# train_file = "/workspace/trainTTS/data/data_train/train.txt"
# root_path = "/workspace/trainTTS/data/data_train/wavs_new"
# with open(train_file, 'r', encoding='utf-8') as f:
# lines = f.readlines()
# print(f"Total files: {len(lines)}")
# # Check 10 files đầu
# for i, line in enumerate(lines[:10]):
# parts = line.strip().split('|')
# audio_path = os.path.join(root_path, parts[0])
# print(f"\n{i+1}. Checking: {audio_path}")
# print(f" Exists: {os.path.exists(audio_path)}")
# if os.path.exists(audio_path):
# try:
# data, sr = sf.read(audio_path)
# print(f" ✅ OK - SR: {sr}, Duration: {len(data)/sr:.2f}s")
# except Exception as e:
# print(f" ❌ CORRUPT: {e}")
# else:
# print(f" ❌ NOT FOUND")
import os
train_file = "/workspace/trainTTS/data/data_train/train.txt"
root_path = "/workspace/trainTTS/data/wavs_new"
# 1. Check thư mục tồn tại không
print(f"📁 Root path exists: {os.path.exists(root_path)}")
print(f"📄 Train file exists: {os.path.exists(train_file)}")
# 2. Đếm files thực tế trong thư mục
if os.path.exists(root_path):
actual_files = [f for f in os.listdir(root_path) if f.endswith('.wav')]
print(f"🎵 Actual .wav files in folder: {len(actual_files)}")
print(f" First 5: {actual_files[:5]}")
else:
print("❌ Root path does NOT exist!")
exit()
# 3. Đọc train.txt
with open(train_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
print(f"\n📋 Train.txt has {len(lines)} lines")
print(f" First 3 lines:")
for i, line in enumerate(lines[:3]):
print(f" {i+1}. {line.strip()[:100]}...")
# 4. Check format
first_line = lines[0].strip()
parts = first_line.split('|')
expected_filename = parts[0]
print(f"\n🔍 Expected filename from train.txt: {expected_filename}")
print(f" Full path would be: {os.path.join(root_path, expected_filename)}")
print(f" File exists: {os.path.exists(os.path.join(root_path, expected_filename))}")
# 5. Tìm file tương tự
print(f"\n🔎 Searching for similar filenames...")
for actual in actual_files[:10]:
if expected_filename in actual or actual in expected_filename:
print(f" Match candidate: {actual}")