llmvc / cotlet /sanity_check.py
Respair's picture
Upload folder using huggingface_hub
e5762f6 verified
import csv
import wave
import os
from tqdm import tqdm
def verify_wav_file(file_path):
try:
with wave.open(file_path, 'rb') as wav_file:
# Try to read some basic properties
channels = wav_file.getnchannels()
sample_width = wav_file.getsampwidth()
framerate = wav_file.getframerate()
frames = wav_file.getnframes()
# If we got here, the file is likely valid
return True
except Exception as e:
print(f"Error processing {file_path}: {str(e)}")
return False
def main():
csv_path = "/home/austin/disk1/stts-zs_cleaning/data/filename.csv"
total_files = 0
valid_files = 0
with open(csv_path, 'r') as csv_file:
csv_reader = csv.reader(csv_file, delimiter='|')
for row in tqdm(csv_reader,desc="Verifying files", unit="file"):
if row: # Check if the row is not empty
wav_path = row[0]
total_files += 1
if os.path.exists(wav_path):
if verify_wav_file(wav_path):
valid_files += 1
else:
print(f"File is corrupted or invalid: {wav_path}")
else:
print(f"File does not exist: {wav_path}")
print(f"\nVerification completed.")
print(f"Total files checked: {total_files}")
print(f"Valid files: {valid_files}")
print(f"Invalid or missing files: {total_files - valid_files}")
if __name__ == "__main__":
main()