piroplasmata's picture
Upload folder using huggingface_hub
924a4d4 verified
from audioset_download import Downloader
import os
import pandas as pd
from pydub import AudioSegment
from pydub.exceptions import CouldntDecodeError
import logging
def download_laughter_data():
# Download the audioset data
downloader = Downloader(
root_path="Data_Management",
labels=["Laughter"],
n_jobs=8,
download_type='unbalanced_train',
copy_and_replicate=False,
)
downloader.download(format='wav',quality=5)
if os.path.getsize("Data_Management/Laughter") >=3000:
return
def download_speech_data():
# Download the audioset data
downloader = Downloader(
root_path="Data_Management/Speech",
labels=["Speech"],
n_jobs=8,
download_type='balanced_train',
copy_and_replicate=False,
)
downloader.download(format='wav',quality=5)
if os.path.getsize("Data_Management/Speech") >=3000:
return
def download_evulation_data():
downloader = Downloader(
root_path="Data_Management/Test_Laughter",
labels=["Laughter"],
n_jobs=8,
download_type='eval',
copy_and_replicate=False,
)
downloader.download(format='wav',quality=5)
if os.path.getsize("Data_Management/Test_Laughter") >=3000:
return
def download_evulation_data2():
downloader = Downloader(
root_path="Data_Management/Test_Speech",
labels=["Speech"],
n_jobs=8,
download_type='eval',
copy_and_replicate=False,
)
downloader.download(format='wav',quality=5)
if os.path.getsize("Data_Management/Test_Speech/Speech") >=100:
return
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def is_file_playable(file_path):
try:
audio = AudioSegment.from_file(file_path)
return True
except CouldntDecodeError:
logging.error(f"Could not decode audio file: {file_path}")
return False
except Exception as e:
logging.error(f"An error occurred while trying to load the audio file: {file_path}. Error: {e}")
return False
def remove_corrupted_files_and_create_csv(root_path, labels):
data = {
"file": [],
"label": []
}
for label in labels:
label_path = os.path.join(root_path, label)
for root, _, files in os.walk(label_path):
for file in files:
file_path = os.path.join(root, file)
if is_file_playable(file_path):
data["file"].append(file)
data["label"].append(label)
else:
logging.info(f"File {file_path} is corrupted.")
os.remove(file_path)
df = pd.DataFrame(data)
df.to_csv("labeled_data.csv", index=False)
logging.info("CSV file created successfully.")
#download_laughter_data()
#download_speech_data()
#remove_corrupted_files_and_create_csv(root_path="Data_Management/laughter", labels=["Laughter"])
#remove_corrupted_files_and_create_csv(root_path="Data_Management/speech", labels=["Speech"])
#download_evulation_data()
download_evulation_data2()