Spaces:
Sleeping
Sleeping
| from audioset_download import Downloader | |
| import os | |
| import pandas as pd | |
| from pydub import AudioSegment | |
| from pydub.exceptions import CouldntDecodeError | |
| import logging | |
| def download_laughter_data(): | |
| # Download the audioset data | |
| downloader = Downloader( | |
| root_path="Data_Management", | |
| labels=["Laughter"], | |
| n_jobs=8, | |
| download_type='unbalanced_train', | |
| copy_and_replicate=False, | |
| ) | |
| downloader.download(format='wav',quality=5) | |
| if os.path.getsize("Data_Management/Laughter") >=3000: | |
| return | |
| def download_speech_data(): | |
| # Download the audioset data | |
| downloader = Downloader( | |
| root_path="Data_Management/Speech", | |
| labels=["Speech"], | |
| n_jobs=8, | |
| download_type='balanced_train', | |
| copy_and_replicate=False, | |
| ) | |
| downloader.download(format='wav',quality=5) | |
| if os.path.getsize("Data_Management/Speech") >=3000: | |
| return | |
| def download_evulation_data(): | |
| downloader = Downloader( | |
| root_path="Data_Management/Test_Laughter", | |
| labels=["Laughter"], | |
| n_jobs=8, | |
| download_type='eval', | |
| copy_and_replicate=False, | |
| ) | |
| downloader.download(format='wav',quality=5) | |
| if os.path.getsize("Data_Management/Test_Laughter") >=3000: | |
| return | |
| def download_evulation_data2(): | |
| downloader = Downloader( | |
| root_path="Data_Management/Test_Speech", | |
| labels=["Speech"], | |
| n_jobs=8, | |
| download_type='eval', | |
| copy_and_replicate=False, | |
| ) | |
| downloader.download(format='wav',quality=5) | |
| if os.path.getsize("Data_Management/Test_Speech/Speech") >=100: | |
| return | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| def is_file_playable(file_path): | |
| try: | |
| audio = AudioSegment.from_file(file_path) | |
| return True | |
| except CouldntDecodeError: | |
| logging.error(f"Could not decode audio file: {file_path}") | |
| return False | |
| except Exception as e: | |
| logging.error(f"An error occurred while trying to load the audio file: {file_path}. Error: {e}") | |
| return False | |
| def remove_corrupted_files_and_create_csv(root_path, labels): | |
| data = { | |
| "file": [], | |
| "label": [] | |
| } | |
| for label in labels: | |
| label_path = os.path.join(root_path, label) | |
| for root, _, files in os.walk(label_path): | |
| for file in files: | |
| file_path = os.path.join(root, file) | |
| if is_file_playable(file_path): | |
| data["file"].append(file) | |
| data["label"].append(label) | |
| else: | |
| logging.info(f"File {file_path} is corrupted.") | |
| os.remove(file_path) | |
| df = pd.DataFrame(data) | |
| df.to_csv("labeled_data.csv", index=False) | |
| logging.info("CSV file created successfully.") | |
| #download_laughter_data() | |
| #download_speech_data() | |
| #remove_corrupted_files_and_create_csv(root_path="Data_Management/laughter", labels=["Laughter"]) | |
| #remove_corrupted_files_and_create_csv(root_path="Data_Management/speech", labels=["Speech"]) | |
| #download_evulation_data() | |
| download_evulation_data2() |