Spaces:

piroplasmata
/

Laugh_Detection

Sleeping

App Files Files Community

Laugh_Detection / Data_Management /install.py

piroplasmata

Upload folder using huggingface_hub

924a4d4 verified over 1 year ago

raw

history blame contribute delete

3.11 kB

	from audioset_download import Downloader
	import os
	import pandas as pd
	from pydub import AudioSegment
	from pydub.exceptions import CouldntDecodeError
	import logging

	def download_laughter_data():
	# Download the audioset data
	downloader = Downloader(
	root_path="Data_Management",
	labels=["Laughter"],
	n_jobs=8,
	download_type='unbalanced_train',
	copy_and_replicate=False,
	)
	downloader.download(format='wav',quality=5)
	if os.path.getsize("Data_Management/Laughter") >=3000:
	return

	def download_speech_data():
	# Download the audioset data
	downloader = Downloader(
	root_path="Data_Management/Speech",
	labels=["Speech"],
	n_jobs=8,
	download_type='balanced_train',
	copy_and_replicate=False,
	)
	downloader.download(format='wav',quality=5)
	if os.path.getsize("Data_Management/Speech") >=3000:
	return

	def download_evulation_data():
	downloader = Downloader(
	root_path="Data_Management/Test_Laughter",
	labels=["Laughter"],
	n_jobs=8,
	download_type='eval',
	copy_and_replicate=False,
	)
	downloader.download(format='wav',quality=5)
	if os.path.getsize("Data_Management/Test_Laughter") >=3000:
	return

	def download_evulation_data2():
	downloader = Downloader(
	root_path="Data_Management/Test_Speech",
	labels=["Speech"],
	n_jobs=8,
	download_type='eval',
	copy_and_replicate=False,
	)
	downloader.download(format='wav',quality=5)
	if os.path.getsize("Data_Management/Test_Speech/Speech") >=100:
	return
	# Configure logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

	def is_file_playable(file_path):
	try:
	audio = AudioSegment.from_file(file_path)
	return True
	except CouldntDecodeError:
	logging.error(f"Could not decode audio file: {file_path}")
	return False
	except Exception as e:
	logging.error(f"An error occurred while trying to load the audio file: {file_path}. Error: {e}")
	return False

	def remove_corrupted_files_and_create_csv(root_path, labels):
	data = {
	"file": [],
	"label": []
	}
	for label in labels:
	label_path = os.path.join(root_path, label)
	for root, _, files in os.walk(label_path):
	for file in files:
	file_path = os.path.join(root, file)
	if is_file_playable(file_path):
	data["file"].append(file)
	data["label"].append(label)
	else:
	logging.info(f"File {file_path} is corrupted.")
	os.remove(file_path)

	df = pd.DataFrame(data)
	df.to_csv("labeled_data.csv", index=False)
	logging.info("CSV file created successfully.")

	#download_laughter_data()
	#download_speech_data()
	#remove_corrupted_files_and_create_csv(root_path="Data_Management/laughter", labels=["Laughter"])

	#remove_corrupted_files_and_create_csv(root_path="Data_Management/speech", labels=["Speech"])

	#download_evulation_data()
	download_evulation_data2()