Spaces:

build-small-hackathon
/

sound-broken

Runtime error

App Files Files Community

sound-broken / collect_data.py

mitvho09

Upload Space app

edb671a verified 19 days ago

Raw

History Blame Contribute Delete

10.1 kB

	"""Collect real appliance fault audio from YouTube.

	Searches for specific fault types, downloads audio, extracts 10-second clips,
	and saves them with labels for training.

	Usage:
	python collect_data.py # Run full collection
	python collect_data.py --dry-run # Preview search results only
	python collect_data.py --max-videos 5 # Limit videos per query
	"""
	import os
	import json
	import subprocess
	import sys
	from pathlib import Path
	from datetime import datetime

	# ffmpeg path
	FFMPEG = r"C:\Users\sushr\AppData\Roaming\Python\Python313\site-packages\imageio_ffmpeg\binaries\ffmpeg-win-x86_64-v7.1.exe"

	DATA_DIR = Path(__file__).parent / "data"
	RAW_DIR = DATA_DIR / "raw"
	CLIPS_DIR = DATA_DIR / "clips"
	LABELS_FILE = DATA_DIR / "labels" / "labels.jsonl"

	# Search queries: (fault_type, appliance, search_query, urgency)
	QUERIES = [
	# Washing machine faults
	("bearing_failure", "Washing machine", "washing machine bearing noise grinding", "HIGH"),
	("belt_slip", "Washing machine", "washing machine belt squeal noise", "MEDIUM"),
	("load_imbalance", "Washing machine", "washing machine unbalanced load vibration", "LOW"),
	("foreign_object", "Washing machine", "washing machine object stuck rattling noise", "MEDIUM"),
	("pump_failure", "Washing machine", "washing machine drain pump noise grinding", "MEDIUM"),
	("motor_failure", "Washing machine", "washing machine motor noise humming whining", "HIGH"),

	# Electric fan faults
	("blade_imbalance", "Electric fan", "electric fan wobbling noise unbalanced blade", "MEDIUM"),
	("bearing_failure", "Electric fan", "electric fan motor bearing noise grinding", "HIGH"),
	("blade_strike", "Electric fan", "electric fan hitting noise blade strike", "HIGH"),

	# Refrigerator faults
	("compressor_failure", "Refrigerator/Freezer", "refrigerator compressor noise loud grinding", "HIGH"),
	("evaporator_fan", "Refrigerator/Freezer", "refrigerator evaporator fan noise buzzing", "MEDIUM"),
	("condenser_fan", "Refrigerator/Freezer", "refrigerator condenser fan noise grinding", "MEDIUM"),

	# Dishwasher faults
	("pump_bearing", "Dishwasher", "dishwasher pump noise grinding bearing", "HIGH"),
	("spray_arm", "Dishwasher", "dishwasher spray arm noise hitting", "LOW"),
	("drain_pump", "Dishwasher", "dishwasher drain pump noise cavitation", "MEDIUM"),

	# Dryer faults
	("drum_roller", "Tumble dryer", "dryer thumping noise drum roller worn", "HIGH"),
	("belt_slip", "Tumble dryer", "dryer squealing noise belt slipping", "MEDIUM"),
	("foreign_object", "Tumble dryer", "dryer rattling noise coins buttons", "LOW"),

	# Car engine faults
	("rod_knock", "Car engine", "car engine rod knock noise deep knocking", "CRITICAL"),
	("belt_squeal", "Car engine", "car serpentine belt squeal noise", "MEDIUM"),
	("exhaust_leak", "Car engine", "car exhaust leak tick noise", "MEDIUM"),

	# Generic motor faults
	("bearing_failure", "Electric motor (generic)", "electric motor bearing noise grinding", "HIGH"),
	("brush_arcing", "Electric motor (generic)", "electric motor brush arcing noise sparking", "MEDIUM"),
	("electrical_hum", "Electric motor (generic)", "electric motor humming loose lamination", "LOW"),
	]


	def search_youtube(query, max_videos=3):
	"""Search YouTube and return video info."""
	import yt_dlp

	ydl_opts = {
	'quiet': True,
	'no_warnings': True,
	'extract_flat': True,
	'force_generic_extractor': False,
	'default_search': 'ytsearch',
	}

	results = []
	try:
	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	search_query = f"ytsearch{max_videos}:{query}"
	info = ydl.extract_info(search_query, download=False)
	if info and 'entries' in info:
	for entry in info['entries']:
	if entry:
	results.append({
	'id': entry.get('id'),
	'title': entry.get('title'),
	'url': f"https://www.youtube.com/watch?v={entry.get('id')}",
	'duration': entry.get('duration'),
	})
	except Exception as e:
	print(f" Search error: {e}")
	return results


	def download_audio(video_url, output_path):
	"""Download audio from a YouTube video."""
	import yt_dlp

	ydl_opts = {
	'quiet': True,
	'no_warnings': True,
	'format': 'bestaudio/best',
	'outtmpl': str(output_path) + '.%(ext)s',
	}

	try:
	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	info = ydl.extract_info(video_url, download=True)
	# Find downloaded file
	for ext in ['webm', 'm4a', 'mp3', 'opus', 'wav']:
	candidate = output_path.with_suffix(f'.{ext}')
	if candidate.exists():
	# Convert to wav using our ffmpeg
	wav_path = output_path.with_suffix('.wav')
	cmd = [FFMPEG, "-y", "-i", str(candidate),
	"-ar", "22050", "-ac", "1", str(wav_path)]
	subprocess.run(cmd, capture_output=True, timeout=30)
	candidate.unlink(missing_ok=True)
	if wav_path.exists():
	return True
	return False
	except Exception as e:
	print(f" Download error: {e}")
	return False


	def extract_clip(audio_path, output_path, start_sec=0, duration_sec=10):
	"""Extract a clip from an audio file using ffmpeg."""
	cmd = [
	FFMPEG, "-y",
	"-i", str(audio_path),
	"-ss", str(start_sec),
	"-t", str(duration_sec),
	"-ar", "22050",
	"-ac", "1",
	str(output_path)
	]
	try:
	subprocess.run(cmd, capture_output=True, timeout=30)
	return output_path.exists()
	except Exception as e:
	print(f" FFmpeg error: {e}")
	return False


	def collect_fault_audio(dry_run=False, max_videos=3):
	"""Main collection pipeline."""
	labels = []
	total_clips = 0

	print(f"Collecting {len(QUERIES)} fault types, max {max_videos} videos each")
	print(f"Output: {CLIPS_DIR}")
	print()

	for i, (fault_type, appliance, query, urgency) in enumerate(QUERIES):
	print(f"[{i+1}/{len(QUERIES)}] {fault_type} ({appliance})")
	print(f" Query: {query}")

	videos = search_youtube(query, max_videos)
	print(f" Found: {len(videos)} videos")

	if dry_run:
	for v in videos:
	print(f" {v['title'][:60]} ({v['duration']}s)")
	continue

	for v in videos:
	print(f" Downloading: {v['title'][:50]}...")

	# Download full audio
	raw_path = RAW_DIR / f"{v['id']}"
	if not download_audio(v['url'], raw_path):
	continue

	# Find the downloaded file (may have different extension)
	raw_file = None
	for ext in ['.wav', '.mp3', '.m4a', '.webm']:
	candidate = raw_path.with_suffix(ext)
	if candidate.exists():
	raw_file = candidate
	break

	if not raw_file:
	print(f" No audio file found for {v['id']}")
	continue

	# Extract 10-second clips at different start points
	duration = v.get('duration', 60) or 60
	clip_starts = [5, 15, 30, 45] # Skip intros
	clips_made = 0

	for start in clip_starts:
	if start + 10 > duration:
	continue

	clip_name = f"{v['id']}_{start:03d}.wav"
	clip_path = CLIPS_DIR / clip_name

	if extract_clip(raw_file, clip_path, start, 10):
	label = {
	"file": clip_name,
	"fault_type": fault_type,
	"appliance": appliance,
	"urgency": urgency,
	"source": v['url'],
	"source_title": v['title'],
	"start_sec": start,
	"collected_at": datetime.now().isoformat(),
	}
	labels.append(label)
	total_clips += 1
	clips_made += 1

	if clips_made >= 2: # Max 2 clips per video
	break

	print(f" Extracted {clips_made} clips")

	# Clean up raw file
	try:
	raw_file.unlink()
	except:
	pass

	# Save labels after each query
	save_labels(labels)

	print(f"\n{'='*60}")
	print(f"Collection complete!")
	print(f"Total clips: {total_clips}")
	print(f"Labels: {LABELS_FILE}")

	# Print summary
	from collections import Counter
	fault_counts = Counter(l['fault_type'] for l in labels)
	print(f"\nClips per fault type:")
	for fault, count in fault_counts.most_common():
	print(f" {fault}: {count}")

	return labels


	def save_labels(labels):
	"""Save labels to JSONL file."""
	LABELS_FILE.parent.mkdir(parents=True, exist_ok=True)
	with open(LABELS_FILE, 'w') as f:
	for label in labels:
	f.write(json.dumps(label) + '\n')


	def load_labels():
	"""Load labels from JSONL file."""
	if not LABELS_FILE.exists():
	return []
	labels = []
	with open(LABELS_FILE) as f:
	for line in f:
	if line.strip():
	labels.append(json.loads(line))
	return labels


	if __name__ == "__main__":
	import argparse
	parser = argparse.ArgumentParser(description="Collect appliance fault audio from YouTube")
	parser.add_argument("--dry-run", action="store_true", help="Preview search results only")
	parser.add_argument("--max-videos", type=int, default=3, help="Max videos per query")
	args = parser.parse_args()

	collect_fault_audio(dry_run=args.dry_run, max_videos=args.max_videos)