Spaces:

xTHExBEASTx
/

Whisper-Transcriber

Runtime error

Whisper-Transcriber / utils /downloader.py

Whisper Transcriber Bot

Initial commit: Complete Whisper Transcriber implementation

4051511 4 months ago

5.98 kB

	import os
	import tempfile
	import re
	from typing import Optional, Tuple
	import yt_dlp
	import requests
	from pathlib import Path
	import logging

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)


	class MediaDownloader:
	"""Handles downloading media from YouTube and direct URLs"""

	@staticmethod
	def is_youtube_url(url: str) -> bool:
	"""Check if URL is a YouTube link"""
	youtube_patterns = [
	r'(https?://)?(www\.)?(youtube\|youtu\|youtube-nocookie)\.(com\|be)/',
	r'(https?://)?(www\.)?youtu\.be/',
	]
	return any(re.match(pattern, url) for pattern in youtube_patterns)

	@staticmethod
	def is_direct_url(url: str) -> bool:
	"""Check if URL is a direct file link"""
	url_lower = url.lower()
	# Check for common media extensions
	media_extensions = ['.mp3', '.mp4', '.wav', '.m4a', '.avi', '.mkv', '.mov', '.webm', '.flac', '.ogg']
	return any(url_lower.endswith(ext) for ext in media_extensions) or url.startswith('http')

	@staticmethod
	def download_youtube(url: str, progress_callback=None) -> str:
	"""
	Download audio from YouTube video

	Args:
	url: YouTube video URL
	progress_callback: Optional callback for progress updates

	Returns:
	Path to downloaded audio file
	"""
	if progress_callback:
	progress_callback("Downloading from YouTube...")

	output_file = tempfile.NamedTemporaryFile(delete=False, suffix='.m4a').name
	output_template = output_file.replace('.m4a', '')

	# Progress hook for yt-dlp
	def progress_hook(d):
	if d['status'] == 'downloading':
	if progress_callback:
	percent = d.get('_percent_str', 'N/A')
	speed = d.get('_speed_str', 'N/A')
	eta = d.get('_eta_str', 'N/A')
	progress_callback(f"Downloading: {percent} \| Speed: {speed} \| ETA: {eta}")
	elif d['status'] == 'finished':
	if progress_callback:
	progress_callback("Download complete, processing...")

	ydl_opts = {
	'format': 'bestaudio/best',
	'outtmpl': output_template,
	'postprocessors': [{
	'key': 'FFmpegExtractAudio',
	'preferredcodec': 'm4a',
	}],
	'progress_hooks': [progress_hook],
	'quiet': True,
	'no_warnings': True,
	}

	try:
	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	info = ydl.extract_info(url, download=True)
	title = info.get('title', 'video')
	logger.info(f"Downloaded: {title}")

	if progress_callback:
	progress_callback("YouTube download complete")

	# yt-dlp might save with different extension, find the file
	for ext in ['.m4a', '.mp3', '.webm', '.opus']:
	potential_file = output_template + ext
	if os.path.exists(potential_file):
	return potential_file

	# If still not found, return original path
	return output_file

	except Exception as e:
	logger.error(f"YouTube download failed: {e}")
	raise Exception(f"Failed to download from YouTube: {str(e)}")

	@staticmethod
	def download_direct_url(url: str, progress_callback=None) -> str:
	"""
	Download file from direct URL

	Args:
	url: Direct URL to media file
	progress_callback: Optional callback for progress updates

	Returns:
	Path to downloaded file
	"""
	if progress_callback:
	progress_callback("Downloading from URL...")

	# Determine file extension from URL
	ext = Path(url).suffix or '.mp4'
	output_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext).name

	try:
	response = requests.get(url, stream=True, timeout=30)
	response.raise_for_status()

	total_size = int(response.headers.get('content-length', 0))
	downloaded = 0

	with open(output_file, 'wb') as f:
	for chunk in response.iter_content(chunk_size=8192):
	if chunk:
	f.write(chunk)
	downloaded += len(chunk)

	if progress_callback and total_size > 0:
	percent = (downloaded / total_size) * 100
	progress_callback(f"Downloading: {percent:.1f}% ({downloaded / 1024 / 1024:.1f}MB)")

	if progress_callback:
	progress_callback("Download complete")

	logger.info(f"Downloaded from URL: {output_file}")
	return output_file

	except requests.exceptions.RequestException as e:
	logger.error(f"Direct URL download failed: {e}")
	raise Exception(f"Failed to download from URL: {str(e)}")

	@staticmethod
	def download_media(input_source: str, progress_callback=None) -> Tuple[str, str]:
	"""
	Download media from URL (YouTube or direct link)

	Args:
	input_source: URL to download from
	progress_callback: Optional callback for progress updates

	Returns:
	Tuple of (file_path, source_type)
	"""
	input_source = input_source.strip()

	if MediaDownloader.is_youtube_url(input_source):
	file_path = MediaDownloader.download_youtube(input_source, progress_callback)
	return file_path, 'youtube'
	elif MediaDownloader.is_direct_url(input_source):
	file_path = MediaDownloader.download_direct_url(input_source, progress_callback)
	return file_path, 'direct_url'
	else:
	raise ValueError("Invalid URL. Please provide a YouTube URL or direct media file link.")