Whisper-Transcriber / utils /downloader.py
Whisper Transcriber Bot
Initial commit: Complete Whisper Transcriber implementation
4051511
import os
import tempfile
import re
from typing import Optional, Tuple
import yt_dlp
import requests
from pathlib import Path
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class MediaDownloader:
"""Handles downloading media from YouTube and direct URLs"""
@staticmethod
def is_youtube_url(url: str) -> bool:
"""Check if URL is a YouTube link"""
youtube_patterns = [
r'(https?://)?(www\.)?(youtube|youtu|youtube-nocookie)\.(com|be)/',
r'(https?://)?(www\.)?youtu\.be/',
]
return any(re.match(pattern, url) for pattern in youtube_patterns)
@staticmethod
def is_direct_url(url: str) -> bool:
"""Check if URL is a direct file link"""
url_lower = url.lower()
# Check for common media extensions
media_extensions = ['.mp3', '.mp4', '.wav', '.m4a', '.avi', '.mkv', '.mov', '.webm', '.flac', '.ogg']
return any(url_lower.endswith(ext) for ext in media_extensions) or url.startswith('http')
@staticmethod
def download_youtube(url: str, progress_callback=None) -> str:
"""
Download audio from YouTube video
Args:
url: YouTube video URL
progress_callback: Optional callback for progress updates
Returns:
Path to downloaded audio file
"""
if progress_callback:
progress_callback("Downloading from YouTube...")
output_file = tempfile.NamedTemporaryFile(delete=False, suffix='.m4a').name
output_template = output_file.replace('.m4a', '')
# Progress hook for yt-dlp
def progress_hook(d):
if d['status'] == 'downloading':
if progress_callback:
percent = d.get('_percent_str', 'N/A')
speed = d.get('_speed_str', 'N/A')
eta = d.get('_eta_str', 'N/A')
progress_callback(f"Downloading: {percent} | Speed: {speed} | ETA: {eta}")
elif d['status'] == 'finished':
if progress_callback:
progress_callback("Download complete, processing...")
ydl_opts = {
'format': 'bestaudio/best',
'outtmpl': output_template,
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'm4a',
}],
'progress_hooks': [progress_hook],
'quiet': True,
'no_warnings': True,
}
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=True)
title = info.get('title', 'video')
logger.info(f"Downloaded: {title}")
if progress_callback:
progress_callback("YouTube download complete")
# yt-dlp might save with different extension, find the file
for ext in ['.m4a', '.mp3', '.webm', '.opus']:
potential_file = output_template + ext
if os.path.exists(potential_file):
return potential_file
# If still not found, return original path
return output_file
except Exception as e:
logger.error(f"YouTube download failed: {e}")
raise Exception(f"Failed to download from YouTube: {str(e)}")
@staticmethod
def download_direct_url(url: str, progress_callback=None) -> str:
"""
Download file from direct URL
Args:
url: Direct URL to media file
progress_callback: Optional callback for progress updates
Returns:
Path to downloaded file
"""
if progress_callback:
progress_callback("Downloading from URL...")
# Determine file extension from URL
ext = Path(url).suffix or '.mp4'
output_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext).name
try:
response = requests.get(url, stream=True, timeout=30)
response.raise_for_status()
total_size = int(response.headers.get('content-length', 0))
downloaded = 0
with open(output_file, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
downloaded += len(chunk)
if progress_callback and total_size > 0:
percent = (downloaded / total_size) * 100
progress_callback(f"Downloading: {percent:.1f}% ({downloaded / 1024 / 1024:.1f}MB)")
if progress_callback:
progress_callback("Download complete")
logger.info(f"Downloaded from URL: {output_file}")
return output_file
except requests.exceptions.RequestException as e:
logger.error(f"Direct URL download failed: {e}")
raise Exception(f"Failed to download from URL: {str(e)}")
@staticmethod
def download_media(input_source: str, progress_callback=None) -> Tuple[str, str]:
"""
Download media from URL (YouTube or direct link)
Args:
input_source: URL to download from
progress_callback: Optional callback for progress updates
Returns:
Tuple of (file_path, source_type)
"""
input_source = input_source.strip()
if MediaDownloader.is_youtube_url(input_source):
file_path = MediaDownloader.download_youtube(input_source, progress_callback)
return file_path, 'youtube'
elif MediaDownloader.is_direct_url(input_source):
file_path = MediaDownloader.download_direct_url(input_source, progress_callback)
return file_path, 'direct_url'
else:
raise ValueError("Invalid URL. Please provide a YouTube URL or direct media file link.")