Spaces:
Running
Running
Commit
·
d09ee28
1
Parent(s):
21f2df3
Add Whisper transcription support and config
Browse files- .env.example +9 -6
- Dockerfile +1 -0
- app/apis/subtitles/service.py +90 -76
- app/apis/subtitles/utils.py +6 -72
- app/core/config.py +4 -2
- poetry.lock +254 -2
- pyproject.toml +1 -0
- tests/conftest.py +0 -24
- tests/test_subtitles.py +31 -69
.env.example
CHANGED
|
@@ -14,14 +14,15 @@ RATE_LIMIT_WINDOW=60
|
|
| 14 |
# Logging configuration
|
| 15 |
LOG_LEVEL=INFO
|
| 16 |
|
| 17 |
-
# yt-dlp configuration
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
# Embedding model configuration
|
| 22 |
EMBEDDING_MODEL=mixedbread-ai/mxbai-embed-large-v1
|
| 23 |
-
# Models cache directory (set by Docker, optional for local dev)
|
| 24 |
-
# SENTENCE_TRANSFORMERS_HOME=models
|
| 25 |
|
| 26 |
# Server configuration
|
| 27 |
HOST=0.0.0.0
|
|
@@ -29,4 +30,6 @@ PORT=8000
|
|
| 29 |
RELOAD=true
|
| 30 |
|
| 31 |
# Container-specific (set automatically in Dockerfile)
|
| 32 |
-
# DISABLE_FILE_LOGGING=true
|
|
|
|
|
|
|
|
|
| 14 |
# Logging configuration
|
| 15 |
LOG_LEVEL=INFO
|
| 16 |
|
| 17 |
+
# yt-dlp configuration (audio download timeout in seconds)
|
| 18 |
+
YT_DLP_TIMEOUT_DOWNLOAD=120
|
| 19 |
+
|
| 20 |
+
# Whisper configuration
|
| 21 |
+
# Models: tiny, base, small, medium, large-v2, large-v3
|
| 22 |
+
WHISPER_MODEL=base
|
| 23 |
|
| 24 |
# Embedding model configuration
|
| 25 |
EMBEDDING_MODEL=mixedbread-ai/mxbai-embed-large-v1
|
|
|
|
|
|
|
| 26 |
|
| 27 |
# Server configuration
|
| 28 |
HOST=0.0.0.0
|
|
|
|
| 30 |
RELOAD=true
|
| 31 |
|
| 32 |
# Container-specific (set automatically in Dockerfile)
|
| 33 |
+
# DISABLE_FILE_LOGGING=true
|
| 34 |
+
# SENTENCE_TRANSFORMERS_HOME=/app/models
|
| 35 |
+
# WHISPER_MODELS_DIR=/app/models
|
Dockerfile
CHANGED
|
@@ -45,6 +45,7 @@ ENV PYTHONPATH=/app
|
|
| 45 |
ENV HF_HOME=/app/models
|
| 46 |
ENV TRANSFORMERS_CACHE=/app/models
|
| 47 |
ENV SENTENCE_TRANSFORMERS_HOME=/app/models
|
|
|
|
| 48 |
|
| 49 |
# Disable file logging in container
|
| 50 |
ENV DISABLE_FILE_LOGGING=true
|
|
|
|
| 45 |
ENV HF_HOME=/app/models
|
| 46 |
ENV TRANSFORMERS_CACHE=/app/models
|
| 47 |
ENV SENTENCE_TRANSFORMERS_HOME=/app/models
|
| 48 |
+
ENV WHISPER_MODELS_DIR=/app/models
|
| 49 |
|
| 50 |
# Disable file logging in container
|
| 51 |
ENV DISABLE_FILE_LOGGING=true
|
app/apis/subtitles/service.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
| 1 |
-
"""Subtitle extraction service using yt-dlp
|
| 2 |
|
| 3 |
import asyncio
|
|
|
|
| 4 |
import sys
|
| 5 |
import tempfile
|
|
|
|
| 6 |
from pathlib import Path
|
| 7 |
-
from typing import List,
|
| 8 |
|
| 9 |
from cachetools import TTLCache
|
| 10 |
|
|
@@ -15,109 +17,89 @@ from app.core.exceptions import (
|
|
| 15 |
SubtitleExtractionError,
|
| 16 |
InvalidVideoURLError
|
| 17 |
)
|
| 18 |
-
from app.apis.subtitles.utils import extract_video_id
|
|
|
|
| 19 |
|
|
|
|
| 20 |
|
| 21 |
SUBTITLE_CACHE: TTLCache = TTLCache(maxsize=100, ttl=3600)
|
| 22 |
-
|
| 23 |
-
ALTERNATIVE_LANGUAGES = {
|
| 24 |
-
"en": ["en-US", "en-GB", "en-orig"],
|
| 25 |
-
"es": ["es-ES", "es-MX", "es-419"],
|
| 26 |
-
"fr": ["fr-FR", "fr-CA"],
|
| 27 |
-
"de": ["de-DE"],
|
| 28 |
-
"it": ["it-IT"],
|
| 29 |
-
"pt": ["pt-BR", "pt-PT"],
|
| 30 |
-
"ja": ["ja-JP"],
|
| 31 |
-
"ko": ["ko-KR"],
|
| 32 |
-
"zh": ["zh-CN", "zh-TW", "zh-Hans", "zh-Hant"]
|
| 33 |
-
}
|
| 34 |
|
| 35 |
|
| 36 |
class SubtitleService:
|
| 37 |
-
"""Service for extracting subtitles from
|
| 38 |
|
| 39 |
def __init__(self) -> None:
|
| 40 |
-
self.
|
|
|
|
|
|
|
| 41 |
self.timeout_download = settings.yt_dlp_timeout_download
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
async def extract_subtitles(self, url: str, lang: str = "en") -> Tuple[str, List[str]]:
|
| 44 |
"""
|
| 45 |
-
Extract subtitles from a
|
| 46 |
|
| 47 |
Args:
|
| 48 |
url: YouTube video URL
|
| 49 |
-
lang: Language code for
|
| 50 |
|
| 51 |
Returns:
|
| 52 |
Tuple of (video_id, subtitle_lines)
|
| 53 |
-
|
| 54 |
-
Raises:
|
| 55 |
-
SubtitlesNotFoundError: If no subtitles are found
|
| 56 |
-
DownloadTimeoutError: If the operation times out
|
| 57 |
-
SubtitleExtractionError: If extraction fails
|
| 58 |
"""
|
| 59 |
video_id = extract_video_id(url)
|
| 60 |
cache_key = f"{video_id}:{lang}"
|
| 61 |
|
| 62 |
if cache_key in SUBTITLE_CACHE:
|
|
|
|
| 63 |
return SUBTITLE_CACHE[cache_key]
|
| 64 |
|
| 65 |
with tempfile.TemporaryDirectory() as temp_dir:
|
| 66 |
-
|
| 67 |
|
| 68 |
-
if not
|
| 69 |
-
|
| 70 |
|
| 71 |
-
|
| 72 |
-
raise SubtitlesNotFoundError(f"No subtitles available in language '{lang}' or alternatives")
|
| 73 |
|
| 74 |
-
|
|
|
|
| 75 |
|
| 76 |
-
|
| 77 |
-
raise SubtitlesNotFoundError("Subtitles found but appear to be empty after cleaning")
|
| 78 |
-
|
| 79 |
-
result = (video_id, clean_lines)
|
| 80 |
SUBTITLE_CACHE[cache_key] = result
|
| 81 |
return result
|
| 82 |
|
| 83 |
-
async def
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
"""Try downloading subtitles in alternative language codes concurrently."""
|
| 87 |
-
alt_langs = ALTERNATIVE_LANGUAGES.get(lang, [f"{lang}-{lang.upper()}"])
|
| 88 |
-
|
| 89 |
-
tasks = [
|
| 90 |
-
self._download_subtitles(url, alt_lang, temp_dir, video_id)
|
| 91 |
-
for alt_lang in alt_langs
|
| 92 |
-
]
|
| 93 |
-
|
| 94 |
-
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 95 |
-
|
| 96 |
-
for result in results:
|
| 97 |
-
if isinstance(result, str) and result:
|
| 98 |
-
return result
|
| 99 |
-
|
| 100 |
-
return None
|
| 101 |
-
|
| 102 |
-
async def _download_subtitles(
|
| 103 |
-
self, url: str, lang: str, temp_dir: str, video_id: str
|
| 104 |
-
) -> Optional[str]:
|
| 105 |
-
"""Download subtitles for a specific language."""
|
| 106 |
-
output_template = str(Path(temp_dir) / f"{video_id}.%(ext)s")
|
| 107 |
|
| 108 |
cmd = [
|
| 109 |
sys.executable, "-m", "yt_dlp",
|
| 110 |
-
"--
|
| 111 |
-
"--
|
| 112 |
-
"--
|
| 113 |
-
"--skip-download",
|
| 114 |
"--no-warnings",
|
| 115 |
-
"--output",
|
| 116 |
-
"--user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
| 117 |
url
|
| 118 |
]
|
| 119 |
|
| 120 |
try:
|
|
|
|
| 121 |
process = await asyncio.create_subprocess_exec(
|
| 122 |
*cmd,
|
| 123 |
stdout=asyncio.subprocess.PIPE,
|
|
@@ -131,24 +113,56 @@ class SubtitleService:
|
|
| 131 |
|
| 132 |
if process.returncode != 0:
|
| 133 |
error_msg = stderr.decode('utf-8', errors='ignore')
|
|
|
|
| 134 |
if "Video unavailable" in error_msg or "Private video" in error_msg:
|
| 135 |
raise InvalidVideoURLError("Video is unavailable, private, or does not exist")
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
temp_path = Path(temp_dir)
|
| 139 |
-
subtitle_files = list(temp_path.glob(f"{video_id}*.vtt"))
|
| 140 |
|
| 141 |
-
|
| 142 |
-
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
-
|
| 145 |
|
| 146 |
except asyncio.TimeoutError:
|
| 147 |
-
raise DownloadTimeoutError(
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
|
| 154 |
subtitle_service = SubtitleService()
|
|
|
|
| 1 |
+
"""Subtitle extraction service using yt-dlp audio download and Whisper transcription."""
|
| 2 |
|
| 3 |
import asyncio
|
| 4 |
+
import os
|
| 5 |
import sys
|
| 6 |
import tempfile
|
| 7 |
+
import threading
|
| 8 |
from pathlib import Path
|
| 9 |
+
from typing import List, Tuple
|
| 10 |
|
| 11 |
from cachetools import TTLCache
|
| 12 |
|
|
|
|
| 17 |
SubtitleExtractionError,
|
| 18 |
InvalidVideoURLError
|
| 19 |
)
|
| 20 |
+
from app.apis.subtitles.utils import extract_video_id
|
| 21 |
+
from app.core.logging import get_logger
|
| 22 |
|
| 23 |
+
logger = get_logger(__name__)
|
| 24 |
|
| 25 |
SUBTITLE_CACHE: TTLCache = TTLCache(maxsize=100, ttl=3600)
|
| 26 |
+
MODELS_DIR = os.environ.get("WHISPER_MODELS_DIR", os.environ.get("SENTENCE_TRANSFORMERS_HOME", "models"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
|
| 29 |
class SubtitleService:
|
| 30 |
+
"""Service for extracting subtitles from videos via Whisper transcription."""
|
| 31 |
|
| 32 |
def __init__(self) -> None:
|
| 33 |
+
self._whisper_model = None
|
| 34 |
+
self._model_name = settings.whisper_model
|
| 35 |
+
self._lock = threading.Lock()
|
| 36 |
self.timeout_download = settings.yt_dlp_timeout_download
|
| 37 |
|
| 38 |
+
def _load_whisper_model(self):
|
| 39 |
+
"""Lazy load the Whisper model on first use."""
|
| 40 |
+
if self._whisper_model is None:
|
| 41 |
+
with self._lock:
|
| 42 |
+
if self._whisper_model is None:
|
| 43 |
+
logger.info(f"Loading Whisper model: {self._model_name}")
|
| 44 |
+
logger.info(f"Models directory: {MODELS_DIR}")
|
| 45 |
+
from faster_whisper import WhisperModel
|
| 46 |
+
self._whisper_model = WhisperModel(
|
| 47 |
+
self._model_name,
|
| 48 |
+
device="cpu",
|
| 49 |
+
compute_type="int8",
|
| 50 |
+
download_root=MODELS_DIR
|
| 51 |
+
)
|
| 52 |
+
logger.info("Whisper model loaded successfully")
|
| 53 |
+
|
| 54 |
async def extract_subtitles(self, url: str, lang: str = "en") -> Tuple[str, List[str]]:
|
| 55 |
"""
|
| 56 |
+
Extract subtitles from a video by downloading audio and transcribing with Whisper.
|
| 57 |
|
| 58 |
Args:
|
| 59 |
url: YouTube video URL
|
| 60 |
+
lang: Language code for transcription
|
| 61 |
|
| 62 |
Returns:
|
| 63 |
Tuple of (video_id, subtitle_lines)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
"""
|
| 65 |
video_id = extract_video_id(url)
|
| 66 |
cache_key = f"{video_id}:{lang}"
|
| 67 |
|
| 68 |
if cache_key in SUBTITLE_CACHE:
|
| 69 |
+
logger.info(f"Cache hit for {cache_key}")
|
| 70 |
return SUBTITLE_CACHE[cache_key]
|
| 71 |
|
| 72 |
with tempfile.TemporaryDirectory() as temp_dir:
|
| 73 |
+
audio_path = await self._download_audio(url, temp_dir, video_id)
|
| 74 |
|
| 75 |
+
if not audio_path or not audio_path.exists():
|
| 76 |
+
raise SubtitleExtractionError("Failed to download audio from video")
|
| 77 |
|
| 78 |
+
subtitle_lines = await self._transcribe_audio(audio_path, lang)
|
|
|
|
| 79 |
|
| 80 |
+
if not subtitle_lines:
|
| 81 |
+
raise SubtitlesNotFoundError("Transcription produced no text")
|
| 82 |
|
| 83 |
+
result = (video_id, subtitle_lines)
|
|
|
|
|
|
|
|
|
|
| 84 |
SUBTITLE_CACHE[cache_key] = result
|
| 85 |
return result
|
| 86 |
|
| 87 |
+
async def _download_audio(self, url: str, temp_dir: str, video_id: str) -> Path:
|
| 88 |
+
"""Download audio from video URL using yt-dlp."""
|
| 89 |
+
output_path = Path(temp_dir) / f"{video_id}.mp3"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
cmd = [
|
| 92 |
sys.executable, "-m", "yt_dlp",
|
| 93 |
+
"--extract-audio",
|
| 94 |
+
"--audio-format", "mp3",
|
| 95 |
+
"--audio-quality", "5",
|
|
|
|
| 96 |
"--no-warnings",
|
| 97 |
+
"--output", str(Path(temp_dir) / f"{video_id}.%(ext)s"),
|
|
|
|
| 98 |
url
|
| 99 |
]
|
| 100 |
|
| 101 |
try:
|
| 102 |
+
logger.info(f"Downloading audio for video: {video_id}")
|
| 103 |
process = await asyncio.create_subprocess_exec(
|
| 104 |
*cmd,
|
| 105 |
stdout=asyncio.subprocess.PIPE,
|
|
|
|
| 113 |
|
| 114 |
if process.returncode != 0:
|
| 115 |
error_msg = stderr.decode('utf-8', errors='ignore')
|
| 116 |
+
logger.error(f"yt-dlp error: {error_msg}")
|
| 117 |
if "Video unavailable" in error_msg or "Private video" in error_msg:
|
| 118 |
raise InvalidVideoURLError("Video is unavailable, private, or does not exist")
|
| 119 |
+
raise SubtitleExtractionError(f"Failed to download audio: {error_msg[:200]}")
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
+
# Find the downloaded audio file
|
| 122 |
+
audio_files = list(Path(temp_dir).glob(f"{video_id}.*"))
|
| 123 |
+
if audio_files:
|
| 124 |
+
logger.info(f"Audio downloaded: {audio_files[0]}")
|
| 125 |
+
return audio_files[0]
|
| 126 |
|
| 127 |
+
raise SubtitleExtractionError("Audio file not found after download")
|
| 128 |
|
| 129 |
except asyncio.TimeoutError:
|
| 130 |
+
raise DownloadTimeoutError("Timeout while downloading audio")
|
| 131 |
+
|
| 132 |
+
async def _transcribe_audio(self, audio_path: Path, lang: str) -> List[str]:
|
| 133 |
+
"""Transcribe audio file using Whisper."""
|
| 134 |
+
self._load_whisper_model()
|
| 135 |
+
|
| 136 |
+
logger.info(f"Transcribing audio: {audio_path}")
|
| 137 |
+
|
| 138 |
+
# Run transcription in thread pool to not block event loop
|
| 139 |
+
loop = asyncio.get_event_loop()
|
| 140 |
+
segments = await loop.run_in_executor(
|
| 141 |
+
None,
|
| 142 |
+
self._run_transcription,
|
| 143 |
+
audio_path,
|
| 144 |
+
lang
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
return segments
|
| 148 |
+
|
| 149 |
+
def _run_transcription(self, audio_path: Path, lang: str) -> List[str]:
|
| 150 |
+
"""Run the actual transcription (blocking)."""
|
| 151 |
+
segments, info = self._whisper_model.transcribe(
|
| 152 |
+
str(audio_path),
|
| 153 |
+
language=lang if lang != "auto" else None,
|
| 154 |
+
beam_size=5,
|
| 155 |
+
vad_filter=True
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
lines = []
|
| 159 |
+
for segment in segments:
|
| 160 |
+
text = segment.text.strip()
|
| 161 |
+
if text:
|
| 162 |
+
lines.append(text)
|
| 163 |
+
|
| 164 |
+
logger.info(f"Transcription complete: {len(lines)} segments")
|
| 165 |
+
return lines
|
| 166 |
|
| 167 |
|
| 168 |
subtitle_service = SubtitleService()
|
app/apis/subtitles/utils.py
CHANGED
|
@@ -4,96 +4,30 @@ import re
|
|
| 4 |
from typing import List
|
| 5 |
|
| 6 |
|
| 7 |
-
def clean_subtitle_text(lines: List[str]) -> List[str]:
|
| 8 |
-
"""
|
| 9 |
-
Clean subtitle text by removing timestamps, duplicates, and empty lines.
|
| 10 |
-
|
| 11 |
-
Args:
|
| 12 |
-
lines: Raw subtitle lines
|
| 13 |
-
|
| 14 |
-
Returns:
|
| 15 |
-
Cleaned subtitle lines
|
| 16 |
-
"""
|
| 17 |
-
clean_lines = []
|
| 18 |
-
seen_recently = set() # Track recent lines to avoid duplicates
|
| 19 |
-
|
| 20 |
-
for line in lines:
|
| 21 |
-
line = line.strip()
|
| 22 |
-
|
| 23 |
-
# Skip VTT header lines
|
| 24 |
-
if line.startswith(("WEBVTT", "NOTE")):
|
| 25 |
-
continue
|
| 26 |
-
|
| 27 |
-
# Skip timestamp lines (format: 00:00:00.000 --> 00:00:00.000)
|
| 28 |
-
if "-->" in line and re.match(r'\d+:\d+:\d+\.\d+', line):
|
| 29 |
-
continue
|
| 30 |
-
|
| 31 |
-
# Skip empty lines and sequence numbers
|
| 32 |
-
if not line or line.isdigit():
|
| 33 |
-
continue
|
| 34 |
-
|
| 35 |
-
# Remove HTML tags that might be in subtitles
|
| 36 |
-
line = re.sub(r'<[^>]+>', '', line)
|
| 37 |
-
|
| 38 |
-
# Remove common subtitle formatting
|
| 39 |
-
line = re.sub(r'^\d+$', '', line) # Remove standalone numbers
|
| 40 |
-
line = re.sub(r'^-\s*', '', line) # Remove leading dashes
|
| 41 |
-
|
| 42 |
-
# Skip if line is too short or just punctuation
|
| 43 |
-
if len(line.strip()) < 3:
|
| 44 |
-
continue
|
| 45 |
-
|
| 46 |
-
# Avoid recent duplicates (check last 5 lines)
|
| 47 |
-
if line not in seen_recently:
|
| 48 |
-
clean_lines.append(line)
|
| 49 |
-
seen_recently.add(line)
|
| 50 |
-
|
| 51 |
-
# Keep only last 5 lines in memory for duplicate checking
|
| 52 |
-
if len(seen_recently) > 5:
|
| 53 |
-
seen_recently.clear()
|
| 54 |
-
|
| 55 |
-
return clean_lines
|
| 56 |
-
|
| 57 |
-
|
| 58 |
def extract_video_id(url: str) -> str:
|
| 59 |
"""
|
| 60 |
Extract video ID from YouTube URL.
|
| 61 |
-
|
| 62 |
Args:
|
| 63 |
url: YouTube URL
|
| 64 |
-
|
| 65 |
Returns:
|
| 66 |
-
Video ID
|
| 67 |
"""
|
| 68 |
-
# Handle different YouTube URL formats
|
| 69 |
patterns = [
|
| 70 |
r'(?:youtube\.com/watch\?v=|youtu\.be/)([a-zA-Z0-9_-]+)',
|
| 71 |
r'youtube\.com/embed/([a-zA-Z0-9_-]+)',
|
| 72 |
r'youtube\.com/v/([a-zA-Z0-9_-]+)'
|
| 73 |
]
|
| 74 |
-
|
| 75 |
for pattern in patterns:
|
| 76 |
match = re.search(pattern, url)
|
| 77 |
if match:
|
| 78 |
return match.group(1)
|
| 79 |
-
|
| 80 |
# Fallback: try to extract anything that looks like a video ID
|
| 81 |
match = re.search(r'[a-zA-Z0-9_-]{11}', url)
|
| 82 |
if match:
|
| 83 |
return match.group(0)
|
| 84 |
-
|
| 85 |
-
return "unknown"
|
| 86 |
|
| 87 |
-
|
| 88 |
-
def convert_vtt_to_text(vtt_content: str) -> List[str]:
|
| 89 |
-
"""
|
| 90 |
-
Convert VTT subtitle content to clean text lines.
|
| 91 |
-
|
| 92 |
-
Args:
|
| 93 |
-
vtt_content: Raw VTT file content
|
| 94 |
-
|
| 95 |
-
Returns:
|
| 96 |
-
Cleaned text lines
|
| 97 |
-
"""
|
| 98 |
-
lines = vtt_content.split('\n')
|
| 99 |
-
return clean_subtitle_text(lines)
|
|
|
|
| 4 |
from typing import List
|
| 5 |
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
def extract_video_id(url: str) -> str:
|
| 8 |
"""
|
| 9 |
Extract video ID from YouTube URL.
|
| 10 |
+
|
| 11 |
Args:
|
| 12 |
url: YouTube URL
|
| 13 |
+
|
| 14 |
Returns:
|
| 15 |
+
Video ID string
|
| 16 |
"""
|
|
|
|
| 17 |
patterns = [
|
| 18 |
r'(?:youtube\.com/watch\?v=|youtu\.be/)([a-zA-Z0-9_-]+)',
|
| 19 |
r'youtube\.com/embed/([a-zA-Z0-9_-]+)',
|
| 20 |
r'youtube\.com/v/([a-zA-Z0-9_-]+)'
|
| 21 |
]
|
| 22 |
+
|
| 23 |
for pattern in patterns:
|
| 24 |
match = re.search(pattern, url)
|
| 25 |
if match:
|
| 26 |
return match.group(1)
|
| 27 |
+
|
| 28 |
# Fallback: try to extract anything that looks like a video ID
|
| 29 |
match = re.search(r'[a-zA-Z0-9_-]{11}', url)
|
| 30 |
if match:
|
| 31 |
return match.group(0)
|
|
|
|
|
|
|
| 32 |
|
| 33 |
+
return "unknown"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/core/config.py
CHANGED
|
@@ -22,8 +22,10 @@ class Settings(BaseSettings):
|
|
| 22 |
log_level: str = "INFO"
|
| 23 |
|
| 24 |
# yt-dlp configuration
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
| 27 |
|
| 28 |
# Embedding configuration
|
| 29 |
embedding_model: str = "mixedbread-ai/mxbai-embed-large-v1"
|
|
|
|
| 22 |
log_level: str = "INFO"
|
| 23 |
|
| 24 |
# yt-dlp configuration
|
| 25 |
+
yt_dlp_timeout_download: int = 120
|
| 26 |
+
|
| 27 |
+
# Whisper configuration
|
| 28 |
+
whisper_model: str = "base"
|
| 29 |
|
| 30 |
# Embedding configuration
|
| 31 |
embedding_model: str = "mixedbread-ai/mxbai-embed-large-v1"
|
poetry.lock
CHANGED
|
@@ -33,6 +33,66 @@ doc = ["Sphinx", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-
|
|
| 33 |
test = ["anyio[trio]", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4) ; python_version < \"3.8\"", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17) ; python_version < \"3.12\" and platform_python_implementation == \"CPython\" and platform_system != \"Windows\""]
|
| 34 |
trio = ["trio (<0.22)"]
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
[[package]]
|
| 37 |
name = "black"
|
| 38 |
version = "23.12.1"
|
|
@@ -253,6 +313,24 @@ files = [
|
|
| 253 |
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
|
| 254 |
]
|
| 255 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
[[package]]
|
| 257 |
name = "coverage"
|
| 258 |
version = "7.10.6"
|
|
@@ -354,6 +432,56 @@ files = [
|
|
| 354 |
[package.extras]
|
| 355 |
toml = ["tomli ; python_full_version <= \"3.11.0a6\""]
|
| 356 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
[[package]]
|
| 358 |
name = "deprecated"
|
| 359 |
version = "1.3.1"
|
|
@@ -393,6 +521,29 @@ typing-extensions = ">=4.8.0"
|
|
| 393 |
[package.extras]
|
| 394 |
all = ["email-validator (>=2.0.0)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "pydantic-extra-types (>=2.0.0)", "pydantic-settings (>=2.0.0)", "python-multipart (>=0.0.5)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"]
|
| 395 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 396 |
[[package]]
|
| 397 |
name = "filelock"
|
| 398 |
version = "3.20.3"
|
|
@@ -422,6 +573,17 @@ mccabe = ">=0.7.0,<0.8.0"
|
|
| 422 |
pycodestyle = ">=2.11.0,<2.12.0"
|
| 423 |
pyflakes = ">=3.1.0,<3.2.0"
|
| 424 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 425 |
[[package]]
|
| 426 |
name = "fsspec"
|
| 427 |
version = "2026.1.0"
|
|
@@ -652,6 +814,21 @@ testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "authlib (>=1.
|
|
| 652 |
torch = ["safetensors[torch]", "torch"]
|
| 653 |
typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"]
|
| 654 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 655 |
[[package]]
|
| 656 |
name = "idna"
|
| 657 |
version = "3.10"
|
|
@@ -1299,6 +1476,46 @@ files = [
|
|
| 1299 |
{file = "nvidia_nvtx_cu12-12.6.77-py3-none-win_amd64.whl", hash = "sha256:2fb11a4af04a5e6c84073e6404d26588a34afd35379f0855a99797897efa75c0"},
|
| 1300 |
]
|
| 1301 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1302 |
[[package]]
|
| 1303 |
name = "packaging"
|
| 1304 |
version = "25.0"
|
|
@@ -1465,6 +1682,26 @@ files = [
|
|
| 1465 |
dev = ["pre-commit", "tox"]
|
| 1466 |
testing = ["coverage", "pytest", "pytest-benchmark"]
|
| 1467 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1468 |
[[package]]
|
| 1469 |
name = "pycodestyle"
|
| 1470 |
version = "2.11.1"
|
|
@@ -1647,6 +1884,22 @@ files = [
|
|
| 1647 |
{file = "pyflakes-3.1.0.tar.gz", hash = "sha256:a0aae034c444db0071aa077972ba4768d40c830d9539fd45bf4cd3f8f6992efc"},
|
| 1648 |
]
|
| 1649 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1650 |
[[package]]
|
| 1651 |
name = "pytest"
|
| 1652 |
version = "7.4.4"
|
|
@@ -2152,7 +2405,6 @@ description = "Easily download, build, install, upgrade, and uninstall Python pa
|
|
| 2152 |
optional = false
|
| 2153 |
python-versions = ">=3.9"
|
| 2154 |
groups = ["main"]
|
| 2155 |
-
markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" or python_version >= \"3.12\""
|
| 2156 |
files = [
|
| 2157 |
{file = "setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922"},
|
| 2158 |
{file = "setuptools-80.9.0.tar.gz", hash = "sha256:f36b47402ecde768dbfafc46e8e4207b4360c654f1f3bb84475f0a28628fb19c"},
|
|
@@ -2954,4 +3206,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"]
|
|
| 2954 |
[metadata]
|
| 2955 |
lock-version = "2.1"
|
| 2956 |
python-versions = "^3.11"
|
| 2957 |
-
content-hash = "
|
|
|
|
| 33 |
test = ["anyio[trio]", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4) ; python_version < \"3.8\"", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17) ; python_version < \"3.12\" and platform_python_implementation == \"CPython\" and platform_system != \"Windows\""]
|
| 34 |
trio = ["trio (<0.22)"]
|
| 35 |
|
| 36 |
+
[[package]]
|
| 37 |
+
name = "av"
|
| 38 |
+
version = "16.1.0"
|
| 39 |
+
description = "Pythonic bindings for FFmpeg's libraries."
|
| 40 |
+
optional = false
|
| 41 |
+
python-versions = ">=3.10"
|
| 42 |
+
groups = ["main"]
|
| 43 |
+
files = [
|
| 44 |
+
{file = "av-16.1.0-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:2395748b0c34fe3a150a1721e4f3d4487b939520991b13e7b36f8926b3b12295"},
|
| 45 |
+
{file = "av-16.1.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:72d7ac832710a158eeb7a93242370aa024a7646516291c562ee7f14a7ea881fd"},
|
| 46 |
+
{file = "av-16.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:6cbac833092e66b6b0ac4d81ab077970b8ca874951e9c3974d41d922aaa653ed"},
|
| 47 |
+
{file = "av-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:eb990672d97c18f99c02f31c8d5750236f770ffe354b5a52c5f4d16c5e65f619"},
|
| 48 |
+
{file = "av-16.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:05ad70933ac3b8ef896a820ea64b33b6cca91a5fac5259cb9ba7fa010435be15"},
|
| 49 |
+
{file = "av-16.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d831a1062a3c47520bf99de6ec682bd1d64a40dfa958e5457bb613c5270e7ce3"},
|
| 50 |
+
{file = "av-16.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:358ab910fef3c5a806c55176f2b27e5663b33c4d0a692dafeb049c6ed71f8aff"},
|
| 51 |
+
{file = "av-16.1.0-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:e88ad64ee9d2b9c4c5d891f16c22ae78e725188b8926eb88187538d9dd0b232f"},
|
| 52 |
+
{file = "av-16.1.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:cb296073fa6935724de72593800ba86ae49ed48af03960a4aee34f8a611f442b"},
|
| 53 |
+
{file = "av-16.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:720edd4d25aa73723c1532bb0597806d7b9af5ee34fc02358782c358cfe2f879"},
|
| 54 |
+
{file = "av-16.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:c7f2bc703d0df260a1fdf4de4253c7f5500ca9fc57772ea241b0cb241bcf972e"},
|
| 55 |
+
{file = "av-16.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d69c393809babada7d54964d56099e4b30a3e1f8b5736ca5e27bd7be0e0f3c83"},
|
| 56 |
+
{file = "av-16.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:441892be28582356d53f282873c5a951592daaf71642c7f20165e3ddcb0b4c63"},
|
| 57 |
+
{file = "av-16.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:273a3e32de64819e4a1cd96341824299fe06f70c46f2288b5dc4173944f0fd62"},
|
| 58 |
+
{file = "av-16.1.0-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:640f57b93f927fba8689f6966c956737ee95388a91bd0b8c8b5e0481f73513d6"},
|
| 59 |
+
{file = "av-16.1.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:ae3fb658eec00852ebd7412fdc141f17f3ddce8afee2d2e1cf366263ad2a3b35"},
|
| 60 |
+
{file = "av-16.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:27ee558d9c02a142eebcbe55578a6d817fedfde42ff5676275504e16d07a7f86"},
|
| 61 |
+
{file = "av-16.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:7ae547f6d5fa31763f73900d43901e8c5fa6367bb9a9840978d57b5a7ae14ed2"},
|
| 62 |
+
{file = "av-16.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8cf065f9d438e1921dc31fc7aa045790b58aee71736897866420d80b5450f62a"},
|
| 63 |
+
{file = "av-16.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a345877a9d3cc0f08e2bc4ec163ee83176864b92587afb9d08dff50f37a9a829"},
|
| 64 |
+
{file = "av-16.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:f49243b1d27c91cd8c66fdba90a674e344eb8eb917264f36117bf2b6879118fd"},
|
| 65 |
+
{file = "av-16.1.0-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:ce2a1b3d8bf619f6c47a9f28cfa7518ff75ddd516c234a4ee351037b05e6a587"},
|
| 66 |
+
{file = "av-16.1.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:408dbe6a2573ca58a855eb8cd854112b33ea598651902c36709f5f84c991ed8e"},
|
| 67 |
+
{file = "av-16.1.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:57f657f86652a160a8a01887aaab82282f9e629abf94c780bbdbb01595d6f0f7"},
|
| 68 |
+
{file = "av-16.1.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:adbad2b355c2ee4552cac59762809d791bda90586d134a33c6f13727fb86cb3a"},
|
| 69 |
+
{file = "av-16.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f42e1a68ec2aebd21f7eb6895be69efa6aa27eec1670536876399725bbda4b99"},
|
| 70 |
+
{file = "av-16.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:58fe47aeaef0f100c40ec8a5de9abbd37f118d3ca03829a1009cf288e9aef67c"},
|
| 71 |
+
{file = "av-16.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:565093ebc93b2f4b76782589564869dadfa83af5b852edebedd8fee746457d06"},
|
| 72 |
+
{file = "av-16.1.0-cp313-cp313t-macosx_11_0_x86_64.whl", hash = "sha256:574081a24edb98343fd9f473e21ae155bf61443d4ec9d7708987fa597d6b04b2"},
|
| 73 |
+
{file = "av-16.1.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:9ab00ea29c25ebf2ea1d1e928d7babb3532d562481c5d96c0829212b70756ad0"},
|
| 74 |
+
{file = "av-16.1.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:a84a91188c1071f238a9523fd42dbe567fb2e2607b22b779851b2ce0eac1b560"},
|
| 75 |
+
{file = "av-16.1.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:c2cd0de4dd022a7225ff224fde8e7971496d700be41c50adaaa26c07bb50bf97"},
|
| 76 |
+
{file = "av-16.1.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:0816143530624a5a93bc5494f8c6eeaf77549b9366709c2ac8566c1e9bff6df5"},
|
| 77 |
+
{file = "av-16.1.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e3a28053af29644696d0c007e897d19b1197585834660a54773e12a40b16974c"},
|
| 78 |
+
{file = "av-16.1.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2e3e67144a202b95ed299d165232533989390a9ea3119d37eccec697dc6dbb0c"},
|
| 79 |
+
{file = "av-16.1.0-cp314-cp314-macosx_11_0_x86_64.whl", hash = "sha256:39a634d8e5a87e78ea80772774bfd20c0721f0d633837ff185f36c9d14ffede4"},
|
| 80 |
+
{file = "av-16.1.0-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:0ba32fb9e9300948a7fa9f8a3fc686e6f7f77599a665c71eb2118fdfd2c743f9"},
|
| 81 |
+
{file = "av-16.1.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:ca04d17815182d34ce3edc53cbda78a4f36e956c0fd73e3bab249872a831c4d7"},
|
| 82 |
+
{file = "av-16.1.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:ee0e8de2e124a9ef53c955fe2add6ee7c56cc8fd83318265549e44057db77142"},
|
| 83 |
+
{file = "av-16.1.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:22bf77a2f658827043a1e184b479c3bf25c4c43ab32353677df2d119f080e28f"},
|
| 84 |
+
{file = "av-16.1.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2dd419d262e6a71cab206d80bbf28e0a10d0f227b671cdf5e854c028faa2d043"},
|
| 85 |
+
{file = "av-16.1.0-cp314-cp314-win_amd64.whl", hash = "sha256:53585986fd431cd436f290fba662cfb44d9494fbc2949a183de00acc5b33fa88"},
|
| 86 |
+
{file = "av-16.1.0-cp314-cp314t-macosx_11_0_x86_64.whl", hash = "sha256:76f5ed8495cf41e1209a5775d3699dc63fdc1740b94a095e2485f13586593205"},
|
| 87 |
+
{file = "av-16.1.0-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:8d55397190f12a1a3ae7538be58c356cceb2bf50df1b33523817587748ce89e5"},
|
| 88 |
+
{file = "av-16.1.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:9d51d9037437218261b4bbf9df78a95e216f83d7774fbfe8d289230b5b2e28e2"},
|
| 89 |
+
{file = "av-16.1.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:0ce07a89c15644407f49d942111ca046e323bbab0a9078ff43ee57c9b4a50dad"},
|
| 90 |
+
{file = "av-16.1.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:cac0c074892ea97113b53556ff41c99562db7b9f09f098adac1f08318c2acad5"},
|
| 91 |
+
{file = "av-16.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:7dec3dcbc35a187ce450f65a2e0dda820d5a9e6553eea8344a1459af11c98649"},
|
| 92 |
+
{file = "av-16.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:6f90dc082ff2068ddbe77618400b44d698d25d9c4edac57459e250c16b33d700"},
|
| 93 |
+
{file = "av-16.1.0.tar.gz", hash = "sha256:a094b4fd87a3721dacf02794d3d2c82b8d712c85b9534437e82a8a978c175ffd"},
|
| 94 |
+
]
|
| 95 |
+
|
| 96 |
[[package]]
|
| 97 |
name = "black"
|
| 98 |
version = "23.12.1"
|
|
|
|
| 313 |
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
|
| 314 |
]
|
| 315 |
|
| 316 |
+
[[package]]
|
| 317 |
+
name = "coloredlogs"
|
| 318 |
+
version = "15.0.1"
|
| 319 |
+
description = "Colored terminal output for Python's logging module"
|
| 320 |
+
optional = false
|
| 321 |
+
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
|
| 322 |
+
groups = ["main"]
|
| 323 |
+
files = [
|
| 324 |
+
{file = "coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934"},
|
| 325 |
+
{file = "coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0"},
|
| 326 |
+
]
|
| 327 |
+
|
| 328 |
+
[package.dependencies]
|
| 329 |
+
humanfriendly = ">=9.1"
|
| 330 |
+
|
| 331 |
+
[package.extras]
|
| 332 |
+
cron = ["capturer (>=2.4)"]
|
| 333 |
+
|
| 334 |
[[package]]
|
| 335 |
name = "coverage"
|
| 336 |
version = "7.10.6"
|
|
|
|
| 432 |
[package.extras]
|
| 433 |
toml = ["tomli ; python_full_version <= \"3.11.0a6\""]
|
| 434 |
|
| 435 |
+
[[package]]
|
| 436 |
+
name = "ctranslate2"
|
| 437 |
+
version = "4.6.3"
|
| 438 |
+
description = "Fast inference engine for Transformer models"
|
| 439 |
+
optional = false
|
| 440 |
+
python-versions = ">=3.9"
|
| 441 |
+
groups = ["main"]
|
| 442 |
+
files = [
|
| 443 |
+
{file = "ctranslate2-4.6.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d75d79e55a3a26964320445c03a56af60d7215d95561b744d93d04bad24c268a"},
|
| 444 |
+
{file = "ctranslate2-4.6.3-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:13ccb5011e67b831354c9a01bf4d824b4dc5535c54abcf492e0ae4e41894518e"},
|
| 445 |
+
{file = "ctranslate2-4.6.3-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:259ab216d4de93723f3db1805f2bac48b1a5732ce3de0e5a163b570821fcb063"},
|
| 446 |
+
{file = "ctranslate2-4.6.3-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7a5e59a5a67c3f48133ffe6fe2a557922283c16eb4233e6dbb82e0b9a20782f2"},
|
| 447 |
+
{file = "ctranslate2-4.6.3-cp310-cp310-win_amd64.whl", hash = "sha256:6be735c7904ea98c22d7d02b338299c0a7f4cd4b1d0e9dd528e319e52bd78d66"},
|
| 448 |
+
{file = "ctranslate2-4.6.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1ac0d2bec0961f0f9ee00cd5c55b4d5904ee309d9269778d9f9edd23c46c87ff"},
|
| 449 |
+
{file = "ctranslate2-4.6.3-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:db5f82661fa960a6a1bc0e738acf135a22da94a32cda198d8fb782d37ef4caa8"},
|
| 450 |
+
{file = "ctranslate2-4.6.3-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5f1ec2cd9546f02ff9f1b2d21b115eadcce45c8ae5ac5811e7d382f9d9736aa4"},
|
| 451 |
+
{file = "ctranslate2-4.6.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:67f4b5802349a8cfa2e6105b161bf015e97aadab0f58a7034c97e78283cb29b8"},
|
| 452 |
+
{file = "ctranslate2-4.6.3-cp311-cp311-win_amd64.whl", hash = "sha256:fa2f3dcda893a3f4dedeb32b5059e4085738934d93ea8dccdce4bbef2be5d3dc"},
|
| 453 |
+
{file = "ctranslate2-4.6.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:32022dcf0ee2eace0b00345899b0e2be2f5a8b57d8467b1f5ecee40bb3e18746"},
|
| 454 |
+
{file = "ctranslate2-4.6.3-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:df88e7ac821b2def12ae6c71ba4180c13abc13713c1d1ae819e92f2db8556564"},
|
| 455 |
+
{file = "ctranslate2-4.6.3-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:487f57da179057e1a8498d3b61f2fcd826ddfe989ce43ff3b500ec805ca55d56"},
|
| 456 |
+
{file = "ctranslate2-4.6.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a857a42b091f9e0b8b1f63cf1fb356822bb4905d555039f542ff95cf90fd592b"},
|
| 457 |
+
{file = "ctranslate2-4.6.3-cp312-cp312-win_amd64.whl", hash = "sha256:05ec48b44bb2f1e623e30acc57d34d22000d969e8998cae7762137231fae0d25"},
|
| 458 |
+
{file = "ctranslate2-4.6.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:95ff7fdd70bd64d40834cb6ba82bcec15228a9f34dff587babd03a1c3064c302"},
|
| 459 |
+
{file = "ctranslate2-4.6.3-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:a562ef2fd48287423dd6158a0c7921b6c238a052f690bce510b998bba82fd3e2"},
|
| 460 |
+
{file = "ctranslate2-4.6.3-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6cc539ed7c3531354971c78938da50f29ac08b8dc9140bc7ac377e8344bc63e2"},
|
| 461 |
+
{file = "ctranslate2-4.6.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f08efa826707d095ade28410dca27f8d377520f3068843e00b349d5ca15cf174"},
|
| 462 |
+
{file = "ctranslate2-4.6.3-cp313-cp313-win_amd64.whl", hash = "sha256:9a6b6e80d79242761d0583bc0ad7e7ba4d09745d2b23e814bc35f6c842b0ca45"},
|
| 463 |
+
{file = "ctranslate2-4.6.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:75f3e9d3ca7b3d91c87f67972f20998fc318a22d49c25b6d7144b947b5e3240e"},
|
| 464 |
+
{file = "ctranslate2-4.6.3-cp314-cp314-macosx_11_0_x86_64.whl", hash = "sha256:a0657885219e05a6575bb9d8ac4c055da25110d6c897dfed7a322f8c01267fb1"},
|
| 465 |
+
{file = "ctranslate2-4.6.3-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:53e975acf49bab2cd00290a2ece56925d087f8300d5bd7463b96c60002146034"},
|
| 466 |
+
{file = "ctranslate2-4.6.3-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e411c7212f42899f12522b4d9a4b5a59542aa27d5b8e87e7e7bd2f52194fa984"},
|
| 467 |
+
{file = "ctranslate2-4.6.3-cp314-cp314-win_amd64.whl", hash = "sha256:40749b5ad208eb5224ea7ec9516ff290e77373974be0f41697eccf3cef2a44eb"},
|
| 468 |
+
{file = "ctranslate2-4.6.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:dd117643e9bae19d53e3fea4415862841c4e69fcff86dbc4dd397f6864390d84"},
|
| 469 |
+
{file = "ctranslate2-4.6.3-cp314-cp314t-macosx_11_0_x86_64.whl", hash = "sha256:e058b51372faee95780c0d0af513e7c5df268fffcd435a856476d998e65ebf67"},
|
| 470 |
+
{file = "ctranslate2-4.6.3-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4eca886e30e658bece2bd0fc331a37f4a5ad1e29a590d43d5082c7896eba59d7"},
|
| 471 |
+
{file = "ctranslate2-4.6.3-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5345d0d259383ddc106343744be5ada9646f0e2632a6676482fd9de6114c9ee2"},
|
| 472 |
+
{file = "ctranslate2-4.6.3-cp314-cp314t-win_amd64.whl", hash = "sha256:53ab04edc3f7280465cd54e6a359f26960eb63961eeae27cb9726f449b4b217e"},
|
| 473 |
+
{file = "ctranslate2-4.6.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3f229dfcd14be23c4f76542873562ab9d8006e6e045fa585be83f82d224c30be"},
|
| 474 |
+
{file = "ctranslate2-4.6.3-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:11f88fe0d2d081f1fc4f7442477a7089a3fac9ad28c98fa2df1d9739a114524e"},
|
| 475 |
+
{file = "ctranslate2-4.6.3-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8d0b028e161a8374467a7b77a4675a0aa88cd2dd24e0700c8277418cc31be4d9"},
|
| 476 |
+
{file = "ctranslate2-4.6.3-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a7d0c858a4fecf288211b488ed281c00b93b77155e39e6d496646cc1ddbecda1"},
|
| 477 |
+
{file = "ctranslate2-4.6.3-cp39-cp39-win_amd64.whl", hash = "sha256:875fd4dcd7185589f07197b667ce547ec532ef42d68feed76615543e87f289ef"},
|
| 478 |
+
]
|
| 479 |
+
|
| 480 |
+
[package.dependencies]
|
| 481 |
+
numpy = "*"
|
| 482 |
+
pyyaml = ">=5.3,<7"
|
| 483 |
+
setuptools = "*"
|
| 484 |
+
|
| 485 |
[[package]]
|
| 486 |
name = "deprecated"
|
| 487 |
version = "1.3.1"
|
|
|
|
| 521 |
[package.extras]
|
| 522 |
all = ["email-validator (>=2.0.0)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "pydantic-extra-types (>=2.0.0)", "pydantic-settings (>=2.0.0)", "python-multipart (>=0.0.5)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"]
|
| 523 |
|
| 524 |
+
[[package]]
|
| 525 |
+
name = "faster-whisper"
|
| 526 |
+
version = "1.2.1"
|
| 527 |
+
description = "Faster Whisper transcription with CTranslate2"
|
| 528 |
+
optional = false
|
| 529 |
+
python-versions = ">=3.9"
|
| 530 |
+
groups = ["main"]
|
| 531 |
+
files = [
|
| 532 |
+
{file = "faster_whisper-1.2.1-py3-none-any.whl", hash = "sha256:79a66ad50688c0b794dd501dc340a736992a6342f7f95e5811be60b5224a26a7"},
|
| 533 |
+
]
|
| 534 |
+
|
| 535 |
+
[package.dependencies]
|
| 536 |
+
av = ">=11"
|
| 537 |
+
ctranslate2 = ">=4.0,<5"
|
| 538 |
+
huggingface-hub = ">=0.21"
|
| 539 |
+
onnxruntime = ">=1.14,<2"
|
| 540 |
+
tokenizers = ">=0.13,<1"
|
| 541 |
+
tqdm = "*"
|
| 542 |
+
|
| 543 |
+
[package.extras]
|
| 544 |
+
conversion = ["transformers[torch] (>=4.23)"]
|
| 545 |
+
dev = ["black (==23.*)", "flake8 (==6.*)", "isort (==5.*)", "pytest (==7.*)"]
|
| 546 |
+
|
| 547 |
[[package]]
|
| 548 |
name = "filelock"
|
| 549 |
version = "3.20.3"
|
|
|
|
| 573 |
pycodestyle = ">=2.11.0,<2.12.0"
|
| 574 |
pyflakes = ">=3.1.0,<3.2.0"
|
| 575 |
|
| 576 |
+
[[package]]
|
| 577 |
+
name = "flatbuffers"
|
| 578 |
+
version = "25.12.19"
|
| 579 |
+
description = "The FlatBuffers serialization format for Python"
|
| 580 |
+
optional = false
|
| 581 |
+
python-versions = "*"
|
| 582 |
+
groups = ["main"]
|
| 583 |
+
files = [
|
| 584 |
+
{file = "flatbuffers-25.12.19-py2.py3-none-any.whl", hash = "sha256:7634f50c427838bb021c2d66a3d1168e9d199b0607e6329399f04846d42e20b4"},
|
| 585 |
+
]
|
| 586 |
+
|
| 587 |
[[package]]
|
| 588 |
name = "fsspec"
|
| 589 |
version = "2026.1.0"
|
|
|
|
| 814 |
torch = ["safetensors[torch]", "torch"]
|
| 815 |
typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"]
|
| 816 |
|
| 817 |
+
[[package]]
|
| 818 |
+
name = "humanfriendly"
|
| 819 |
+
version = "10.0"
|
| 820 |
+
description = "Human friendly output for text interfaces using Python"
|
| 821 |
+
optional = false
|
| 822 |
+
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
|
| 823 |
+
groups = ["main"]
|
| 824 |
+
files = [
|
| 825 |
+
{file = "humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477"},
|
| 826 |
+
{file = "humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc"},
|
| 827 |
+
]
|
| 828 |
+
|
| 829 |
+
[package.dependencies]
|
| 830 |
+
pyreadline3 = {version = "*", markers = "sys_platform == \"win32\" and python_version >= \"3.8\""}
|
| 831 |
+
|
| 832 |
[[package]]
|
| 833 |
name = "idna"
|
| 834 |
version = "3.10"
|
|
|
|
| 1476 |
{file = "nvidia_nvtx_cu12-12.6.77-py3-none-win_amd64.whl", hash = "sha256:2fb11a4af04a5e6c84073e6404d26588a34afd35379f0855a99797897efa75c0"},
|
| 1477 |
]
|
| 1478 |
|
| 1479 |
+
[[package]]
|
| 1480 |
+
name = "onnxruntime"
|
| 1481 |
+
version = "1.23.2"
|
| 1482 |
+
description = "ONNX Runtime is a runtime accelerator for Machine Learning models"
|
| 1483 |
+
optional = false
|
| 1484 |
+
python-versions = ">=3.10"
|
| 1485 |
+
groups = ["main"]
|
| 1486 |
+
files = [
|
| 1487 |
+
{file = "onnxruntime-1.23.2-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:a7730122afe186a784660f6ec5807138bf9d792fa1df76556b27307ea9ebcbe3"},
|
| 1488 |
+
{file = "onnxruntime-1.23.2-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:b28740f4ecef1738ea8f807461dd541b8287d5650b5be33bca7b474e3cbd1f36"},
|
| 1489 |
+
{file = "onnxruntime-1.23.2-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8f7d1fe034090a1e371b7f3ca9d3ccae2fabae8c1d8844fb7371d1ea38e8e8d2"},
|
| 1490 |
+
{file = "onnxruntime-1.23.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4ca88747e708e5c67337b0f65eed4b7d0dd70d22ac332038c9fc4635760018f7"},
|
| 1491 |
+
{file = "onnxruntime-1.23.2-cp310-cp310-win_amd64.whl", hash = "sha256:0be6a37a45e6719db5120e9986fcd30ea205ac8103fd1fb74b6c33348327a0cc"},
|
| 1492 |
+
{file = "onnxruntime-1.23.2-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:6f91d2c9b0965e86827a5ba01531d5b669770b01775b23199565d6c1f136616c"},
|
| 1493 |
+
{file = "onnxruntime-1.23.2-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:87d8b6eaf0fbeb6835a60a4265fde7a3b60157cf1b2764773ac47237b4d48612"},
|
| 1494 |
+
{file = "onnxruntime-1.23.2-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bbfd2fca76c855317568c1b36a885ddea2272c13cb0e395002c402f2360429a6"},
|
| 1495 |
+
{file = "onnxruntime-1.23.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da44b99206e77734c5819aa2142c69e64f3b46edc3bd314f6a45a932defc0b3e"},
|
| 1496 |
+
{file = "onnxruntime-1.23.2-cp311-cp311-win_amd64.whl", hash = "sha256:902c756d8b633ce0dedd889b7c08459433fbcf35e9c38d1c03ddc020f0648c6e"},
|
| 1497 |
+
{file = "onnxruntime-1.23.2-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:b8f029a6b98d3cf5be564d52802bb50a8489ab73409fa9db0bf583eabb7c2321"},
|
| 1498 |
+
{file = "onnxruntime-1.23.2-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:218295a8acae83905f6f1aed8cacb8e3eb3bd7513a13fe4ba3b2664a19fc4a6b"},
|
| 1499 |
+
{file = "onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:76ff670550dc23e58ea9bc53b5149b99a44e63b34b524f7b8547469aaa0dcb8c"},
|
| 1500 |
+
{file = "onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f9b4ae77f8e3c9bee50c27bc1beede83f786fe1d52e99ac85aa8d65a01e9b77"},
|
| 1501 |
+
{file = "onnxruntime-1.23.2-cp312-cp312-win_amd64.whl", hash = "sha256:25de5214923ce941a3523739d34a520aac30f21e631de53bba9174dc9c004435"},
|
| 1502 |
+
{file = "onnxruntime-1.23.2-cp313-cp313-macosx_13_0_arm64.whl", hash = "sha256:2ff531ad8496281b4297f32b83b01cdd719617e2351ffe0dba5684fb283afa1f"},
|
| 1503 |
+
{file = "onnxruntime-1.23.2-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:162f4ca894ec3de1a6fd53589e511e06ecdc3ff646849b62a9da7489dee9ce95"},
|
| 1504 |
+
{file = "onnxruntime-1.23.2-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:45d127d6e1e9b99d1ebeae9bcd8f98617a812f53f46699eafeb976275744826b"},
|
| 1505 |
+
{file = "onnxruntime-1.23.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8bace4e0d46480fbeeb7bbe1ffe1f080e6663a42d1086ff95c1551f2d39e7872"},
|
| 1506 |
+
{file = "onnxruntime-1.23.2-cp313-cp313-win_amd64.whl", hash = "sha256:1f9cc0a55349c584f083c1c076e611a7c35d5b867d5d6e6d6c823bf821978088"},
|
| 1507 |
+
{file = "onnxruntime-1.23.2-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9d2385e774f46ac38f02b3a91a91e30263d41b2f1f4f26ae34805b2a9ddef466"},
|
| 1508 |
+
{file = "onnxruntime-1.23.2-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e2b9233c4947907fd1818d0e581c049c41ccc39b2856cc942ff6d26317cee145"},
|
| 1509 |
+
]
|
| 1510 |
+
|
| 1511 |
+
[package.dependencies]
|
| 1512 |
+
coloredlogs = "*"
|
| 1513 |
+
flatbuffers = "*"
|
| 1514 |
+
numpy = ">=1.21.6"
|
| 1515 |
+
packaging = "*"
|
| 1516 |
+
protobuf = "*"
|
| 1517 |
+
sympy = "*"
|
| 1518 |
+
|
| 1519 |
[[package]]
|
| 1520 |
name = "packaging"
|
| 1521 |
version = "25.0"
|
|
|
|
| 1682 |
dev = ["pre-commit", "tox"]
|
| 1683 |
testing = ["coverage", "pytest", "pytest-benchmark"]
|
| 1684 |
|
| 1685 |
+
[[package]]
|
| 1686 |
+
name = "protobuf"
|
| 1687 |
+
version = "6.33.4"
|
| 1688 |
+
description = ""
|
| 1689 |
+
optional = false
|
| 1690 |
+
python-versions = ">=3.9"
|
| 1691 |
+
groups = ["main"]
|
| 1692 |
+
files = [
|
| 1693 |
+
{file = "protobuf-6.33.4-cp310-abi3-win32.whl", hash = "sha256:918966612c8232fc6c24c78e1cd89784307f5814ad7506c308ee3cf86662850d"},
|
| 1694 |
+
{file = "protobuf-6.33.4-cp310-abi3-win_amd64.whl", hash = "sha256:8f11ffae31ec67fc2554c2ef891dcb561dae9a2a3ed941f9e134c2db06657dbc"},
|
| 1695 |
+
{file = "protobuf-6.33.4-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:2fe67f6c014c84f655ee06f6f66213f9254b3a8b6bda6cda0ccd4232c73c06f0"},
|
| 1696 |
+
{file = "protobuf-6.33.4-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:757c978f82e74d75cba88eddec479df9b99a42b31193313b75e492c06a51764e"},
|
| 1697 |
+
{file = "protobuf-6.33.4-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:c7c64f259c618f0bef7bee042075e390debbf9682334be2b67408ec7c1c09ee6"},
|
| 1698 |
+
{file = "protobuf-6.33.4-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:3df850c2f8db9934de4cf8f9152f8dc2558f49f298f37f90c517e8e5c84c30e9"},
|
| 1699 |
+
{file = "protobuf-6.33.4-cp39-cp39-win32.whl", hash = "sha256:955478a89559fa4568f5a81dce77260eabc5c686f9e8366219ebd30debf06aa6"},
|
| 1700 |
+
{file = "protobuf-6.33.4-cp39-cp39-win_amd64.whl", hash = "sha256:0f12ddbf96912690c3582f9dffb55530ef32015ad8e678cd494312bd78314c4f"},
|
| 1701 |
+
{file = "protobuf-6.33.4-py3-none-any.whl", hash = "sha256:1fe3730068fcf2e595816a6c34fe66eeedd37d51d0400b72fabc848811fdc1bc"},
|
| 1702 |
+
{file = "protobuf-6.33.4.tar.gz", hash = "sha256:dc2e61bca3b10470c1912d166fe0af67bfc20eb55971dcef8dfa48ce14f0ed91"},
|
| 1703 |
+
]
|
| 1704 |
+
|
| 1705 |
[[package]]
|
| 1706 |
name = "pycodestyle"
|
| 1707 |
version = "2.11.1"
|
|
|
|
| 1884 |
{file = "pyflakes-3.1.0.tar.gz", hash = "sha256:a0aae034c444db0071aa077972ba4768d40c830d9539fd45bf4cd3f8f6992efc"},
|
| 1885 |
]
|
| 1886 |
|
| 1887 |
+
[[package]]
|
| 1888 |
+
name = "pyreadline3"
|
| 1889 |
+
version = "3.5.4"
|
| 1890 |
+
description = "A python implementation of GNU readline."
|
| 1891 |
+
optional = false
|
| 1892 |
+
python-versions = ">=3.8"
|
| 1893 |
+
groups = ["main"]
|
| 1894 |
+
markers = "sys_platform == \"win32\""
|
| 1895 |
+
files = [
|
| 1896 |
+
{file = "pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6"},
|
| 1897 |
+
{file = "pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7"},
|
| 1898 |
+
]
|
| 1899 |
+
|
| 1900 |
+
[package.extras]
|
| 1901 |
+
dev = ["build", "flake8", "mypy", "pytest", "twine"]
|
| 1902 |
+
|
| 1903 |
[[package]]
|
| 1904 |
name = "pytest"
|
| 1905 |
version = "7.4.4"
|
|
|
|
| 2405 |
optional = false
|
| 2406 |
python-versions = ">=3.9"
|
| 2407 |
groups = ["main"]
|
|
|
|
| 2408 |
files = [
|
| 2409 |
{file = "setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922"},
|
| 2410 |
{file = "setuptools-80.9.0.tar.gz", hash = "sha256:f36b47402ecde768dbfafc46e8e4207b4360c654f1f3bb84475f0a28628fb19c"},
|
|
|
|
| 3206 |
[metadata]
|
| 3207 |
lock-version = "2.1"
|
| 3208 |
python-versions = "^3.11"
|
| 3209 |
+
content-hash = "ebb16163670f970e4d506a8230fbb256edc8d5a749255bbbc9dc9cd17fc870eb"
|
pyproject.toml
CHANGED
|
@@ -19,6 +19,7 @@ slowapi = "^0.1.9"
|
|
| 19 |
cachetools = "^5.3.0"
|
| 20 |
sentence-transformers = "^2.2.2"
|
| 21 |
torch = "^2.0.0"
|
|
|
|
| 22 |
|
| 23 |
[tool.poetry.group.dev.dependencies]
|
| 24 |
pytest = "^7.4.3"
|
|
|
|
| 19 |
cachetools = "^5.3.0"
|
| 20 |
sentence-transformers = "^2.2.2"
|
| 21 |
torch = "^2.0.0"
|
| 22 |
+
faster-whisper = "^1.0.0"
|
| 23 |
|
| 24 |
[tool.poetry.group.dev.dependencies]
|
| 25 |
pytest = "^7.4.3"
|
tests/conftest.py
CHANGED
|
@@ -46,30 +46,6 @@ def temp_dir():
|
|
| 46 |
yield tmp_dir
|
| 47 |
|
| 48 |
|
| 49 |
-
@pytest.fixture
|
| 50 |
-
def sample_vtt_content():
|
| 51 |
-
"""Sample VTT subtitle content for testing."""
|
| 52 |
-
return """WEBVTT
|
| 53 |
-
Kind: captions
|
| 54 |
-
Language: en
|
| 55 |
-
|
| 56 |
-
00:00:00.000 --> 00:00:03.000
|
| 57 |
-
Never gonna give you up
|
| 58 |
-
|
| 59 |
-
00:00:03.000 --> 00:00:06.000
|
| 60 |
-
Never gonna let you down
|
| 61 |
-
|
| 62 |
-
00:00:06.000 --> 00:00:09.000
|
| 63 |
-
Never gonna run around and desert you
|
| 64 |
-
|
| 65 |
-
00:00:09.000 --> 00:00:12.000
|
| 66 |
-
Never gonna make you cry
|
| 67 |
-
|
| 68 |
-
00:00:12.000 --> 00:00:15.000
|
| 69 |
-
Never gonna say goodbye
|
| 70 |
-
"""
|
| 71 |
-
|
| 72 |
-
|
| 73 |
@pytest.fixture
|
| 74 |
def sample_youtube_url():
|
| 75 |
"""Sample YouTube URL for testing."""
|
|
|
|
| 46 |
yield tmp_dir
|
| 47 |
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
@pytest.fixture
|
| 50 |
def sample_youtube_url():
|
| 51 |
"""Sample YouTube URL for testing."""
|
tests/test_subtitles.py
CHANGED
|
@@ -1,19 +1,11 @@
|
|
| 1 |
"""Tests for subtitle extraction functionality."""
|
| 2 |
|
| 3 |
import pytest
|
| 4 |
-
from unittest.mock import
|
| 5 |
-
import asyncio
|
| 6 |
|
| 7 |
from app.apis.subtitles.service import SubtitleService, SUBTITLE_CACHE
|
| 8 |
-
from app.apis.subtitles.utils import
|
| 9 |
-
|
| 10 |
-
extract_video_id,
|
| 11 |
-
convert_vtt_to_text
|
| 12 |
-
)
|
| 13 |
-
from app.core.exceptions import (
|
| 14 |
-
SubtitlesNotFoundError,
|
| 15 |
-
DownloadTimeoutError
|
| 16 |
-
)
|
| 17 |
|
| 18 |
|
| 19 |
class TestSubtitleUtils:
|
|
@@ -34,38 +26,6 @@ class TestSubtitleUtils:
|
|
| 34 |
url = "https://www.youtube.com/embed/dQw4w9WgXcQ"
|
| 35 |
assert extract_video_id(url) == "dQw4w9WgXcQ"
|
| 36 |
|
| 37 |
-
def test_clean_subtitle_text(self):
|
| 38 |
-
"""Test cleaning subtitle text."""
|
| 39 |
-
raw_lines = [
|
| 40 |
-
"WEBVTT",
|
| 41 |
-
"",
|
| 42 |
-
"1",
|
| 43 |
-
"00:00:00.000 --> 00:00:03.000",
|
| 44 |
-
"Never gonna give you up",
|
| 45 |
-
"",
|
| 46 |
-
"2",
|
| 47 |
-
"00:00:03.000 --> 00:00:06.000",
|
| 48 |
-
"Never gonna let you down",
|
| 49 |
-
"Never gonna give you up",
|
| 50 |
-
""
|
| 51 |
-
]
|
| 52 |
-
|
| 53 |
-
cleaned = clean_subtitle_text(raw_lines)
|
| 54 |
-
assert "Never gonna give you up" in cleaned
|
| 55 |
-
assert "Never gonna let you down" in cleaned
|
| 56 |
-
assert "WEBVTT" not in cleaned
|
| 57 |
-
assert "00:00:00.000 --> 00:00:03.000" not in cleaned
|
| 58 |
-
assert len([line for line in cleaned if line == "Never gonna give you up"]) == 1
|
| 59 |
-
|
| 60 |
-
def test_convert_vtt_to_text(self, sample_vtt_content):
|
| 61 |
-
"""Test converting VTT content to clean text."""
|
| 62 |
-
result = convert_vtt_to_text(sample_vtt_content)
|
| 63 |
-
|
| 64 |
-
assert "Never gonna give you up" in result
|
| 65 |
-
assert "Never gonna let you down" in result
|
| 66 |
-
assert "WEBVTT" not in result
|
| 67 |
-
assert "00:00:00.000 --> 00:00:03.000" not in result
|
| 68 |
-
|
| 69 |
|
| 70 |
class TestSubtitleService:
|
| 71 |
"""Test subtitle extraction service."""
|
|
@@ -81,33 +41,28 @@ class TestSubtitleService:
|
|
| 81 |
return SubtitleService()
|
| 82 |
|
| 83 |
@pytest.mark.asyncio
|
| 84 |
-
async def test_extract_subtitles_success(self, service, sample_youtube_url
|
| 85 |
"""Test successful subtitle extraction."""
|
| 86 |
-
with patch.object(service, '
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
video_id, subtitles = await service.extract_subtitles(sample_youtube_url, "en")
|
| 90 |
|
| 91 |
assert video_id == "dQw4w9WgXcQ"
|
| 92 |
-
assert len(subtitles)
|
| 93 |
-
assert "
|
| 94 |
-
|
| 95 |
-
@pytest.mark.asyncio
|
| 96 |
-
async def test_extract_subtitles_not_found(self, service, sample_youtube_url):
|
| 97 |
-
"""Test subtitle extraction when no subtitles are found."""
|
| 98 |
-
with patch.object(service, '_download_subtitles') as mock_download, \
|
| 99 |
-
patch.object(service, '_try_alternative_languages') as mock_alt:
|
| 100 |
-
mock_download.return_value = None
|
| 101 |
-
mock_alt.return_value = None
|
| 102 |
-
|
| 103 |
-
with pytest.raises(SubtitlesNotFoundError):
|
| 104 |
-
await service.extract_subtitles(sample_youtube_url, "en")
|
| 105 |
|
| 106 |
@pytest.mark.asyncio
|
| 107 |
-
async def test_extract_subtitles_uses_cache(self, service, sample_youtube_url
|
| 108 |
"""Test that cached results are returned."""
|
| 109 |
-
with patch.object(service, '
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
| 111 |
|
| 112 |
result1 = await service.extract_subtitles(sample_youtube_url, "en")
|
| 113 |
result2 = await service.extract_subtitles(sample_youtube_url, "en")
|
|
@@ -115,14 +70,26 @@ class TestSubtitleService:
|
|
| 115 |
assert result1 == result2
|
| 116 |
assert mock_download.call_count == 1
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
class TestSubtitleAPI:
|
| 120 |
"""Test subtitle API endpoints."""
|
| 121 |
|
| 122 |
-
def test_extract_subtitles_endpoint_success(self, client, api_key
|
| 123 |
"""Test successful subtitle extraction via API."""
|
| 124 |
with patch('app.apis.subtitles.service.subtitle_service.extract_subtitles') as mock_extract:
|
| 125 |
-
mock_extract.return_value = ("dQw4w9WgXcQ", ["Never gonna give you up"
|
| 126 |
|
| 127 |
response = client.post(
|
| 128 |
"/api/v1/subtitles/extract",
|
|
@@ -134,7 +101,6 @@ class TestSubtitleAPI:
|
|
| 134 |
data = response.json()
|
| 135 |
assert data["status"] == "success"
|
| 136 |
assert data["video_id"] == "dQw4w9WgXcQ"
|
| 137 |
-
assert "Never gonna give you up" in data["subtitles"]
|
| 138 |
|
| 139 |
def test_extract_subtitles_endpoint_invalid_api_key(self, client, invalid_api_key):
|
| 140 |
"""Test API endpoint with invalid API key."""
|
|
@@ -143,7 +109,6 @@ class TestSubtitleAPI:
|
|
| 143 |
json={"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ", "lang": "en"},
|
| 144 |
headers={"x-api-key": invalid_api_key}
|
| 145 |
)
|
| 146 |
-
|
| 147 |
assert response.status_code == 401
|
| 148 |
|
| 149 |
def test_extract_subtitles_endpoint_missing_api_key(self, client):
|
|
@@ -152,7 +117,6 @@ class TestSubtitleAPI:
|
|
| 152 |
"/api/v1/subtitles/extract",
|
| 153 |
json={"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ", "lang": "en"}
|
| 154 |
)
|
| 155 |
-
|
| 156 |
assert response.status_code == 401
|
| 157 |
|
| 158 |
def test_extract_subtitles_endpoint_invalid_url(self, client, api_key):
|
|
@@ -162,13 +126,11 @@ class TestSubtitleAPI:
|
|
| 162 |
json={"url": "https://example.com/not-youtube", "lang": "en"},
|
| 163 |
headers={"x-api-key": api_key}
|
| 164 |
)
|
| 165 |
-
|
| 166 |
assert response.status_code == 422
|
| 167 |
|
| 168 |
def test_subtitles_health_endpoint(self, client):
|
| 169 |
"""Test subtitles health check endpoint."""
|
| 170 |
response = client.get("/api/v1/subtitles/health")
|
| 171 |
-
|
| 172 |
assert response.status_code == 200
|
| 173 |
data = response.json()
|
| 174 |
assert data["status"] == "healthy"
|
|
|
|
| 1 |
"""Tests for subtitle extraction functionality."""
|
| 2 |
|
| 3 |
import pytest
|
| 4 |
+
from unittest.mock import patch, MagicMock
|
|
|
|
| 5 |
|
| 6 |
from app.apis.subtitles.service import SubtitleService, SUBTITLE_CACHE
|
| 7 |
+
from app.apis.subtitles.utils import extract_video_id
|
| 8 |
+
from app.core.exceptions import SubtitlesNotFoundError
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
class TestSubtitleUtils:
|
|
|
|
| 26 |
url = "https://www.youtube.com/embed/dQw4w9WgXcQ"
|
| 27 |
assert extract_video_id(url) == "dQw4w9WgXcQ"
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
class TestSubtitleService:
|
| 31 |
"""Test subtitle extraction service."""
|
|
|
|
| 41 |
return SubtitleService()
|
| 42 |
|
| 43 |
@pytest.mark.asyncio
|
| 44 |
+
async def test_extract_subtitles_success(self, service, sample_youtube_url):
|
| 45 |
"""Test successful subtitle extraction."""
|
| 46 |
+
with patch.object(service, '_download_audio') as mock_download, \
|
| 47 |
+
patch.object(service, '_transcribe_audio') as mock_transcribe:
|
| 48 |
+
mock_download.return_value = MagicMock()
|
| 49 |
+
mock_download.return_value.exists.return_value = True
|
| 50 |
+
mock_transcribe.return_value = ["Test subtitle line 1", "Test subtitle line 2"]
|
| 51 |
|
| 52 |
video_id, subtitles = await service.extract_subtitles(sample_youtube_url, "en")
|
| 53 |
|
| 54 |
assert video_id == "dQw4w9WgXcQ"
|
| 55 |
+
assert len(subtitles) == 2
|
| 56 |
+
assert "Test subtitle line 1" in subtitles
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
@pytest.mark.asyncio
|
| 59 |
+
async def test_extract_subtitles_uses_cache(self, service, sample_youtube_url):
|
| 60 |
"""Test that cached results are returned."""
|
| 61 |
+
with patch.object(service, '_download_audio') as mock_download, \
|
| 62 |
+
patch.object(service, '_transcribe_audio') as mock_transcribe:
|
| 63 |
+
mock_download.return_value = MagicMock()
|
| 64 |
+
mock_download.return_value.exists.return_value = True
|
| 65 |
+
mock_transcribe.return_value = ["Cached subtitle"]
|
| 66 |
|
| 67 |
result1 = await service.extract_subtitles(sample_youtube_url, "en")
|
| 68 |
result2 = await service.extract_subtitles(sample_youtube_url, "en")
|
|
|
|
| 70 |
assert result1 == result2
|
| 71 |
assert mock_download.call_count == 1
|
| 72 |
|
| 73 |
+
@pytest.mark.asyncio
|
| 74 |
+
async def test_extract_subtitles_empty_transcription(self, service, sample_youtube_url):
|
| 75 |
+
"""Test error when transcription produces no text."""
|
| 76 |
+
with patch.object(service, '_download_audio') as mock_download, \
|
| 77 |
+
patch.object(service, '_transcribe_audio') as mock_transcribe:
|
| 78 |
+
mock_download.return_value = MagicMock()
|
| 79 |
+
mock_download.return_value.exists.return_value = True
|
| 80 |
+
mock_transcribe.return_value = []
|
| 81 |
+
|
| 82 |
+
with pytest.raises(SubtitlesNotFoundError):
|
| 83 |
+
await service.extract_subtitles(sample_youtube_url, "en")
|
| 84 |
+
|
| 85 |
|
| 86 |
class TestSubtitleAPI:
|
| 87 |
"""Test subtitle API endpoints."""
|
| 88 |
|
| 89 |
+
def test_extract_subtitles_endpoint_success(self, client, api_key):
|
| 90 |
"""Test successful subtitle extraction via API."""
|
| 91 |
with patch('app.apis.subtitles.service.subtitle_service.extract_subtitles') as mock_extract:
|
| 92 |
+
mock_extract.return_value = ("dQw4w9WgXcQ", ["Never gonna give you up"])
|
| 93 |
|
| 94 |
response = client.post(
|
| 95 |
"/api/v1/subtitles/extract",
|
|
|
|
| 101 |
data = response.json()
|
| 102 |
assert data["status"] == "success"
|
| 103 |
assert data["video_id"] == "dQw4w9WgXcQ"
|
|
|
|
| 104 |
|
| 105 |
def test_extract_subtitles_endpoint_invalid_api_key(self, client, invalid_api_key):
|
| 106 |
"""Test API endpoint with invalid API key."""
|
|
|
|
| 109 |
json={"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ", "lang": "en"},
|
| 110 |
headers={"x-api-key": invalid_api_key}
|
| 111 |
)
|
|
|
|
| 112 |
assert response.status_code == 401
|
| 113 |
|
| 114 |
def test_extract_subtitles_endpoint_missing_api_key(self, client):
|
|
|
|
| 117 |
"/api/v1/subtitles/extract",
|
| 118 |
json={"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ", "lang": "en"}
|
| 119 |
)
|
|
|
|
| 120 |
assert response.status_code == 401
|
| 121 |
|
| 122 |
def test_extract_subtitles_endpoint_invalid_url(self, client, api_key):
|
|
|
|
| 126 |
json={"url": "https://example.com/not-youtube", "lang": "en"},
|
| 127 |
headers={"x-api-key": api_key}
|
| 128 |
)
|
|
|
|
| 129 |
assert response.status_code == 422
|
| 130 |
|
| 131 |
def test_subtitles_health_endpoint(self, client):
|
| 132 |
"""Test subtitles health check endpoint."""
|
| 133 |
response = client.get("/api/v1/subtitles/health")
|
|
|
|
| 134 |
assert response.status_code == 200
|
| 135 |
data = response.json()
|
| 136 |
assert data["status"] == "healthy"
|