sathishkumarbsk commited on
Commit
30c1053
·
1 Parent(s): 229c8e9

Initial transcription service

Browse files
Dockerfile ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Transcription Service Dockerfile
2
+ # Uses official OpenAI Whisper for transcription
3
+
4
+ FROM python:3.11-slim
5
+
6
+ WORKDIR /app
7
+
8
+ # Install system dependencies
9
+ # - ffmpeg: required for audio processing
10
+ # - git: needed by some pip packages
11
+ RUN apt-get update && apt-get install -y --no-install-recommends \
12
+ ffmpeg \
13
+ git \
14
+ && rm -rf /var/lib/apt/lists/*
15
+
16
+ # Install Python dependencies
17
+ COPY requirements.txt .
18
+ RUN pip install --no-cache-dir -r requirements.txt
19
+
20
+ # Create temp directory
21
+ RUN mkdir -p /tmp/transcription
22
+
23
+ # Copy application code
24
+ COPY app/ ./app/
25
+
26
+ # Environment variables with defaults
27
+ ENV HOST=0.0.0.0
28
+ ENV PORT=8000
29
+ ENV ASR_MODEL_SIZE=base
30
+ ENV ASR_DEVICE=cpu
31
+ ENV MAX_FILE_SIZE_MB=200
32
+ ENV DOWNLOAD_TIMEOUT=300
33
+ ENV YOUTUBE_TIMEOUT=600
34
+ ENV ASR_TIMEOUT=1800
35
+ ENV LOG_LEVEL=INFO
36
+ ENV TEMP_DIR_BASE=/tmp/transcription
37
+
38
+ # Expose port
39
+ EXPOSE 8000
40
+
41
+ # Health check
42
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
43
+ CMD python -c "import httpx; httpx.get('http://localhost:8000/health').raise_for_status()"
44
+
45
+ # Run with uvicorn
46
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
app/.DS_Store ADDED
Binary file (6.15 kB). View file
 
app/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Transcription Service App
app/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (182 Bytes). View file
 
app/__pycache__/main.cpython-311.pyc ADDED
Binary file (19.3 kB). View file
 
app/core/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Core module
app/core/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (187 Bytes). View file
 
app/core/__pycache__/config.cpython-311.pyc ADDED
Binary file (2.69 kB). View file
 
app/core/__pycache__/logging.cpython-311.pyc ADDED
Binary file (3.31 kB). View file
 
app/core/__pycache__/security.cpython-311.pyc ADDED
Binary file (5.35 kB). View file
 
app/core/config.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration management via environment variables.
3
+ """
4
+ import os
5
+ from typing import List
6
+
7
+
8
+ class Settings:
9
+ """Application settings loaded from environment variables."""
10
+
11
+ # Server
12
+ HOST: str = os.getenv("HOST", "0.0.0.0")
13
+ PORT: int = int(os.getenv("PORT", "8000"))
14
+
15
+ # ASR Model (OpenAI Whisper)
16
+ # Options: tiny, base, small, medium, large, large-v2, large-v3
17
+ ASR_MODEL_SIZE: str = os.getenv("ASR_MODEL_SIZE", "base")
18
+ ASR_DEVICE: str = os.getenv("ASR_DEVICE", "cpu") # cpu or cuda
19
+
20
+ # File limits
21
+ MAX_FILE_SIZE_MB: int = int(os.getenv("MAX_FILE_SIZE_MB", "200"))
22
+ MAX_FILE_SIZE_BYTES: int = MAX_FILE_SIZE_MB * 1024 * 1024
23
+
24
+ # Timeouts (seconds)
25
+ DOWNLOAD_TIMEOUT: int = int(os.getenv("DOWNLOAD_TIMEOUT", "300"))
26
+ YOUTUBE_TIMEOUT: int = int(os.getenv("YOUTUBE_TIMEOUT", "600"))
27
+ ASR_TIMEOUT: int = int(os.getenv("ASR_TIMEOUT", "1800")) # 30 min for long files
28
+
29
+ # YouTube allowlist
30
+ YOUTUBE_ALLOWED_DOMAINS: List[str] = os.getenv(
31
+ "YOUTUBE_ALLOWED_DOMAINS",
32
+ "youtube.com,youtu.be,www.youtube.com,m.youtube.com"
33
+ ).split(",")
34
+
35
+ # YouTube cookies - set to browser name (chrome, firefox, safari, edge) to use browser cookies
36
+ # This helps avoid 403 errors by using your logged-in session
37
+ YOUTUBE_COOKIES_FROM_BROWSER: str = os.getenv("YOUTUBE_COOKIES_FROM_BROWSER", "")
38
+
39
+ # Security
40
+ ALLOWED_MEDIA_EXTENSIONS: List[str] = [".mp3", ".mp4", ".m4a", ".wav", ".webm", ".ogg", ".flac"]
41
+ ALLOWED_CONTENT_TYPES: List[str] = [
42
+ "audio/mpeg", "audio/mp4", "audio/x-m4a", "audio/wav", "audio/webm",
43
+ "audio/ogg", "audio/flac", "video/mp4", "video/webm",
44
+ "application/octet-stream" # fallback for some servers
45
+ ]
46
+
47
+ # Temp directory base
48
+ TEMP_DIR_BASE: str = os.getenv("TEMP_DIR_BASE", "/tmp/transcription")
49
+
50
+ # Logging
51
+ LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
52
+
53
+
54
+ settings = Settings()
app/core/logging.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Logging configuration for the application.
3
+ """
4
+ import logging
5
+ import sys
6
+ from app.core.config import settings
7
+
8
+
9
+ class RequestIdFilter(logging.Filter):
10
+ """Filter that adds request_id to log records if not present."""
11
+
12
+ def filter(self, record):
13
+ if not hasattr(record, 'request_id'):
14
+ record.request_id = 'no-request'
15
+ return True
16
+
17
+
18
+ def setup_logging() -> logging.Logger:
19
+ """Configure and return the application logger."""
20
+ logger = logging.getLogger("transcription")
21
+ logger.setLevel(getattr(logging, settings.LOG_LEVEL.upper(), logging.INFO))
22
+
23
+ if not logger.handlers:
24
+ handler = logging.StreamHandler(sys.stdout)
25
+ handler.setLevel(logging.DEBUG)
26
+
27
+ # Add filter to ensure request_id is always present
28
+ handler.addFilter(RequestIdFilter())
29
+
30
+ formatter = logging.Formatter(
31
+ "%(asctime)s - %(name)s - %(levelname)s - [%(request_id)s] - %(message)s",
32
+ datefmt="%Y-%m-%d %H:%M:%S"
33
+ )
34
+ handler.setFormatter(formatter)
35
+ logger.addHandler(handler)
36
+
37
+ return logger
38
+
39
+
40
+ class RequestLoggerAdapter(logging.LoggerAdapter):
41
+ """Logger adapter that includes request_id in all log messages."""
42
+
43
+ def process(self, msg, kwargs):
44
+ kwargs.setdefault("extra", {})
45
+ kwargs["extra"]["request_id"] = self.extra.get("request_id", "no-request")
46
+ return msg, kwargs
47
+
48
+
49
+ def get_request_logger(request_id: str) -> RequestLoggerAdapter:
50
+ """Get a logger adapter with request_id context."""
51
+ logger = logging.getLogger("transcription")
52
+ return RequestLoggerAdapter(logger, {"request_id": request_id})
53
+
54
+
55
+ # Initialize logger at module load
56
+ app_logger = setup_logging()
app/core/security.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Security utilities: SSRF protection, URL validation, etc.
3
+ """
4
+ import ipaddress
5
+ import socket
6
+ from urllib.parse import urlparse
7
+ from typing import Tuple, Optional
8
+ from app.core.config import settings
9
+
10
+
11
+ class SecurityError(Exception):
12
+ """Raised when a security check fails."""
13
+ pass
14
+
15
+
16
+ def is_private_ip(ip: str) -> bool:
17
+ """Check if an IP address is private, loopback, or link-local."""
18
+ try:
19
+ ip_obj = ipaddress.ip_address(ip)
20
+ return (
21
+ ip_obj.is_private or
22
+ ip_obj.is_loopback or
23
+ ip_obj.is_link_local or
24
+ ip_obj.is_multicast or
25
+ ip_obj.is_reserved or
26
+ ip_obj.is_unspecified
27
+ )
28
+ except ValueError:
29
+ return True # Invalid IP, treat as unsafe
30
+
31
+
32
+ def resolve_and_validate_url(url: str) -> Tuple[str, str]:
33
+ """
34
+ Resolve URL hostname and validate it's not pointing to private/internal IPs.
35
+ Returns (hostname, resolved_ip) if safe.
36
+ Raises SecurityError if URL is unsafe.
37
+ """
38
+ parsed = urlparse(url)
39
+
40
+ # Only allow http/https
41
+ if parsed.scheme not in ("http", "https"):
42
+ raise SecurityError(f"Invalid URL scheme: {parsed.scheme}. Only http/https allowed.")
43
+
44
+ hostname = parsed.hostname
45
+ if not hostname:
46
+ raise SecurityError("Invalid URL: no hostname found.")
47
+
48
+ # Check for IP address directly in URL
49
+ try:
50
+ ip_obj = ipaddress.ip_address(hostname)
51
+ if is_private_ip(str(ip_obj)):
52
+ raise SecurityError(f"Direct IP addresses to private networks are not allowed: {hostname}")
53
+ return hostname, str(ip_obj)
54
+ except ValueError:
55
+ pass # Not an IP, it's a hostname - continue with DNS resolution
56
+
57
+ # Resolve hostname to IP
58
+ try:
59
+ resolved_ip = socket.gethostbyname(hostname)
60
+ except socket.gaierror as e:
61
+ raise SecurityError(f"Failed to resolve hostname: {hostname}") from e
62
+
63
+ # Check resolved IP
64
+ if is_private_ip(resolved_ip):
65
+ raise SecurityError(
66
+ f"URL resolves to private/internal IP: {hostname} -> {resolved_ip}"
67
+ )
68
+
69
+ return hostname, resolved_ip
70
+
71
+
72
+ def validate_youtube_url(url: str) -> bool:
73
+ """
74
+ Validate that a YouTube URL is from an allowed domain.
75
+ """
76
+ parsed = urlparse(url)
77
+ hostname = parsed.hostname
78
+
79
+ if not hostname:
80
+ return False
81
+
82
+ # Remove www. prefix for comparison
83
+ hostname_clean = hostname.lower()
84
+ if hostname_clean.startswith("www."):
85
+ hostname_clean = hostname_clean[4:]
86
+
87
+ # Check against allowlist
88
+ allowed = [d.lower().replace("www.", "") for d in settings.YOUTUBE_ALLOWED_DOMAINS]
89
+ return hostname_clean in allowed or hostname in settings.YOUTUBE_ALLOWED_DOMAINS
90
+
91
+
92
+ def validate_file_extension(filename: str) -> bool:
93
+ """Validate file has an allowed extension."""
94
+ if not filename:
95
+ return False
96
+ lower = filename.lower()
97
+ return any(lower.endswith(ext) for ext in settings.ALLOWED_MEDIA_EXTENSIONS)
98
+
99
+
100
+ def validate_content_type(content_type: Optional[str]) -> bool:
101
+ """Validate content type is allowed."""
102
+ if not content_type:
103
+ return True # Some servers don't send content-type
104
+ # Extract main type (ignore charset etc.)
105
+ main_type = content_type.split(";")[0].strip().lower()
106
+ return main_type in settings.ALLOWED_CONTENT_TYPES
app/main.py ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Transcription Service API
3
+ FastAPI application for audio/video transcription.
4
+ """
5
+ import uuid
6
+ from pathlib import Path
7
+ from typing import Optional
8
+
9
+ from fastapi import FastAPI, File, UploadFile, Request, HTTPException
10
+ from fastapi.responses import HTMLResponse, PlainTextResponse, JSONResponse
11
+ from pydantic import BaseModel
12
+
13
+ from app.core.config import settings
14
+ from app.core.logging import setup_logging, get_request_logger
15
+ from app.services.cleanup import cleanup_temp_dir
16
+ from app.services.ingest import (
17
+ create_request_temp_dir,
18
+ ingest_upload,
19
+ ingest_media_url,
20
+ ingest_youtube,
21
+ IngestError,
22
+ )
23
+ from app.services.ffmpeg import normalize_audio, FFmpegError
24
+ from app.services.asr import transcribe_audio, ASRError
25
+
26
+ # Initialize logging
27
+ setup_logging()
28
+
29
+ # Create FastAPI app
30
+ app = FastAPI(
31
+ title="Transcription Service",
32
+ description="Audio/Video transcription API using Whisper",
33
+ version="1.0.0",
34
+ )
35
+
36
+
37
+ # Request models
38
+ class TranscribeRequest(BaseModel):
39
+ youtube_url: Optional[str] = None
40
+ media_url: Optional[str] = None
41
+
42
+
43
+ # HTML Test Page (inline for simplicity)
44
+ HTML_PAGE = """
45
+ <!DOCTYPE html>
46
+ <html lang="en">
47
+ <head>
48
+ <meta charset="UTF-8">
49
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
50
+ <title>Transcription Service</title>
51
+ <style>
52
+ * { box-sizing: border-box; }
53
+ body {
54
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
55
+ max-width: 900px;
56
+ margin: 0 auto;
57
+ padding: 20px;
58
+ background: #f5f5f5;
59
+ }
60
+ h1 { color: #333; }
61
+ .section {
62
+ background: white;
63
+ padding: 20px;
64
+ border-radius: 8px;
65
+ margin-bottom: 20px;
66
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
67
+ }
68
+ .section h2 {
69
+ margin-top: 0;
70
+ color: #555;
71
+ border-bottom: 1px solid #eee;
72
+ padding-bottom: 10px;
73
+ }
74
+ label {
75
+ display: block;
76
+ margin-bottom: 5px;
77
+ font-weight: 600;
78
+ color: #444;
79
+ }
80
+ input[type="text"], input[type="url"], input[type="file"] {
81
+ width: 100%;
82
+ padding: 10px;
83
+ margin-bottom: 15px;
84
+ border: 1px solid #ddd;
85
+ border-radius: 4px;
86
+ font-size: 14px;
87
+ }
88
+ button {
89
+ background: #007bff;
90
+ color: white;
91
+ border: none;
92
+ padding: 12px 24px;
93
+ border-radius: 4px;
94
+ cursor: pointer;
95
+ font-size: 14px;
96
+ margin-right: 10px;
97
+ margin-bottom: 10px;
98
+ }
99
+ button:hover { background: #0056b3; }
100
+ button:disabled { background: #ccc; cursor: not-allowed; }
101
+ button.secondary {
102
+ background: #6c757d;
103
+ }
104
+ button.secondary:hover { background: #545b62; }
105
+ button.success {
106
+ background: #28a745;
107
+ }
108
+ button.success:hover { background: #1e7e34; }
109
+ #result {
110
+ margin-top: 20px;
111
+ padding: 15px;
112
+ background: #f8f9fa;
113
+ border: 1px solid #dee2e6;
114
+ border-radius: 4px;
115
+ white-space: pre-wrap;
116
+ word-break: break-word;
117
+ font-family: monospace;
118
+ font-size: 13px;
119
+ max-height: 400px;
120
+ overflow-y: auto;
121
+ }
122
+ .status {
123
+ padding: 8px 12px;
124
+ border-radius: 4px;
125
+ margin-bottom: 10px;
126
+ font-weight: 600;
127
+ }
128
+ .status.loading { background: #fff3cd; color: #856404; }
129
+ .status.success { background: #d4edda; color: #155724; }
130
+ .status.error { background: #f8d7da; color: #721c24; }
131
+ .request-id { font-size: 12px; color: #666; margin-top: 5px; }
132
+ .tabs {
133
+ display: flex;
134
+ border-bottom: 2px solid #dee2e6;
135
+ margin-bottom: 15px;
136
+ }
137
+ .tab {
138
+ padding: 10px 20px;
139
+ cursor: pointer;
140
+ border: none;
141
+ background: none;
142
+ font-size: 14px;
143
+ color: #666;
144
+ margin: 0;
145
+ }
146
+ .tab.active {
147
+ color: #007bff;
148
+ border-bottom: 2px solid #007bff;
149
+ margin-bottom: -2px;
150
+ }
151
+ .tab-content { display: none; }
152
+ .tab-content.active { display: block; }
153
+ </style>
154
+ </head>
155
+ <body>
156
+ <h1>🎙️ Transcription Service</h1>
157
+ <p>Convert audio/video to text using AI. <a href="/docs" target="_blank">API Docs</a></p>
158
+
159
+ <div class="section">
160
+ <h2>Transcribe</h2>
161
+ <div class="tabs">
162
+ <button class="tab active" onclick="showTab('upload')">File Upload</button>
163
+ <button class="tab" onclick="showTab('media')">Media URL</button>
164
+ <button class="tab" onclick="showTab('youtube')">YouTube</button>
165
+ </div>
166
+
167
+ <div id="tab-upload" class="tab-content active">
168
+ <label for="file">Select audio/video file:</label>
169
+ <input type="file" id="file" accept=".mp3,.mp4,.m4a,.wav,.webm,.ogg,.flac">
170
+ <button onclick="submitFile()">Submit</button>
171
+ </div>
172
+
173
+ <div id="tab-media" class="tab-content">
174
+ <label for="media_url">Direct URL to audio/video:</label>
175
+ <input type="url" id="media_url" placeholder="https://example.com/audio.mp3">
176
+ <button onclick="submitMediaUrl()">Submit</button>
177
+ </div>
178
+
179
+ <div id="tab-youtube" class="tab-content">
180
+ <label for="youtube_url">YouTube URL:</label>
181
+ <input type="url" id="youtube_url" placeholder="https://www.youtube.com/watch?v=...">
182
+ <button onclick="submitYoutube()">Submit</button>
183
+ </div>
184
+ </div>
185
+
186
+ <div class="section">
187
+ <h2>Concurrency Test</h2>
188
+ <p>Test concurrent request handling:</p>
189
+ <button class="success" onclick="runParallelRequests()">Run 2 Parallel Requests</button>
190
+ <button class="secondary" onclick="checkHealth()">Health Check</button>
191
+ </div>
192
+
193
+ <div class="section">
194
+ <h2>Result</h2>
195
+ <div id="status" class="status" style="display:none;"></div>
196
+ <div id="result">Results will appear here...</div>
197
+ </div>
198
+
199
+ <script>
200
+ function showTab(tabName) {
201
+ document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
202
+ document.querySelectorAll('.tab-content').forEach(t => t.classList.remove('active'));
203
+ document.querySelector(`[onclick="showTab('${tabName}')"]`).classList.add('active');
204
+ document.getElementById(`tab-${tabName}`).classList.add('active');
205
+ }
206
+
207
+ function setStatus(type, message) {
208
+ const status = document.getElementById('status');
209
+ status.style.display = 'block';
210
+ status.className = `status ${type}`;
211
+ status.textContent = message;
212
+ }
213
+
214
+ function setResult(text) {
215
+ document.getElementById('result').textContent = text;
216
+ }
217
+
218
+ async function submitFile() {
219
+ const file = document.getElementById('file').files[0];
220
+ if (!file) {
221
+ setStatus('error', 'Please select a file');
222
+ return;
223
+ }
224
+
225
+ setStatus('loading', 'Uploading and processing...');
226
+ const formData = new FormData();
227
+ formData.append('file', file);
228
+
229
+ try {
230
+ const response = await fetch('/transcribe', {
231
+ method: 'POST',
232
+ body: formData
233
+ });
234
+ const requestId = response.headers.get('X-Request-ID') || 'N/A';
235
+ const text = await response.text();
236
+
237
+ if (response.ok) {
238
+ setStatus('success', `Success! Request ID: ${requestId}`);
239
+ setResult(text);
240
+ } else {
241
+ setStatus('error', `Error (${response.status}). Request ID: ${requestId}`);
242
+ setResult(text);
243
+ }
244
+ } catch (e) {
245
+ setStatus('error', `Network error: ${e.message}`);
246
+ setResult(e.toString());
247
+ }
248
+ }
249
+
250
+ async function submitMediaUrl() {
251
+ const url = document.getElementById('media_url').value.trim();
252
+ if (!url) {
253
+ setStatus('error', 'Please enter a URL');
254
+ return;
255
+ }
256
+
257
+ setStatus('loading', 'Downloading and processing...');
258
+
259
+ try {
260
+ const response = await fetch('/transcribe', {
261
+ method: 'POST',
262
+ headers: { 'Content-Type': 'application/json' },
263
+ body: JSON.stringify({ media_url: url })
264
+ });
265
+ const requestId = response.headers.get('X-Request-ID') || 'N/A';
266
+ const text = await response.text();
267
+
268
+ if (response.ok) {
269
+ setStatus('success', `Success! Request ID: ${requestId}`);
270
+ setResult(text);
271
+ } else {
272
+ setStatus('error', `Error (${response.status}). Request ID: ${requestId}`);
273
+ setResult(text);
274
+ }
275
+ } catch (e) {
276
+ setStatus('error', `Network error: ${e.message}`);
277
+ setResult(e.toString());
278
+ }
279
+ }
280
+
281
+ async function submitYoutube() {
282
+ const url = document.getElementById('youtube_url').value.trim();
283
+ if (!url) {
284
+ setStatus('error', 'Please enter a YouTube URL');
285
+ return;
286
+ }
287
+
288
+ setStatus('loading', 'Downloading YouTube audio and processing...');
289
+
290
+ try {
291
+ const response = await fetch('/transcribe', {
292
+ method: 'POST',
293
+ headers: { 'Content-Type': 'application/json' },
294
+ body: JSON.stringify({ youtube_url: url })
295
+ });
296
+ const requestId = response.headers.get('X-Request-ID') || 'N/A';
297
+ const text = await response.text();
298
+
299
+ if (response.ok) {
300
+ setStatus('success', `Success! Request ID: ${requestId}`);
301
+ setResult(text);
302
+ } else {
303
+ setStatus('error', `Error (${response.status}). Request ID: ${requestId}`);
304
+ setResult(text);
305
+ }
306
+ } catch (e) {
307
+ setStatus('error', `Network error: ${e.message}`);
308
+ setResult(e.toString());
309
+ }
310
+ }
311
+
312
+ async function runParallelRequests() {
313
+ setStatus('loading', 'Running 2 parallel requests...');
314
+
315
+ // Create two simple JSON requests
316
+ const makeRequest = async (id) => {
317
+ const start = Date.now();
318
+ try {
319
+ const response = await fetch('/transcribe', {
320
+ method: 'POST',
321
+ headers: { 'Content-Type': 'application/json' },
322
+ body: JSON.stringify({}) // Empty request to trigger validation error - quick test
323
+ });
324
+ const requestId = response.headers.get('X-Request-ID') || 'N/A';
325
+ const text = await response.text();
326
+ const elapsed = Date.now() - start;
327
+ return `Request ${id}: Status ${response.status}, Request-ID: ${requestId}, Time: ${elapsed}ms\\n${text}`;
328
+ } catch (e) {
329
+ return `Request ${id}: Error - ${e.message}`;
330
+ }
331
+ };
332
+
333
+ try {
334
+ const results = await Promise.all([
335
+ makeRequest(1),
336
+ makeRequest(2)
337
+ ]);
338
+
339
+ setStatus('success', 'Parallel requests completed!');
340
+ setResult(results.join('\\n\\n---\\n\\n'));
341
+ } catch (e) {
342
+ setStatus('error', `Error: ${e.message}`);
343
+ setResult(e.toString());
344
+ }
345
+ }
346
+
347
+ async function checkHealth() {
348
+ setStatus('loading', 'Checking health...');
349
+
350
+ try {
351
+ const response = await fetch('/health');
352
+ const data = await response.json();
353
+ setStatus('success', 'Health check passed!');
354
+ setResult(JSON.stringify(data, null, 2));
355
+ } catch (e) {
356
+ setStatus('error', `Health check failed: ${e.message}`);
357
+ setResult(e.toString());
358
+ }
359
+ }
360
+ </script>
361
+ </body>
362
+ </html>
363
+ """
364
+
365
+
366
+ @app.get("/", response_class=HTMLResponse)
367
+ async def home():
368
+ """Serve the HTML test page."""
369
+ return HTML_PAGE
370
+
371
+
372
+ @app.get("/health")
373
+ async def health():
374
+ """Health check endpoint."""
375
+ return {"status": "ok"}
376
+
377
+
378
+ @app.post("/transcribe")
379
+ async def transcribe(
380
+ request: Request,
381
+ file: Optional[UploadFile] = File(None),
382
+ ):
383
+ """
384
+ Transcribe audio/video to plain text.
385
+
386
+ Accepts:
387
+ - JSON body with youtube_url or media_url
388
+ - Multipart form with file upload
389
+
390
+ Returns plain text transcription.
391
+ """
392
+ # Generate unique request ID for concurrency safety
393
+ request_id = str(uuid.uuid4())
394
+ logger = get_request_logger(request_id)
395
+ temp_dir: Optional[Path] = None
396
+
397
+ try:
398
+ # Create per-request temp directory
399
+ temp_dir = create_request_temp_dir(request_id)
400
+ logger.info(f"Created temp directory: {temp_dir}")
401
+
402
+ # Determine input type
403
+ input_path: Optional[Path] = None
404
+
405
+ # Check if it's a file upload
406
+ if file and file.filename:
407
+ logger.info(f"Processing file upload: {file.filename}")
408
+ input_path = await ingest_upload(file, request_id, temp_dir)
409
+
410
+ # Check if it's a JSON request
411
+ else:
412
+ content_type = request.headers.get("content-type", "")
413
+ if "application/json" in content_type:
414
+ try:
415
+ body = await request.json()
416
+ req = TranscribeRequest(**body)
417
+ except Exception as e:
418
+ return JSONResponse(
419
+ status_code=400,
420
+ content={"error": "Invalid JSON", "detail": str(e)},
421
+ headers={"X-Request-ID": request_id}
422
+ )
423
+
424
+ if req.youtube_url:
425
+ logger.info(f"Processing YouTube URL: {req.youtube_url}")
426
+ input_path = await ingest_youtube(req.youtube_url, request_id, temp_dir)
427
+
428
+ elif req.media_url:
429
+ logger.info(f"Processing media URL: {req.media_url}")
430
+ input_path = await ingest_media_url(req.media_url, request_id, temp_dir)
431
+
432
+ # Validate we have input
433
+ if input_path is None:
434
+ return JSONResponse(
435
+ status_code=400,
436
+ content={
437
+ "error": "No input provided",
438
+ "detail": "Provide youtube_url, media_url (JSON), or file (multipart)"
439
+ },
440
+ headers={"X-Request-ID": request_id}
441
+ )
442
+
443
+ # Normalize audio with FFmpeg
444
+ logger.info("Normalizing audio...")
445
+ normalized_path = await normalize_audio(input_path, request_id, temp_dir)
446
+
447
+ # Transcribe with ASR
448
+ logger.info("Starting transcription...")
449
+ transcript = await transcribe_audio(normalized_path, request_id)
450
+
451
+ logger.info("Transcription complete")
452
+ return PlainTextResponse(
453
+ content=transcript,
454
+ headers={"X-Request-ID": request_id}
455
+ )
456
+
457
+ except IngestError as e:
458
+ logger.error(f"Ingestion error: {e}")
459
+ return JSONResponse(
460
+ status_code=400,
461
+ content={"error": "Ingestion failed", "detail": str(e)},
462
+ headers={"X-Request-ID": request_id}
463
+ )
464
+
465
+ except FFmpegError as e:
466
+ logger.error(f"FFmpeg error: {e}")
467
+ return JSONResponse(
468
+ status_code=500,
469
+ content={"error": "Audio processing failed", "detail": str(e)},
470
+ headers={"X-Request-ID": request_id}
471
+ )
472
+
473
+ except ASRError as e:
474
+ logger.error(f"ASR error: {e}")
475
+ return JSONResponse(
476
+ status_code=500,
477
+ content={"error": "Transcription failed", "detail": str(e)},
478
+ headers={"X-Request-ID": request_id}
479
+ )
480
+
481
+ except Exception as e:
482
+ logger.exception(f"Unexpected error: {e}")
483
+ return JSONResponse(
484
+ status_code=500,
485
+ content={"error": "Internal server error", "detail": str(e)},
486
+ headers={"X-Request-ID": request_id}
487
+ )
488
+
489
+ finally:
490
+ # Always cleanup temp directory
491
+ cleanup_temp_dir(temp_dir, request_id)
492
+
493
+
494
+ if __name__ == "__main__":
495
+ import uvicorn
496
+ uvicorn.run(app, host=settings.HOST, port=settings.PORT)
app/services/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Services module
app/services/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (191 Bytes). View file
 
app/services/__pycache__/asr.cpython-311.pyc ADDED
Binary file (6.21 kB). View file
 
app/services/__pycache__/cleanup.cpython-311.pyc ADDED
Binary file (1.93 kB). View file
 
app/services/__pycache__/ffmpeg.cpython-311.pyc ADDED
Binary file (3.29 kB). View file
 
app/services/__pycache__/ingest.cpython-311.pyc ADDED
Binary file (13.1 kB). View file
 
app/services/asr.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ASR (Automatic Speech Recognition) service using official OpenAI Whisper.
3
+ Thread-safe model loading with singleton pattern.
4
+ """
5
+ import asyncio
6
+ import threading
7
+ from pathlib import Path
8
+ from typing import Optional
9
+
10
+ from app.core.config import settings
11
+ from app.core.logging import get_request_logger
12
+
13
+
14
+ class ASRError(Exception):
15
+ """Raised when ASR processing fails."""
16
+ pass
17
+
18
+
19
+ class ASRService:
20
+ """
21
+ Singleton ASR service with thread-safe model loading.
22
+ Model is loaded once at startup and reused for all requests.
23
+ """
24
+ _instance: Optional["ASRService"] = None
25
+ _lock = threading.Lock()
26
+ _model = None
27
+ _model_loaded = False
28
+ _model_lock = threading.Lock()
29
+
30
+ def __new__(cls) -> "ASRService":
31
+ if cls._instance is None:
32
+ with cls._lock:
33
+ if cls._instance is None:
34
+ cls._instance = super().__new__(cls)
35
+ return cls._instance
36
+
37
+ def _load_model(self):
38
+ """Load the Whisper model (called once, protected by lock)."""
39
+ if self._model_loaded:
40
+ return
41
+
42
+ with self._model_lock:
43
+ if self._model_loaded:
44
+ return
45
+
46
+ import whisper
47
+
48
+ model_size = settings.ASR_MODEL_SIZE
49
+ device = settings.ASR_DEVICE
50
+
51
+ # Log model loading
52
+ import logging
53
+ logger = logging.getLogger("transcription")
54
+ logger.info(f"Loading Whisper model: {model_size} on {device}")
55
+
56
+ self._model = whisper.load_model(model_size, device=device)
57
+ self._model_loaded = True
58
+ logger.info("Whisper model loaded successfully")
59
+
60
+ async def transcribe(
61
+ self,
62
+ audio_path: Path,
63
+ request_id: str
64
+ ) -> str:
65
+ """
66
+ Transcribe audio file to plain text.
67
+ Returns the full transcription as a string.
68
+ """
69
+ logger = get_request_logger(request_id)
70
+ logger.info(f"Starting transcription: {audio_path}")
71
+
72
+ # Ensure model is loaded
73
+ self._load_model()
74
+
75
+ # Run transcription in thread pool to not block event loop
76
+ loop = asyncio.get_event_loop()
77
+
78
+ try:
79
+ result = await asyncio.wait_for(
80
+ loop.run_in_executor(None, self._transcribe_sync, audio_path, request_id),
81
+ timeout=settings.ASR_TIMEOUT
82
+ )
83
+ return result
84
+ except asyncio.TimeoutError:
85
+ raise ASRError(f"Transcription timeout after {settings.ASR_TIMEOUT}s")
86
+
87
+ def _transcribe_sync(self, audio_path: Path, request_id: str) -> str:
88
+ """Synchronous transcription (runs in thread pool)."""
89
+ logger = get_request_logger(request_id)
90
+
91
+ try:
92
+ # Transcribe with auto language detection
93
+ result = self._model.transcribe(
94
+ str(audio_path),
95
+ task="transcribe",
96
+ verbose=False
97
+ )
98
+
99
+ detected_lang = result.get("language", "unknown")
100
+ logger.info(f"Detected language: {detected_lang}")
101
+
102
+ # Get the full text
103
+ full_text = result.get("text", "").strip()
104
+
105
+ # Clean up extra whitespace
106
+ import re
107
+ full_text = re.sub(r'\s+', ' ', full_text).strip()
108
+
109
+ logger.info(f"Transcription complete: {len(full_text)} characters")
110
+ return full_text
111
+
112
+ except Exception as e:
113
+ raise ASRError(f"Transcription failed: {e}")
114
+
115
+
116
+ # Global ASR service instance
117
+ asr_service = ASRService()
118
+
119
+
120
+ async def transcribe_audio(audio_path: Path, request_id: str) -> str:
121
+ """Convenience function to transcribe audio."""
122
+ return await asr_service.transcribe(audio_path, request_id)
app/services/cleanup.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Cleanup utilities for temp directories.
3
+ """
4
+ import shutil
5
+ import os
6
+ from pathlib import Path
7
+ from typing import Optional
8
+ from app.core.logging import get_request_logger
9
+
10
+
11
+ def cleanup_temp_dir(temp_dir: Optional[Path], request_id: str) -> None:
12
+ """
13
+ Safely remove a temporary directory and all its contents.
14
+ Always called in finally blocks to ensure cleanup.
15
+ """
16
+ if temp_dir is None:
17
+ return
18
+
19
+ logger = get_request_logger(request_id)
20
+
21
+ try:
22
+ if temp_dir.exists():
23
+ shutil.rmtree(temp_dir, ignore_errors=True)
24
+ logger.info(f"Cleaned up temp directory: {temp_dir}")
25
+ except Exception as e:
26
+ logger.warning(f"Failed to cleanup temp directory {temp_dir}: {e}")
27
+
28
+
29
+ def ensure_temp_base_exists() -> Path:
30
+ """Ensure the base temp directory exists."""
31
+ from app.core.config import settings
32
+ base = Path(settings.TEMP_DIR_BASE)
33
+ base.mkdir(parents=True, exist_ok=True)
34
+ return base
app/services/ffmpeg.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FFmpeg audio processing: normalize and extract audio to 16kHz mono WAV.
3
+ """
4
+ import asyncio
5
+ import uuid
6
+ from pathlib import Path
7
+
8
+ from app.core.config import settings
9
+ from app.core.logging import get_request_logger
10
+
11
+
12
+ class FFmpegError(Exception):
13
+ """Raised when FFmpeg processing fails."""
14
+ pass
15
+
16
+
17
+ async def normalize_audio(
18
+ input_path: Path,
19
+ request_id: str,
20
+ temp_dir: Path
21
+ ) -> Path:
22
+ """
23
+ Convert audio to 16kHz mono WAV format required by Whisper.
24
+ Returns the path to the normalized audio file.
25
+ """
26
+ logger = get_request_logger(request_id)
27
+
28
+ output_path = temp_dir / f"{uuid.uuid4().hex[:8]}_normalized.wav"
29
+
30
+ logger.info(f"Normalizing audio: {input_path} -> {output_path}")
31
+
32
+ cmd = [
33
+ "ffmpeg",
34
+ "-i", str(input_path),
35
+ "-ar", "16000", # 16kHz sample rate
36
+ "-ac", "1", # Mono
37
+ "-c:a", "pcm_s16le", # 16-bit PCM
38
+ "-y", # Overwrite output
39
+ "-loglevel", "error",
40
+ str(output_path)
41
+ ]
42
+
43
+ try:
44
+ process = await asyncio.create_subprocess_exec(
45
+ *cmd,
46
+ stdout=asyncio.subprocess.PIPE,
47
+ stderr=asyncio.subprocess.PIPE
48
+ )
49
+
50
+ try:
51
+ stdout, stderr = await asyncio.wait_for(
52
+ process.communicate(),
53
+ timeout=300 # 5 min timeout for ffmpeg
54
+ )
55
+ except asyncio.TimeoutError:
56
+ process.kill()
57
+ raise FFmpegError("FFmpeg timeout after 300s")
58
+
59
+ if process.returncode != 0:
60
+ error_msg = stderr.decode() if stderr else "Unknown error"
61
+ raise FFmpegError(f"FFmpeg failed: {error_msg}")
62
+
63
+ except FileNotFoundError:
64
+ raise FFmpegError("FFmpeg not installed. Please install it.")
65
+
66
+ if not output_path.exists():
67
+ raise FFmpegError("FFmpeg completed but output file not found")
68
+
69
+ logger.info(f"Normalized audio: {output_path} ({output_path.stat().st_size} bytes)")
70
+ return output_path
app/services/ingest.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ingestion services: file upload, media URL download, YouTube download.
3
+ """
4
+ import os
5
+ import asyncio
6
+ import uuid
7
+ from pathlib import Path
8
+ from typing import Optional, Tuple
9
+ from urllib.parse import urlparse
10
+
11
+ import httpx
12
+ from fastapi import UploadFile
13
+
14
+ from app.core.config import settings
15
+ from app.core.security import (
16
+ SecurityError,
17
+ resolve_and_validate_url,
18
+ validate_youtube_url,
19
+ validate_file_extension,
20
+ validate_content_type,
21
+ )
22
+ from app.core.logging import get_request_logger
23
+ from app.services.cleanup import ensure_temp_base_exists
24
+
25
+
26
+ class IngestError(Exception):
27
+ """Raised when ingestion fails."""
28
+ pass
29
+
30
+
31
+ def create_request_temp_dir(request_id: str) -> Path:
32
+ """Create a unique temporary directory for a request."""
33
+ base = ensure_temp_base_exists()
34
+ temp_dir = base / request_id
35
+ temp_dir.mkdir(parents=True, exist_ok=True)
36
+ return temp_dir
37
+
38
+
39
+ async def ingest_upload(
40
+ file: UploadFile,
41
+ request_id: str,
42
+ temp_dir: Path
43
+ ) -> Path:
44
+ """
45
+ Save an uploaded file to the temp directory.
46
+ Returns the path to the saved file.
47
+ """
48
+ logger = get_request_logger(request_id)
49
+
50
+ if not file.filename:
51
+ raise IngestError("No filename provided")
52
+
53
+ # Validate extension
54
+ if not validate_file_extension(file.filename):
55
+ raise IngestError(
56
+ f"Invalid file type. Allowed: {', '.join(settings.ALLOWED_MEDIA_EXTENSIONS)}"
57
+ )
58
+
59
+ # Validate content type
60
+ if not validate_content_type(file.content_type):
61
+ raise IngestError(f"Invalid content type: {file.content_type}")
62
+
63
+ # Safe filename with UUID prefix to avoid collisions
64
+ safe_filename = f"{uuid.uuid4().hex[:8]}_{Path(file.filename).name}"
65
+ output_path = temp_dir / safe_filename
66
+
67
+ logger.info(f"Saving uploaded file: {file.filename} -> {output_path}")
68
+
69
+ # Read and write with size limit check
70
+ total_size = 0
71
+ try:
72
+ with open(output_path, "wb") as f:
73
+ while True:
74
+ chunk = await file.read(1024 * 1024) # 1MB chunks
75
+ if not chunk:
76
+ break
77
+ total_size += len(chunk)
78
+ if total_size > settings.MAX_FILE_SIZE_BYTES:
79
+ raise IngestError(
80
+ f"File too large. Maximum size: {settings.MAX_FILE_SIZE_MB}MB"
81
+ )
82
+ f.write(chunk)
83
+ except IngestError:
84
+ if output_path.exists():
85
+ output_path.unlink()
86
+ raise
87
+
88
+ logger.info(f"Saved file: {output_path} ({total_size} bytes)")
89
+ return output_path
90
+
91
+
92
+ async def ingest_media_url(
93
+ url: str,
94
+ request_id: str,
95
+ temp_dir: Path
96
+ ) -> Path:
97
+ """
98
+ Download media from a URL with SSRF protection and size limits.
99
+ Returns the path to the downloaded file.
100
+ """
101
+ logger = get_request_logger(request_id)
102
+
103
+ # SSRF protection: validate URL and resolve DNS
104
+ try:
105
+ hostname, resolved_ip = resolve_and_validate_url(url)
106
+ logger.info(f"Validated URL: {hostname} -> {resolved_ip}")
107
+ except SecurityError as e:
108
+ raise IngestError(f"Security check failed: {e}")
109
+
110
+ # Extract filename from URL or generate one
111
+ parsed = urlparse(url)
112
+ path_name = Path(parsed.path).name if parsed.path else ""
113
+ if not validate_file_extension(path_name):
114
+ # Try to get extension from content-type later, use default for now
115
+ path_name = f"media_{uuid.uuid4().hex[:8]}.mp4"
116
+
117
+ output_path = temp_dir / f"{uuid.uuid4().hex[:8]}_{path_name}"
118
+
119
+ logger.info(f"Downloading media from: {url}")
120
+
121
+ total_size = 0
122
+ try:
123
+ async with httpx.AsyncClient(
124
+ timeout=httpx.Timeout(settings.DOWNLOAD_TIMEOUT),
125
+ follow_redirects=True,
126
+ max_redirects=5
127
+ ) as client:
128
+ async with client.stream("GET", url) as response:
129
+ response.raise_for_status()
130
+
131
+ # Validate content type
132
+ content_type = response.headers.get("content-type", "")
133
+ if not validate_content_type(content_type):
134
+ raise IngestError(f"Invalid content type: {content_type}")
135
+
136
+ # Check content-length if available
137
+ content_length = response.headers.get("content-length")
138
+ if content_length and int(content_length) > settings.MAX_FILE_SIZE_BYTES:
139
+ raise IngestError(
140
+ f"File too large ({int(content_length) // (1024*1024)}MB). "
141
+ f"Maximum: {settings.MAX_FILE_SIZE_MB}MB"
142
+ )
143
+
144
+ with open(output_path, "wb") as f:
145
+ async for chunk in response.aiter_bytes(chunk_size=1024 * 1024):
146
+ total_size += len(chunk)
147
+ if total_size > settings.MAX_FILE_SIZE_BYTES:
148
+ raise IngestError(
149
+ f"File too large. Maximum size: {settings.MAX_FILE_SIZE_MB}MB"
150
+ )
151
+ f.write(chunk)
152
+
153
+ except httpx.TimeoutException:
154
+ if output_path.exists():
155
+ output_path.unlink()
156
+ raise IngestError(f"Download timeout after {settings.DOWNLOAD_TIMEOUT}s")
157
+ except httpx.HTTPStatusError as e:
158
+ if output_path.exists():
159
+ output_path.unlink()
160
+ raise IngestError(f"HTTP error {e.response.status_code}: {e.response.reason_phrase}")
161
+ except IngestError:
162
+ if output_path.exists():
163
+ output_path.unlink()
164
+ raise
165
+ except Exception as e:
166
+ if output_path.exists():
167
+ output_path.unlink()
168
+ raise IngestError(f"Download failed: {e}")
169
+
170
+ logger.info(f"Downloaded: {output_path} ({total_size} bytes)")
171
+ return output_path
172
+
173
+
174
+ async def ingest_youtube(
175
+ url: str,
176
+ request_id: str,
177
+ temp_dir: Path
178
+ ) -> Path:
179
+ """
180
+ Download audio from YouTube using yt-dlp.
181
+ Returns the path to the downloaded audio file.
182
+ """
183
+ logger = get_request_logger(request_id)
184
+
185
+ # Validate YouTube domain
186
+ if not validate_youtube_url(url):
187
+ raise IngestError(
188
+ f"Invalid YouTube URL. Allowed domains: {', '.join(settings.YOUTUBE_ALLOWED_DOMAINS)}"
189
+ )
190
+
191
+ output_template = str(temp_dir / f"{uuid.uuid4().hex[:8]}_%(title).50s.%(ext)s")
192
+
193
+ logger.info(f"Downloading YouTube audio: {url}")
194
+
195
+ # Use yt-dlp with audio-only extraction
196
+ # Added options to help avoid 403 errors from YouTube
197
+ cmd = [
198
+ "yt-dlp",
199
+ "--extract-audio",
200
+ "--audio-format", "mp3",
201
+ "--audio-quality", "0", # Best quality
202
+ "--no-playlist",
203
+ "--no-warnings",
204
+ "--quiet",
205
+ "--max-filesize", f"{settings.MAX_FILE_SIZE_MB}M",
206
+ # Options to help avoid 403 Forbidden errors
207
+ "--user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
208
+ "--referer", "https://www.youtube.com/",
209
+ "--extractor-args", "youtube:player_client=android",
210
+ "--no-check-certificates",
211
+ "-o", output_template,
212
+ ]
213
+
214
+ # Add browser cookies if configured (helps with age-restricted/login-required videos)
215
+ if settings.YOUTUBE_COOKIES_FROM_BROWSER:
216
+ cmd.extend(["--cookies-from-browser", settings.YOUTUBE_COOKIES_FROM_BROWSER])
217
+
218
+ cmd.append(url)
219
+
220
+ try:
221
+ process = await asyncio.create_subprocess_exec(
222
+ *cmd,
223
+ stdout=asyncio.subprocess.PIPE,
224
+ stderr=asyncio.subprocess.PIPE
225
+ )
226
+
227
+ try:
228
+ stdout, stderr = await asyncio.wait_for(
229
+ process.communicate(),
230
+ timeout=settings.YOUTUBE_TIMEOUT
231
+ )
232
+ except asyncio.TimeoutError:
233
+ process.kill()
234
+ raise IngestError(f"YouTube download timeout after {settings.YOUTUBE_TIMEOUT}s")
235
+
236
+ if process.returncode != 0:
237
+ error_msg = stderr.decode() if stderr else "Unknown error"
238
+ raise IngestError(f"yt-dlp failed: {error_msg}")
239
+
240
+ except FileNotFoundError:
241
+ raise IngestError("yt-dlp not installed. Please install it: pip install yt-dlp")
242
+
243
+ # Find the downloaded file
244
+ downloaded_files = list(temp_dir.glob("*.mp3")) + list(temp_dir.glob("*.m4a")) + list(temp_dir.glob("*.webm"))
245
+ if not downloaded_files:
246
+ raise IngestError("yt-dlp completed but no audio file found")
247
+
248
+ # Return the most recently created file
249
+ output_path = max(downloaded_files, key=lambda p: p.stat().st_mtime)
250
+ logger.info(f"Downloaded YouTube audio: {output_path}")
251
+ return output_path
requirements.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Web framework
2
+ fastapi>=0.104.0
3
+ uvicorn[standard]>=0.24.0
4
+ python-multipart>=0.0.6
5
+
6
+ # HTTP client with async support
7
+ httpx>=0.25.0
8
+
9
+ # ASR - Official OpenAI Whisper
10
+ openai-whisper>=20231117
11
+
12
+ # Pin NumPy to 1.x for compatibility with Whisper dependencies
13
+ numpy<2
14
+
15
+ # YouTube download
16
+ yt-dlp>=2023.11.0
17
+
18
+ # Testing
19
+ pytest>=7.4.0
20
+ pytest-asyncio>=0.21.0
21
+
22
+ # Type hints
23
+ pydantic>=2.5.0