Spaces:

sathish-93
/

demo

Runtime error

App Files Files Community

sathishkumarbsk commited on Jan 29

Commit

30c1053

1 Parent(s): 229c8e9

Initial transcription service

Browse files

Files changed (25) hide show

Dockerfile +46 -0
app/.DS_Store +0 -0
app/__init__.py +1 -0
app/__pycache__/__init__.cpython-311.pyc +0 -0
app/__pycache__/main.cpython-311.pyc +0 -0
app/core/__init__.py +1 -0
app/core/__pycache__/__init__.cpython-311.pyc +0 -0
app/core/__pycache__/config.cpython-311.pyc +0 -0
app/core/__pycache__/logging.cpython-311.pyc +0 -0
app/core/__pycache__/security.cpython-311.pyc +0 -0
app/core/config.py +54 -0
app/core/logging.py +56 -0
app/core/security.py +106 -0
app/main.py +496 -0
app/services/__init__.py +1 -0
app/services/__pycache__/__init__.cpython-311.pyc +0 -0
app/services/__pycache__/asr.cpython-311.pyc +0 -0
app/services/__pycache__/cleanup.cpython-311.pyc +0 -0
app/services/__pycache__/ffmpeg.cpython-311.pyc +0 -0
app/services/__pycache__/ingest.cpython-311.pyc +0 -0
app/services/asr.py +122 -0
app/services/cleanup.py +34 -0
app/services/ffmpeg.py +70 -0
app/services/ingest.py +251 -0
requirements.txt +23 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,46 @@

+# Transcription Service Dockerfile
+# Uses official OpenAI Whisper for transcription
+FROM python:3.11-slim
+WORKDIR /app
+# Install system dependencies
+# - ffmpeg: required for audio processing
+# - git: needed by some pip packages
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ffmpeg \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Create temp directory
+RUN mkdir -p /tmp/transcription
+# Copy application code
+COPY app/ ./app/
+# Environment variables with defaults
+ENV HOST=0.0.0.0
+ENV PORT=8000
+ENV ASR_MODEL_SIZE=base
+ENV ASR_DEVICE=cpu
+ENV MAX_FILE_SIZE_MB=200
+ENV DOWNLOAD_TIMEOUT=300
+ENV YOUTUBE_TIMEOUT=600
+ENV ASR_TIMEOUT=1800
+ENV LOG_LEVEL=INFO
+ENV TEMP_DIR_BASE=/tmp/transcription
+# Expose port
+EXPOSE 8000
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD python -c "import httpx; httpx.get('http://localhost:8000/health').raise_for_status()"
+# Run with uvicorn
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

app/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

app/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Transcription Service App

app/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (182 Bytes). View file

app/__pycache__/main.cpython-311.pyc ADDED Viewed

Binary file (19.3 kB). View file

app/core/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Core module

app/core/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (187 Bytes). View file

app/core/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (2.69 kB). View file

app/core/__pycache__/logging.cpython-311.pyc ADDED Viewed

Binary file (3.31 kB). View file

app/core/__pycache__/security.cpython-311.pyc ADDED Viewed

Binary file (5.35 kB). View file

app/core/config.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""
+Configuration management via environment variables.
+"""
+import os
+from typing import List
+class Settings:
+    """Application settings loaded from environment variables."""
+    # Server
+    HOST: str = os.getenv("HOST", "0.0.0.0")
+    PORT: int = int(os.getenv("PORT", "8000"))
+    # ASR Model (OpenAI Whisper)
+    # Options: tiny, base, small, medium, large, large-v2, large-v3
+    ASR_MODEL_SIZE: str = os.getenv("ASR_MODEL_SIZE", "base")
+    ASR_DEVICE: str = os.getenv("ASR_DEVICE", "cpu")  # cpu or cuda
+    # File limits
+    MAX_FILE_SIZE_MB: int = int(os.getenv("MAX_FILE_SIZE_MB", "200"))
+    MAX_FILE_SIZE_BYTES: int = MAX_FILE_SIZE_MB * 1024 * 1024
+    # Timeouts (seconds)
+    DOWNLOAD_TIMEOUT: int = int(os.getenv("DOWNLOAD_TIMEOUT", "300"))
+    YOUTUBE_TIMEOUT: int = int(os.getenv("YOUTUBE_TIMEOUT", "600"))
+    ASR_TIMEOUT: int = int(os.getenv("ASR_TIMEOUT", "1800"))  # 30 min for long files
+    # YouTube allowlist
+    YOUTUBE_ALLOWED_DOMAINS: List[str] = os.getenv(
+        "YOUTUBE_ALLOWED_DOMAINS",
+        "youtube.com,youtu.be,www.youtube.com,m.youtube.com"
+    ).split(",")
+    # YouTube cookies - set to browser name (chrome, firefox, safari, edge) to use browser cookies
+    # This helps avoid 403 errors by using your logged-in session
+    YOUTUBE_COOKIES_FROM_BROWSER: str = os.getenv("YOUTUBE_COOKIES_FROM_BROWSER", "")
+    # Security
+    ALLOWED_MEDIA_EXTENSIONS: List[str] = [".mp3", ".mp4", ".m4a", ".wav", ".webm", ".ogg", ".flac"]
+    ALLOWED_CONTENT_TYPES: List[str] = [
+        "audio/mpeg", "audio/mp4", "audio/x-m4a", "audio/wav", "audio/webm",
+        "audio/ogg", "audio/flac", "video/mp4", "video/webm",
+        "application/octet-stream"  # fallback for some servers
+    ]
+    # Temp directory base
+    TEMP_DIR_BASE: str = os.getenv("TEMP_DIR_BASE", "/tmp/transcription")
+    # Logging
+    LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
+settings = Settings()

app/core/logging.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""
+Logging configuration for the application.
+"""
+import logging
+import sys
+from app.core.config import settings
+class RequestIdFilter(logging.Filter):
+    """Filter that adds request_id to log records if not present."""
+    def filter(self, record):
+        if not hasattr(record, 'request_id'):
+            record.request_id = 'no-request'
+        return True
+def setup_logging() -> logging.Logger:
+    """Configure and return the application logger."""
+    logger = logging.getLogger("transcription")
+    logger.setLevel(getattr(logging, settings.LOG_LEVEL.upper(), logging.INFO))
+    if not logger.handlers:
+        handler = logging.StreamHandler(sys.stdout)
+        handler.setLevel(logging.DEBUG)
+        # Add filter to ensure request_id is always present
+        handler.addFilter(RequestIdFilter())
+        formatter = logging.Formatter(
+            "%(asctime)s - %(name)s - %(levelname)s - [%(request_id)s] - %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S"
+        )
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+    return logger
+class RequestLoggerAdapter(logging.LoggerAdapter):
+    """Logger adapter that includes request_id in all log messages."""
+    def process(self, msg, kwargs):
+        kwargs.setdefault("extra", {})
+        kwargs["extra"]["request_id"] = self.extra.get("request_id", "no-request")
+        return msg, kwargs
+def get_request_logger(request_id: str) -> RequestLoggerAdapter:
+    """Get a logger adapter with request_id context."""
+    logger = logging.getLogger("transcription")
+    return RequestLoggerAdapter(logger, {"request_id": request_id})
+# Initialize logger at module load
+app_logger = setup_logging()

app/core/security.py ADDED Viewed

	@@ -0,0 +1,106 @@

+"""
+Security utilities: SSRF protection, URL validation, etc.
+"""
+import ipaddress
+import socket
+from urllib.parse import urlparse
+from typing import Tuple, Optional
+from app.core.config import settings
+class SecurityError(Exception):
+    """Raised when a security check fails."""
+    pass
+def is_private_ip(ip: str) -> bool:
+    """Check if an IP address is private, loopback, or link-local."""
+    try:
+        ip_obj = ipaddress.ip_address(ip)
+        return (
+            ip_obj.is_private or
+            ip_obj.is_loopback or
+            ip_obj.is_link_local or
+            ip_obj.is_multicast or
+            ip_obj.is_reserved or
+            ip_obj.is_unspecified
+        )
+    except ValueError:
+        return True  # Invalid IP, treat as unsafe
+def resolve_and_validate_url(url: str) -> Tuple[str, str]:
+    """
+    Resolve URL hostname and validate it's not pointing to private/internal IPs.
+    Returns (hostname, resolved_ip) if safe.
+    Raises SecurityError if URL is unsafe.
+    """
+    parsed = urlparse(url)
+    # Only allow http/https
+    if parsed.scheme not in ("http", "https"):
+        raise SecurityError(f"Invalid URL scheme: {parsed.scheme}. Only http/https allowed.")
+    hostname = parsed.hostname
+    if not hostname:
+        raise SecurityError("Invalid URL: no hostname found.")
+    # Check for IP address directly in URL
+    try:
+        ip_obj = ipaddress.ip_address(hostname)
+        if is_private_ip(str(ip_obj)):
+            raise SecurityError(f"Direct IP addresses to private networks are not allowed: {hostname}")
+        return hostname, str(ip_obj)
+    except ValueError:
+        pass  # Not an IP, it's a hostname - continue with DNS resolution
+    # Resolve hostname to IP
+    try:
+        resolved_ip = socket.gethostbyname(hostname)
+    except socket.gaierror as e:
+        raise SecurityError(f"Failed to resolve hostname: {hostname}") from e
+    # Check resolved IP
+    if is_private_ip(resolved_ip):
+        raise SecurityError(
+            f"URL resolves to private/internal IP: {hostname} -> {resolved_ip}"
+        )
+    return hostname, resolved_ip
+def validate_youtube_url(url: str) -> bool:
+    """
+    Validate that a YouTube URL is from an allowed domain.
+    """
+    parsed = urlparse(url)
+    hostname = parsed.hostname
+    if not hostname:
+        return False
+    # Remove www. prefix for comparison
+    hostname_clean = hostname.lower()
+    if hostname_clean.startswith("www."):
+        hostname_clean = hostname_clean[4:]
+    # Check against allowlist
+    allowed = [d.lower().replace("www.", "") for d in settings.YOUTUBE_ALLOWED_DOMAINS]
+    return hostname_clean in allowed or hostname in settings.YOUTUBE_ALLOWED_DOMAINS
+def validate_file_extension(filename: str) -> bool:
+    """Validate file has an allowed extension."""
+    if not filename:
+        return False
+    lower = filename.lower()
+    return any(lower.endswith(ext) for ext in settings.ALLOWED_MEDIA_EXTENSIONS)
+def validate_content_type(content_type: Optional[str]) -> bool:
+    """Validate content type is allowed."""
+    if not content_type:
+        return True  # Some servers don't send content-type
+    # Extract main type (ignore charset etc.)
+    main_type = content_type.split(";")[0].strip().lower()
+    return main_type in settings.ALLOWED_CONTENT_TYPES

app/main.py ADDED Viewed

	@@ -0,0 +1,496 @@

+"""
+Transcription Service API
+FastAPI application for audio/video transcription.
+"""
+import uuid
+from pathlib import Path
+from typing import Optional
+from fastapi import FastAPI, File, UploadFile, Request, HTTPException
+from fastapi.responses import HTMLResponse, PlainTextResponse, JSONResponse
+from pydantic import BaseModel
+from app.core.config import settings
+from app.core.logging import setup_logging, get_request_logger
+from app.services.cleanup import cleanup_temp_dir
+from app.services.ingest import (
+    create_request_temp_dir,
+    ingest_upload,
+    ingest_media_url,
+    ingest_youtube,
+    IngestError,
+)
+from app.services.ffmpeg import normalize_audio, FFmpegError
+from app.services.asr import transcribe_audio, ASRError
+# Initialize logging
+setup_logging()
+# Create FastAPI app
+app = FastAPI(
+    title="Transcription Service",
+    description="Audio/Video transcription API using Whisper",
+    version="1.0.0",
+)
+# Request models
+class TranscribeRequest(BaseModel):
+    youtube_url: Optional[str] = None
+    media_url: Optional[str] = None
+# HTML Test Page (inline for simplicity)
+HTML_PAGE = """
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Transcription Service</title>
+    <style>
+        * { box-sizing: border-box; }
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+            max-width: 900px;
+            margin: 0 auto;
+            padding: 20px;
+            background: #f5f5f5;
+        }
+        h1 { color: #333; }
+        .section {
+            background: white;
+            padding: 20px;
+            border-radius: 8px;
+            margin-bottom: 20px;
+            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+        }
+        .section h2 {
+            margin-top: 0;
+            color: #555;
+            border-bottom: 1px solid #eee;
+            padding-bottom: 10px;
+        }
+        label {
+            display: block;
+            margin-bottom: 5px;
+            font-weight: 600;
+            color: #444;
+        }
+        input[type="text"], input[type="url"], input[type="file"] {
+            width: 100%;
+            padding: 10px;
+            margin-bottom: 15px;
+            border: 1px solid #ddd;
+            border-radius: 4px;
+            font-size: 14px;
+        }
+        button {
+            background: #007bff;
+            color: white;
+            border: none;
+            padding: 12px 24px;
+            border-radius: 4px;
+            cursor: pointer;
+            font-size: 14px;
+            margin-right: 10px;
+            margin-bottom: 10px;
+        }
+        button:hover { background: #0056b3; }
+        button:disabled { background: #ccc; cursor: not-allowed; }
+        button.secondary {
+            background: #6c757d;
+        }
+        button.secondary:hover { background: #545b62; }
+        button.success {
+            background: #28a745;
+        }
+        button.success:hover { background: #1e7e34; }
+        #result {
+            margin-top: 20px;
+            padding: 15px;
+            background: #f8f9fa;
+            border: 1px solid #dee2e6;
+            border-radius: 4px;
+            white-space: pre-wrap;
+            word-break: break-word;
+            font-family: monospace;
+            font-size: 13px;
+            max-height: 400px;
+            overflow-y: auto;
+        }
+        .status {
+            padding: 8px 12px;
+            border-radius: 4px;
+            margin-bottom: 10px;
+            font-weight: 600;
+        }
+        .status.loading { background: #fff3cd; color: #856404; }
+        .status.success { background: #d4edda; color: #155724; }
+        .status.error { background: #f8d7da; color: #721c24; }
+        .request-id { font-size: 12px; color: #666; margin-top: 5px; }
+        .tabs {
+            display: flex;
+            border-bottom: 2px solid #dee2e6;
+            margin-bottom: 15px;
+        }
+        .tab {
+            padding: 10px 20px;
+            cursor: pointer;
+            border: none;
+            background: none;
+            font-size: 14px;
+            color: #666;
+            margin: 0;
+        }
+        .tab.active {
+            color: #007bff;
+            border-bottom: 2px solid #007bff;
+            margin-bottom: -2px;
+        }
+        .tab-content { display: none; }
+        .tab-content.active { display: block; }
+    </style>
+</head>
+<body>
+    <h1>🎙️ Transcription Service</h1>
+    <p>Convert audio/video to text using AI. <a href="/docs" target="_blank">API Docs</a></p>
+    <div class="section">
+        <h2>Transcribe</h2>
+        <div class="tabs">
+            <button class="tab active" onclick="showTab('upload')">File Upload</button>
+            <button class="tab" onclick="showTab('media')">Media URL</button>
+            <button class="tab" onclick="showTab('youtube')">YouTube</button>
+        </div>
+        <div id="tab-upload" class="tab-content active">
+            <label for="file">Select audio/video file:</label>
+            <input type="file" id="file" accept=".mp3,.mp4,.m4a,.wav,.webm,.ogg,.flac">
+            <button onclick="submitFile()">Submit</button>
+        </div>
+        <div id="tab-media" class="tab-content">
+            <label for="media_url">Direct URL to audio/video:</label>
+            <input type="url" id="media_url" placeholder="https://example.com/audio.mp3">
+            <button onclick="submitMediaUrl()">Submit</button>
+        </div>
+        <div id="tab-youtube" class="tab-content">
+            <label for="youtube_url">YouTube URL:</label>
+            <input type="url" id="youtube_url" placeholder="https://www.youtube.com/watch?v=...">
+            <button onclick="submitYoutube()">Submit</button>
+        </div>
+    </div>
+    <div class="section">
+        <h2>Concurrency Test</h2>
+        <p>Test concurrent request handling:</p>
+        <button class="success" onclick="runParallelRequests()">Run 2 Parallel Requests</button>
+        <button class="secondary" onclick="checkHealth()">Health Check</button>
+    </div>
+    <div class="section">
+        <h2>Result</h2>
+        <div id="status" class="status" style="display:none;"></div>
+        <div id="result">Results will appear here...</div>
+    </div>
+    <script>
+        function showTab(tabName) {
+            document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
+            document.querySelectorAll('.tab-content').forEach(t => t.classList.remove('active'));
+            document.querySelector(`[onclick="showTab('${tabName}')"]`).classList.add('active');
+            document.getElementById(`tab-${tabName}`).classList.add('active');
+        }
+        function setStatus(type, message) {
+            const status = document.getElementById('status');
+            status.style.display = 'block';
+            status.className = `status ${type}`;
+            status.textContent = message;
+        }
+        function setResult(text) {
+            document.getElementById('result').textContent = text;
+        }
+        async function submitFile() {
+            const file = document.getElementById('file').files[0];
+            if (!file) {
+                setStatus('error', 'Please select a file');
+                return;
+            }
+            setStatus('loading', 'Uploading and processing...');
+            const formData = new FormData();
+            formData.append('file', file);
+            try {
+                const response = await fetch('/transcribe', {
+                    method: 'POST',
+                    body: formData
+                });
+                const requestId = response.headers.get('X-Request-ID') || 'N/A';
+                const text = await response.text();
+                if (response.ok) {
+                    setStatus('success', `Success! Request ID: ${requestId}`);
+                    setResult(text);
+                } else {
+                    setStatus('error', `Error (${response.status}). Request ID: ${requestId}`);
+                    setResult(text);
+                }
+            } catch (e) {
+                setStatus('error', `Network error: ${e.message}`);
+                setResult(e.toString());
+            }
+        }
+        async function submitMediaUrl() {
+            const url = document.getElementById('media_url').value.trim();
+            if (!url) {
+                setStatus('error', 'Please enter a URL');
+                return;
+            }
+            setStatus('loading', 'Downloading and processing...');
+            try {
+                const response = await fetch('/transcribe', {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({ media_url: url })
+                });
+                const requestId = response.headers.get('X-Request-ID') || 'N/A';
+                const text = await response.text();
+                if (response.ok) {
+                    setStatus('success', `Success! Request ID: ${requestId}`);
+                    setResult(text);
+                } else {
+                    setStatus('error', `Error (${response.status}). Request ID: ${requestId}`);
+                    setResult(text);
+                }
+            } catch (e) {
+                setStatus('error', `Network error: ${e.message}`);
+                setResult(e.toString());
+            }
+        }
+        async function submitYoutube() {
+            const url = document.getElementById('youtube_url').value.trim();
+            if (!url) {
+                setStatus('error', 'Please enter a YouTube URL');
+                return;
+            }
+            setStatus('loading', 'Downloading YouTube audio and processing...');
+            try {
+                const response = await fetch('/transcribe', {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({ youtube_url: url })
+                });
+                const requestId = response.headers.get('X-Request-ID') || 'N/A';
+                const text = await response.text();
+                if (response.ok) {
+                    setStatus('success', `Success! Request ID: ${requestId}`);
+                    setResult(text);
+                } else {
+                    setStatus('error', `Error (${response.status}). Request ID: ${requestId}`);
+                    setResult(text);
+                }
+            } catch (e) {
+                setStatus('error', `Network error: ${e.message}`);
+                setResult(e.toString());
+            }
+        }
+        async function runParallelRequests() {
+            setStatus('loading', 'Running 2 parallel requests...');
+            // Create two simple JSON requests
+            const makeRequest = async (id) => {
+                const start = Date.now();
+                try {
+                    const response = await fetch('/transcribe', {
+                        method: 'POST',
+                        headers: { 'Content-Type': 'application/json' },
+                        body: JSON.stringify({})  // Empty request to trigger validation error - quick test
+                    });
+                    const requestId = response.headers.get('X-Request-ID') || 'N/A';
+                    const text = await response.text();
+                    const elapsed = Date.now() - start;
+                    return `Request ${id}: Status ${response.status}, Request-ID: ${requestId}, Time: ${elapsed}ms\\n${text}`;
+                } catch (e) {
+                    return `Request ${id}: Error - ${e.message}`;
+                }
+            };
+            try {
+                const results = await Promise.all([
+                    makeRequest(1),
+                    makeRequest(2)
+                ]);
+                setStatus('success', 'Parallel requests completed!');
+                setResult(results.join('\\n\\n---\\n\\n'));
+            } catch (e) {
+                setStatus('error', `Error: ${e.message}`);
+                setResult(e.toString());
+            }
+        }
+        async function checkHealth() {
+            setStatus('loading', 'Checking health...');
+            try {
+                const response = await fetch('/health');
+                const data = await response.json();
+                setStatus('success', 'Health check passed!');
+                setResult(JSON.stringify(data, null, 2));
+            } catch (e) {
+                setStatus('error', `Health check failed: ${e.message}`);
+                setResult(e.toString());
+            }
+        }
+    </script>
+</body>
+</html>
+"""
+@app.get("/", response_class=HTMLResponse)
+async def home():
+    """Serve the HTML test page."""
+    return HTML_PAGE
+@app.get("/health")
+async def health():
+    """Health check endpoint."""
+    return {"status": "ok"}
+@app.post("/transcribe")
+async def transcribe(
+    request: Request,
+    file: Optional[UploadFile] = File(None),
+):
+    """
+    Transcribe audio/video to plain text.
+    Accepts:
+    - JSON body with youtube_url or media_url
+    - Multipart form with file upload
+    Returns plain text transcription.
+    """
+    # Generate unique request ID for concurrency safety
+    request_id = str(uuid.uuid4())
+    logger = get_request_logger(request_id)
+    temp_dir: Optional[Path] = None
+    try:
+        # Create per-request temp directory
+        temp_dir = create_request_temp_dir(request_id)
+        logger.info(f"Created temp directory: {temp_dir}")
+        # Determine input type
+        input_path: Optional[Path] = None
+        # Check if it's a file upload
+        if file and file.filename:
+            logger.info(f"Processing file upload: {file.filename}")
+            input_path = await ingest_upload(file, request_id, temp_dir)
+        # Check if it's a JSON request
+        else:
+            content_type = request.headers.get("content-type", "")
+            if "application/json" in content_type:
+                try:
+                    body = await request.json()
+                    req = TranscribeRequest(**body)
+                except Exception as e:
+                    return JSONResponse(
+                        status_code=400,
+                        content={"error": "Invalid JSON", "detail": str(e)},
+                        headers={"X-Request-ID": request_id}
+                    )
+                if req.youtube_url:
+                    logger.info(f"Processing YouTube URL: {req.youtube_url}")
+                    input_path = await ingest_youtube(req.youtube_url, request_id, temp_dir)
+                elif req.media_url:
+                    logger.info(f"Processing media URL: {req.media_url}")
+                    input_path = await ingest_media_url(req.media_url, request_id, temp_dir)
+        # Validate we have input
+        if input_path is None:
+            return JSONResponse(
+                status_code=400,
+                content={
+                    "error": "No input provided",
+                    "detail": "Provide youtube_url, media_url (JSON), or file (multipart)"
+                },
+                headers={"X-Request-ID": request_id}
+            )
+        # Normalize audio with FFmpeg
+        logger.info("Normalizing audio...")
+        normalized_path = await normalize_audio(input_path, request_id, temp_dir)
+        # Transcribe with ASR
+        logger.info("Starting transcription...")
+        transcript = await transcribe_audio(normalized_path, request_id)
+        logger.info("Transcription complete")
+        return PlainTextResponse(
+            content=transcript,
+            headers={"X-Request-ID": request_id}
+        )
+    except IngestError as e:
+        logger.error(f"Ingestion error: {e}")
+        return JSONResponse(
+            status_code=400,
+            content={"error": "Ingestion failed", "detail": str(e)},
+            headers={"X-Request-ID": request_id}
+        )
+    except FFmpegError as e:
+        logger.error(f"FFmpeg error: {e}")
+        return JSONResponse(
+            status_code=500,
+            content={"error": "Audio processing failed", "detail": str(e)},
+            headers={"X-Request-ID": request_id}
+        )
+    except ASRError as e:
+        logger.error(f"ASR error: {e}")
+        return JSONResponse(
+            status_code=500,
+            content={"error": "Transcription failed", "detail": str(e)},
+            headers={"X-Request-ID": request_id}
+        )
+    except Exception as e:
+        logger.exception(f"Unexpected error: {e}")
+        return JSONResponse(
+            status_code=500,
+            content={"error": "Internal server error", "detail": str(e)},
+            headers={"X-Request-ID": request_id}
+        )
+    finally:
+        # Always cleanup temp directory
+        cleanup_temp_dir(temp_dir, request_id)
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host=settings.HOST, port=settings.PORT)

app/services/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Services module

app/services/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (191 Bytes). View file

app/services/__pycache__/asr.cpython-311.pyc ADDED Viewed

Binary file (6.21 kB). View file

app/services/__pycache__/cleanup.cpython-311.pyc ADDED Viewed

Binary file (1.93 kB). View file

app/services/__pycache__/ffmpeg.cpython-311.pyc ADDED Viewed

Binary file (3.29 kB). View file

app/services/__pycache__/ingest.cpython-311.pyc ADDED Viewed

Binary file (13.1 kB). View file

app/services/asr.py ADDED Viewed

	@@ -0,0 +1,122 @@

+"""
+ASR (Automatic Speech Recognition) service using official OpenAI Whisper.
+Thread-safe model loading with singleton pattern.
+"""
+import asyncio
+import threading
+from pathlib import Path
+from typing import Optional
+from app.core.config import settings
+from app.core.logging import get_request_logger
+class ASRError(Exception):
+    """Raised when ASR processing fails."""
+    pass
+class ASRService:
+    """
+    Singleton ASR service with thread-safe model loading.
+    Model is loaded once at startup and reused for all requests.
+    """
+    _instance: Optional["ASRService"] = None
+    _lock = threading.Lock()
+    _model = None
+    _model_loaded = False
+    _model_lock = threading.Lock()
+    def __new__(cls) -> "ASRService":
+        if cls._instance is None:
+            with cls._lock:
+                if cls._instance is None:
+                    cls._instance = super().__new__(cls)
+        return cls._instance
+    def _load_model(self):
+        """Load the Whisper model (called once, protected by lock)."""
+        if self._model_loaded:
+            return
+        with self._model_lock:
+            if self._model_loaded:
+                return
+            import whisper
+            model_size = settings.ASR_MODEL_SIZE
+            device = settings.ASR_DEVICE
+            # Log model loading
+            import logging
+            logger = logging.getLogger("transcription")
+            logger.info(f"Loading Whisper model: {model_size} on {device}")
+            self._model = whisper.load_model(model_size, device=device)
+            self._model_loaded = True
+            logger.info("Whisper model loaded successfully")
+    async def transcribe(
+        self,
+        audio_path: Path,
+        request_id: str
+    ) -> str:
+        """
+        Transcribe audio file to plain text.
+        Returns the full transcription as a string.
+        """
+        logger = get_request_logger(request_id)
+        logger.info(f"Starting transcription: {audio_path}")
+        # Ensure model is loaded
+        self._load_model()
+        # Run transcription in thread pool to not block event loop
+        loop = asyncio.get_event_loop()
+        try:
+            result = await asyncio.wait_for(
+                loop.run_in_executor(None, self._transcribe_sync, audio_path, request_id),
+                timeout=settings.ASR_TIMEOUT
+            )
+            return result
+        except asyncio.TimeoutError:
+            raise ASRError(f"Transcription timeout after {settings.ASR_TIMEOUT}s")
+    def _transcribe_sync(self, audio_path: Path, request_id: str) -> str:
+        """Synchronous transcription (runs in thread pool)."""
+        logger = get_request_logger(request_id)
+        try:
+            # Transcribe with auto language detection
+            result = self._model.transcribe(
+                str(audio_path),
+                task="transcribe",
+                verbose=False
+            )
+            detected_lang = result.get("language", "unknown")
+            logger.info(f"Detected language: {detected_lang}")
+            # Get the full text
+            full_text = result.get("text", "").strip()
+            # Clean up extra whitespace
+            import re
+            full_text = re.sub(r'\s+', ' ', full_text).strip()
+            logger.info(f"Transcription complete: {len(full_text)} characters")
+            return full_text
+        except Exception as e:
+            raise ASRError(f"Transcription failed: {e}")
+# Global ASR service instance
+asr_service = ASRService()
+async def transcribe_audio(audio_path: Path, request_id: str) -> str:
+    """Convenience function to transcribe audio."""
+    return await asr_service.transcribe(audio_path, request_id)

app/services/cleanup.py ADDED Viewed

	@@ -0,0 +1,34 @@

+"""
+Cleanup utilities for temp directories.
+"""
+import shutil
+import os
+from pathlib import Path
+from typing import Optional
+from app.core.logging import get_request_logger
+def cleanup_temp_dir(temp_dir: Optional[Path], request_id: str) -> None:
+    """
+    Safely remove a temporary directory and all its contents.
+    Always called in finally blocks to ensure cleanup.
+    """
+    if temp_dir is None:
+        return
+    logger = get_request_logger(request_id)
+    try:
+        if temp_dir.exists():
+            shutil.rmtree(temp_dir, ignore_errors=True)
+            logger.info(f"Cleaned up temp directory: {temp_dir}")
+    except Exception as e:
+        logger.warning(f"Failed to cleanup temp directory {temp_dir}: {e}")
+def ensure_temp_base_exists() -> Path:
+    """Ensure the base temp directory exists."""
+    from app.core.config import settings
+    base = Path(settings.TEMP_DIR_BASE)
+    base.mkdir(parents=True, exist_ok=True)
+    return base

app/services/ffmpeg.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""
+FFmpeg audio processing: normalize and extract audio to 16kHz mono WAV.
+"""
+import asyncio
+import uuid
+from pathlib import Path
+from app.core.config import settings
+from app.core.logging import get_request_logger
+class FFmpegError(Exception):
+    """Raised when FFmpeg processing fails."""
+    pass
+async def normalize_audio(
+    input_path: Path,
+    request_id: str,
+    temp_dir: Path
+) -> Path:
+    """
+    Convert audio to 16kHz mono WAV format required by Whisper.
+    Returns the path to the normalized audio file.
+    """
+    logger = get_request_logger(request_id)
+    output_path = temp_dir / f"{uuid.uuid4().hex[:8]}_normalized.wav"
+    logger.info(f"Normalizing audio: {input_path} -> {output_path}")
+    cmd = [
+        "ffmpeg",
+        "-i", str(input_path),
+        "-ar", "16000",      # 16kHz sample rate
+        "-ac", "1",          # Mono
+        "-c:a", "pcm_s16le", # 16-bit PCM
+        "-y",                # Overwrite output
+        "-loglevel", "error",
+        str(output_path)
+    ]
+    try:
+        process = await asyncio.create_subprocess_exec(
+            *cmd,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE
+        )
+        try:
+            stdout, stderr = await asyncio.wait_for(
+                process.communicate(),
+                timeout=300  # 5 min timeout for ffmpeg
+            )
+        except asyncio.TimeoutError:
+            process.kill()
+            raise FFmpegError("FFmpeg timeout after 300s")
+        if process.returncode != 0:
+            error_msg = stderr.decode() if stderr else "Unknown error"
+            raise FFmpegError(f"FFmpeg failed: {error_msg}")
+    except FileNotFoundError:
+        raise FFmpegError("FFmpeg not installed. Please install it.")
+    if not output_path.exists():
+        raise FFmpegError("FFmpeg completed but output file not found")
+    logger.info(f"Normalized audio: {output_path} ({output_path.stat().st_size} bytes)")
+    return output_path

app/services/ingest.py ADDED Viewed

	@@ -0,0 +1,251 @@

+"""
+Ingestion services: file upload, media URL download, YouTube download.
+"""
+import os
+import asyncio
+import uuid
+from pathlib import Path
+from typing import Optional, Tuple
+from urllib.parse import urlparse
+import httpx
+from fastapi import UploadFile
+from app.core.config import settings
+from app.core.security import (
+    SecurityError,
+    resolve_and_validate_url,
+    validate_youtube_url,
+    validate_file_extension,
+    validate_content_type,
+)
+from app.core.logging import get_request_logger
+from app.services.cleanup import ensure_temp_base_exists
+class IngestError(Exception):
+    """Raised when ingestion fails."""
+    pass
+def create_request_temp_dir(request_id: str) -> Path:
+    """Create a unique temporary directory for a request."""
+    base = ensure_temp_base_exists()
+    temp_dir = base / request_id
+    temp_dir.mkdir(parents=True, exist_ok=True)
+    return temp_dir
+async def ingest_upload(
+    file: UploadFile,
+    request_id: str,
+    temp_dir: Path
+) -> Path:
+    """
+    Save an uploaded file to the temp directory.
+    Returns the path to the saved file.
+    """
+    logger = get_request_logger(request_id)
+    if not file.filename:
+        raise IngestError("No filename provided")
+    # Validate extension
+    if not validate_file_extension(file.filename):
+        raise IngestError(
+            f"Invalid file type. Allowed: {', '.join(settings.ALLOWED_MEDIA_EXTENSIONS)}"
+        )
+    # Validate content type
+    if not validate_content_type(file.content_type):
+        raise IngestError(f"Invalid content type: {file.content_type}")
+    # Safe filename with UUID prefix to avoid collisions
+    safe_filename = f"{uuid.uuid4().hex[:8]}_{Path(file.filename).name}"
+    output_path = temp_dir / safe_filename
+    logger.info(f"Saving uploaded file: {file.filename} -> {output_path}")
+    # Read and write with size limit check
+    total_size = 0
+    try:
+        with open(output_path, "wb") as f:
+            while True:
+                chunk = await file.read(1024 * 1024)  # 1MB chunks
+                if not chunk:
+                    break
+                total_size += len(chunk)
+                if total_size > settings.MAX_FILE_SIZE_BYTES:
+                    raise IngestError(
+                        f"File too large. Maximum size: {settings.MAX_FILE_SIZE_MB}MB"
+                    )
+                f.write(chunk)
+    except IngestError:
+        if output_path.exists():
+            output_path.unlink()
+        raise
+    logger.info(f"Saved file: {output_path} ({total_size} bytes)")
+    return output_path
+async def ingest_media_url(
+    url: str,
+    request_id: str,
+    temp_dir: Path
+) -> Path:
+    """
+    Download media from a URL with SSRF protection and size limits.
+    Returns the path to the downloaded file.
+    """
+    logger = get_request_logger(request_id)
+    # SSRF protection: validate URL and resolve DNS
+    try:
+        hostname, resolved_ip = resolve_and_validate_url(url)
+        logger.info(f"Validated URL: {hostname} -> {resolved_ip}")
+    except SecurityError as e:
+        raise IngestError(f"Security check failed: {e}")
+    # Extract filename from URL or generate one
+    parsed = urlparse(url)
+    path_name = Path(parsed.path).name if parsed.path else ""
+    if not validate_file_extension(path_name):
+        # Try to get extension from content-type later, use default for now
+        path_name = f"media_{uuid.uuid4().hex[:8]}.mp4"
+    output_path = temp_dir / f"{uuid.uuid4().hex[:8]}_{path_name}"
+    logger.info(f"Downloading media from: {url}")
+    total_size = 0
+    try:
+        async with httpx.AsyncClient(
+            timeout=httpx.Timeout(settings.DOWNLOAD_TIMEOUT),
+            follow_redirects=True,
+            max_redirects=5
+        ) as client:
+            async with client.stream("GET", url) as response:
+                response.raise_for_status()
+                # Validate content type
+                content_type = response.headers.get("content-type", "")
+                if not validate_content_type(content_type):
+                    raise IngestError(f"Invalid content type: {content_type}")
+                # Check content-length if available
+                content_length = response.headers.get("content-length")
+                if content_length and int(content_length) > settings.MAX_FILE_SIZE_BYTES:
+                    raise IngestError(
+                        f"File too large ({int(content_length) // (1024*1024)}MB). "
+                        f"Maximum: {settings.MAX_FILE_SIZE_MB}MB"
+                    )
+                with open(output_path, "wb") as f:
+                    async for chunk in response.aiter_bytes(chunk_size=1024 * 1024):
+                        total_size += len(chunk)
+                        if total_size > settings.MAX_FILE_SIZE_BYTES:
+                            raise IngestError(
+                                f"File too large. Maximum size: {settings.MAX_FILE_SIZE_MB}MB"
+                            )
+                        f.write(chunk)
+    except httpx.TimeoutException:
+        if output_path.exists():
+            output_path.unlink()
+        raise IngestError(f"Download timeout after {settings.DOWNLOAD_TIMEOUT}s")
+    except httpx.HTTPStatusError as e:
+        if output_path.exists():
+            output_path.unlink()
+        raise IngestError(f"HTTP error {e.response.status_code}: {e.response.reason_phrase}")
+    except IngestError:
+        if output_path.exists():
+            output_path.unlink()
+        raise
+    except Exception as e:
+        if output_path.exists():
+            output_path.unlink()
+        raise IngestError(f"Download failed: {e}")
+    logger.info(f"Downloaded: {output_path} ({total_size} bytes)")
+    return output_path
+async def ingest_youtube(
+    url: str,
+    request_id: str,
+    temp_dir: Path
+) -> Path:
+    """
+    Download audio from YouTube using yt-dlp.
+    Returns the path to the downloaded audio file.
+    """
+    logger = get_request_logger(request_id)
+    # Validate YouTube domain
+    if not validate_youtube_url(url):
+        raise IngestError(
+            f"Invalid YouTube URL. Allowed domains: {', '.join(settings.YOUTUBE_ALLOWED_DOMAINS)}"
+        )
+    output_template = str(temp_dir / f"{uuid.uuid4().hex[:8]}_%(title).50s.%(ext)s")
+    logger.info(f"Downloading YouTube audio: {url}")
+    # Use yt-dlp with audio-only extraction
+    # Added options to help avoid 403 errors from YouTube
+    cmd = [
+        "yt-dlp",
+        "--extract-audio",
+        "--audio-format", "mp3",
+        "--audio-quality", "0",  # Best quality
+        "--no-playlist",
+        "--no-warnings",
+        "--quiet",
+        "--max-filesize", f"{settings.MAX_FILE_SIZE_MB}M",
+        # Options to help avoid 403 Forbidden errors
+        "--user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+        "--referer", "https://www.youtube.com/",
+        "--extractor-args", "youtube:player_client=android",
+        "--no-check-certificates",
+        "-o", output_template,
+    ]
+    # Add browser cookies if configured (helps with age-restricted/login-required videos)
+    if settings.YOUTUBE_COOKIES_FROM_BROWSER:
+        cmd.extend(["--cookies-from-browser", settings.YOUTUBE_COOKIES_FROM_BROWSER])
+    cmd.append(url)
+    try:
+        process = await asyncio.create_subprocess_exec(
+            *cmd,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE
+        )
+        try:
+            stdout, stderr = await asyncio.wait_for(
+                process.communicate(),
+                timeout=settings.YOUTUBE_TIMEOUT
+            )
+        except asyncio.TimeoutError:
+            process.kill()
+            raise IngestError(f"YouTube download timeout after {settings.YOUTUBE_TIMEOUT}s")
+        if process.returncode != 0:
+            error_msg = stderr.decode() if stderr else "Unknown error"
+            raise IngestError(f"yt-dlp failed: {error_msg}")
+    except FileNotFoundError:
+        raise IngestError("yt-dlp not installed. Please install it: pip install yt-dlp")
+    # Find the downloaded file
+    downloaded_files = list(temp_dir.glob("*.mp3")) + list(temp_dir.glob("*.m4a")) + list(temp_dir.glob("*.webm"))
+    if not downloaded_files:
+        raise IngestError("yt-dlp completed but no audio file found")
+    # Return the most recently created file
+    output_path = max(downloaded_files, key=lambda p: p.stat().st_mtime)
+    logger.info(f"Downloaded YouTube audio: {output_path}")
+    return output_path

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+# Web framework
+fastapi>=0.104.0
+uvicorn[standard]>=0.24.0
+python-multipart>=0.0.6
+# HTTP client with async support
+httpx>=0.25.0
+# ASR - Official OpenAI Whisper
+openai-whisper>=20231117
+# Pin NumPy to 1.x for compatibility with Whisper dependencies
+numpy<2
+# YouTube download
+yt-dlp>=2023.11.0
+# Testing
+pytest>=7.4.0
+pytest-asyncio>=0.21.0
+# Type hints
+pydantic>=2.5.0