abhisheksan commited on
Commit
781f7b0
·
1 Parent(s): 0e5ada8

Add embeddings API, caching, and Docker revamp

Browse files

Implement embeddings endpoint (router, service, schemas) using
sentence-transformers and configurable embedding model. Add a TTL cache
and concurrent alternative-language fallback to the subtitles service.
Introduce rate limiting (slowapi), timing-safe API key verification and
updated API key middleware. Replace Dockerfile with a multi-stage
builder/runtime image, set model cache dirs, bump package version to
1.0.0, and update README, tests, .env.example, and .gitignore (ignore
models/)

.env.example CHANGED
@@ -1,18 +1,32 @@
1
- # Environment variables template for the multi-utility server
2
  # Copy this file to .env and update the values
3
 
4
- # API Security - Comma-separated list of valid API keys
5
- API_KEYS=your-secret-key-1,your-secret-key-2,your-secret-key-3
 
 
 
 
 
 
 
6
 
7
  # Logging configuration
8
  LOG_LEVEL=INFO
9
 
10
  # yt-dlp configuration
11
- YT_DLP_BINARY=python -m yt_dlp
12
  YT_DLP_TIMEOUT_LIST=30
13
  YT_DLP_TIMEOUT_DOWNLOAD=60
14
 
 
 
 
 
 
15
  # Server configuration
16
  HOST=0.0.0.0
17
  PORT=8000
18
- RELOAD=true
 
 
 
 
1
+ # Environment variables for the multi-utility server
2
  # Copy this file to .env and update the values
3
 
4
+ # API Security - Comma-separated list of valid API keys (REQUIRED)
5
+ API_KEYS=your-secret-key-1,your-secret-key-2
6
+
7
+ # CORS Configuration - Comma-separated list of allowed origins (optional, defaults to *)
8
+ CORS_ORIGINS=http://localhost:3000,https://yourdomain.com
9
+
10
+ # Rate Limiting
11
+ RATE_LIMIT_REQUESTS=100
12
+ RATE_LIMIT_WINDOW=60
13
 
14
  # Logging configuration
15
  LOG_LEVEL=INFO
16
 
17
  # yt-dlp configuration
 
18
  YT_DLP_TIMEOUT_LIST=30
19
  YT_DLP_TIMEOUT_DOWNLOAD=60
20
 
21
+ # Embedding model configuration
22
+ EMBEDDING_MODEL=mixedbread-ai/mxbai-embed-large-v1
23
+ # Models cache directory (set by Docker, optional for local dev)
24
+ # SENTENCE_TRANSFORMERS_HOME=models
25
+
26
  # Server configuration
27
  HOST=0.0.0.0
28
  PORT=8000
29
+ RELOAD=true
30
+
31
+ # Container-specific (set automatically in Dockerfile)
32
+ # DISABLE_FILE_LOGGING=true
.gitignore CHANGED
@@ -124,6 +124,7 @@ dmypy.json
124
 
125
  # Project specific
126
  logs/
 
127
  *.log
128
  .DS_Store
129
  Thumbs.db
 
124
 
125
  # Project specific
126
  logs/
127
+ models/
128
  *.log
129
  .DS_Store
130
  Thumbs.db
Dockerfile CHANGED
@@ -1,62 +1,58 @@
1
- # Use Python 3.11 slim image
2
- FROM python:3.11-slim
3
-
4
- # Set environment variables
5
- ENV PYTHONUNBUFFERED=1 \
6
- PYTHONDONTWRITEBYTECODE=1 \
7
- POETRY_VERSION=1.7.1 \
8
- POETRY_HOME="/opt/poetry" \
9
- POETRY_VIRTUALENVS_IN_PROJECT=true \
10
- POETRY_NO_INTERACTION=1
11
-
12
- # Add Poetry to PATH
13
- ENV PATH="$POETRY_HOME/bin:$PATH"
14
-
15
- # Install system dependencies including curl for health checks
16
- RUN apt-get update \
17
- && apt-get install -y --no-install-recommends \
18
- curl \
19
- build-essential \
20
- && rm -rf /var/lib/apt/lists/*
21
 
22
  # Install Poetry
23
- RUN curl -sSL https://install.python-poetry.org | python3 -
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- # Set work directory
26
  WORKDIR /app
27
 
28
- # Copy pyproject.toml first
29
- COPY pyproject.toml ./
 
 
 
 
30
 
31
- # Copy poetry.lock if it exists, otherwise generate it
32
- COPY poetry.loc[k] ./
33
- RUN if [ ! -f poetry.lock ]; then \
34
- echo "poetry.lock not found, generating it..." && \
35
- poetry lock; \
36
- fi
37
 
38
- # Install Python dependencies
39
- RUN poetry install --only=main --no-dev
40
 
41
- # Copy application code
42
- COPY app/ ./app/
43
 
44
- # Create non-root user and set up logging directory
45
- RUN useradd --create-home --shell /bin/bash app \
46
- && mkdir -p /tmp/app-logs \
47
- && chown -R app:app /app /tmp/app-logs
48
 
49
- USER app
 
 
 
50
 
51
- # Disable file logging for container environment (stdout only)
52
  ENV DISABLE_FILE_LOGGING=true
53
 
54
- # Expose port
55
- EXPOSE 8000
56
 
57
- # Health check (temporarily disabled for debugging)
58
- # HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
59
- # CMD curl -f http://localhost:8000/health || exit 1
60
 
61
- # Run the application
62
- CMD ["poetry", "run", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
 
1
+ # Stage 1: Builder - Install dependencies
2
+ FROM python:3.11-slim AS builder
3
+
4
+ WORKDIR /app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  # Install Poetry
7
+ RUN pip install --no-cache-dir poetry==1.8.0
8
+
9
+ # Configure Poetry for non-interactive installation
10
+ RUN poetry config virtualenvs.create false
11
+
12
+ # Copy only dependency files first (cache layer)
13
+ COPY pyproject.toml poetry.lock ./
14
+
15
+ # Install only production dependencies
16
+ RUN poetry install --only main --no-interaction --no-ansi --no-root
17
+
18
+ # Stage 2: Runtime - Minimal production image
19
+ FROM python:3.11-slim AS runtime
20
 
 
21
  WORKDIR /app
22
 
23
+ # Copy installed packages from builder
24
+ COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
25
+ COPY --from=builder /usr/local/bin /usr/local/bin
26
+
27
+ # Create non-root user for security
28
+ RUN useradd --create-home --shell /bin/bash appuser
29
 
30
+ # Create directories with proper ownership
31
+ RUN mkdir -p /app/logs /app/models && chown -R appuser:appuser /app
 
 
 
 
32
 
33
+ # Copy application code (separate layer for faster rebuilds)
34
+ COPY --chown=appuser:appuser app/ ./app/
35
 
36
+ # Switch to non-root user
37
+ USER appuser
38
 
39
+ # Environment configuration
40
+ ENV PYTHONDONTWRITEBYTECODE=1
41
+ ENV PYTHONUNBUFFERED=1
42
+ ENV PYTHONPATH=/app
43
 
44
+ # Hugging Face model cache directory
45
+ ENV HF_HOME=/app/models
46
+ ENV TRANSFORMERS_CACHE=/app/models
47
+ ENV SENTENCE_TRANSFORMERS_HOME=/app/models
48
 
49
+ # Disable file logging in container
50
  ENV DISABLE_FILE_LOGGING=true
51
 
52
+ EXPOSE 7860
 
53
 
54
+ # Health check
55
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
56
+ CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:7860/health')" || exit 1
57
 
58
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
 
README.md CHANGED
@@ -1,392 +1,147 @@
1
  # Multi-Utility FastAPI Server
2
 
3
- A centralized, extensible FastAPI server that provides reusable APIs for different projects. Instead of spinning up multiple servers, this single server hosts multiple modular APIs with robust authentication, logging, and error handling.
4
 
5
- ## 🚀 Features
 
 
6
 
7
- - **Modular Architecture**: Easy to add new APIs and utilities
8
- - **API Key Authentication**: Secure access with configurable API keys
9
- - **Comprehensive Logging**: Structured logging with Loguru
10
- - **Robust Error Handling**: Custom exceptions with proper HTTP status codes
11
- - **YouTube Subtitle Extractor**: First utility - extract and clean YouTube subtitles
12
- - **Async Support**: Full async/await support for better performance
13
- - **Comprehensive Testing**: Unit and integration tests with pytest
14
- - **Development Tools**: Pre-configured with Black, isort, flake8, and mypy
15
 
16
- ## 📋 Current APIs
 
 
 
 
 
17
 
18
- ### 1. YouTube Subtitle Extractor
19
 
20
- Extract and clean subtitles from YouTube videos using yt-dlp.
 
 
 
21
 
22
- **Endpoint**: `POST /api/v1/subtitles/extract`
23
 
24
- **Features**:
25
- - Supports multiple language codes
26
- - Automatic fallback to alternative language variants
27
- - Text cleaning and deduplication
28
- - Timeout protection
29
- - Direct JSON response (no file storage)
30
 
31
- ## 🛠️ Installation
32
-
33
- ### Prerequisites
34
-
35
- - Python 3.11+
36
- - Poetry (for dependency management)
37
-
38
- ### Setup
39
-
40
- 1. **Clone the repository**:
41
- ```bash
42
- git clone <repository-url>
43
- cd multi-utility-server
44
- ```
45
-
46
- 2. **Install dependencies**:
47
- ```bash
48
- poetry install
49
- ```
50
-
51
- 3. **Configure environment**:
52
- ```bash
53
- cp .env.example .env
54
- # Edit .env file with your API keys and settings
55
- ```
56
-
57
- 4. **Run development server**:
58
- ```bash
59
- # On Linux/macOS
60
- ./scripts/run_dev.sh
61
-
62
- # On Windows
63
- scripts\run_dev.bat
64
-
65
- # Or manually
66
- poetry run uvicorn app.main:app --reload
67
- ```
68
-
69
- ## 🔧 Configuration
70
-
71
- ### Environment Variables
72
-
73
- Create a `.env` file (copy from `.env.example`):
74
 
75
- ```env
76
- # API Security - Comma-separated list of valid API keys
77
- API_KEYS=your-secret-key-1,your-secret-key-2,your-secret-key-3
78
 
79
- # Logging configuration
80
- LOG_LEVEL=INFO
 
81
 
82
- # yt-dlp configuration
83
- YT_DLP_BINARY=python -m yt_dlp
84
- YT_DLP_TIMEOUT_LIST=30
85
- YT_DLP_TIMEOUT_DOWNLOAD=60
86
 
87
- # Server configuration
88
- HOST=0.0.0.0
89
- PORT=8000
90
- RELOAD=true
91
  ```
92
 
93
- ### API Key Configuration
94
-
95
- The server uses API key authentication. Configure your keys in the `.env` file:
96
 
97
- ```env
98
- API_KEYS=production-key-1,development-key-2,client-key-3
99
- ```
 
 
 
 
100
 
101
- ## 📖 API Documentation
102
 
103
  ### Authentication
104
 
105
- All API endpoints (except health checks and documentation) require authentication:
106
 
107
  ```bash
108
- # Include API key in request header
109
- curl -H "x-api-key: your-api-key" ...
110
  ```
111
 
112
- ### YouTube Subtitle Extractor
113
-
114
- #### Extract Subtitles
115
-
116
- **POST** `/api/v1/subtitles/extract`
117
 
118
- Extract subtitles from a YouTube video.
119
-
120
- **Request**:
121
- ```json
122
- {
123
- "url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
124
- "lang": "en"
125
- }
126
  ```
127
 
128
- **Response** (200 OK):
129
  ```json
130
  {
131
  "status": "success",
 
132
  "language": "en",
133
- "video_id": "dQw4w9WgXcQ",
134
- "subtitles": [
135
- "Never gonna give you up",
136
- "Never gonna let you down",
137
- "Never gonna run around and desert you"
138
- ]
139
  }
140
  ```
141
 
142
- **Error Responses**:
143
- - `400 Bad Request`: Invalid YouTube URL or parameters
144
- - `401 Unauthorized`: Missing or invalid API key
145
- - `404 Not Found`: No subtitles available in requested language
146
- - `408 Request Timeout`: Subtitle extraction timed out
147
- - `500 Internal Server Error`: yt-dlp error or unexpected failure
148
-
149
- **Example Usage**:
150
 
151
  ```bash
152
- # Extract English subtitles
153
- curl -X POST "http://localhost:8000/api/v1/subtitles/extract" \
154
  -H "Content-Type: application/json" \
155
- -H "x-api-key: your-api-key" \
156
- -d '{
157
- "url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
158
- "lang": "en"
159
- }'
160
-
161
- # Extract Spanish subtitles
162
- curl -X POST "http://localhost:8000/api/v1/subtitles/extract" \
163
- -H "Content-Type: application/json" \
164
- -H "x-api-key: your-api-key" \
165
- -d '{
166
- "url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
167
- "lang": "es"
168
- }'
169
  ```
170
 
171
- #### Health Check
172
-
173
- **GET** `/api/v1/subtitles/health`
174
-
175
- Check subtitles service health (no authentication required).
176
-
177
- **Response**:
178
  ```json
179
  {
180
- "status": "healthy",
181
- "service": "subtitles",
182
- "yt_dlp_binary": "python -m yt_dlp"
 
183
  }
184
  ```
185
 
186
- ## 🧪 Testing
187
-
188
- Run the test suite:
189
-
190
- ```bash
191
- # Run all tests
192
- poetry run pytest
193
-
194
- # Run with coverage
195
- poetry run pytest --cov=app
196
-
197
- # Run specific test file
198
- poetry run pytest tests/test_subtitles.py
199
-
200
- # Run with verbose output
201
- poetry run pytest -v
202
- ```
203
-
204
- ## 🏗️ Development
205
-
206
- ### Project Structure
207
 
208
  ```
209
- multi_utility_server/
210
- ├── pyproject.toml # Poetry config
211
- ├── poetry.lock # Dependency lock file
212
- ├── README.md
213
- ── .env.example # Environment variable template
214
- ├── app/
215
- │ ├── main.py # FastAPI app entrypoint
216
- │ ├── core/
217
- │ │ ├── config.py # Environment-based config loader
218
- │ │ ├── security.py # API key verification
219
- │ │ ├── logging.py # Structured logging setup
220
- │ │ └── exceptions.py # Custom exception definitions
221
- │ ├── apis/
222
- │ │ ├── __init__.py
223
- │ │ ├── subtitles/
224
- │ │ │ ├── router.py # FastAPI routes
225
- │ │ │ ├── service.py # Business logic (yt-dlp, cleaning)
226
- │ │ │ ├── schemas.py # Request/response models
227
- │ │ │ └── utils.py # Helpers for text cleaning
228
- │ │ └── <future_api>/
229
- │ └── middleware/
230
- │ └── api_key_auth.py # API key auth middleware
231
- ├── tests/
232
- │ ├── test_subtitles.py
233
- │ ├── test_security.py
234
- │ └── conftest.py
235
- └── scripts/
236
- ├── run_dev.sh # Linux/macOS dev script
237
- └── run_dev.bat # Windows dev script
238
  ```
239
 
240
- ### Adding New APIs
241
 
242
- 1. **Create new API module**:
243
- ```bash
244
- mkdir app/apis/your_new_api
245
- touch app/apis/your_new_api/__init__.py
246
- touch app/apis/your_new_api/router.py
247
- touch app/apis/your_new_api/service.py
248
- touch app/apis/your_new_api/schemas.py
249
- ```
250
 
251
- 2. **Implement your API** following the same pattern as the subtitles API
 
 
252
 
253
- 3. **Register the router** in `app/main.py`:
254
- ```python
255
- from app.apis.your_new_api.router import router as your_new_api_router
256
- app.include_router(your_new_api_router)
257
- ```
258
-
259
- 4. **Add tests** in `tests/test_your_new_api.py`
260
-
261
- ### Code Quality
262
-
263
- The project includes several code quality tools:
264
 
265
  ```bash
266
- # Format code
267
- poetry run black .
268
- poetry run isort .
269
-
270
- # Lint code
271
- poetry run flake8 .
272
-
273
- # Type checking
274
- poetry run mypy app/
275
-
276
- # Run all quality checks
277
- poetry run black . && poetry run isort . && poetry run flake8 . && poetry run mypy app/
278
  ```
279
 
280
- ## 🚀 Deployment
281
-
282
- ### Using Docker
283
-
284
- The project includes Docker support for easy deployment:
285
-
286
- #### Quick Start with Docker Compose
287
-
288
- 1. **Build and run**:
289
- ```bash
290
- # Production deployment
291
- docker-compose up --build
292
-
293
- # Development with hot reload
294
- docker-compose -f docker-compose.dev.yml up --build
295
- ```
296
-
297
- 2. **Access the application**:
298
- - API: http://localhost:8000
299
- - Documentation: http://localhost:8000/docs
300
- - Health check: http://localhost:8000/health
301
-
302
- #### Docker Files Included
303
-
304
- - `Dockerfile`: Production-ready container
305
- - `Dockerfile.dev`: Development container with hot reload
306
- - `docker-compose.yml`: Basic production setup
307
- - `docker-compose.dev.yml`: Development setup
308
- - `docker-compose.prod.yml`: Production setup with resource limits
309
- - `.dockerignore`: Optimized build context
310
-
311
- #### Manual Docker Build
312
 
313
  ```bash
314
- # Build production image
315
- docker build -t multiutility-server .
316
-
317
- # Run container
318
- docker run -p 8000:8000 --env-file .env multiutility-server
319
- ```
320
 
321
- ### Using Gunicorn
 
322
 
323
- ```bash
324
- # Production deployment with Gunicorn
325
- poetry run gunicorn -k uvicorn.workers.UvicornWorker app.main:app \
326
- --bind 0.0.0.0:8000 \
327
- --workers 4 \
328
- --worker-class uvicorn.workers.UvicornWorker \
329
- --access-logfile - \
330
- --error-logfile -
331
  ```
332
 
333
- ## 🔐 Security Considerations
334
-
335
- - **API Keys**: Store API keys securely and rotate them regularly
336
- - **CORS**: Configure CORS appropriately for your use case
337
- - **Rate Limiting**: Consider adding rate limiting for production use
338
- - **HTTPS**: Always use HTTPS in production
339
- - **Input Validation**: All inputs are validated using Pydantic schemas
340
-
341
- ## 📊 Monitoring and Logging
342
-
343
- The server includes comprehensive logging:
344
-
345
- - **Console Logs**: Colored output for development
346
- - **File Logs**: Rotating log files in `logs/` directory
347
- - **Error Logs**: Separate error log file
348
- - **Request Logging**: All HTTP requests are logged
349
- - **Structured Format**: JSON-like format for easy parsing
350
-
351
- Log files:
352
- - `logs/app.log`: General application logs
353
- - `logs/error.log`: Error-only logs
354
-
355
- ## 🤝 Contributing
356
-
357
- 1. Fork the repository
358
- 2. Create a feature branch
359
- 3. Make your changes
360
- 4. Add tests for new functionality
361
- 5. Run quality checks and tests
362
- 6. Submit a pull request
363
-
364
- ## 📝 License
365
-
366
- This project is licensed under the MIT License - see the LICENSE file for details.
367
-
368
- ## 🆘 Troubleshooting
369
-
370
- ### Common Issues
371
-
372
- 1. **yt-dlp not found**: Ensure yt-dlp is installed: `pip install yt-dlp`
373
- 2. **Permission denied on scripts**: Make scripts executable: `chmod +x scripts/run_dev.sh`
374
- 3. **Port already in use**: Change the port in `.env` file or stop the conflicting service
375
- 4. **API key errors**: Verify your API key is correctly set in the `.env` file
376
-
377
- ### Getting Help
378
-
379
- - Check the logs in `logs/` directory
380
- - Use the health check endpoints to verify service status
381
- - Run tests to ensure everything is working: `poetry run pytest`
382
- - Check the interactive API documentation at `/docs`
383
-
384
- ## 🔮 Future Enhancements
385
 
386
- - **Rate Limiting**: Per-API-key rate limiting
387
- - **Metrics**: Prometheus metrics integration
388
- - **Caching**: Redis caching for frequently requested data
389
- - **File Upload APIs**: Handle file processing utilities
390
- - **Webhook Support**: Async webhook notifications
391
- - **Admin Dashboard**: Web-based administration interface
392
- - **OAuth2/JWT**: Advanced authentication options
 
1
  # Multi-Utility FastAPI Server
2
 
3
+ A centralized, extensible FastAPI server providing reusable APIs with robust authentication, rate limiting, and logging.
4
 
5
+ [![Python 3.11+](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/)
6
+ [![FastAPI](https://img.shields.io/badge/FastAPI-0.104+-green.svg)](https://fastapi.tiangolo.com/)
7
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
8
 
9
+ ## Features
 
 
 
 
 
 
 
10
 
11
+ - **Modular Architecture** - Easy to add new APIs
12
+ - **API Key Authentication** - Secure, timing-safe key validation
13
+ - **Rate Limiting** - Configurable per-endpoint limits with `slowapi`
14
+ - **Result Caching** - TTL-based caching with `cachetools`
15
+ - **Structured Logging** - Loguru with console/file output
16
+ - **Docker Ready** - Multi-stage, cache-optimized Dockerfile
17
 
18
+ ## APIs
19
 
20
+ | API | Endpoint | Description |
21
+ |-----|----------|-------------|
22
+ | **Subtitles** | `POST /api/v1/subtitles/extract` | Extract YouTube subtitles |
23
+ | **Embeddings** | `POST /api/v1/embeddings/generate` | Generate text embeddings (1024-dim) |
24
 
25
+ ## Quick Start
26
 
27
+ ### Local Development
 
 
 
 
 
28
 
29
+ ```bash
30
+ # Install dependencies
31
+ poetry install
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ # Configure environment
34
+ cp .env.example .env
35
+ # Edit .env with your API keys
36
 
37
+ # Run server
38
+ poetry run uvicorn app.main:app --reload
39
+ ```
40
 
41
+ ### Docker
 
 
 
42
 
43
+ ```bash
44
+ docker build -t multiutility-server .
45
+ docker run -p 7860:7860 -e API_KEYS=your-key multiutility-server
 
46
  ```
47
 
48
+ ## Configuration
 
 
49
 
50
+ | Variable | Description | Default |
51
+ |----------|-------------|---------|
52
+ | `API_KEYS` | Comma-separated API keys (required) | - |
53
+ | `CORS_ORIGINS` | Allowed origins | `*` |
54
+ | `RATE_LIMIT_REQUESTS` | Requests per minute | `100` |
55
+ | `LOG_LEVEL` | Logging level | `INFO` |
56
+ | `EMBEDDING_MODEL` | HuggingFace model | `mixedbread-ai/mxbai-embed-large-v1` |
57
 
58
+ ## API Usage
59
 
60
  ### Authentication
61
 
62
+ All endpoints (except health checks) require the `x-api-key` header:
63
 
64
  ```bash
65
+ curl -H "x-api-key: your-api-key" http://localhost:8000/api/v1/...
 
66
  ```
67
 
68
+ ### Subtitles API
 
 
 
 
69
 
70
+ ```bash
71
+ curl -X POST http://localhost:8000/api/v1/subtitles/extract \
72
+ -H "Content-Type: application/json" \
73
+ -H "x-api-key: your-key" \
74
+ -d '{"url": "https://youtube.com/watch?v=VIDEO_ID", "lang": "en"}'
 
 
 
75
  ```
76
 
77
+ **Response:**
78
  ```json
79
  {
80
  "status": "success",
81
+ "video_id": "VIDEO_ID",
82
  "language": "en",
83
+ "subtitles": ["Line 1", "Line 2", "..."]
 
 
 
 
 
84
  }
85
  ```
86
 
87
+ ### Embeddings API
 
 
 
 
 
 
 
88
 
89
  ```bash
90
+ curl -X POST http://localhost:8000/api/v1/embeddings/generate \
 
91
  -H "Content-Type: application/json" \
92
+ -H "x-api-key: your-key" \
93
+ -d '{"texts": ["Hello world", "Another text"], "normalize": true}'
 
 
 
 
 
 
 
 
 
 
 
 
94
  ```
95
 
96
+ **Response:**
 
 
 
 
 
 
97
  ```json
98
  {
99
+ "status": "success",
100
+ "embeddings": [[0.1, 0.2, ...], [0.3, 0.4, ...]],
101
+ "model": "mixedbread-ai/mxbai-embed-large-v1",
102
+ "dimensions": 1024
103
  }
104
  ```
105
 
106
+ ## Project Structure
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  ```
109
+ app/
110
+ ├── main.py # FastAPI application
111
+ ├── core/ # Config, logging, exceptions
112
+ ├── middleware/ # Auth, rate limiting
113
+ ── apis/
114
+ ├── subtitles/ # YouTube subtitle extraction
115
+ ── embeddings/ # Text embedding generation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  ```
117
 
118
+ ## Deployment
119
 
120
+ ### Hugging Face Spaces
 
 
 
 
 
 
 
121
 
122
+ 1. Create a Docker Space
123
+ 2. Set `API_KEYS` secret in Space settings
124
+ 3. Push repository
125
 
126
+ ### Docker Compose
 
 
 
 
 
 
 
 
 
 
127
 
128
  ```bash
129
+ docker-compose up --build
 
 
 
 
 
 
 
 
 
 
 
130
  ```
131
 
132
+ ## Development
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
  ```bash
135
+ # Run tests
136
+ poetry run pytest
 
 
 
 
137
 
138
+ # Type checking
139
+ poetry run mypy app/
140
 
141
+ # Format code
142
+ poetry run black . && poetry run isort .
 
 
 
 
 
 
143
  ```
144
 
145
+ ## License
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
+ MIT License - see [LICENSE](LICENSE) for details.
 
 
 
 
 
 
app/__init__.py CHANGED
@@ -1 +1,3 @@
1
- # Multi-utility FastAPI Server
 
 
 
1
+ """Multi-utility FastAPI Server."""
2
+
3
+ __version__ = "0.1.0"
app/apis/__init__.py CHANGED
@@ -1 +1,6 @@
1
- # API modules
 
 
 
 
 
 
1
+ """API modules."""
2
+
3
+ from app.apis.subtitles.router import router as subtitles_router
4
+ from app.apis.embeddings.router import router as embeddings_router
5
+
6
+ __all__ = ["subtitles_router", "embeddings_router"]
app/apis/embeddings/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """Embeddings API module."""
2
+
3
+ from app.apis.embeddings.router import router
4
+ from app.apis.embeddings.service import embedding_service
5
+
6
+ __all__ = ["router", "embedding_service"]
app/apis/embeddings/router.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FastAPI router for embeddings API."""
2
+
3
+ from fastapi import APIRouter
4
+
5
+ from app.apis.embeddings.schemas import EmbeddingRequest, EmbeddingResponse
6
+ from app.apis.embeddings.service import embedding_service
7
+
8
+
9
+ router = APIRouter(prefix="/api/v1/embeddings", tags=["embeddings"])
10
+
11
+
12
+ @router.post(
13
+ "/generate",
14
+ response_model=EmbeddingResponse,
15
+ summary="Generate text embeddings",
16
+ description="Generate 1024-dimensional embeddings for a list of texts using a local sentence-transformers model."
17
+ )
18
+ async def generate_embeddings(request: EmbeddingRequest) -> EmbeddingResponse:
19
+ """
20
+ Generate embeddings for a list of texts.
21
+
22
+ Args:
23
+ request: Contains texts list and optional normalize flag
24
+
25
+ Returns:
26
+ Embedding vectors for each input text
27
+ """
28
+ embeddings = embedding_service.generate_embeddings(
29
+ request.texts,
30
+ normalize=request.normalize
31
+ )
32
+
33
+ return EmbeddingResponse(
34
+ embeddings=embeddings,
35
+ model=embedding_service.model_name,
36
+ dimensions=embedding_service.dimensions
37
+ )
38
+
39
+
40
+ @router.get(
41
+ "/health",
42
+ summary="Health check for embeddings service",
43
+ description="Check if the embeddings service is operational"
44
+ )
45
+ async def health_check():
46
+ """Health check endpoint for the embeddings service."""
47
+ return {
48
+ "status": "healthy",
49
+ "service": "embeddings",
50
+ "model": embedding_service.model_name
51
+ }
app/apis/embeddings/schemas.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pydantic schemas for embeddings API."""
2
+
3
+ from typing import List
4
+ from pydantic import BaseModel, field_validator
5
+
6
+
7
+ class EmbeddingRequest(BaseModel):
8
+ """Request model for embedding generation."""
9
+
10
+ texts: List[str]
11
+ normalize: bool = True
12
+
13
+ @field_validator("texts")
14
+ @classmethod
15
+ def validate_texts(cls, v: List[str]) -> List[str]:
16
+ """Validate that texts list is not empty and has valid content."""
17
+ if not v:
18
+ raise ValueError("texts list cannot be empty")
19
+ if len(v) > 100:
20
+ raise ValueError("Maximum 100 texts per request")
21
+ for i, text in enumerate(v):
22
+ if not text or not text.strip():
23
+ raise ValueError(f"Text at index {i} is empty")
24
+ return v
25
+
26
+
27
+ class EmbeddingResponse(BaseModel):
28
+ """Response model for embedding generation."""
29
+
30
+ status: str = "success"
31
+ embeddings: List[List[float]]
32
+ model: str
33
+ dimensions: int
34
+
35
+ model_config = {
36
+ "json_schema_extra": {
37
+ "example": {
38
+ "status": "success",
39
+ "embeddings": [[0.1, 0.2, 0.3]],
40
+ "model": "mixedbread-ai/mxbai-embed-large-v1",
41
+ "dimensions": 1024
42
+ }
43
+ }
44
+ }
app/apis/embeddings/service.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Embedding generation service using sentence-transformers."""
2
+
3
+ import os
4
+ from typing import List
5
+ import threading
6
+
7
+ from app.core.config import settings
8
+ from app.core.logging import get_logger
9
+
10
+ logger = get_logger(__name__)
11
+
12
+ MODELS_DIR = os.environ.get("SENTENCE_TRANSFORMERS_HOME", "models")
13
+
14
+
15
+ class EmbeddingService:
16
+ """Service for generating text embeddings using local models."""
17
+
18
+ def __init__(self) -> None:
19
+ self._model = None
20
+ self._model_name = settings.embedding_model
21
+ self._lock = threading.Lock()
22
+
23
+ def _load_model(self):
24
+ """Lazy load the embedding model on first use."""
25
+ if self._model is None:
26
+ with self._lock:
27
+ if self._model is None:
28
+ logger.info(f"Loading embedding model: {self._model_name}")
29
+ logger.info(f"Models directory: {MODELS_DIR}")
30
+ from sentence_transformers import SentenceTransformer
31
+ self._model = SentenceTransformer(
32
+ self._model_name,
33
+ cache_folder=MODELS_DIR
34
+ )
35
+ logger.info(f"Model loaded. Dimensions: {self._model.get_sentence_embedding_dimension()}")
36
+
37
+ def generate_embeddings(self, texts: List[str], normalize: bool = True) -> List[List[float]]:
38
+ """
39
+ Generate embeddings for a list of texts.
40
+
41
+ Args:
42
+ texts: List of text strings to embed
43
+ normalize: Whether to normalize embeddings to unit length
44
+
45
+ Returns:
46
+ List of embedding vectors (each 1024-dimensional)
47
+ """
48
+ self._load_model()
49
+
50
+ embeddings = self._model.encode(
51
+ texts,
52
+ normalize_embeddings=normalize,
53
+ convert_to_numpy=True
54
+ )
55
+
56
+ return embeddings.tolist()
57
+
58
+ @property
59
+ def model_name(self) -> str:
60
+ """Get the model name."""
61
+ return self._model_name
62
+
63
+ @property
64
+ def dimensions(self) -> int:
65
+ """Get the embedding dimensions."""
66
+ self._load_model()
67
+ return self._model.get_sentence_embedding_dimension()
68
+
69
+
70
+ embedding_service = EmbeddingService()
app/apis/subtitles/__init__.py CHANGED
@@ -1 +1,7 @@
1
- # Subtitles API module
 
 
 
 
 
 
 
1
+ """Subtitles API module."""
2
+
3
+ from app.apis.subtitles.router import router
4
+ from app.apis.subtitles.service import subtitle_service
5
+ from app.apis.subtitles.schemas import SubtitleExtractRequest, SubtitleExtractResponse
6
+
7
+ __all__ = ["router", "subtitle_service", "SubtitleExtractRequest", "SubtitleExtractResponse"]
app/apis/subtitles/router.py CHANGED
@@ -1,18 +1,9 @@
1
  """FastAPI router for subtitles API."""
2
 
3
- from fastapi import APIRouter, HTTPException, Header
4
- from typing import Optional
5
 
6
- from app.apis.subtitles.schemas import SubtitleExtractRequest, SubtitleExtractResponse, SubtitleErrorResponse
7
  from app.apis.subtitles.service import subtitle_service
8
- from app.core.exceptions import (
9
- InvalidVideoURLError,
10
- SubtitlesNotFoundError,
11
- DownloadTimeoutError,
12
- SubtitleExtractionError,
13
- AuthenticationError
14
- )
15
- from app.core.security import get_api_key_from_header
16
 
17
 
18
  router = APIRouter(prefix="/api/v1/subtitles", tags=["subtitles"])
@@ -21,59 +12,29 @@ router = APIRouter(prefix="/api/v1/subtitles", tags=["subtitles"])
21
  @router.post(
22
  "/extract",
23
  response_model=SubtitleExtractResponse,
24
- responses={
25
- 400: {"model": SubtitleErrorResponse, "description": "Bad Request"},
26
- 401: {"model": SubtitleErrorResponse, "description": "Unauthorized"},
27
- 404: {"model": SubtitleErrorResponse, "description": "Not Found"},
28
- 408: {"model": SubtitleErrorResponse, "description": "Request Timeout"},
29
- 500: {"model": SubtitleErrorResponse, "description": "Internal Server Error"},
30
- },
31
  summary="Extract subtitles from YouTube video",
32
  description="Extract and clean subtitles from a YouTube video URL. Returns subtitles as a list of text lines."
33
  )
34
- async def extract_subtitles(
35
- request: SubtitleExtractRequest,
36
- x_api_key: Optional[str] = Header(None, description="API key for authentication")
37
- ) -> SubtitleExtractResponse:
38
  """
39
  Extract subtitles from a YouTube video.
40
-
41
- - **url**: YouTube video URL (youtube.com or youtu.be)
42
- - **lang**: Language code for subtitles (default: "en")
43
-
44
- Returns cleaned subtitle text as a list of strings.
 
45
  """
46
- try:
47
- # Validate API key
48
- get_api_key_from_header(x_api_key)
49
-
50
- # Extract subtitles
51
- video_id, subtitle_lines = await subtitle_service.extract_subtitles(
52
- str(request.url),
53
- request.lang
54
- )
55
-
56
- return SubtitleExtractResponse(
57
- language=request.lang,
58
- video_id=video_id,
59
- subtitles=subtitle_lines
60
- )
61
-
62
- except AuthenticationError as e:
63
- raise HTTPException(status_code=401, detail={"status": "error", "message": e.message})
64
- except InvalidVideoURLError as e:
65
- raise HTTPException(status_code=400, detail={"status": "error", "message": e.message})
66
- except SubtitlesNotFoundError as e:
67
- raise HTTPException(status_code=404, detail={"status": "error", "message": e.message})
68
- except DownloadTimeoutError as e:
69
- raise HTTPException(status_code=408, detail={"status": "error", "message": e.message})
70
- except SubtitleExtractionError as e:
71
- raise HTTPException(status_code=500, detail={"status": "error", "message": e.message})
72
- except Exception as e:
73
- raise HTTPException(
74
- status_code=500,
75
- detail={"status": "error", "message": "An unexpected error occurred"}
76
- )
77
 
78
 
79
  @router.get(
@@ -85,6 +46,5 @@ async def health_check():
85
  """Health check endpoint for the subtitles service."""
86
  return {
87
  "status": "healthy",
88
- "service": "subtitles",
89
- "yt_dlp_binary": subtitle_service.yt_dlp_binary
90
  }
 
1
  """FastAPI router for subtitles API."""
2
 
3
+ from fastapi import APIRouter
 
4
 
5
+ from app.apis.subtitles.schemas import SubtitleExtractRequest, SubtitleExtractResponse
6
  from app.apis.subtitles.service import subtitle_service
 
 
 
 
 
 
 
 
7
 
8
 
9
  router = APIRouter(prefix="/api/v1/subtitles", tags=["subtitles"])
 
12
  @router.post(
13
  "/extract",
14
  response_model=SubtitleExtractResponse,
 
 
 
 
 
 
 
15
  summary="Extract subtitles from YouTube video",
16
  description="Extract and clean subtitles from a YouTube video URL. Returns subtitles as a list of text lines."
17
  )
18
+ async def extract_subtitles(request: SubtitleExtractRequest) -> SubtitleExtractResponse:
 
 
 
19
  """
20
  Extract subtitles from a YouTube video.
21
+
22
+ Args:
23
+ request: Contains url and optional lang parameter
24
+
25
+ Returns:
26
+ Cleaned subtitle text as a list of strings
27
  """
28
+ video_id, subtitle_lines = await subtitle_service.extract_subtitles(
29
+ str(request.url),
30
+ request.lang
31
+ )
32
+
33
+ return SubtitleExtractResponse(
34
+ language=request.lang,
35
+ video_id=video_id,
36
+ subtitles=subtitle_lines
37
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
 
40
  @router.get(
 
46
  """Health check endpoint for the subtitles service."""
47
  return {
48
  "status": "healthy",
49
+ "service": "subtitles"
 
50
  }
app/apis/subtitles/schemas.py CHANGED
@@ -1,15 +1,15 @@
1
  """Pydantic schemas for subtitles API."""
2
 
3
- from typing import List, Optional
4
  from pydantic import BaseModel, HttpUrl, field_validator
5
 
6
 
7
  class SubtitleExtractRequest(BaseModel):
8
  """Request model for subtitle extraction."""
9
-
10
  url: HttpUrl
11
  lang: str = "en"
12
-
13
  @field_validator("url")
14
  @classmethod
15
  def validate_youtube_url(cls, v: HttpUrl) -> HttpUrl:
@@ -18,7 +18,7 @@ class SubtitleExtractRequest(BaseModel):
18
  if not any(domain in url_str for domain in ["youtube.com", "youtu.be"]):
19
  raise ValueError("URL must be a valid YouTube URL")
20
  return v
21
-
22
  @field_validator("lang")
23
  @classmethod
24
  def validate_language(cls, v: str) -> str:
@@ -30,14 +30,14 @@ class SubtitleExtractRequest(BaseModel):
30
 
31
  class SubtitleExtractResponse(BaseModel):
32
  """Response model for successful subtitle extraction."""
33
-
34
  status: str = "success"
35
  language: str
36
  video_id: str
37
  subtitles: List[str]
38
-
39
- class Config:
40
- json_schema_extra = {
41
  "example": {
42
  "status": "success",
43
  "language": "en",
@@ -49,18 +49,4 @@ class SubtitleExtractResponse(BaseModel):
49
  ]
50
  }
51
  }
52
-
53
-
54
- class SubtitleErrorResponse(BaseModel):
55
- """Response model for subtitle extraction errors."""
56
-
57
- status: str = "error"
58
- message: str
59
-
60
- class Config:
61
- json_schema_extra = {
62
- "example": {
63
- "status": "error",
64
- "message": "No subtitles available in the requested language"
65
- }
66
- }
 
1
  """Pydantic schemas for subtitles API."""
2
 
3
+ from typing import List
4
  from pydantic import BaseModel, HttpUrl, field_validator
5
 
6
 
7
  class SubtitleExtractRequest(BaseModel):
8
  """Request model for subtitle extraction."""
9
+
10
  url: HttpUrl
11
  lang: str = "en"
12
+
13
  @field_validator("url")
14
  @classmethod
15
  def validate_youtube_url(cls, v: HttpUrl) -> HttpUrl:
 
18
  if not any(domain in url_str for domain in ["youtube.com", "youtu.be"]):
19
  raise ValueError("URL must be a valid YouTube URL")
20
  return v
21
+
22
  @field_validator("lang")
23
  @classmethod
24
  def validate_language(cls, v: str) -> str:
 
30
 
31
  class SubtitleExtractResponse(BaseModel):
32
  """Response model for successful subtitle extraction."""
33
+
34
  status: str = "success"
35
  language: str
36
  video_id: str
37
  subtitles: List[str]
38
+
39
+ model_config = {
40
+ "json_schema_extra": {
41
  "example": {
42
  "status": "success",
43
  "language": "en",
 
49
  ]
50
  }
51
  }
52
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/apis/subtitles/service.py CHANGED
@@ -1,183 +1,154 @@
1
- """Subtitle extraction service using yt-dlp."""
2
 
3
  import asyncio
4
- import subprocess
5
  import sys
6
  import tempfile
7
  from pathlib import Path
8
- from typing import List, Optional
9
- import os
 
10
 
11
  from app.core.config import settings
12
  from app.core.exceptions import (
13
- InvalidVideoURLError,
14
- SubtitlesNotFoundError,
15
- DownloadTimeoutError,
16
- SubtitleExtractionError
17
  )
18
  from app.apis.subtitles.utils import extract_video_id, convert_vtt_to_text
19
 
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  class SubtitleService:
22
  """Service for extracting subtitles from YouTube videos."""
23
-
24
- def __init__(self):
25
- self.yt_dlp_binary = settings.yt_dlp_binary
26
  self.timeout_list = settings.yt_dlp_timeout_list
27
  self.timeout_download = settings.yt_dlp_timeout_download
28
-
29
- async def extract_subtitles(self, url: str, lang: str = "en") -> tuple[str, List[str]]:
30
  """
31
  Extract subtitles from a YouTube video.
32
-
33
  Args:
34
  url: YouTube video URL
35
  lang: Language code for subtitles
36
-
37
  Returns:
38
  Tuple of (video_id, subtitle_lines)
39
-
40
  Raises:
41
- InvalidVideoURLError: If the URL is invalid
42
  SubtitlesNotFoundError: If no subtitles are found
43
  DownloadTimeoutError: If the operation times out
44
  SubtitleExtractionError: If extraction fails
45
  """
46
  video_id = extract_video_id(url)
47
-
48
- # Create temporary directory for subtitle files
 
 
 
49
  with tempfile.TemporaryDirectory() as temp_dir:
50
- try:
51
- # First, try to list available subtitles
52
- await self._check_available_subtitles(url)
53
-
54
- # Download subtitles
55
- subtitle_content = await self._download_subtitles(url, lang, temp_dir, video_id)
56
-
57
- if not subtitle_content:
58
- # Try alternative language codes
59
- alt_langs = self._get_alternative_languages(lang)
60
- for alt_lang in alt_langs:
61
- subtitle_content = await self._download_subtitles(url, alt_lang, temp_dir, video_id)
62
- if subtitle_content:
63
- lang = alt_lang # Update the language that worked
64
- break
65
-
66
- if not subtitle_content:
67
- raise SubtitlesNotFoundError(f"No subtitles available in language '{lang}' or alternatives")
68
-
69
- # Convert and clean subtitles
70
- clean_lines = convert_vtt_to_text(subtitle_content)
71
-
72
- if not clean_lines:
73
- raise SubtitlesNotFoundError("Subtitles found but appear to be empty after cleaning")
74
-
75
- return video_id, clean_lines
76
-
77
- except (InvalidVideoURLError, SubtitlesNotFoundError, DownloadTimeoutError):
78
- raise
79
- except Exception as e:
80
- raise SubtitleExtractionError(f"Unexpected error during subtitle extraction: {str(e)}")
81
-
82
- async def _check_available_subtitles(self, url: str) -> None:
83
- """Check if subtitles are available for the video."""
84
- try:
85
- cmd = [
86
- sys.executable, "-m", "yt_dlp",
87
- "--list-subs",
88
- "--no-warnings",
89
- url
90
- ]
91
-
92
- process = await asyncio.create_subprocess_exec(
93
- *cmd,
94
- stdout=asyncio.subprocess.PIPE,
95
- stderr=asyncio.subprocess.PIPE
96
- )
97
-
98
- stdout, stderr = await asyncio.wait_for(
99
- process.communicate(),
100
- timeout=self.timeout_list
101
- )
102
-
103
- if process.returncode != 0:
104
- error_msg = stderr.decode('utf-8', errors='ignore')
105
- if "Video unavailable" in error_msg or "Private video" in error_msg:
106
- raise InvalidVideoURLError("Video is unavailable, private, or does not exist")
107
- raise SubtitleExtractionError(f"Failed to check subtitles: {error_msg}")
108
-
109
- except asyncio.TimeoutError:
110
- raise DownloadTimeoutError("Timeout while checking available subtitles")
111
- except (InvalidVideoURLError, SubtitleExtractionError):
112
- raise
113
- except Exception as e:
114
- raise SubtitleExtractionError(f"Error checking subtitles: {str(e)}")
115
-
116
- async def _download_subtitles(self, url: str, lang: str, temp_dir: str, video_id: str) -> Optional[str]:
117
  """Download subtitles for a specific language."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  try:
119
- output_template = os.path.join(temp_dir, f"{video_id}.%(ext)s")
120
-
121
- cmd = [
122
- sys.executable, "-m", "yt_dlp",
123
- "--write-subs",
124
- "--write-auto-subs", # Also try auto-generated subs
125
- "--sub-lang", lang,
126
- "--skip-download",
127
- "--no-warnings",
128
- "--output", output_template,
129
- "--user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
130
- url
131
- ]
132
-
133
  process = await asyncio.create_subprocess_exec(
134
  *cmd,
135
  stdout=asyncio.subprocess.PIPE,
136
  stderr=asyncio.subprocess.PIPE
137
  )
138
-
139
  stdout, stderr = await asyncio.wait_for(
140
  process.communicate(),
141
  timeout=self.timeout_download
142
  )
143
-
144
  if process.returncode != 0:
145
- # This is expected if subtitles aren't available in this language
 
 
146
  return None
147
-
148
- # Look for downloaded subtitle files
149
  temp_path = Path(temp_dir)
150
  subtitle_files = list(temp_path.glob(f"{video_id}*.vtt"))
151
-
152
  if not subtitle_files:
153
  return None
154
-
155
- # Read the first subtitle file found
156
- with open(subtitle_files[0], 'r', encoding='utf-8', errors='ignore') as f:
157
- return f.read()
158
-
159
  except asyncio.TimeoutError:
160
  raise DownloadTimeoutError(f"Timeout while downloading subtitles for language '{lang}'")
161
- except Exception:
162
- # Return None to allow trying other languages
163
- return None
164
-
165
- def _get_alternative_languages(self, lang: str) -> List[str]:
166
- """Get alternative language codes to try."""
167
- alternatives = {
168
- "en": ["en-US", "en-GB", "en-orig"],
169
- "es": ["es-ES", "es-MX", "es-419"],
170
- "fr": ["fr-FR", "fr-CA"],
171
- "de": ["de-DE"],
172
- "it": ["it-IT"],
173
- "pt": ["pt-BR", "pt-PT"],
174
- "ja": ["ja-JP"],
175
- "ko": ["ko-KR"],
176
- "zh": ["zh-CN", "zh-TW", "zh-Hans", "zh-Hant"]
177
- }
178
-
179
- return alternatives.get(lang, [f"{lang}-{lang.upper()}"])
180
-
181
-
182
- # Global service instance
183
  subtitle_service = SubtitleService()
 
1
+ """Subtitle extraction service using yt-dlp with caching."""
2
 
3
  import asyncio
 
4
  import sys
5
  import tempfile
6
  from pathlib import Path
7
+ from typing import List, Optional, Tuple
8
+
9
+ from cachetools import TTLCache
10
 
11
  from app.core.config import settings
12
  from app.core.exceptions import (
13
+ SubtitlesNotFoundError,
14
+ DownloadTimeoutError,
15
+ SubtitleExtractionError,
16
+ InvalidVideoURLError
17
  )
18
  from app.apis.subtitles.utils import extract_video_id, convert_vtt_to_text
19
 
20
 
21
+ SUBTITLE_CACHE: TTLCache = TTLCache(maxsize=100, ttl=3600)
22
+
23
+ ALTERNATIVE_LANGUAGES = {
24
+ "en": ["en-US", "en-GB", "en-orig"],
25
+ "es": ["es-ES", "es-MX", "es-419"],
26
+ "fr": ["fr-FR", "fr-CA"],
27
+ "de": ["de-DE"],
28
+ "it": ["it-IT"],
29
+ "pt": ["pt-BR", "pt-PT"],
30
+ "ja": ["ja-JP"],
31
+ "ko": ["ko-KR"],
32
+ "zh": ["zh-CN", "zh-TW", "zh-Hans", "zh-Hant"]
33
+ }
34
+
35
+
36
  class SubtitleService:
37
  """Service for extracting subtitles from YouTube videos."""
38
+
39
+ def __init__(self) -> None:
 
40
  self.timeout_list = settings.yt_dlp_timeout_list
41
  self.timeout_download = settings.yt_dlp_timeout_download
42
+
43
+ async def extract_subtitles(self, url: str, lang: str = "en") -> Tuple[str, List[str]]:
44
  """
45
  Extract subtitles from a YouTube video.
46
+
47
  Args:
48
  url: YouTube video URL
49
  lang: Language code for subtitles
50
+
51
  Returns:
52
  Tuple of (video_id, subtitle_lines)
53
+
54
  Raises:
 
55
  SubtitlesNotFoundError: If no subtitles are found
56
  DownloadTimeoutError: If the operation times out
57
  SubtitleExtractionError: If extraction fails
58
  """
59
  video_id = extract_video_id(url)
60
+ cache_key = f"{video_id}:{lang}"
61
+
62
+ if cache_key in SUBTITLE_CACHE:
63
+ return SUBTITLE_CACHE[cache_key]
64
+
65
  with tempfile.TemporaryDirectory() as temp_dir:
66
+ subtitle_content = await self._download_subtitles(url, lang, temp_dir, video_id)
67
+
68
+ if not subtitle_content:
69
+ subtitle_content = await self._try_alternative_languages(url, lang, temp_dir, video_id)
70
+
71
+ if not subtitle_content:
72
+ raise SubtitlesNotFoundError(f"No subtitles available in language '{lang}' or alternatives")
73
+
74
+ clean_lines = convert_vtt_to_text(subtitle_content)
75
+
76
+ if not clean_lines:
77
+ raise SubtitlesNotFoundError("Subtitles found but appear to be empty after cleaning")
78
+
79
+ result = (video_id, clean_lines)
80
+ SUBTITLE_CACHE[cache_key] = result
81
+ return result
82
+
83
+ async def _try_alternative_languages(
84
+ self, url: str, lang: str, temp_dir: str, video_id: str
85
+ ) -> Optional[str]:
86
+ """Try downloading subtitles in alternative language codes concurrently."""
87
+ alt_langs = ALTERNATIVE_LANGUAGES.get(lang, [f"{lang}-{lang.upper()}"])
88
+
89
+ tasks = [
90
+ self._download_subtitles(url, alt_lang, temp_dir, video_id)
91
+ for alt_lang in alt_langs
92
+ ]
93
+
94
+ results = await asyncio.gather(*tasks, return_exceptions=True)
95
+
96
+ for result in results:
97
+ if isinstance(result, str) and result:
98
+ return result
99
+
100
+ return None
101
+
102
+ async def _download_subtitles(
103
+ self, url: str, lang: str, temp_dir: str, video_id: str
104
+ ) -> Optional[str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  """Download subtitles for a specific language."""
106
+ output_template = str(Path(temp_dir) / f"{video_id}.%(ext)s")
107
+
108
+ cmd = [
109
+ sys.executable, "-m", "yt_dlp",
110
+ "--write-subs",
111
+ "--write-auto-subs",
112
+ "--sub-lang", lang,
113
+ "--skip-download",
114
+ "--no-warnings",
115
+ "--output", output_template,
116
+ "--user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
117
+ url
118
+ ]
119
+
120
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  process = await asyncio.create_subprocess_exec(
122
  *cmd,
123
  stdout=asyncio.subprocess.PIPE,
124
  stderr=asyncio.subprocess.PIPE
125
  )
126
+
127
  stdout, stderr = await asyncio.wait_for(
128
  process.communicate(),
129
  timeout=self.timeout_download
130
  )
131
+
132
  if process.returncode != 0:
133
+ error_msg = stderr.decode('utf-8', errors='ignore')
134
+ if "Video unavailable" in error_msg or "Private video" in error_msg:
135
+ raise InvalidVideoURLError("Video is unavailable, private, or does not exist")
136
  return None
137
+
 
138
  temp_path = Path(temp_dir)
139
  subtitle_files = list(temp_path.glob(f"{video_id}*.vtt"))
140
+
141
  if not subtitle_files:
142
  return None
143
+
144
+ return subtitle_files[0].read_text(encoding='utf-8', errors='ignore')
145
+
 
 
146
  except asyncio.TimeoutError:
147
  raise DownloadTimeoutError(f"Timeout while downloading subtitles for language '{lang}'")
148
+ except (InvalidVideoURLError, DownloadTimeoutError):
149
+ raise
150
+ except Exception as e:
151
+ raise SubtitleExtractionError(f"Error downloading subtitles: {str(e)}")
152
+
153
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  subtitle_service = SubtitleService()
app/core/__init__.py CHANGED
@@ -1 +1,26 @@
1
- # Core modules
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Core module exports."""
2
+
3
+ from app.core.config import settings
4
+ from app.core.exceptions import (
5
+ MultiUtilityServerException,
6
+ AuthenticationError,
7
+ InvalidVideoURLError,
8
+ SubtitlesNotFoundError,
9
+ DownloadTimeoutError,
10
+ SubtitleExtractionError
11
+ )
12
+ from app.core.logging import setup_logging, get_logger
13
+ from app.core.security import verify_api_key
14
+
15
+ __all__ = [
16
+ "settings",
17
+ "MultiUtilityServerException",
18
+ "AuthenticationError",
19
+ "InvalidVideoURLError",
20
+ "SubtitlesNotFoundError",
21
+ "DownloadTimeoutError",
22
+ "SubtitleExtractionError",
23
+ "setup_logging",
24
+ "get_logger",
25
+ "verify_api_key"
26
+ ]
app/core/config.py CHANGED
@@ -1,56 +1,65 @@
1
  """Configuration module for the multi-utility server."""
2
 
3
- import os
4
- from typing import Set
5
  from pydantic_settings import BaseSettings
6
- from pydantic import field_validator
7
 
8
 
9
  class Settings(BaseSettings):
10
  """Application settings loaded from environment variables."""
11
-
12
- # API Security
13
- api_keys: str = "default-key-1,default-key-2"
14
-
 
 
 
 
 
 
 
15
  # Logging
16
  log_level: str = "INFO"
17
-
18
  # yt-dlp configuration
19
- yt_dlp_binary: str = "python -m yt_dlp"
20
- yt_dlp_timeout_list: int = 30 # seconds for listing subtitles
21
- yt_dlp_timeout_download: int = 60 # seconds for downloading subtitles
22
-
 
 
23
  # Server configuration
24
  host: str = "0.0.0.0"
25
  port: int = 8000
26
- reload: bool = False # Default to False for production
27
-
28
- class Config:
29
- env_file = ".env"
30
- env_file_encoding = "utf-8"
31
-
32
- @field_validator("api_keys")
33
- @classmethod
34
- def parse_api_keys(cls, v: str) -> Set[str]:
35
- """Convert comma-separated API keys to a set."""
36
- return {key.strip() for key in v.split(",") if key.strip()}
37
-
38
  @property
39
  def api_keys_set(self) -> Set[str]:
40
  """Get API keys as a set."""
41
- if isinstance(self.api_keys, str):
42
- return {key.strip() for key in self.api_keys.split(",") if key.strip()}
43
- return self.api_keys
44
-
 
 
 
 
 
 
 
 
45
  @field_validator("log_level")
46
  @classmethod
47
  def validate_log_level(cls, v: str) -> str:
48
  """Validate log level."""
49
  valid_levels = {"DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"}
50
- if v.upper() not in valid_levels:
 
51
  raise ValueError(f"Invalid log level: {v}. Must be one of {valid_levels}")
52
- return v.upper()
53
 
54
 
55
- # Global settings instance
56
  settings = Settings()
 
1
  """Configuration module for the multi-utility server."""
2
 
3
+ from typing import List, Set
4
+ from pydantic import computed_field, field_validator
5
  from pydantic_settings import BaseSettings
 
6
 
7
 
8
  class Settings(BaseSettings):
9
  """Application settings loaded from environment variables."""
10
+
11
+ # API Security - no default keys for security
12
+ api_keys: str = ""
13
+
14
+ # CORS Configuration
15
+ cors_origins: str = ""
16
+
17
+ # Rate Limiting
18
+ rate_limit_requests: int = 100
19
+ rate_limit_window: int = 60
20
+
21
  # Logging
22
  log_level: str = "INFO"
23
+
24
  # yt-dlp configuration
25
+ yt_dlp_timeout_list: int = 30
26
+ yt_dlp_timeout_download: int = 60
27
+
28
+ # Embedding configuration
29
+ embedding_model: str = "mixedbread-ai/mxbai-embed-large-v1"
30
+
31
  # Server configuration
32
  host: str = "0.0.0.0"
33
  port: int = 8000
34
+ reload: bool = False
35
+
36
+ model_config = {"env_file": ".env", "env_file_encoding": "utf-8"}
37
+
38
+ @computed_field
 
 
 
 
 
 
 
39
  @property
40
  def api_keys_set(self) -> Set[str]:
41
  """Get API keys as a set."""
42
+ if not self.api_keys:
43
+ return set()
44
+ return {key.strip() for key in self.api_keys.split(",") if key.strip()}
45
+
46
+ @computed_field
47
+ @property
48
+ def cors_origins_list(self) -> List[str]:
49
+ """Get CORS origins as a list."""
50
+ if not self.cors_origins:
51
+ return ["*"]
52
+ return [origin.strip() for origin in self.cors_origins.split(",") if origin.strip()]
53
+
54
  @field_validator("log_level")
55
  @classmethod
56
  def validate_log_level(cls, v: str) -> str:
57
  """Validate log level."""
58
  valid_levels = {"DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"}
59
+ upper_v = v.upper()
60
+ if upper_v not in valid_levels:
61
  raise ValueError(f"Invalid log level: {v}. Must be one of {valid_levels}")
62
+ return upper_v
63
 
64
 
 
65
  settings = Settings()
app/core/security.py CHANGED
@@ -1,44 +1,31 @@
1
  """Security utilities for API key verification."""
2
 
3
- from typing import Optional
4
  from app.core.config import settings
5
  from app.core.exceptions import AuthenticationError
6
 
7
 
8
- def verify_api_key(api_key: Optional[str]) -> bool:
9
  """
10
- Verify if the provided API key is valid.
11
-
12
  Args:
13
  api_key: The API key to verify
14
-
15
  Returns:
16
  True if the API key is valid
17
-
18
  Raises:
19
  AuthenticationError: If the API key is invalid or missing
20
  """
21
  if not api_key:
22
  raise AuthenticationError("Missing API key")
23
-
24
- if api_key not in settings.api_keys_set:
25
- raise AuthenticationError("Invalid API key")
26
-
27
- return True
28
 
 
 
29
 
30
- def get_api_key_from_header(x_api_key: Optional[str]) -> str:
31
- """
32
- Extract and validate API key from header.
33
-
34
- Args:
35
- x_api_key: The X-API-Key header value
36
-
37
- Returns:
38
- The validated API key
39
-
40
- Raises:
41
- AuthenticationError: If the API key is invalid or missing
42
- """
43
- verify_api_key(x_api_key)
44
- return x_api_key
 
1
  """Security utilities for API key verification."""
2
 
3
+ import secrets
4
  from app.core.config import settings
5
  from app.core.exceptions import AuthenticationError
6
 
7
 
8
+ def verify_api_key(api_key: str) -> bool:
9
  """
10
+ Verify if the provided API key is valid using timing-safe comparison.
11
+
12
  Args:
13
  api_key: The API key to verify
14
+
15
  Returns:
16
  True if the API key is valid
17
+
18
  Raises:
19
  AuthenticationError: If the API key is invalid or missing
20
  """
21
  if not api_key:
22
  raise AuthenticationError("Missing API key")
 
 
 
 
 
23
 
24
+ if not settings.api_keys_set:
25
+ raise AuthenticationError("No API keys configured on server")
26
 
27
+ for valid_key in settings.api_keys_set:
28
+ if secrets.compare_digest(api_key, valid_key):
29
+ return True
30
+
31
+ raise AuthenticationError("Invalid API key")
 
 
 
 
 
 
 
 
 
 
app/main.py CHANGED
@@ -1,18 +1,21 @@
1
  """Main FastAPI application for the multi-utility server."""
2
 
3
- from fastapi import FastAPI, Request, HTTPException
4
- from fastapi.responses import JSONResponse
5
- from fastapi.middleware.cors import CORSMiddleware
6
  import time
7
  from contextlib import asynccontextmanager
8
 
 
 
 
 
 
 
9
  from app.core.config import settings
10
  from app.core.logging import setup_logging, get_logger
11
  from app.core.exceptions import MultiUtilityServerException
12
- from app.middleware.api_key_auth import APIKeyMiddleware
13
  from app.apis.subtitles.router import router as subtitles_router
 
14
 
15
- # Set up logging
16
  setup_logging()
17
  logger = get_logger(__name__)
18
 
@@ -23,11 +26,11 @@ async def lifespan(app: FastAPI):
23
  logger.info("Starting multi-utility server...")
24
  logger.info(f"Log level: {settings.log_level}")
25
  logger.info(f"API keys configured: {len(settings.api_keys_set)}")
 
26
  yield
27
  logger.info("Shutting down multi-utility server...")
28
 
29
 
30
- # Create FastAPI application
31
  app = FastAPI(
32
  title="Multi-Utility Server",
33
  description="Centralized FastAPI server providing reusable APIs for different projects",
@@ -37,20 +40,20 @@ app = FastAPI(
37
  lifespan=lifespan
38
  )
39
 
40
- # Add CORS middleware
 
 
41
  app.add_middleware(
42
  CORSMiddleware,
43
- allow_origins=["*"], # Configure appropriately for production
44
- allow_credentials=True,
45
- allow_methods=["*"],
46
  allow_headers=["*"],
47
  )
48
 
49
- # Add API key authentication middleware
50
  app.add_middleware(APIKeyMiddleware)
51
 
52
 
53
- # Custom exception handler
54
  @app.exception_handler(MultiUtilityServerException)
55
  async def custom_exception_handler(request: Request, exc: MultiUtilityServerException):
56
  """Handle custom application exceptions."""
@@ -61,7 +64,6 @@ async def custom_exception_handler(request: Request, exc: MultiUtilityServerExce
61
  )
62
 
63
 
64
- # Global exception handler
65
  @app.exception_handler(Exception)
66
  async def global_exception_handler(request: Request, exc: Exception):
67
  """Handle unexpected exceptions."""
@@ -72,18 +74,14 @@ async def global_exception_handler(request: Request, exc: Exception):
72
  )
73
 
74
 
75
- # Request logging middleware
76
  @app.middleware("http")
77
  async def log_requests(request: Request, call_next):
78
  """Log all HTTP requests."""
79
  start_time = time.time()
80
-
81
- # Log request
82
  logger.info(f"Request: {request.method} {request.url.path}")
83
 
84
  response = await call_next(request)
85
 
86
- # Log response
87
  process_time = time.time() - start_time
88
  logger.info(
89
  f"Response: {response.status_code} | "
@@ -94,7 +92,6 @@ async def log_requests(request: Request, call_next):
94
  return response
95
 
96
 
97
- # Health check endpoint
98
  @app.get("/health")
99
  async def health_check():
100
  """Health check endpoint."""
@@ -105,23 +102,23 @@ async def health_check():
105
  }
106
 
107
 
108
- # Root endpoint
109
  @app.get("/")
110
  async def root():
111
- """Root endpoint with basic information."""
112
  return {
113
  "message": "Multi-Utility FastAPI Server",
114
  "version": "0.1.0",
115
  "docs": "/docs",
116
  "health": "/health",
117
  "apis": {
118
- "subtitles": "/api/v1/subtitles"
 
119
  }
120
  }
121
 
122
 
123
- # Include API routers
124
  app.include_router(subtitles_router)
 
125
 
126
 
127
  if __name__ == "__main__":
 
1
  """Main FastAPI application for the multi-utility server."""
2
 
 
 
 
3
  import time
4
  from contextlib import asynccontextmanager
5
 
6
+ from fastapi import FastAPI, Request
7
+ from fastapi.responses import JSONResponse
8
+ from fastapi.middleware.cors import CORSMiddleware
9
+ from slowapi import _rate_limit_exceeded_handler
10
+ from slowapi.errors import RateLimitExceeded
11
+
12
  from app.core.config import settings
13
  from app.core.logging import setup_logging, get_logger
14
  from app.core.exceptions import MultiUtilityServerException
15
+ from app.middleware import APIKeyMiddleware, limiter
16
  from app.apis.subtitles.router import router as subtitles_router
17
+ from app.apis.embeddings.router import router as embeddings_router
18
 
 
19
  setup_logging()
20
  logger = get_logger(__name__)
21
 
 
26
  logger.info("Starting multi-utility server...")
27
  logger.info(f"Log level: {settings.log_level}")
28
  logger.info(f"API keys configured: {len(settings.api_keys_set)}")
29
+ logger.info(f"CORS origins: {settings.cors_origins_list}")
30
  yield
31
  logger.info("Shutting down multi-utility server...")
32
 
33
 
 
34
  app = FastAPI(
35
  title="Multi-Utility Server",
36
  description="Centralized FastAPI server providing reusable APIs for different projects",
 
40
  lifespan=lifespan
41
  )
42
 
43
+ app.state.limiter = limiter
44
+ app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
45
+
46
  app.add_middleware(
47
  CORSMiddleware,
48
+ allow_origins=settings.cors_origins_list,
49
+ allow_credentials=True if settings.cors_origins else False,
50
+ allow_methods=["GET", "POST", "PUT", "DELETE"],
51
  allow_headers=["*"],
52
  )
53
 
 
54
  app.add_middleware(APIKeyMiddleware)
55
 
56
 
 
57
  @app.exception_handler(MultiUtilityServerException)
58
  async def custom_exception_handler(request: Request, exc: MultiUtilityServerException):
59
  """Handle custom application exceptions."""
 
64
  )
65
 
66
 
 
67
  @app.exception_handler(Exception)
68
  async def global_exception_handler(request: Request, exc: Exception):
69
  """Handle unexpected exceptions."""
 
74
  )
75
 
76
 
 
77
  @app.middleware("http")
78
  async def log_requests(request: Request, call_next):
79
  """Log all HTTP requests."""
80
  start_time = time.time()
 
 
81
  logger.info(f"Request: {request.method} {request.url.path}")
82
 
83
  response = await call_next(request)
84
 
 
85
  process_time = time.time() - start_time
86
  logger.info(
87
  f"Response: {response.status_code} | "
 
92
  return response
93
 
94
 
 
95
  @app.get("/health")
96
  async def health_check():
97
  """Health check endpoint."""
 
102
  }
103
 
104
 
 
105
  @app.get("/")
106
  async def root():
107
+ """Root endpoint with API information."""
108
  return {
109
  "message": "Multi-Utility FastAPI Server",
110
  "version": "0.1.0",
111
  "docs": "/docs",
112
  "health": "/health",
113
  "apis": {
114
+ "subtitles": "/api/v1/subtitles",
115
+ "embeddings": "/api/v1/embeddings"
116
  }
117
  }
118
 
119
 
 
120
  app.include_router(subtitles_router)
121
+ app.include_router(embeddings_router)
122
 
123
 
124
  if __name__ == "__main__":
app/middleware/__init__.py CHANGED
@@ -1 +1,6 @@
1
- # Middleware modules
 
 
 
 
 
 
1
+ """Middleware module exports."""
2
+
3
+ from app.middleware.api_key_auth import APIKeyMiddleware
4
+ from app.middleware.rate_limit import limiter, rate_limit_exceeded_handler
5
+
6
+ __all__ = ["APIKeyMiddleware", "limiter", "rate_limit_exceeded_handler"]
app/middleware/api_key_auth.py CHANGED
@@ -1,72 +1,56 @@
1
  """API key authentication middleware."""
2
 
3
- from typing import Optional
4
- from fastapi import Request, HTTPException
5
- from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
6
  from starlette.middleware.base import BaseHTTPMiddleware
7
  from starlette.responses import JSONResponse
8
 
9
  from app.core.config import settings
10
- from app.core.exceptions import AuthenticationError
 
 
 
 
 
 
 
 
 
 
11
 
12
 
13
  class APIKeyMiddleware(BaseHTTPMiddleware):
14
  """Middleware to enforce API key authentication."""
15
-
16
  async def dispatch(self, request: Request, call_next):
17
- # Skip authentication for health check and docs endpoints
18
- if request.url.path in ["/", "/health", "/docs", "/redoc", "/openapi.json"]:
19
- response = await call_next(request)
20
- return response
21
-
22
- # Extract API key from header
23
  api_key = request.headers.get("x-api-key")
24
-
25
  if not api_key:
26
  return JSONResponse(
27
  status_code=401,
28
- content={
29
- "status": "error",
30
- "message": "Missing API key. Include 'x-api-key' header."
31
- }
32
  )
33
-
34
- if api_key not in settings.api_keys_set:
35
  return JSONResponse(
36
- status_code=401,
37
- content={
38
- "status": "error",
39
- "message": "Invalid API key"
40
- }
41
  )
42
-
43
- # Add API key to request state for potential logging
44
- request.state.api_key = api_key
45
-
46
- response = await call_next(request)
47
- return response
48
 
 
 
 
 
49
 
50
- # Alternative dependency-based approach for specific routes
51
- api_key_header = HTTPBearer(scheme_name="API Key", description="API Key required in Authorization header")
52
-
 
 
53
 
54
- async def get_api_key(credentials: HTTPAuthorizationCredentials = api_key_header) -> str:
55
- """
56
- Dependency to extract and validate API key from Authorization header.
57
-
58
- Args:
59
- credentials: HTTP authorization credentials
60
-
61
- Returns:
62
- The validated API key
63
-
64
- Raises:
65
- HTTPException: If the API key is invalid
66
- """
67
- if not credentials or credentials.credentials not in settings.api_keys_set:
68
- raise HTTPException(
69
- status_code=401,
70
- detail="Invalid API key"
71
- )
72
- return credentials.credentials
 
1
  """API key authentication middleware."""
2
 
3
+ import secrets
4
+ from fastapi import Request
 
5
  from starlette.middleware.base import BaseHTTPMiddleware
6
  from starlette.responses import JSONResponse
7
 
8
  from app.core.config import settings
9
+
10
+
11
+ EXEMPT_PATHS = frozenset([
12
+ "/",
13
+ "/health",
14
+ "/docs",
15
+ "/redoc",
16
+ "/openapi.json",
17
+ "/api/v1/subtitles/health",
18
+ "/api/v1/embeddings/health"
19
+ ])
20
 
21
 
22
  class APIKeyMiddleware(BaseHTTPMiddleware):
23
  """Middleware to enforce API key authentication."""
24
+
25
  async def dispatch(self, request: Request, call_next):
26
+ """Process request and validate API key for protected endpoints."""
27
+ if request.url.path in EXEMPT_PATHS:
28
+ return await call_next(request)
29
+
 
 
30
  api_key = request.headers.get("x-api-key")
31
+
32
  if not api_key:
33
  return JSONResponse(
34
  status_code=401,
35
+ content={"status": "error", "message": "Missing API key. Include 'x-api-key' header."}
 
 
 
36
  )
37
+
38
+ if not settings.api_keys_set:
39
  return JSONResponse(
40
+ status_code=500,
41
+ content={"status": "error", "message": "No API keys configured on server"}
 
 
 
42
  )
 
 
 
 
 
 
43
 
44
+ is_valid = any(
45
+ secrets.compare_digest(api_key, valid_key)
46
+ for valid_key in settings.api_keys_set
47
+ )
48
 
49
+ if not is_valid:
50
+ return JSONResponse(
51
+ status_code=401,
52
+ content={"status": "error", "message": "Invalid API key"}
53
+ )
54
 
55
+ request.state.api_key = api_key
56
+ return await call_next(request)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/middleware/rate_limit.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Rate limiting middleware using slowapi."""
2
+
3
+ from slowapi import Limiter
4
+ from slowapi.util import get_remote_address
5
+ from slowapi.errors import RateLimitExceeded
6
+ from slowapi.middleware import SlowAPIMiddleware
7
+ from starlette.responses import JSONResponse
8
+ from starlette.requests import Request
9
+
10
+ from app.core.config import settings
11
+
12
+
13
+ limiter = Limiter(
14
+ key_func=get_remote_address,
15
+ default_limits=[f"{settings.rate_limit_requests}/minute"]
16
+ )
17
+
18
+
19
+ def rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded) -> JSONResponse:
20
+ """Handle rate limit exceeded errors."""
21
+ return JSONResponse(
22
+ status_code=429,
23
+ content={
24
+ "status": "error",
25
+ "message": f"Rate limit exceeded. Try again in {exc.detail} seconds."
26
+ }
27
+ )
poetry.lock CHANGED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
  [tool.poetry]
2
  name = "multi-utility-server"
3
- version = "0.1.0"
4
  description = "Centralized multi-utility FastAPI server with reusable APIs"
5
  authors = ["Abhishek Sharma <abhishek@abhisheksan.com>"]
6
  readme = "README.md"
@@ -15,6 +15,10 @@ pydantic-settings = "^2.1.0"
15
  yt-dlp = "^2025.9.5"
16
  python-dotenv = "^1.0.0"
17
  loguru = "^0.7.2"
 
 
 
 
18
 
19
  [tool.poetry.group.dev.dependencies]
20
  pytest = "^7.4.3"
 
1
  [tool.poetry]
2
  name = "multi-utility-server"
3
+ version = "1.0.0"
4
  description = "Centralized multi-utility FastAPI server with reusable APIs"
5
  authors = ["Abhishek Sharma <abhishek@abhisheksan.com>"]
6
  readme = "README.md"
 
15
  yt-dlp = "^2025.9.5"
16
  python-dotenv = "^1.0.0"
17
  loguru = "^0.7.2"
18
+ slowapi = "^0.1.9"
19
+ cachetools = "^5.3.0"
20
+ sentence-transformers = "^2.2.2"
21
+ torch = "^2.0.0"
22
 
23
  [tool.poetry.group.dev.dependencies]
24
  pytest = "^7.4.3"
scripts/run_dev.bat DELETED
@@ -1,38 +0,0 @@
1
- @echo off
2
- REM Development script for running the multi-utility server on Windows
3
-
4
- echo Starting Multi-Utility Server Development Environment
5
-
6
- REM Check if Poetry is installed
7
- poetry --version >nul 2>&1
8
- if %errorlevel% neq 0 (
9
- echo Poetry is not installed. Please install Poetry first.
10
- echo Visit: https://python-poetry.org/docs/#installation
11
- pause
12
- exit /b 1
13
- )
14
-
15
- REM Install dependencies if not already installed
16
- if not exist ".venv" (
17
- echo Installing dependencies...
18
- poetry install
19
- )
20
-
21
- REM Create .env file if it doesn't exist
22
- if not exist ".env" (
23
- echo Creating .env file from template...
24
- copy .env.example .env
25
- echo Please edit .env file to configure your API keys and settings.
26
- )
27
-
28
- REM Create logs directory
29
- if not exist "logs" mkdir logs
30
-
31
- REM Start the development server
32
- echo Starting development server...
33
- echo Server will be available at: http://localhost:8000
34
- echo API documentation: http://localhost:8000/docs
35
- echo Press Ctrl+C to stop the server
36
- echo.
37
-
38
- poetry run python -m uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/run_dev.sh DELETED
@@ -1,42 +0,0 @@
1
- #!/bin/bash
2
- # Development script for running the multi-utility server
3
-
4
- # Colors for output
5
- GREEN='\033[0;32m'
6
- YELLOW='\033[1;33m'
7
- RED='\033[0;31m'
8
- NC='\033[0m' # No Color
9
-
10
- echo -e "${GREEN}Starting Multi-Utility Server Development Environment${NC}"
11
-
12
- # Check if Poetry is installed
13
- if ! command -v poetry &> /dev/null; then
14
- echo -e "${RED}Poetry is not installed. Please install Poetry first.${NC}"
15
- echo "Visit: https://python-poetry.org/docs/#installation"
16
- exit 1
17
- fi
18
-
19
- # Install dependencies if not already installed
20
- if [ ! -d ".venv" ]; then
21
- echo -e "${YELLOW}Installing dependencies...${NC}"
22
- poetry install
23
- fi
24
-
25
- # Create .env file if it doesn't exist
26
- if [ ! -f ".env" ]; then
27
- echo -e "${YELLOW}Creating .env file from template...${NC}"
28
- cp .env.example .env
29
- echo -e "${YELLOW}Please edit .env file to configure your API keys and settings.${NC}"
30
- fi
31
-
32
- # Create logs directory
33
- mkdir -p logs
34
-
35
- # Start the development server
36
- echo -e "${GREEN}Starting development server...${NC}"
37
- echo -e "${YELLOW}Server will be available at: http://localhost:8000${NC}"
38
- echo -e "${YELLOW}API documentation: http://localhost:8000/docs${NC}"
39
- echo -e "${YELLOW}Press Ctrl+C to stop the server${NC}"
40
- echo ""
41
-
42
- poetry run python -m uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/run_docker.bat DELETED
@@ -1,20 +0,0 @@
1
- @echo off
2
- echo Building and running Multi-Utility Server with Docker...
3
- echo.
4
-
5
- REM Change to project root directory
6
- cd /d "%~dp0\.."
7
-
8
- REM Check if .env file exists
9
- if not exist .env (
10
- echo Warning: .env file not found. Copying from .env.example...
11
- copy .env.example .env
12
- echo Please edit .env file with your API keys before running again.
13
- pause
14
- exit /b 1
15
- )
16
-
17
- REM Build and run with docker-compose
18
- docker-compose up --build
19
-
20
- pause
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/run_docker.sh DELETED
@@ -1,18 +0,0 @@
1
- #!/bin/bash
2
-
3
- echo "Building and running Multi-Utility Server with Docker..."
4
- echo
5
-
6
- # Change to project root directory
7
- cd "$(dirname "$0")/.."
8
-
9
- # Check if .env file exists
10
- if [ ! -f .env ]; then
11
- echo "Warning: .env file not found. Copying from .env.example..."
12
- cp .env.example .env
13
- echo "Please edit .env file with your API keys before running again."
14
- exit 1
15
- fi
16
-
17
- # Build and run with docker-compose
18
- docker-compose up --build
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/conftest.py CHANGED
@@ -2,10 +2,13 @@
2
 
3
  import pytest
4
  from fastapi.testclient import TestClient
5
- from unittest.mock import Mock, patch
6
  import tempfile
7
  import os
8
 
 
 
 
9
  from app.main import app
10
  from app.core.config import settings
11
 
@@ -29,13 +32,6 @@ def invalid_api_key():
29
  return "invalid-key-for-testing"
30
 
31
 
32
- @pytest.fixture
33
- def mock_subprocess():
34
- """Mock subprocess calls for yt-dlp."""
35
- with patch('subprocess.run') as mock_run:
36
- yield mock_run
37
-
38
-
39
  @pytest.fixture
40
  def mock_asyncio_subprocess():
41
  """Mock asyncio subprocess calls for yt-dlp."""
 
2
 
3
  import pytest
4
  from fastapi.testclient import TestClient
5
+ from unittest.mock import patch
6
  import tempfile
7
  import os
8
 
9
+ # Ensure API_KEYS is set for testing
10
+ os.environ.setdefault("API_KEYS", "test-key-1,test-key-2")
11
+
12
  from app.main import app
13
  from app.core.config import settings
14
 
 
32
  return "invalid-key-for-testing"
33
 
34
 
 
 
 
 
 
 
 
35
  @pytest.fixture
36
  def mock_asyncio_subprocess():
37
  """Mock asyncio subprocess calls for yt-dlp."""
tests/test_security.py CHANGED
@@ -4,57 +4,46 @@ from unittest import mock
4
  import pytest
5
  from fastapi.testclient import TestClient
6
 
7
- from app.core.security import verify_api_key, get_api_key_from_header
8
  from app.core.exceptions import AuthenticationError
9
- from app.main import app
10
 
11
 
12
  class TestAPIKeySecurity:
13
  """Test API key security functions."""
14
-
15
  def test_verify_api_key_valid(self, api_key):
16
  """Test verifying a valid API key."""
17
  assert verify_api_key(api_key) is True
18
-
19
  def test_verify_api_key_invalid(self, invalid_api_key):
20
  """Test verifying an invalid API key."""
21
  with pytest.raises(AuthenticationError):
22
  verify_api_key(invalid_api_key)
23
-
24
- def test_verify_api_key_none(self):
25
- """Test verifying None API key."""
26
- with pytest.raises(AuthenticationError):
27
- verify_api_key(None)
28
-
29
- def test_get_api_key_from_header_valid(self, api_key):
30
- """Test getting API key from valid header."""
31
- result = get_api_key_from_header(api_key)
32
- assert result == api_key
33
-
34
- def test_get_api_key_from_header_invalid(self, invalid_api_key):
35
- """Test getting API key from invalid header."""
36
  with pytest.raises(AuthenticationError):
37
- get_api_key_from_header(invalid_api_key)
38
 
39
 
40
  class TestAPIKeyMiddleware:
41
  """Test API key middleware functionality."""
42
-
43
  def test_middleware_allows_health_endpoint(self, client):
44
  """Test that middleware allows access to health endpoint without API key."""
45
  response = client.get("/health")
46
  assert response.status_code == 200
47
-
48
  def test_middleware_allows_docs_endpoint(self, client):
49
  """Test that middleware allows access to docs endpoint without API key."""
50
  response = client.get("/docs")
51
  assert response.status_code == 200
52
-
53
  def test_middleware_allows_root_endpoint(self, client):
54
  """Test that middleware allows access to root endpoint without API key."""
55
  response = client.get("/")
56
  assert response.status_code == 200
57
-
58
  def test_middleware_blocks_api_without_key(self, client):
59
  """Test that middleware blocks API access without API key."""
60
  response = client.post(
@@ -65,7 +54,7 @@ class TestAPIKeyMiddleware:
65
  data = response.json()
66
  assert data["status"] == "error"
67
  assert "Missing API key" in data["message"]
68
-
69
  def test_middleware_blocks_api_with_invalid_key(self, client, invalid_api_key):
70
  """Test that middleware blocks API access with invalid API key."""
71
  response = client.post(
@@ -77,13 +66,12 @@ class TestAPIKeyMiddleware:
77
  data = response.json()
78
  assert data["status"] == "error"
79
  assert "Invalid API key" in data["message"]
80
-
81
  def test_middleware_allows_api_with_valid_key(self, client, api_key):
82
  """Test that middleware allows API access with valid API key."""
83
- # Mock the subtitle service to avoid actual yt-dlp calls
84
  with mock.patch('app.apis.subtitles.service.subtitle_service.extract_subtitles') as mock_extract:
85
  mock_extract.return_value = ("dQw4w9WgXcQ", ["Test subtitle"])
86
-
87
  response = client.post(
88
  "/api/v1/subtitles/extract",
89
  json={"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ", "lang": "en"},
 
4
  import pytest
5
  from fastapi.testclient import TestClient
6
 
7
+ from app.core.security import verify_api_key
8
  from app.core.exceptions import AuthenticationError
 
9
 
10
 
11
  class TestAPIKeySecurity:
12
  """Test API key security functions."""
13
+
14
  def test_verify_api_key_valid(self, api_key):
15
  """Test verifying a valid API key."""
16
  assert verify_api_key(api_key) is True
17
+
18
  def test_verify_api_key_invalid(self, invalid_api_key):
19
  """Test verifying an invalid API key."""
20
  with pytest.raises(AuthenticationError):
21
  verify_api_key(invalid_api_key)
22
+
23
+ def test_verify_api_key_empty(self):
24
+ """Test verifying empty API key."""
 
 
 
 
 
 
 
 
 
 
25
  with pytest.raises(AuthenticationError):
26
+ verify_api_key("")
27
 
28
 
29
  class TestAPIKeyMiddleware:
30
  """Test API key middleware functionality."""
31
+
32
  def test_middleware_allows_health_endpoint(self, client):
33
  """Test that middleware allows access to health endpoint without API key."""
34
  response = client.get("/health")
35
  assert response.status_code == 200
36
+
37
  def test_middleware_allows_docs_endpoint(self, client):
38
  """Test that middleware allows access to docs endpoint without API key."""
39
  response = client.get("/docs")
40
  assert response.status_code == 200
41
+
42
  def test_middleware_allows_root_endpoint(self, client):
43
  """Test that middleware allows access to root endpoint without API key."""
44
  response = client.get("/")
45
  assert response.status_code == 200
46
+
47
  def test_middleware_blocks_api_without_key(self, client):
48
  """Test that middleware blocks API access without API key."""
49
  response = client.post(
 
54
  data = response.json()
55
  assert data["status"] == "error"
56
  assert "Missing API key" in data["message"]
57
+
58
  def test_middleware_blocks_api_with_invalid_key(self, client, invalid_api_key):
59
  """Test that middleware blocks API access with invalid API key."""
60
  response = client.post(
 
66
  data = response.json()
67
  assert data["status"] == "error"
68
  assert "Invalid API key" in data["message"]
69
+
70
  def test_middleware_allows_api_with_valid_key(self, client, api_key):
71
  """Test that middleware allows API access with valid API key."""
 
72
  with mock.patch('app.apis.subtitles.service.subtitle_service.extract_subtitles') as mock_extract:
73
  mock_extract.return_value = ("dQw4w9WgXcQ", ["Test subtitle"])
74
+
75
  response = client.post(
76
  "/api/v1/subtitles/extract",
77
  json={"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ", "lang": "en"},
tests/test_subtitles.py CHANGED
@@ -1,42 +1,39 @@
1
  """Tests for subtitle extraction functionality."""
2
 
3
  import pytest
4
- from unittest.mock import Mock, AsyncMock, patch, mock_open
5
  import asyncio
6
- from pathlib import Path
7
 
8
- from app.apis.subtitles.service import SubtitleService
9
  from app.apis.subtitles.utils import (
10
- clean_subtitle_text,
11
- extract_video_id,
12
  convert_vtt_to_text
13
  )
14
  from app.core.exceptions import (
15
- InvalidVideoURLError,
16
- SubtitlesNotFoundError,
17
- DownloadTimeoutError,
18
- SubtitleExtractionError
19
  )
20
 
21
 
22
  class TestSubtitleUtils:
23
  """Test subtitle utility functions."""
24
-
25
  def test_extract_video_id_standard_url(self):
26
  """Test extracting video ID from standard YouTube URL."""
27
  url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
28
  assert extract_video_id(url) == "dQw4w9WgXcQ"
29
-
30
  def test_extract_video_id_short_url(self):
31
  """Test extracting video ID from short YouTube URL."""
32
  url = "https://youtu.be/dQw4w9WgXcQ"
33
  assert extract_video_id(url) == "dQw4w9WgXcQ"
34
-
35
  def test_extract_video_id_embed_url(self):
36
  """Test extracting video ID from embed URL."""
37
  url = "https://www.youtube.com/embed/dQw4w9WgXcQ"
38
  assert extract_video_id(url) == "dQw4w9WgXcQ"
39
-
40
  def test_clean_subtitle_text(self):
41
  """Test cleaning subtitle text."""
42
  raw_lines = [
@@ -46,24 +43,24 @@ class TestSubtitleUtils:
46
  "00:00:00.000 --> 00:00:03.000",
47
  "Never gonna give you up",
48
  "",
49
- "2",
50
  "00:00:03.000 --> 00:00:06.000",
51
  "Never gonna let you down",
52
- "Never gonna give you up", # Duplicate
53
  ""
54
  ]
55
-
56
  cleaned = clean_subtitle_text(raw_lines)
57
  assert "Never gonna give you up" in cleaned
58
  assert "Never gonna let you down" in cleaned
59
  assert "WEBVTT" not in cleaned
60
  assert "00:00:00.000 --> 00:00:03.000" not in cleaned
61
  assert len([line for line in cleaned if line == "Never gonna give you up"]) == 1
62
-
63
  def test_convert_vtt_to_text(self, sample_vtt_content):
64
  """Test converting VTT content to clean text."""
65
  result = convert_vtt_to_text(sample_vtt_content)
66
-
67
  assert "Never gonna give you up" in result
68
  assert "Never gonna let you down" in result
69
  assert "WEBVTT" not in result
@@ -72,109 +69,73 @@ class TestSubtitleUtils:
72
 
73
  class TestSubtitleService:
74
  """Test subtitle extraction service."""
75
-
 
 
 
 
 
76
  @pytest.fixture
77
  def service(self):
78
  """Create a subtitle service instance."""
79
  return SubtitleService()
80
-
81
  @pytest.mark.asyncio
82
  async def test_extract_subtitles_success(self, service, sample_youtube_url, sample_vtt_content):
83
  """Test successful subtitle extraction."""
84
- with patch.object(service, '_check_available_subtitles') as mock_check, \
85
- patch.object(service, '_download_subtitles') as mock_download:
86
-
87
- mock_check.return_value = None
88
  mock_download.return_value = sample_vtt_content
89
-
90
  video_id, subtitles = await service.extract_subtitles(sample_youtube_url, "en")
91
-
92
  assert video_id == "dQw4w9WgXcQ"
93
  assert len(subtitles) > 0
94
  assert "Never gonna give you up" in subtitles
95
-
96
  @pytest.mark.asyncio
97
  async def test_extract_subtitles_not_found(self, service, sample_youtube_url):
98
  """Test subtitle extraction when no subtitles are found."""
99
- with patch.object(service, '_check_available_subtitles') as mock_check, \
100
- patch.object(service, '_download_subtitles') as mock_download:
101
-
102
- mock_check.return_value = None
103
  mock_download.return_value = None
104
-
 
105
  with pytest.raises(SubtitlesNotFoundError):
106
  await service.extract_subtitles(sample_youtube_url, "en")
107
-
108
- @pytest.mark.asyncio
109
- async def test_extract_subtitles_timeout(self, service, sample_youtube_url):
110
- """Test subtitle extraction timeout."""
111
- with patch.object(service, '_check_available_subtitles') as mock_check:
112
- mock_check.side_effect = asyncio.TimeoutError()
113
-
114
- with pytest.raises(DownloadTimeoutError):
115
- await service.extract_subtitles(sample_youtube_url, "en")
116
-
117
- @pytest.mark.asyncio
118
- async def test_check_available_subtitles_success(self, service, sample_youtube_url):
119
- """Test checking available subtitles."""
120
- mock_process = AsyncMock()
121
- mock_process.returncode = 0
122
- mock_process.communicate.return_value = (b"Subtitle output", b"")
123
-
124
- with patch('asyncio.create_subprocess_exec', return_value=mock_process), \
125
- patch('asyncio.wait_for', return_value=(b"Subtitle output", b"")):
126
-
127
- # Should not raise any exception
128
- await service._check_available_subtitles(sample_youtube_url)
129
-
130
  @pytest.mark.asyncio
131
- async def test_check_available_subtitles_invalid_video(self, service):
132
- """Test checking subtitles for invalid video."""
133
- mock_process = AsyncMock()
134
- mock_process.returncode = 1
135
- mock_process.communicate.return_value = (b"", b"Video unavailable")
136
-
137
- with patch('asyncio.create_subprocess_exec', return_value=mock_process), \
138
- patch('asyncio.wait_for', return_value=(b"", b"Video unavailable")):
139
-
140
- with pytest.raises(InvalidVideoURLError):
141
- await service._check_available_subtitles("https://www.youtube.com/watch?v=invalid")
142
-
143
- def test_get_alternative_languages(self, service):
144
- """Test getting alternative language codes."""
145
- alternatives = service._get_alternative_languages("en")
146
- assert "en-US" in alternatives
147
- assert "en-GB" in alternatives
148
-
149
- alternatives = service._get_alternative_languages("es")
150
- assert "es-ES" in alternatives
151
- assert "es-MX" in alternatives
152
-
153
- # Test unknown language
154
- alternatives = service._get_alternative_languages("unknown")
155
- assert "unknown-UNKNOWN" in alternatives
156
 
157
 
158
  class TestSubtitleAPI:
159
  """Test subtitle API endpoints."""
160
-
161
  def test_extract_subtitles_endpoint_success(self, client, api_key, sample_vtt_content):
162
  """Test successful subtitle extraction via API."""
163
  with patch('app.apis.subtitles.service.subtitle_service.extract_subtitles') as mock_extract:
164
  mock_extract.return_value = ("dQw4w9WgXcQ", ["Never gonna give you up", "Never gonna let you down"])
165
-
166
  response = client.post(
167
  "/api/v1/subtitles/extract",
168
  json={"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ", "lang": "en"},
169
  headers={"x-api-key": api_key}
170
  )
171
-
172
  assert response.status_code == 200
173
  data = response.json()
174
  assert data["status"] == "success"
175
  assert data["video_id"] == "dQw4w9WgXcQ"
176
  assert "Never gonna give you up" in data["subtitles"]
177
-
178
  def test_extract_subtitles_endpoint_invalid_api_key(self, client, invalid_api_key):
179
  """Test API endpoint with invalid API key."""
180
  response = client.post(
@@ -182,18 +143,18 @@ class TestSubtitleAPI:
182
  json={"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ", "lang": "en"},
183
  headers={"x-api-key": invalid_api_key}
184
  )
185
-
186
  assert response.status_code == 401
187
-
188
  def test_extract_subtitles_endpoint_missing_api_key(self, client):
189
  """Test API endpoint with missing API key."""
190
  response = client.post(
191
  "/api/v1/subtitles/extract",
192
  json={"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ", "lang": "en"}
193
  )
194
-
195
  assert response.status_code == 401
196
-
197
  def test_extract_subtitles_endpoint_invalid_url(self, client, api_key):
198
  """Test API endpoint with invalid URL."""
199
  response = client.post(
@@ -201,28 +162,13 @@ class TestSubtitleAPI:
201
  json={"url": "https://example.com/not-youtube", "lang": "en"},
202
  headers={"x-api-key": api_key}
203
  )
204
-
205
- assert response.status_code == 422 # Validation error
206
-
207
- def test_extract_subtitles_endpoint_not_found(self, client, api_key):
208
- """Test API endpoint when subtitles are not found."""
209
- with patch('app.apis.subtitles.service.subtitle_service.extract_subtitles') as mock_extract:
210
- mock_extract.side_effect = SubtitlesNotFoundError("No subtitles available")
211
-
212
- response = client.post(
213
- "/api/v1/subtitles/extract",
214
- json={"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ", "lang": "en"},
215
- headers={"x-api-key": api_key}
216
- )
217
-
218
- assert response.status_code == 404
219
- data = response.json()
220
- assert data["detail"]["status"] == "error"
221
-
222
  def test_subtitles_health_endpoint(self, client):
223
  """Test subtitles health check endpoint."""
224
  response = client.get("/api/v1/subtitles/health")
225
-
226
  assert response.status_code == 200
227
  data = response.json()
228
  assert data["status"] == "healthy"
 
1
  """Tests for subtitle extraction functionality."""
2
 
3
  import pytest
4
+ from unittest.mock import AsyncMock, patch
5
  import asyncio
 
6
 
7
+ from app.apis.subtitles.service import SubtitleService, SUBTITLE_CACHE
8
  from app.apis.subtitles.utils import (
9
+ clean_subtitle_text,
10
+ extract_video_id,
11
  convert_vtt_to_text
12
  )
13
  from app.core.exceptions import (
14
+ SubtitlesNotFoundError,
15
+ DownloadTimeoutError
 
 
16
  )
17
 
18
 
19
  class TestSubtitleUtils:
20
  """Test subtitle utility functions."""
21
+
22
  def test_extract_video_id_standard_url(self):
23
  """Test extracting video ID from standard YouTube URL."""
24
  url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
25
  assert extract_video_id(url) == "dQw4w9WgXcQ"
26
+
27
  def test_extract_video_id_short_url(self):
28
  """Test extracting video ID from short YouTube URL."""
29
  url = "https://youtu.be/dQw4w9WgXcQ"
30
  assert extract_video_id(url) == "dQw4w9WgXcQ"
31
+
32
  def test_extract_video_id_embed_url(self):
33
  """Test extracting video ID from embed URL."""
34
  url = "https://www.youtube.com/embed/dQw4w9WgXcQ"
35
  assert extract_video_id(url) == "dQw4w9WgXcQ"
36
+
37
  def test_clean_subtitle_text(self):
38
  """Test cleaning subtitle text."""
39
  raw_lines = [
 
43
  "00:00:00.000 --> 00:00:03.000",
44
  "Never gonna give you up",
45
  "",
46
+ "2",
47
  "00:00:03.000 --> 00:00:06.000",
48
  "Never gonna let you down",
49
+ "Never gonna give you up",
50
  ""
51
  ]
52
+
53
  cleaned = clean_subtitle_text(raw_lines)
54
  assert "Never gonna give you up" in cleaned
55
  assert "Never gonna let you down" in cleaned
56
  assert "WEBVTT" not in cleaned
57
  assert "00:00:00.000 --> 00:00:03.000" not in cleaned
58
  assert len([line for line in cleaned if line == "Never gonna give you up"]) == 1
59
+
60
  def test_convert_vtt_to_text(self, sample_vtt_content):
61
  """Test converting VTT content to clean text."""
62
  result = convert_vtt_to_text(sample_vtt_content)
63
+
64
  assert "Never gonna give you up" in result
65
  assert "Never gonna let you down" in result
66
  assert "WEBVTT" not in result
 
69
 
70
  class TestSubtitleService:
71
  """Test subtitle extraction service."""
72
+
73
+ @pytest.fixture(autouse=True)
74
+ def clear_cache(self):
75
+ """Clear cache before each test."""
76
+ SUBTITLE_CACHE.clear()
77
+
78
  @pytest.fixture
79
  def service(self):
80
  """Create a subtitle service instance."""
81
  return SubtitleService()
82
+
83
  @pytest.mark.asyncio
84
  async def test_extract_subtitles_success(self, service, sample_youtube_url, sample_vtt_content):
85
  """Test successful subtitle extraction."""
86
+ with patch.object(service, '_download_subtitles') as mock_download:
 
 
 
87
  mock_download.return_value = sample_vtt_content
88
+
89
  video_id, subtitles = await service.extract_subtitles(sample_youtube_url, "en")
90
+
91
  assert video_id == "dQw4w9WgXcQ"
92
  assert len(subtitles) > 0
93
  assert "Never gonna give you up" in subtitles
94
+
95
  @pytest.mark.asyncio
96
  async def test_extract_subtitles_not_found(self, service, sample_youtube_url):
97
  """Test subtitle extraction when no subtitles are found."""
98
+ with patch.object(service, '_download_subtitles') as mock_download, \
99
+ patch.object(service, '_try_alternative_languages') as mock_alt:
 
 
100
  mock_download.return_value = None
101
+ mock_alt.return_value = None
102
+
103
  with pytest.raises(SubtitlesNotFoundError):
104
  await service.extract_subtitles(sample_youtube_url, "en")
105
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  @pytest.mark.asyncio
107
+ async def test_extract_subtitles_uses_cache(self, service, sample_youtube_url, sample_vtt_content):
108
+ """Test that cached results are returned."""
109
+ with patch.object(service, '_download_subtitles') as mock_download:
110
+ mock_download.return_value = sample_vtt_content
111
+
112
+ result1 = await service.extract_subtitles(sample_youtube_url, "en")
113
+ result2 = await service.extract_subtitles(sample_youtube_url, "en")
114
+
115
+ assert result1 == result2
116
+ assert mock_download.call_count == 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
 
119
  class TestSubtitleAPI:
120
  """Test subtitle API endpoints."""
121
+
122
  def test_extract_subtitles_endpoint_success(self, client, api_key, sample_vtt_content):
123
  """Test successful subtitle extraction via API."""
124
  with patch('app.apis.subtitles.service.subtitle_service.extract_subtitles') as mock_extract:
125
  mock_extract.return_value = ("dQw4w9WgXcQ", ["Never gonna give you up", "Never gonna let you down"])
126
+
127
  response = client.post(
128
  "/api/v1/subtitles/extract",
129
  json={"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ", "lang": "en"},
130
  headers={"x-api-key": api_key}
131
  )
132
+
133
  assert response.status_code == 200
134
  data = response.json()
135
  assert data["status"] == "success"
136
  assert data["video_id"] == "dQw4w9WgXcQ"
137
  assert "Never gonna give you up" in data["subtitles"]
138
+
139
  def test_extract_subtitles_endpoint_invalid_api_key(self, client, invalid_api_key):
140
  """Test API endpoint with invalid API key."""
141
  response = client.post(
 
143
  json={"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ", "lang": "en"},
144
  headers={"x-api-key": invalid_api_key}
145
  )
146
+
147
  assert response.status_code == 401
148
+
149
  def test_extract_subtitles_endpoint_missing_api_key(self, client):
150
  """Test API endpoint with missing API key."""
151
  response = client.post(
152
  "/api/v1/subtitles/extract",
153
  json={"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ", "lang": "en"}
154
  )
155
+
156
  assert response.status_code == 401
157
+
158
  def test_extract_subtitles_endpoint_invalid_url(self, client, api_key):
159
  """Test API endpoint with invalid URL."""
160
  response = client.post(
 
162
  json={"url": "https://example.com/not-youtube", "lang": "en"},
163
  headers={"x-api-key": api_key}
164
  )
165
+
166
+ assert response.status_code == 422
167
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  def test_subtitles_health_endpoint(self, client):
169
  """Test subtitles health check endpoint."""
170
  response = client.get("/api/v1/subtitles/health")
171
+
172
  assert response.status_code == 200
173
  data = response.json()
174
  assert data["status"] == "healthy"