Spaces:

unveilAiDotOrg
/

audio-processor

Sleeping

App Files Files Community

Tadeas Kosek commited on Jun 20, 2025

Commit

37e59a0

1 Parent(s): d369cf2

add r2 file repository

Browse files

Files changed (12) hide show

app.py +1 -0
application/use_cases/extract_audio_async.py +43 -0
application/use_cases/process_job.py +27 -46
infrastructure/config/settings.py +38 -3
infrastructure/providers/__init__.py +0 -0
infrastructure/providers/file_storage_provider.py +167 -0
infrastructure/repositories/file_repository.py +41 -18
infrastructure/repositories/r2_file_repository.py +411 -0
infrastructure/services/container.py +22 -2
infrastructure/services/local_file_processor.py +112 -0
interfaces/api/routes/extraction_routes.py +21 -11
requirements.txt +4 -0

app.py CHANGED Viewed

@@ -34,6 +34,7 @@ async def lifespan(app: FastAPI):
     # Startup
     logger.info(f"Starting {settings.app_name} v{settings.app_version}")
     # Initialize containers
     service_container = ServiceContainer.get_instance()

     # Startup
     logger.info(f"Starting {settings.app_name} v{settings.app_version}")
+    logger.info(f"Storage type: {settings.storage_type}")
     # Initialize containers
     service_container = ServiceContainer.get_instance()

application/use_cases/extract_audio_async.py CHANGED Viewed

@@ -91,6 +91,49 @@ class ExtractAudioAsyncUseCase:
             file_size_mb=job.file_size.megabytes
         )
     async def _process_job_background(self, job_id: str, request: ExtractionRequestDTO):
         """Process job in background."""
         try:

             file_size_mb=job.file_size.megabytes
         )
+    async def execute_with_job(self, job: Job, request: ExtractionRequestDTO,
+                              background_tasks: BackgroundTaskRunner) -> JobCreationDTO:
+        """Execute with a pre-created job."""
+        # Create domain objects for validation
+        video = Video(
+            filename=request.video_filename,
+            file_path=request.video_file_path,
+            size=FileSize(request.video_file_size),
+            content_type=request.content_type
+        )
+        # Validate request
+        self.validation_service.validate_extraction_request(
+            video, request.output_format, request.quality
+        )
+        # Save job to repository (job already created)
+        await self.job_repository.create(
+            job_id=job.id,
+            filename=job.video_filename,
+            file_size_mb=job.file_size.megabytes,
+            output_format=job.output_format.value,
+            quality=job.quality.value
+        )
+        # Queue background processing
+        background_tasks.add_task(
+            self._process_job_background,
+            job.id,
+            request
+        )
+        logger.info(f"Created async job {job.id} for {video.filename}")
+        return JobCreationDTO(
+            job_id=job.id,
+            status=job.status.value,
+            message=f"Processing large file ({job.file_size.megabytes:.1f} MB)",
+            check_url=f"/api/v1/jobs/{job.id}",
+            file_size_mb=job.file_size.megabytes
+        )
     async def _process_job_background(self, job_id: str, request: ExtractionRequestDTO):
         """Process job in background."""
         try:

application/use_cases/process_job.py CHANGED Viewed

@@ -8,6 +8,7 @@ from domain.value_objects.audio_quality import AudioQuality
 from domain.value_objects.file_size import FileSize
 from domain.entities.video import Video
 from domain.entities.audio import Audio
 from ..dto.extraction_request import ExtractionRequestDTO
@@ -38,76 +39,56 @@ class FileRepository(Protocol):
 class ProcessJobUseCase:
     """Use case for processing a queued extraction job."""
-    def __init__(self,
-                 job_repository: JobRepository,
-                 ffmpeg_service: FFmpegService,
-                 file_repository: FileRepository):
-        self.job_repository = job_repository
-        self.ffmpeg_service = ffmpeg_service
         self.file_repository = file_repository
     async def execute(self, job_id: str, request: ExtractionRequestDTO):
-        """Process the extraction job."""
         start_time = time.time()
-        output_path = None
         try:
-            # Update job status to processing
             await self.job_repository.update_status(job_id, "processing")
-            # Create domain objects
-            video = Video(
-                filename=request.video_filename,
-                file_path=request.video_file_path,
-                size=FileSize(request.video_file_size),
-                content_type=request.content_type
-            )
-            audio_format = AudioFormat(request.output_format)
-            audio_quality = AudioQuality(request.quality)
             # Create output path
-            output_path = await self.file_repository.create_output_path(
-                job_id, audio_format.value
             )
-            # Extract audio
-            logger.info(f"Processing job {job_id}: {video.filename} -> {audio_format.value}")
-            result = await self.ffmpeg_service.extract_audio(
-                video.file_path,
-                output_path,
-                audio_format.value,
-                audio_quality.value
             )
-            if not result.success:
-                raise Exception(f"FFmpeg extraction failed: {result.error}")
             # Calculate processing time
             processing_time = time.time() - start_time
             # Update job as completed
             await self.job_repository.update_status(
-                job_id=job_id,
-                status="completed",
-                output_path=output_path,
                 processing_time=processing_time
             )
-            logger.info(f"Job {job_id} completed in {processing_time:.2f}s")
         except Exception as e:
-            # Clean up output file on error
-            if output_path:
-                await self.file_repository.delete_file(output_path)
-            # Update job as failed
             await self.job_repository.update_status(
-                job_id=job_id,
-                status="failed",
-                error=str(e)
             )
-            logger.error(f"Job {job_id} failed: {str(e)}")
             raise

 from domain.value_objects.file_size import FileSize
 from domain.entities.video import Video
 from domain.entities.audio import Audio
+from infrastructure.services.local_file_processor import LocalFileProcessor
 from ..dto.extraction_request import ExtractionRequestDTO
 class ProcessJobUseCase:
     """Use case for processing a queued extraction job."""
+    def __init__(self, file_repository, ffmpeg_service, job_repository):
         self.file_repository = file_repository
+        self.ffmpeg_service = ffmpeg_service
+        self.job_repository = job_repository
+        self.file_processor = LocalFileProcessor(file_repository)
     async def execute(self, job_id: str, request: ExtractionRequestDTO):
+        """Execute job processing with local file handling."""
         start_time = time.time()
         try:
+            # Update job status
             await self.job_repository.update_status(job_id, "processing")
             # Create output path
+            output_key = await self.file_repository.create_output_path(
+                job_id,
+                request.output_format
             )
+            # Process with local files
+            await self.file_processor.process_with_ffmpeg(
+                input_storage_key=request.video_file_path,
+                output_storage_key=output_key,
+                ffmpeg_func=self.ffmpeg_service.extract_audio,  # Remove _async
+                format=request.output_format,  # Change from output_format to format
+                quality=request.quality
             )
             # Calculate processing time
             processing_time = time.time() - start_time
             # Update job as completed
             await self.job_repository.update_status(
+                job_id,
+                "completed",
+                output_path=output_key,
                 processing_time=processing_time
             )
+            logger.info(f"Job {job_id} completed in {processing_time:.2f} seconds")
         except Exception as e:
+            processing_time = time.time() - start_time
+            logger.error(f"Job {job_id} failed after {processing_time:.2f} seconds: {str(e)}")
             await self.job_repository.update_status(
+                job_id,
+                "failed",
+                error=str(e),
+                processing_time=processing_time
             )
             raise

infrastructure/config/settings.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from pydantic_settings import BaseSettings
 from pydantic import Field
 from pathlib import Path
-from typing import List, Dict, Any
 import os
 class Settings(BaseSettings):
@@ -13,12 +13,21 @@ class Settings(BaseSettings):
     app_version: str = "1.0.0"
     debug: bool = Field(default=False, env="DEBUG")
     # File processing settings
     temp_dir: Path = Field(default=Path("/tmp/audio_extractor"), env="TEMP_DIR")
     max_direct_file_size_mb: float = Field(default=10.0, env="MAX_DIRECT_FILE_SIZE_MB")
     cleanup_interval_seconds: int = Field(default=3600, env="CLEANUP_INTERVAL_SECONDS")
     file_retention_hours: int = Field(default=2, env="FILE_RETENTION_HOURS")
     # FFmpeg settings
     ffmpeg_path: str = Field(default="/usr/bin/ffmpeg", env="FFMPEG_PATH")
     ffmpeg_timeout_seconds: int = Field(default=1800, env="FFMPEG_TIMEOUT_SECONDS")  # 30 minutes
@@ -84,8 +93,34 @@ class Settings(BaseSettings):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        # Ensure temp directory exists
-        self.temp_dir.mkdir(parents=True, exist_ok=True)
 # Singleton instance
 settings = Settings()

 from pydantic_settings import BaseSettings
 from pydantic import Field
 from pathlib import Path
+from typing import List, Dict, Any, Optional
 import os
 class Settings(BaseSettings):
     app_version: str = "1.0.0"
     debug: bool = Field(default=False, env="DEBUG")
+    # Storage configuration
+    storage_type: str = Field(default="filesystem", env="STORAGE_TYPE")  # "filesystem" or "r2"
     # File processing settings
     temp_dir: Path = Field(default=Path("/tmp/audio_extractor"), env="TEMP_DIR")
     max_direct_file_size_mb: float = Field(default=10.0, env="MAX_DIRECT_FILE_SIZE_MB")
     cleanup_interval_seconds: int = Field(default=3600, env="CLEANUP_INTERVAL_SECONDS")
     file_retention_hours: int = Field(default=2, env="FILE_RETENTION_HOURS")
+    # Cloudflare R2 storage settings (optional, only needed if storage_type == "r2")
+    cloudflare_r2_account_id: Optional[str] = Field(default=None, env="CLOUDFLARE_R2_ACCOUNT_ID")
+    cloudflare_r2_access_key_id: Optional[str] = Field(default=None, env="CLOUDFLARE_R2_ACCESS_KEY_ID")
+    cloudflare_r2_secret_access_key: Optional[str] = Field(default=None, env="CLOUDFLARE_R2_SECRET_ACCESS_KEY")
+    cloudflare_r2_bucket_name: Optional[str] = Field(default=None, env="CLOUDFLARE_R2_BUCKET_NAME")
     # FFmpeg settings
     ffmpeg_path: str = Field(default="/usr/bin/ffmpeg", env="FFMPEG_PATH")
     ffmpeg_timeout_seconds: int = Field(default=1800, env="FFMPEG_TIMEOUT_SECONDS")  # 30 minutes
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
+        # Ensure temp directory exists (only for filesystem storage)
+        if self.storage_type.lower() == "filesystem":
+            self.temp_dir.mkdir(parents=True, exist_ok=True)
+        # Validate R2 configuration if R2 storage is selected
+        if self.storage_type.lower() == "r2":
+            self._validate_r2_config()
+    def _validate_r2_config(self):
+        """Validate R2 configuration when R2 storage is selected."""
+        required_fields = [
+            'cloudflare_r2_account_id',
+            'cloudflare_r2_access_key_id',
+            'cloudflare_r2_secret_access_key',
+            'cloudflare_r2_bucket_name'
+        ]
+        missing_fields = []
+        for field in required_fields:
+            value = getattr(self, field)
+            if not value:
+                missing_fields.append(field.upper())
+        if missing_fields:
+            raise ValueError(
+                f"R2 storage selected but missing required environment variables: "
+                f"{', '.join(missing_fields)}"
+            )
 # Singleton instance
 settings = Settings()

infrastructure/providers/__init__.py ADDED Viewed

File without changes

infrastructure/providers/file_storage_provider.py ADDED Viewed

	@@ -0,0 +1,167 @@

+"""File storage provider factory."""
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Protocol, Union
+from enum import Enum
+from ..repositories.file_repository import FileSystemRepository
+from ..repositories.r2_file_repository import R2Repository
+class StorageType(Enum):
+    """Available storage types."""
+    FILESYSTEM = "filesystem"
+    R2 = "r2"
+class FileStorageRepository(Protocol):
+    """Protocol defining the file storage interface."""
+    async def save_uploaded_file(self, content: bytes, original_filename: str, job_id: str = None) -> str:
+        ...
+    async def save_stream(self, stream, original_filename: str, job_id: str = None, chunk_size: int = 1024 * 1024) -> str:
+        ...
+    async def create_output_path(self, job_id: str, format: str) -> str:
+        ...
+    async def create_temp_path(self, prefix: str, extension: str) -> str:
+        ...
+    async def create_deterministic_temp_path(self, job_id: str,
+                                           start_seconds: float = None,
+                                           end_seconds: float = None,
+                                           extension: str = None) -> str:
+        ...
+    async def read_file(self, file_path: str) -> bytes:
+        ...
+    async def file_exists(self, file_path: str) -> bool:
+        ...
+    async def get_file_size(self, file_path: str) -> int:
+        ...
+    async def delete_file(self, file_path: str) -> bool:
+        ...
+    async def cleanup_old_files(self, older_than_hours: int) -> int:
+        ...
+    # New methods for local file access
+    async def get_local_path(self, file_key: str) -> str:
+        """Get a local file path for processing. Downloads if remote storage."""
+        ...
+    async def save_local_file_to_storage(self, local_path: str, storage_key: str) -> str:
+        """Save a local file to storage and return the storage key."""
+        ...
+    async def cleanup_local_path(self, local_path: str, storage_key: str) -> bool:
+        """Clean up local temp file. No-op for filesystem storage."""
+        ...
+class FileStorageConfig(ABC):
+    """Base configuration for file storage."""
+    pass
+class FileSystemConfig(FileStorageConfig):
+    """Configuration for filesystem storage."""
+    def __init__(self, base_path: Union[str, Path]):
+        self.base_path = Path(base_path)
+class R2Config(FileStorageConfig):
+    """Configuration for Cloudflare R2 storage."""
+    def __init__(self, account_id: str, access_key_id: str, secret_access_key: str, bucket_name: str):
+        self.account_id = account_id
+        self.access_key_id = access_key_id
+        self.secret_access_key = secret_access_key
+        self.bucket_name = bucket_name
+        # Validate required fields
+        if not all([account_id, access_key_id, secret_access_key, bucket_name]):
+            raise ValueError("All R2 configuration fields are required")
+class FileStorageProvider:
+    """Factory for creating file storage repositories."""
+    @staticmethod
+    def create_filesystem_storage(config: FileSystemConfig) -> FileSystemRepository:
+        """Create a filesystem storage repository."""
+        return FileSystemRepository(base_path=config.base_path)
+    @staticmethod
+    def create_r2_storage(config: R2Config) -> R2Repository:
+        """Create an R2 storage repository."""
+        return R2Repository(
+            account_id=config.account_id,
+            access_key_id=config.access_key_id,
+            secret_access_key=config.secret_access_key,
+            bucket_name=config.bucket_name
+        )
+    @staticmethod
+    def create_storage(storage_type: StorageType, config: FileStorageConfig) -> FileStorageRepository:
+        """Create a file storage repository based on type and configuration."""
+        if storage_type == StorageType.FILESYSTEM:
+            if not isinstance(config, FileSystemConfig):
+                raise ValueError("FileSystemConfig required for filesystem storage")
+            return FileStorageProvider.create_filesystem_storage(config)
+        elif storage_type == StorageType.R2:
+            if not isinstance(config, R2Config):
+                raise ValueError("R2Config required for R2 storage")
+            return FileStorageProvider.create_r2_storage(config)
+        else:
+            raise ValueError(f"Unsupported storage type: {storage_type}")
+# Convenience factory functions
+def create_filesystem_storage(base_path: Union[str, Path]) -> FileSystemRepository:
+    """Convenience function to create filesystem storage."""
+    config = FileSystemConfig(base_path)
+    return FileStorageProvider.create_filesystem_storage(config)
+def create_r2_storage(account_id: str, access_key_id: str,
+                     secret_access_key: str, bucket_name: str) -> R2Repository:
+    """Convenience function to create R2 storage."""
+    config = R2Config(account_id, access_key_id, secret_access_key, bucket_name)
+    return FileStorageProvider.create_r2_storage(config)
+def create_storage_from_settings(storage_type: str, **kwargs) -> FileStorageRepository:
+    """Create storage from string type and keyword arguments."""
+    try:
+        storage_enum = StorageType(storage_type.lower())
+    except ValueError:
+        raise ValueError(f"Invalid storage type: {storage_type}. Must be one of: {[t.value for t in StorageType]}")
+    if storage_enum == StorageType.FILESYSTEM:
+        if 'base_path' not in kwargs:
+            raise ValueError("base_path required for filesystem storage")
+        config = FileSystemConfig(kwargs['base_path'])
+    elif storage_enum == StorageType.R2:
+        required_fields = ['account_id', 'access_key_id', 'secret_access_key', 'bucket_name']
+        missing_fields = [field for field in required_fields if field not in kwargs]
+        if missing_fields:
+            raise ValueError(f"Missing required R2 configuration fields: {missing_fields}")
+        config = R2Config(
+            account_id=kwargs['account_id'],
+            access_key_id=kwargs['access_key_id'],
+            secret_access_key=kwargs['secret_access_key'],
+            bucket_name=kwargs['bucket_name']
+        )
+    return FileStorageProvider.create_storage(storage_enum, config)

infrastructure/repositories/file_repository.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""File system repository implementation."""
 from pathlib import Path
 from typing import Optional, List, Tuple
 import aiofiles
@@ -12,18 +12,19 @@ import asyncio
 logger = logging.getLogger(__name__)
 class FileSystemRepository:
-    """Repository for managing temporary files."""
     def __init__(self, base_path: Path):
         self.base_path = Path(base_path)
         self.base_path.mkdir(parents=True, exist_ok=True)
-    async def save_uploaded_file(self, content: bytes, original_filename: str) -> str:
         """Save uploaded file and return the path."""
         # Generate unique filename
-        file_id = str(uuid.uuid4())
         extension = Path(original_filename).suffix
-        filename = f"{file_id}_input{extension}"
         file_path = self.base_path / filename
         async with aiofiles.open(file_path, 'wb') as f:
@@ -32,11 +33,12 @@ class FileSystemRepository:
         logger.info(f"Saved uploaded file: {file_path}")
         return str(file_path)
-    async def save_stream(self, stream, original_filename: str, chunk_size: int = 1024 * 1024) -> str:
         """Save file from stream, handling both async and sync streams."""
-        file_id = str(uuid.uuid4())
         extension = Path(original_filename).suffix
-        filename = f"{file_id}_input{extension}"
         file_path = self.base_path / filename
         async with aiofiles.open(file_path, 'wb') as f:
@@ -94,15 +96,6 @@ class FileSystemRepository:
         This ensures that the same trimming parameters always result in the same filename,
         allowing for efficient reuse of previously trimmed files.
-        Args:
-            job_id: The job ID
-            start_seconds: Start time in seconds (None for beginning)
-            end_seconds: End time in seconds (None for end)
-            extension: File extension (e.g., 'mp3')
-        Returns:
-            str: Deterministic file path
         """
         import hashlib
@@ -175,4 +168,34 @@ class FileSystemRepository:
         if deleted_count > 0:
             logger.info(f"Cleaned up {deleted_count} old files")
-        return deleted_count

+"""File system repository implementation with local path support."""
 from pathlib import Path
 from typing import Optional, List, Tuple
 import aiofiles
 logger = logging.getLogger(__name__)
 class FileSystemRepository:
+    """Repository for managing temporary files with local path support."""
     def __init__(self, base_path: Path):
         self.base_path = Path(base_path)
         self.base_path.mkdir(parents=True, exist_ok=True)
+    async def save_uploaded_file(self, content: bytes, original_filename: str, job_id: str = None) -> str:
         """Save uploaded file and return the path."""
         # Generate unique filename
+        if job_id is None:
+            job_id = str(uuid.uuid4())
         extension = Path(original_filename).suffix
+        filename = f"{job_id}_input{extension}"
         file_path = self.base_path / filename
         async with aiofiles.open(file_path, 'wb') as f:
         logger.info(f"Saved uploaded file: {file_path}")
         return str(file_path)
+    async def save_stream(self, stream, original_filename: str, job_id: str = None, chunk_size: int = 1024 * 1024) -> str:
         """Save file from stream, handling both async and sync streams."""
+        if job_id is None:
+            job_id = str(uuid.uuid4())
         extension = Path(original_filename).suffix
+        filename = f"{job_id}_input{extension}"
         file_path = self.base_path / filename
         async with aiofiles.open(file_path, 'wb') as f:
         This ensures that the same trimming parameters always result in the same filename,
         allowing for efficient reuse of previously trimmed files.
         """
         import hashlib
         if deleted_count > 0:
             logger.info(f"Cleaned up {deleted_count} old files")
+        return deleted_count
+    # Local file access methods (no-op for filesystem storage)
+    async def get_local_path(self, file_path: str) -> str:
+        """Return the file path directly since it's already local."""
+        if not await self.file_exists(file_path):
+            raise FileNotFoundError(f"File not found: {file_path}")
+        return file_path
+    async def save_local_file_to_storage(self, local_path: str, storage_path: str) -> str:
+        """Copy local file to storage path (filesystem to filesystem)."""
+        try:
+            # Read source file
+            content = await self.read_file(local_path)
+            # Write to destination
+            async with aiofiles.open(storage_path, 'wb') as f:
+                await f.write(content)
+            logger.debug(f"Copied local file {local_path} to {storage_path}")
+            return storage_path
+        except Exception as e:
+            logger.error(f"Failed to copy local file {local_path} to {storage_path}: {e}")
+            raise
+    async def cleanup_local_path(self, local_path: str, storage_path: str) -> bool:
+        """No-op for filesystem storage since local_path == storage_path."""
+        # For filesystem storage, the local path and storage path are the same,
+        # so we don't need to clean up anything extra
+        return True

infrastructure/repositories/r2_file_repository.py ADDED Viewed

	@@ -0,0 +1,411 @@

+"""Cloudflare R2 repository implementation."""
+import asyncio
+import hashlib
+import inspect
+import json
+import logging
+import os
+import tempfile
+import uuid
+from datetime import datetime, timedelta
+from io import BytesIO
+from pathlib import Path
+from typing import List, Optional, Tuple
+import aioboto3
+from botocore.config import Config
+logger = logging.getLogger(__name__)
+class R2Repository:
+    """Repository for managing files in Cloudflare R2."""
+    def __init__(self, account_id: str, access_key_id: str, secret_access_key: str, bucket_name: str):
+        self.account_id = account_id
+        self.access_key_id = access_key_id
+        self.secret_access_key = secret_access_key
+        self.bucket_name = bucket_name
+        self.endpoint_url = f"https://{account_id}.r2.cloudflarestorage.com"
+        # Create session configuration
+        self.config = Config(
+            signature_version='s3v4',
+            retries={'max_attempts': 3, 'mode': 'adaptive'}
+        )
+    def _get_client(self):
+        """Get R2 client using aioboto3."""
+        session = aioboto3.Session()
+        return session.client(
+            's3',
+            endpoint_url=self.endpoint_url,
+            aws_access_key_id=self.access_key_id,
+            aws_secret_access_key=self.secret_access_key,
+            config=self.config,
+            region_name='auto'
+        )
+    def _generate_job_key(self, job_id: str, filename: str) -> str:
+        """Generate R2 object key for job-specific file."""
+        return f"jobs/{job_id}/{filename}"
+    async def save_uploaded_file(self, content: bytes, original_filename: str, job_id: str = None) -> str:
+        """Save uploaded file and return the R2 object key."""
+        # Generate unique job ID if not provided
+        if job_id is None:
+            job_id = str(uuid.uuid4())
+        extension = Path(original_filename).suffix
+        filename = f"input{extension}"
+        object_key = self._generate_job_key(job_id, filename)
+        async with self._get_client() as s3:
+            await s3.put_object(
+                Bucket=self.bucket_name,
+                Key=object_key,
+                Body=content,
+                Metadata={
+                    'original_filename': original_filename,
+                    'upload_time': datetime.utcnow().isoformat(),
+                    'job_id': job_id
+                }
+            )
+        logger.info(f"Saved uploaded file to R2: {object_key}")
+        return object_key
+    async def save_stream(self, stream, original_filename: str, job_id: str = None, chunk_size: int = 1024 * 1024) -> str:
+        """Save file from stream to R2."""
+        if job_id is None:
+            job_id = str(uuid.uuid4())
+        extension = Path(original_filename).suffix
+        filename = f"input{extension}"
+        object_key = self._generate_job_key(job_id, filename)
+        # Buffer the stream content
+        buffer = BytesIO()
+        read_method = getattr(stream, 'read', None)
+        if read_method is None:
+            raise ValueError("Provided stream does not have a .read() method")
+        is_async = inspect.iscoroutinefunction(read_method)
+        # Read stream into buffer
+        while True:
+            if is_async:
+                chunk = await read_method(chunk_size)
+            else:
+                chunk = await asyncio.to_thread(read_method, chunk_size)
+            if not chunk:
+                break
+            buffer.write(chunk)
+        # Upload to R2
+        buffer.seek(0)
+        async with self._get_client() as s3:
+            await s3.put_object(
+                Bucket=self.bucket_name,
+                Key=object_key,
+                Body=buffer.getvalue(),
+                Metadata={
+                    'original_filename': original_filename,
+                    'upload_time': datetime.utcnow().isoformat(),
+                    'job_id': job_id
+                }
+            )
+        logger.info(f"Saved streamed file to R2: {object_key}")
+        return object_key
+    async def create_output_path(self, job_id: str, format: str) -> str:
+        """Create an R2 object key for output file."""
+        filename = f"output.{format}"
+        return self._generate_job_key(job_id, filename)
+    async def create_temp_path(self, prefix: str, extension: str) -> str:
+        """Create a unique temporary R2 object key."""
+        # Generate unique identifier
+        temp_id = str(uuid.uuid4())[:8]
+        job_id = str(uuid.uuid4())
+        # Clean the prefix to be filename-safe
+        safe_prefix = "".join(c for c in prefix if c.isalnum() or c in ('-', '_'))
+        # Ensure extension doesn't start with a dot
+        clean_extension = extension.lstrip('.')
+        # Create filename
+        filename = f"{safe_prefix}_{temp_id}.{clean_extension}"
+        return self._generate_job_key(job_id, filename)
+    async def create_deterministic_temp_path(self, job_id: str,
+                                           start_seconds: Optional[float],
+                                           end_seconds: Optional[float],
+                                           extension: str) -> str:
+        """Create a deterministic R2 object key based on parameters."""
+        # Create a unique string from the parameters
+        start_str = str(start_seconds) if start_seconds is not None else "start"
+        end_str = str(end_seconds) if end_seconds is not None else "end"
+        # Create hash input
+        hash_input = f"{job_id}_{start_str}_{end_str}"
+        # Generate MD5 hash
+        hash_suffix = hashlib.md5(hash_input.encode()).hexdigest()[:8]
+        # Clean extension
+        clean_extension = extension.lstrip('.')
+        # Create deterministic filename
+        filename = f"trim_{hash_suffix}.{clean_extension}"
+        object_key = self._generate_job_key(job_id, filename)
+        logger.debug(f"Created deterministic R2 key: {object_key} for params: {hash_input}")
+        return object_key
+    async def read_file(self, object_key: str) -> bytes:
+        """Read file content from R2."""
+        async with self._get_client() as s3:
+            try:
+                response = await s3.get_object(Bucket=self.bucket_name, Key=object_key)
+                content = await response['Body'].read()
+                return content
+            except Exception as e:
+                logger.error(f"Failed to read file from R2: {object_key}, error: {e}")
+                raise
+    async def file_exists(self, object_key: str) -> bool:
+        """Check if object exists in R2."""
+        async with self._get_client() as s3:
+            try:
+                await s3.head_object(Bucket=self.bucket_name, Key=object_key)
+                return True
+            except s3.exceptions.NoSuchKey:
+                return False
+            except Exception as e:
+                logger.error(f"Error checking if file exists: {object_key}, error: {e}")
+                return False
+    async def get_file_size(self, object_key: str) -> int:
+        """Get file size in bytes from R2."""
+        async with self._get_client() as s3:
+            try:
+                response = await s3.head_object(Bucket=self.bucket_name, Key=object_key)
+                return response['ContentLength']
+            except Exception as e:
+                logger.error(f"Failed to get file size: {object_key}, error: {e}")
+                raise
+    async def delete_file(self, object_key: str) -> bool:
+        """Delete an object from R2."""
+        async with self._get_client() as s3:
+            try:
+                await s3.delete_object(Bucket=self.bucket_name, Key=object_key)
+                logger.info(f"Deleted file from R2: {object_key}")
+                return True
+            except Exception as e:
+                logger.error(f"Failed to delete file from R2: {object_key}, error: {e}")
+                return False
+    async def delete_job_files(self, job_id: str) -> int:
+        """Delete all files for a specific job."""
+        prefix = f"jobs/{job_id}/"
+        deleted_count = 0
+        async with self._get_client() as s3:
+            try:
+                # List all objects with the job prefix
+                paginator = s3.get_paginator('list_objects_v2')
+                async for page in paginator.paginate(Bucket=self.bucket_name, Prefix=prefix):
+                    if 'Contents' in page:
+                        # Delete objects in batches
+                        objects_to_delete = [{'Key': obj['Key']} for obj in page['Contents']]
+                        if objects_to_delete:
+                            delete_response = await s3.delete_objects(
+                                Bucket=self.bucket_name,
+                                Delete={'Objects': objects_to_delete}
+                            )
+                            deleted_count += len(delete_response.get('Deleted', []))
+                if deleted_count > 0:
+                    logger.info(f"Deleted {deleted_count} files for job {job_id}")
+            except Exception as e:
+                logger.error(f"Failed to delete job files for {job_id}: {e}")
+        return deleted_count
+    async def save_job_metadata(self, job_id: str, metadata: dict) -> str:
+        """Save job metadata as JSON."""
+        metadata_key = self._generate_job_key(job_id, "metadata.json")
+        metadata_with_timestamp = {
+            **metadata,
+            'created_at': datetime.utcnow().isoformat(),
+            'job_id': job_id
+        }
+        async with self._get_client() as s3:
+            await s3.put_object(
+                Bucket=self.bucket_name,
+                Key=metadata_key,
+                Body=json.dumps(metadata_with_timestamp, indent=2),
+                ContentType='application/json'
+            )
+        logger.debug(f"Saved metadata for job {job_id}")
+        return metadata_key
+    async def get_job_metadata(self, job_id: str) -> Optional[dict]:
+        """Get job metadata."""
+        metadata_key = self._generate_job_key(job_id, "metadata.json")
+        try:
+            content = await self.read_file(metadata_key)
+            return json.loads(content.decode('utf-8'))
+        except Exception as e:
+            logger.debug(f"No metadata found for job {job_id}: {e}")
+            return None
+    async def list_old_files(self, older_than_hours: int) -> List[Tuple[str, datetime]]:
+        """List jobs older than specified hours by checking metadata."""
+        cutoff_time = datetime.utcnow() - timedelta(hours=older_than_hours)
+        old_jobs = []
+        async with self._get_client() as s3:
+            try:
+                # List all metadata files
+                paginator = s3.get_paginator('list_objects_v2')
+                async for page in paginator.paginate(
+                    Bucket=self.bucket_name,
+                    Prefix="jobs/",
+                    Delimiter="/"
+                ):
+                    # Get job directories
+                    if 'CommonPrefixes' in page:
+                        for prefix_info in page['CommonPrefixes']:
+                            job_prefix = prefix_info['Prefix']
+                            # Extract job_id from prefix like "jobs/uuid/"
+                            job_id = job_prefix.split('/')[1]
+                            # Check if this job has metadata and if it's old
+                            metadata = await self.get_job_metadata(job_id)
+                            if metadata and 'created_at' in metadata:
+                                created_time = datetime.fromisoformat(metadata['created_at'])
+                                if created_time < cutoff_time:
+                                    old_jobs.append((job_id, created_time))
+            except Exception as e:
+                logger.error(f"Failed to list old files: {e}")
+        return old_jobs
+    async def cleanup_old_files(self, older_than_hours: int) -> int:
+        """Clean up jobs older than specified hours."""
+        old_jobs = await self.list_old_files(older_than_hours)
+        total_deleted = 0
+        for job_id, created_time in old_jobs:
+            deleted_count = await self.delete_job_files(job_id)
+            total_deleted += deleted_count
+        if total_deleted > 0:
+            logger.info(f"Cleaned up {total_deleted} files from {len(old_jobs)} old jobs")
+        return total_deleted
+    # Additional utility methods for job management
+    async def list_job_files(self, job_id: str) -> List[str]:
+        """List all files for a specific job."""
+        prefix = f"jobs/{job_id}/"
+        files = []
+        async with self._get_client() as s3:
+            try:
+                paginator = s3.get_paginator('list_objects_v2')
+                async for page in paginator.paginate(Bucket=self.bucket_name, Prefix=prefix):
+                    if 'Contents' in page:
+                        files.extend([obj['Key'] for obj in page['Contents']])
+            except Exception as e:
+                logger.error(f"Failed to list job files for {job_id}: {e}")
+        return files
+    async def get_job_id_from_key(self, object_key: str) -> Optional[str]:
+        """Extract job ID from R2 object key."""
+        try:
+            # Expecting format: jobs/{job_id}/filename
+            parts = object_key.split('/')
+            if len(parts) >= 3 and parts[0] == 'jobs':
+                return parts[1]
+        except Exception:
+            pass
+        return None
+    # Local file access methods for FFmpeg processing
+    async def get_local_path(self, storage_key: str) -> str:
+        """Download file from R2 to local temp file and return local path."""
+        # Create temp file with appropriate extension
+        _, ext = os.path.splitext(storage_key.split('/')[-1])
+        with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
+            temp_path = temp_file.name
+        try:
+            # Download content from R2
+            content = await self.read_file(storage_key)
+            # Write to temp file
+            with open(temp_path, 'wb') as f:
+                f.write(content)
+            logger.debug(f"Downloaded {storage_key} to local temp file: {temp_path}")
+            return temp_path
+        except Exception as e:
+            # Clean up temp file if download failed
+            if os.path.exists(temp_path):
+                os.unlink(temp_path)
+            logger.error(f"Failed to download {storage_key} to local temp: {e}")
+            raise
+    async def save_local_file_to_storage(self, local_path: str, storage_key: str) -> str:
+        """Upload local file to R2 storage."""
+        try:
+            # Read local file
+            with open(local_path, 'rb') as f:
+                content = f.read()
+            # Upload to R2
+            async with self._get_client() as s3:
+                await s3.put_object(
+                    Bucket=self.bucket_name,
+                    Key=storage_key,
+                    Body=content,
+                    Metadata={
+                        'upload_time': datetime.utcnow().isoformat(),
+                        'source': 'local_file'
+                    }
+                )
+            logger.debug(f"Uploaded local file {local_path} to R2: {storage_key}")
+            return storage_key
+        except Exception as e:
+            logger.error(f"Failed to upload local file {local_path} to R2: {e}")
+            raise
+    async def cleanup_local_path(self, local_path: str, storage_key: str) -> bool:
+        """Clean up local temp file."""
+        try:
+            if os.path.exists(local_path):
+                os.unlink(local_path)
+                logger.debug(f"Cleaned up local temp file: {local_path}")
+                return True
+            return False
+        except Exception as e:
+            logger.error(f"Failed to clean up local temp file {local_path}: {e}")
+            return False

infrastructure/services/container.py CHANGED Viewed

@@ -4,7 +4,7 @@ from pathlib import Path
 from ..config.settings import settings
 from ..repositories.job_repository import InMemoryJobRepository
-from ..repositories.file_repository import FileSystemRepository
 from .ffmpeg_service import FFmpegService
 from .file_cleanup_service import FileCleanupService
@@ -16,7 +16,9 @@ class ServiceContainer:
     def __init__(self):
         # Repositories
         self.job_repository = InMemoryJobRepository()
-        self.file_repository = FileSystemRepository(settings.temp_dir)
         # Services
         self.ffmpeg_service = FFmpegService(
@@ -32,6 +34,24 @@ class ServiceContainer:
             retention_hours=settings.file_retention_hours
         )
     @classmethod
     def get_instance(cls) -> 'ServiceContainer':
         """Get singleton instance of service container."""

 from ..config.settings import settings
 from ..repositories.job_repository import InMemoryJobRepository
+from ..providers.file_storage_provider import create_storage_from_settings
 from .ffmpeg_service import FFmpegService
 from .file_cleanup_service import FileCleanupService
     def __init__(self):
         # Repositories
         self.job_repository = InMemoryJobRepository()
+        # Create file repository based on settings
+        self.file_repository = self._create_file_repository()
         # Services
         self.ffmpeg_service = FFmpegService(
             retention_hours=settings.file_retention_hours
         )
+    def _create_file_repository(self):
+        """Create file repository based on settings."""
+        if settings.storage_type.lower() == "filesystem":
+            return create_storage_from_settings(
+                storage_type="filesystem",
+                base_path=settings.temp_dir
+            )
+        elif settings.storage_type.lower() == "r2":
+            return create_storage_from_settings(
+                storage_type="r2",
+                account_id=settings.cloudflare_r2_account_id,
+                access_key_id=settings.cloudflare_r2_access_key_id,
+                secret_access_key=settings.cloudflare_r2_secret_access_key,
+                bucket_name=settings.cloudflare_r2_bucket_name
+            )
+        else:
+            raise ValueError(f"Unsupported storage type: {settings.storage_type}")
     @classmethod
     def get_instance(cls) -> 'ServiceContainer':
         """Get singleton instance of service container."""

infrastructure/services/local_file_processor.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""Local file processor for handling FFmpeg operations with remote storage."""
+import logging
+import os
+from contextlib import asynccontextmanager
+from typing import AsyncGenerator, Tuple
+logger = logging.getLogger(__name__)
+class LocalFileProcessor:
+    """Handles local file operations for FFmpeg processing with remote storage."""
+    def __init__(self, file_repository):
+        self.file_repository = file_repository
+    @asynccontextmanager
+    async def get_local_files(self, input_storage_key: str, output_storage_key: str) -> AsyncGenerator[Tuple[str, str], None]:
+        """
+        Context manager that provides local file paths for input and output.
+        For filesystem storage: Returns paths directly
+        For R2 storage: Downloads input to temp, provides temp output path
+        Usage:
+            async with processor.get_local_files(input_key, output_key) as (local_input, local_output):
+                # Run FFmpeg with local_input and local_output
+                ffmpeg_command(local_input, local_output)
+                # Files are automatically uploaded and cleaned up
+        Args:
+            input_storage_key: Storage key/path for input file
+            output_storage_key: Storage key/path for output file
+        Yields:
+            Tuple[str, str]: (local_input_path, local_output_path)
+        """
+        local_input_path = None
+        local_output_path = None
+        try:
+            # Get local input path (downloads if remote)
+            local_input_path = await self.file_repository.get_local_path(input_storage_key)
+            logger.debug(f"Got local input path: {local_input_path}")
+            # Create local output path
+            if hasattr(self.file_repository, 'base_path'):
+                # Filesystem storage - create output path directly
+                local_output_path = output_storage_key
+            else:
+                # Remote storage - create temp file for output
+                import tempfile
+                _, ext = os.path.splitext(output_storage_key.split('/')[-1])
+                with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
+                    local_output_path = temp_file.name
+            logger.debug(f"Created local output path: {local_output_path}")
+            # Yield the local paths for processing
+            yield local_input_path, local_output_path
+            # Upload output file to storage if it was created
+            if os.path.exists(local_output_path) and os.path.getsize(local_output_path) > 0:
+                await self.file_repository.save_local_file_to_storage(
+                    local_output_path,
+                    output_storage_key
+                )
+                logger.debug(f"Uploaded output file to storage: {output_storage_key}")
+            else:
+                logger.warning(f"Output file was not created or is empty: {local_output_path}")
+        except Exception as e:
+            logger.error(f"Error in local file processing: {e}")
+            raise
+        finally:
+            # Clean up local files
+            cleanup_tasks = []
+            if local_input_path:
+                cleanup_tasks.append(
+                    self.file_repository.cleanup_local_path(local_input_path, input_storage_key)
+                )
+            if local_output_path and hasattr(self.file_repository, '_get_client'):
+                # Only clean up output file for remote storage (R2)
+                cleanup_tasks.append(
+                    self.file_repository.cleanup_local_path(local_output_path, output_storage_key)
+                )
+            # Execute cleanup tasks
+            for task in cleanup_tasks:
+                try:
+                    await task
+                except Exception as e:
+                    logger.error(f"Error during cleanup: {e}")
+    async def process_with_ffmpeg(self, input_storage_key: str, output_storage_key: str, ffmpeg_func, *args, **kwargs):
+        """
+        Helper method to process files with FFmpeg.
+        Args:
+            input_storage_key: Storage key for input file
+            output_storage_key: Storage key for output file
+            ffmpeg_func: Async function that takes (input_path, output_path, *args, **kwargs)
+            *args, **kwargs: Additional arguments for ffmpeg_func
+        Returns:
+            Result from ffmpeg_func
+        """
+        async with self.get_local_files(input_storage_key, output_storage_key) as (local_input, local_output):
+            logger.info(f"Processing {input_storage_key} -> {output_storage_key}")
+            return await ffmpeg_func(local_input, local_output, *args, **kwargs)

interfaces/api/routes/extraction_routes.py CHANGED Viewed

@@ -6,6 +6,7 @@ from dataclasses import asdict
 from ..dependencies import ValidatedVideo, ExtractionParams, UseCases, Services
 from ..responses import JobCreatedResponse
 from application.dto.extraction_request import ExtractionRequestDTO
 router = APIRouter()
@@ -44,33 +45,42 @@ async def extract_audio(
     # Get file size
     file_size = _get_file_size(video)
     # Create DTO
     extraction_dto = ExtractionRequestDTO(
         video_filename=video.filename,
-        video_file_path="",  # Will be set by use case
         video_file_size=file_size,
         output_format=params.output_format,
         quality=params.quality,
         content_type=video.content_type
     )
-    # Save uploaded file temporarily
-    file_path = await services.file_repository.save_stream(
-        video,
-        video.filename
-    )
-    extraction_dto.video_file_path = file_path
-    # Always use async processing
     try:
-        result = await use_cases.extract_audio_async.execute(
             extraction_dto,
             background_tasks
         )
         return JSONResponse(
             content=asdict(result),
-            status_code=202  # Accepted
         )
     except Exception as e:
         # Clean up input file on error

 from ..dependencies import ValidatedVideo, ExtractionParams, UseCases, Services
 from ..responses import JobCreatedResponse
 from application.dto.extraction_request import ExtractionRequestDTO
+from domain.entities.job import Job
 router = APIRouter()
     # Get file size
     file_size = _get_file_size(video)
+    # Create job first to get the job ID
+    job = Job.create_new(
+        video_filename=video.filename,
+        file_size_bytes=file_size,
+        output_format=params.output_format,
+        quality=params.quality
+    )
+    # Save uploaded file with the job ID
+    file_path = await services.file_repository.save_stream(
+        video,
+        video.filename,
+        job_id=job.id  # Pass the job ID here
+    )
     # Create DTO
     extraction_dto = ExtractionRequestDTO(
         video_filename=video.filename,
+        video_file_path=file_path,
         video_file_size=file_size,
         output_format=params.output_format,
         quality=params.quality,
         content_type=video.content_type
     )
+    # Execute use case with pre-created job
     try:
+        result = await use_cases.extract_audio_async.execute_with_job(
+            job,  # Pass the job object
             extraction_dto,
             background_tasks
         )
         return JSONResponse(
             content=asdict(result),
+            status_code=202
         )
     except Exception as e:
         # Clean up input file on error

requirements.txt CHANGED Viewed

@@ -10,5 +10,9 @@ pydantic-settings==2.1.0
 aiofiles==23.2.1
 ffmpeg-python==0.2.0
 # Utilities
 python-dotenv==1.0.0

 aiofiles==23.2.1
 ffmpeg-python==0.2.0
+# Cloud Storage (R2/S3 compatible)
+aioboto3==12.3.0
+botocore==1.34.34
 # Utilities
 python-dotenv==1.0.0