Spaces:

calebhan
/

rescored

Sleeping

App Files Files Community

calebhan commited on Jan 6

Commit

a5359f9

1 Parent(s): b5fec2f

file upload pipeline

Browse files

Files changed (10) hide show

.gitignore +1 -0
backend/celery_app.py +8 -0
backend/main.py +93 -1
backend/pipeline.py +207 -36
backend/tasks.py +119 -10
frontend/src/api/client.ts +22 -1
frontend/src/components/InstrumentSelector.css +36 -0
frontend/src/components/InstrumentSelector.tsx +44 -4
frontend/src/components/JobSubmission.css +35 -0
frontend/src/components/JobSubmission.tsx +98 -20

.gitignore CHANGED Viewed

@@ -244,6 +244,7 @@ storage/youtube_cookies*
 !storage/README.txt
 storage/outputs/*
 storage/temp/*
 # Temp files
 /tmp/

 !storage/README.txt
 storage/outputs/*
 storage/temp/*
+storage/uploads/*
 # Temp files
 /tmp/

backend/celery_app.py CHANGED Viewed

@@ -1,4 +1,12 @@
 """Celery application configuration."""
 from celery import Celery
 from kombu import Exchange, Queue
 from app_config import settings

 """Celery application configuration."""
+import sys
+from pathlib import Path
+# Ensure backend directory is in Python path for imports
+backend_dir = Path(__file__).parent.resolve()
+if str(backend_dir) not in sys.path:
+    sys.path.insert(0, str(backend_dir))
 from celery import Celery
 from kombu import Exchange, Queue
 from app_config import settings

backend/main.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """FastAPI application for Rescored backend."""
-from fastapi import FastAPI, HTTPException, WebSocket, WebSocketDisconnect, Request, File, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse
 from pydantic import BaseModel, HttpUrl
@@ -157,6 +157,11 @@ class TranscribeRequest(BaseModel):
     options: dict = {"instruments": ["piano"]}
 class TranscribeResponse(BaseModel):
     """Response model for transcription submission."""
     job_id: str
@@ -288,6 +293,93 @@ async def submit_transcription(request: TranscribeRequest):
     )
 @app.get("/api/v1/jobs/{job_id}", response_model=JobStatusResponse)
 async def get_job_status(job_id: str):
     """

 """FastAPI application for Rescored backend."""
+from fastapi import FastAPI, HTTPException, WebSocket, WebSocketDisconnect, Request, File, UploadFile, Form
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse
 from pydantic import BaseModel, HttpUrl
     options: dict = {"instruments": ["piano"]}
+class FileUploadTranscribeRequest(BaseModel):
+    """Request model for file upload transcription."""
+    options: dict = {"instruments": ["piano"]}
 class TranscribeResponse(BaseModel):
     """Response model for transcription submission."""
     job_id: str
     )
+@app.post("/api/v1/transcribe/upload", response_model=TranscribeResponse, status_code=201)
+async def submit_file_transcription(
+    file: UploadFile = File(...),
+    instruments: str = Form('["piano"]'),
+    vocal_instrument: int = Form(40)  # Default to violin (program 40)
+):
+    """
+    Submit an audio file for transcription.
+    Args:
+        file: Audio file (WAV, MP3, FLAC, etc.)
+        instruments: JSON array of instruments (default: ["piano"])
+        vocal_instrument: MIDI program number for vocals (default: 40 = violin)
+    Returns:
+        Job information including job ID and WebSocket URL
+    """
+    print(f"[DEBUG] FastAPI received instruments parameter: {instruments!r}")
+    print(f"[DEBUG] FastAPI received vocal_instrument parameter: {vocal_instrument}")
+    # Validate file type
+    allowed_extensions = {'.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac'}
+    file_ext = Path(file.filename or '').suffix.lower()
+    if file_ext not in allowed_extensions:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported file type. Allowed: {', '.join(allowed_extensions)}"
+        )
+    # Validate file size (max 100MB)
+    max_size = 100 * 1024 * 1024  # 100MB
+    content = await file.read()
+    if len(content) > max_size:
+        raise HTTPException(
+            status_code=400,
+            detail=f"File too large. Maximum size: 100MB"
+        )
+    # Parse instruments option
+    try:
+        import json as json_module
+        print(f"[DEBUG] Received instruments parameter (raw): {instruments}")
+        instruments_list = json_module.loads(instruments)
+        print(f"[DEBUG] Parsed instruments list: {instruments_list}")
+    except Exception as e:
+        print(f"[DEBUG] Failed to parse instruments, using default ['piano']. Error: {e}")
+        instruments_list = ["piano"]
+    # Create job
+    job_id = str(uuid4())
+    # Save uploaded file to storage
+    upload_dir = settings.storage_path / "uploads"
+    upload_dir.mkdir(parents=True, exist_ok=True)
+    upload_path = upload_dir / f"{job_id}{file_ext}"
+    with open(upload_path, "wb") as f:
+        f.write(content)
+    job_data = {
+        "job_id": job_id,
+        "status": "queued",
+        "upload_path": str(upload_path),
+        "original_filename": file.filename or "unknown",
+        "options": json.dumps({"instruments": instruments_list, "vocal_instrument": vocal_instrument}),
+        "created_at": datetime.utcnow().isoformat(),
+        "progress": 0,
+        "current_stage": "queued",
+        "status_message": "Job queued for processing",
+    }
+    # Store in Redis
+    redis_client.hset(f"job:{job_id}", mapping=job_data)
+    # Queue Celery task
+    process_transcription_task.delay(job_id)
+    return TranscribeResponse(
+        job_id=job_id,
+        status="queued",
+        created_at=datetime.utcnow(),
+        estimated_duration_seconds=120,
+        websocket_url=f"ws://localhost:{settings.api_port}/api/v1/jobs/{job_id}/stream"
+    )
 @app.get("/api/v1/jobs/{job_id}", response_model=JobStatusResponse)
 async def get_job_status(job_id: str):
     """

backend/pipeline.py CHANGED Viewed

@@ -33,18 +33,52 @@ except ImportError as e:
     print(f"WARNING: madmom not available. Falling back to librosa for tempo/beat detection.")
     print(f"         Error: {e}")
 class TranscriptionPipeline:
     """Handles the complete transcription workflow."""
-    def __init__(self, job_id: str, youtube_url: str, storage_path: Path, config=None):
         self.job_id = job_id
         self.youtube_url = youtube_url
         self.storage_path = storage_path
         self.temp_dir = storage_path / "temp" / job_id
         self.temp_dir.mkdir(parents=True, exist_ok=True)
         self.progress_callback = None
         # Load configuration
         if config is None:
@@ -117,6 +151,9 @@ class TranscriptionPipeline:
             else:
                 midi_path = piano_midi
             # Apply post-processing filters (Phase 4)
             midi_path = self.apply_post_processing_filters(midi_path)
@@ -164,6 +201,16 @@ class TranscriptionPipeline:
             # Log the full error for debugging
             print(f"yt-dlp stderr: {result.stderr}")
             print(f"yt-dlp stdout: {result.stdout}")
             raise RuntimeError(f"yt-dlp failed: {result.stderr}")
         if not output_path.exists():
@@ -231,7 +278,9 @@ class TranscriptionPipeline:
             # 2. Demucs separates clean instrumental into piano/guitar/drums/bass/other
             print("   Using two-stage separation (BS-RoFormer + Demucs)")
-            from audio_separator_wrapper import AudioSeparator
             separator = AudioSeparator()
             separation_dir = self.temp_dir / "separation"
@@ -254,7 +303,9 @@ class TranscriptionPipeline:
             # Direct Demucs 6-stem separation (no vocal pre-removal)
             print("   Using Demucs 6-stem separation")
-            from audio_separator_wrapper import AudioSeparator
             separator = AudioSeparator()
             instrument_dir = self.temp_dir / "instruments"
@@ -298,6 +349,52 @@ class TranscriptionPipeline:
             return stems
     def transcribe_to_midi(
         self,
         audio_path: Path,
@@ -407,16 +504,8 @@ class TranscriptionPipeline:
         Raises:
             RuntimeError: If transcription fails
         """
-        try:
-            from yourmt3_wrapper import YourMT3Transcriber
-        except ImportError:
-            # Try adding backend directory to path
-            import sys
-            from pathlib import Path as PathLib
-            backend_dir = PathLib(__file__).parent
-            if str(backend_dir) not in sys.path:
-                sys.path.insert(0, str(backend_dir))
-            from yourmt3_wrapper import YourMT3Transcriber
         print(f"   Transcribing with YourMT3+ (direct call, device: {self.config.yourmt3_device})...")
@@ -459,20 +548,12 @@ class TranscriptionPipeline:
         Raises:
             RuntimeError: If transcription fails
         """
-        try:
-            from yourmt3_wrapper import YourMT3Transcriber
-            from bytedance_wrapper import ByteDanceTranscriber
-            from ensemble_transcriber import EnsembleTranscriber
-        except ImportError:
-            # Try adding backend directory to path
-            import sys
-            from pathlib import Path as PathLib
-            backend_dir = PathLib(__file__).parent
-            if str(backend_dir) not in sys.path:
-                sys.path.insert(0, str(backend_dir))
-            from yourmt3_wrapper import YourMT3Transcriber
-            from bytedance_wrapper import ByteDanceTranscriber
-            from ensemble_transcriber import EnsembleTranscriber
         try:
             # Initialize transcribers
@@ -527,15 +608,8 @@ class TranscriptionPipeline:
         # Use YourMT3+ for vocal transcription
         # (Could use dedicated melody transcription model in future)
-        try:
-            from yourmt3_wrapper import YourMT3Transcriber
-        except ImportError:
-            import sys
-            from pathlib import Path as PathLib
-            backend_dir = PathLib(__file__).parent
-            if str(backend_dir) not in sys.path:
-                sys.path.insert(0, str(backend_dir))
-            from yourmt3_wrapper import YourMT3Transcriber
         transcriber = YourMT3Transcriber(
             model_name="YPTF.MoE+Multi (noPS)",
@@ -648,6 +722,103 @@ class TranscriptionPipeline:
         return merged_path
     def apply_post_processing_filters(self, midi_path: Path) -> Path:
         """
         Apply post-processing filters to improve transcription quality.

     print(f"WARNING: madmom not available. Falling back to librosa for tempo/beat detection.")
     print(f"         Error: {e}")
+# Import wrapper modules at top level
+try:
+    from audio_separator_wrapper import AudioSeparator
+    AUDIO_SEPARATOR_AVAILABLE = True
+except ImportError as e:
+    AUDIO_SEPARATOR_AVAILABLE = False
+    AudioSeparator = None
+    print(f"WARNING: audio_separator_wrapper not available: {e}")
+try:
+    from yourmt3_wrapper import YourMT3Transcriber
+    YOURMT3_AVAILABLE = True
+except ImportError as e:
+    YOURMT3_AVAILABLE = False
+    YourMT3Transcriber = None
+    print(f"WARNING: yourmt3_wrapper not available: {e}")
+try:
+    from bytedance_wrapper import ByteDanceTranscriber
+    BYTEDANCE_AVAILABLE = True
+except ImportError as e:
+    BYTEDANCE_AVAILABLE = False
+    ByteDanceTranscriber = None
+    print(f"WARNING: bytedance_wrapper not available: {e}")
+try:
+    from ensemble_transcriber import EnsembleTranscriber
+    ENSEMBLE_AVAILABLE = True
+except ImportError as e:
+    ENSEMBLE_AVAILABLE = False
+    EnsembleTranscriber = None
+    print(f"WARNING: ensemble_transcriber not available: {e}")
 class TranscriptionPipeline:
     """Handles the complete transcription workflow."""
+    def __init__(self, job_id: str, youtube_url: str, storage_path: Path, config=None, instruments: list = None):
         self.job_id = job_id
         self.youtube_url = youtube_url
         self.storage_path = storage_path
         self.temp_dir = storage_path / "temp" / job_id
         self.temp_dir.mkdir(parents=True, exist_ok=True)
         self.progress_callback = None
+        self.instruments = instruments if instruments else ['piano']
         # Load configuration
         if config is None:
             else:
                 midi_path = piano_midi
+            # Filter MIDI to only include selected instruments
+            midi_path = self.filter_midi_by_instruments(midi_path)
             # Apply post-processing filters (Phase 4)
             midi_path = self.apply_post_processing_filters(midi_path)
             # Log the full error for debugging
             print(f"yt-dlp stderr: {result.stderr}")
             print(f"yt-dlp stdout: {result.stdout}")
+            # Check for DNS resolution errors
+            stderr_lower = result.stderr.lower()
+            if ("failed to resolve" in stderr_lower or
+                "no address associated with hostname" in stderr_lower or
+                "unable to download api page" in stderr_lower):
+                raise RuntimeError(
+                    "Unable to connect to YouTube. For this demo version, please upload your audio file directly using the file upload option."
+                )
             raise RuntimeError(f"yt-dlp failed: {result.stderr}")
         if not output_path.exists():
             # 2. Demucs separates clean instrumental into piano/guitar/drums/bass/other
             print("   Using two-stage separation (BS-RoFormer + Demucs)")
+            if not AUDIO_SEPARATOR_AVAILABLE or AudioSeparator is None:
+                raise RuntimeError("audio_separator_wrapper is not available")
             separator = AudioSeparator()
             separation_dir = self.temp_dir / "separation"
             # Direct Demucs 6-stem separation (no vocal pre-removal)
             print("   Using Demucs 6-stem separation")
+            if not AUDIO_SEPARATOR_AVAILABLE or AudioSeparator is None:
+                raise RuntimeError("audio_separator_wrapper is not available")
             separator = AudioSeparator()
             instrument_dir = self.temp_dir / "instruments"
             return stems
+    def transcribe_multiple_stems(self, stems: dict) -> Path:
+        """
+        Transcribe multiple instrument stems and combine into single MIDI.
+        Args:
+            stems: Dict mapping stem names to file paths (e.g., {'piano': Path, 'vocals': Path})
+        Returns:
+            Path to combined MIDI file
+        """
+        import pretty_midi
+        print(f"   Transcribing {len(stems)} stems: {list(stems.keys())}")
+        # Transcribe each stem separately
+        stem_midis = {}
+        for stem_name, stem_path in stems.items():
+            print(f"   [Stem {stem_name}] Transcribing {stem_path.name}...")
+            # Use appropriate transcription method
+            if stem_name == 'piano' and self.config.use_ensemble_transcription:
+                midi_path = self.transcribe_with_ensemble(stem_path)
+            else:
+                midi_path = self.transcribe_with_yourmt3(stem_path)
+            stem_midis[stem_name] = midi_path
+            print(f"   [Stem {stem_name}] ✓ Complete")
+        # Combine all MIDI files
+        print(f"   Combining {len(stem_midis)} MIDI files...")
+        combined_pm = pretty_midi.PrettyMIDI()
+        for stem_name, midi_path in stem_midis.items():
+            pm = pretty_midi.PrettyMIDI(str(midi_path))
+            # Add all instruments from this MIDI to the combined MIDI
+            for instrument in pm.instruments:
+                combined_pm.instruments.append(instrument)
+        # Save combined MIDI
+        combined_path = self.temp_dir / "combined_stems.mid"
+        combined_pm.write(str(combined_path))
+        print(f"   ✓ Combined {len(stem_midis)} stems into {len(combined_pm.instruments)} MIDI tracks")
+        return combined_path
     def transcribe_to_midi(
         self,
         audio_path: Path,
         Raises:
             RuntimeError: If transcription fails
         """
+        if not YOURMT3_AVAILABLE or YourMT3Transcriber is None:
+            raise RuntimeError("yourmt3_wrapper is not available")
         print(f"   Transcribing with YourMT3+ (direct call, device: {self.config.yourmt3_device})...")
         Raises:
             RuntimeError: If transcription fails
         """
+        if not YOURMT3_AVAILABLE or YourMT3Transcriber is None:
+            raise RuntimeError("yourmt3_wrapper is not available")
+        if not BYTEDANCE_AVAILABLE or ByteDanceTranscriber is None:
+            raise RuntimeError("bytedance_wrapper is not available")
+        if not ENSEMBLE_AVAILABLE or EnsembleTranscriber is None:
+            raise RuntimeError("ensemble_transcriber is not available")
         try:
             # Initialize transcribers
         # Use YourMT3+ for vocal transcription
         # (Could use dedicated melody transcription model in future)
+        if not YOURMT3_AVAILABLE or YourMT3Transcriber is None:
+            raise RuntimeError("yourmt3_wrapper is not available")
         transcriber = YourMT3Transcriber(
             model_name="YPTF.MoE+Multi (noPS)",
         return merged_path
+    def filter_midi_by_instruments(self, midi_path: Path) -> Path:
+        """
+        Filter MIDI file to only include tracks for selected instruments.
+        YourMT3+ transcribes all instruments it detects. This function filters
+        the output to only keep tracks matching the user's selection.
+        Args:
+            midi_path: Input MIDI file (may contain multiple instrument tracks)
+        Returns:
+            Path to filtered MIDI file containing only selected instruments
+        """
+        import pretty_midi
+        # Map instrument IDs to MIDI program ranges
+        # YourMT3+ uses General MIDI program numbers
+        INSTRUMENT_PROGRAMS = {
+            'piano': list(range(0, 8)),      # Acoustic Grand Piano to Celesta
+            'guitar': list(range(24, 32)),   # Acoustic Guitar to Guitar Harmonics
+            'bass': list(range(32, 40)),     # Acoustic Bass to Synth Bass 2
+            'drums': [128],                   # Drum channel (special case)
+            'vocals': list(range(52, 56)) + [65, 85],  # Choir Aahs, Voice Oohs, Synth Voice, Lead Voice, YourMT3+ "Singing Voice" (65)
+            'other': list(range(8, 24)) + list(range(40, 52)) + list(range(56, 65)) + list(range(66, 85)) + list(range(86, 128))  # Everything else (excluding vocals programs)
+        }
+        # Load MIDI file
+        pm = pretty_midi.PrettyMIDI(str(midi_path))
+        # Debug: Show what's in the MIDI before filtering
+        print(f"   [DEBUG] MIDI contains {len(pm.instruments)} tracks before filtering:")
+        for i, inst in enumerate(pm.instruments):
+            print(f"      Track {i}: {inst.name} (program={inst.program}, is_drum={inst.is_drum}, notes={len(inst.notes)})")
+        # Determine which programs to keep
+        programs_to_keep = set()
+        for instrument in self.instruments:
+            if instrument in INSTRUMENT_PROGRAMS:
+                programs_to_keep.update(INSTRUMENT_PROGRAMS[instrument])
+        print(f"   [DEBUG] Looking for programs: {sorted(programs_to_keep)[:20]}... (selected instruments: {self.instruments})")
+        # Group instruments by category to handle YourMT3+ outputting multiple tracks per instrument
+        # (e.g., both "Acoustic Piano" and "Electric Piano" for piano)
+        instrument_groups = {}
+        for inst in pm.instruments:
+            # Determine which category this instrument belongs to
+            matched_category = None
+            if inst.is_drum and 128 in programs_to_keep:
+                matched_category = 'drums'
+            elif not inst.is_drum and inst.program in programs_to_keep:
+                # Find which instrument category this program belongs to
+                for instr_name, programs in INSTRUMENT_PROGRAMS.items():
+                    if inst.program in programs and instr_name in self.instruments:
+                        matched_category = instr_name
+                        break
+            if matched_category:
+                if matched_category not in instrument_groups:
+                    instrument_groups[matched_category] = []
+                instrument_groups[matched_category].append(inst)
+                print(f"   [DEBUG] Track '{inst.name}' (program={inst.program}) matched category: {matched_category}")
+        # For each category, keep only the track with the most notes
+        # (YourMT3+ sometimes outputs spurious tracks with very few notes)
+        filtered_instruments = []
+        for category, tracks in instrument_groups.items():
+            if len(tracks) == 1:
+                filtered_instruments.append(tracks[0])
+            else:
+                # Keep the track with the most notes
+                best_track = max(tracks, key=lambda t: len(t.notes))
+                filtered_instruments.append(best_track)
+                # Log which tracks were filtered out
+                for track in tracks:
+                    if track != best_track:
+                        track_name = track.name or f"Program {track.program}"
+                        best_name = best_track.name or f"Program {best_track.program}"
+                        print(f"   Filtered out spurious track: {track_name} ({len(track.notes)} notes) - kept {best_name} ({len(best_track.notes)} notes)")
+        # Create new MIDI with only selected instruments
+        filtered_pm = pretty_midi.PrettyMIDI()
+        filtered_pm.instruments = filtered_instruments
+        # Save filtered MIDI
+        filtered_path = midi_path.parent / f"{midi_path.stem}_filtered.mid"
+        filtered_pm.write(str(filtered_path))
+        # Log filtering results
+        original_count = len(pm.instruments)
+        filtered_count = len(filtered_instruments)
+        print(f"   Filtered MIDI: {original_count} tracks → {filtered_count} tracks (1 per category)")
+        print(f"   Kept instruments: {self.instruments}")
+        return filtered_path
     def apply_post_processing_filters(self, midi_path: Path) -> Path:
         """
         Apply post-processing filters to improve transcription quality.

backend/tasks.py CHANGED Viewed

@@ -1,4 +1,12 @@
 """Celery tasks for background job processing."""
 from celery import Task
 from celery_app import celery_app
 from pipeline import TranscriptionPipeline, run_transcription_pipeline
@@ -6,7 +14,6 @@ import redis
 import json
 import os
 from datetime import datetime
-from pathlib import Path
 from app_config import settings
 import shutil
@@ -76,24 +83,126 @@ def process_transcription_task(self, job_id: str):
         # Get job data
         job_data = redis_client.hgetall(f"job:{job_id}")
         if not job_data:
             raise ValueError(f"Job not found: {job_id}")
         youtube_url = job_data.get('youtube_url')
-        if not youtube_url:
-            raise ValueError(f"Job missing youtube_url: {job_id}")
-        # Initialize pipeline
         pipeline = TranscriptionPipeline(
             job_id=job_id,
-            youtube_url=youtube_url,
-            storage_path=settings.storage_path
         )
         pipeline.set_progress_callback(lambda p, s, m: self.update_progress(job_id, p, s, m))
-        # Run pipeline
-        temp_output_path = pipeline.run()
         # Output is already in the temp directory, move to persistent storage
         output_path = settings.outputs_path / f"{job_id}.musicxml"

 """Celery tasks for background job processing."""
+import sys
+from pathlib import Path
+# Ensure backend directory is in Python path for imports
+backend_dir = Path(__file__).parent.resolve()
+if str(backend_dir) not in sys.path:
+    sys.path.insert(0, str(backend_dir))
 from celery import Task
 from celery_app import celery_app
 from pipeline import TranscriptionPipeline, run_transcription_pipeline
 import json
 import os
 from datetime import datetime
 from app_config import settings
 import shutil
         # Get job data
         job_data = redis_client.hgetall(f"job:{job_id}")
         if not job_data:
             raise ValueError(f"Job not found: {job_id}")
+        # Check if this is a file upload or YouTube URL job
+        upload_path = job_data.get('upload_path')
         youtube_url = job_data.get('youtube_url')
+        # Parse instruments option (defaults to piano only)
+        instruments = ['piano']
+        vocal_instrument_program = 40  # Default to violin
+        if 'options' in job_data:
+            try:
+                options = json.loads(job_data['options'])
+                instruments = options.get('instruments', ['piano'])
+                vocal_instrument_program = options.get('vocal_instrument', 40)
+            except (json.JSONDecodeError, KeyError):
+                instruments = ['piano']
+                vocal_instrument_program = 40
+        # Import shutil and subprocess
+        import shutil
+        import subprocess
+        # Create pipeline
         pipeline = TranscriptionPipeline(
             job_id=job_id,
+            youtube_url=youtube_url or "file://uploaded",  # Dummy URL for file uploads
+            storage_path=settings.storage_path,
+            instruments=instruments
         )
         pipeline.set_progress_callback(lambda p, s, m: self.update_progress(job_id, p, s, m))
+        # Get audio.wav - either from upload or YouTube download
+        audio_path = pipeline.temp_dir / "audio.wav"
+        if upload_path:
+            # File upload - convert to WAV if needed
+            upload_file = Path(upload_path)
+            if upload_file.suffix.lower() == '.wav':
+                shutil.copy(str(upload_file), str(audio_path))
+            else:
+                # Convert to WAV using ffmpeg
+                result = subprocess.run([
+                    'ffmpeg', '-i', str(upload_file),
+                    '-ar', '44100', '-ac', '2',
+                    str(audio_path)
+                ], capture_output=True, text=True)
+                if result.returncode != 0:
+                    raise RuntimeError(f"Audio conversion failed: {result.stderr}")
+        elif youtube_url:
+            # YouTube download
+            pipeline.progress(0, "download", "Starting audio download")
+            audio_path = pipeline.download_audio()
+        else:
+            raise ValueError(f"Job missing both youtube_url and upload_path: {job_id}")
+        # From here, both paths converge - process audio.wav the same way
+        # Preprocess audio if enabled
+        if pipeline.config.enable_audio_preprocessing:
+            pipeline.progress(10, "preprocess", "Preprocessing audio")
+            audio_path = pipeline.preprocess_audio(audio_path)
+        # Source separation
+        pipeline.progress(20, "separate", "Starting source separation")
+        all_stems = pipeline.separate_sources(audio_path)
+        # Select stems to transcribe based on user selection
+        stems_to_transcribe = {}
+        for instrument in instruments:
+            if instrument in all_stems:
+                stems_to_transcribe[instrument] = all_stems[instrument]
+                print(f"   [DEBUG] Will transcribe {instrument} stem")
+            else:
+                print(f"   [WARNING] {instrument} stem not found in separated audio")
+        # If no selected stems available, fall back to piano
+        if not stems_to_transcribe:
+            print(f"   [WARNING] No selected stems found, falling back to piano")
+            if 'piano' in all_stems:
+                stems_to_transcribe['piano'] = all_stems['piano']
+            else:
+                stems_to_transcribe['other'] = all_stems['other']
+        pipeline.progress(50, "transcribe", f"Transcribing {len(stems_to_transcribe)} instrument(s)")
+        # Transcribe stems
+        if len(stems_to_transcribe) == 1:
+            # Single stem - use original method
+            stem_path = list(stems_to_transcribe.values())[0]
+            combined_midi = pipeline.transcribe_to_midi(stem_path)
+        else:
+            # Multiple stems - use new multi-stem method
+            combined_midi = pipeline.transcribe_multiple_stems(stems_to_transcribe)
+        # Filter MIDI to only include selected instruments
+        filtered_midi = pipeline.filter_midi_by_instruments(combined_midi)
+        # Remap vocals MIDI program if vocals were selected
+        if 'vocals' in instruments and vocal_instrument_program != 65:
+            print(f"   [DEBUG] Remapping vocals MIDI program from 65 to {vocal_instrument_program}")
+            import pretty_midi
+            pm = pretty_midi.PrettyMIDI(str(filtered_midi))
+            for inst in pm.instruments:
+                if inst.program == 65 and not inst.is_drum:  # Singing Voice
+                    inst.program = vocal_instrument_program
+                    print(f"   [DEBUG] Changed track '{inst.name}' program to {vocal_instrument_program}")
+            # Save remapped MIDI
+            pm.write(str(filtered_midi))
+        # Apply post-processing
+        midi_path = pipeline.apply_post_processing_filters(filtered_midi)
+        pipeline.final_midi_path = midi_path
+        # Get audio stem for MusicXML generation (use piano if available, otherwise first available stem)
+        audio_stem = stems_to_transcribe.get('piano') or list(stems_to_transcribe.values())[0]
+        pipeline.progress(90, "musicxml", "Generating MusicXML")
+        temp_output_path = pipeline.generate_musicxml_minimal(midi_path, audio_stem)
+        pipeline.progress(100, "complete", "Transcription complete")
         # Output is already in the temp directory, move to persistent storage
         output_path = settings.outputs_path / f"{job_id}.musicxml"

frontend/src/api/client.ts CHANGED Viewed

@@ -49,7 +49,7 @@ export class RescoredAPI {
   private baseURL = API_BASE_URL;
   private wsBaseURL = WS_BASE_URL;
-  async submitJob(youtubeURL: string, options?: { instruments?: string[] }): Promise<TranscribeResponse> {
     const response = await fetch(`${this.baseURL}/api/v1/transcribe`, {
       method: 'POST',
       headers: {
@@ -69,6 +69,27 @@ export class RescoredAPI {
     return response.json();
   }
   async getJobStatus(jobId: string): Promise<JobStatus> {
     const response = await fetch(`${this.baseURL}/api/v1/jobs/${jobId}`);

   private baseURL = API_BASE_URL;
   private wsBaseURL = WS_BASE_URL;
+  async submitJob(youtubeURL: string, options?: { instruments?: string[]; vocalInstrument?: number }): Promise<TranscribeResponse> {
     const response = await fetch(`${this.baseURL}/api/v1/transcribe`, {
       method: 'POST',
       headers: {
     return response.json();
   }
+  async submitFileJob(file: File, options?: { instruments?: string[]; vocalInstrument?: number }): Promise<TranscribeResponse> {
+    const formData = new FormData();
+    formData.append('file', file);
+    formData.append('instruments', JSON.stringify(options?.instruments ?? ['piano']));
+    if (options?.vocalInstrument !== undefined) {
+      formData.append('vocal_instrument', options.vocalInstrument.toString());
+    }
+    const response = await fetch(`${this.baseURL}/api/v1/transcribe/upload`, {
+      method: 'POST',
+      body: formData,
+    });
+    if (!response.ok) {
+      const error = await response.json();
+      throw new Error(error.detail || 'Failed to submit file');
+    }
+    return response.json();
+  }
   async getJobStatus(jobId: string): Promise<JobStatus> {
     const response = await fetch(`${this.baseURL}/api/v1/jobs/${jobId}`);

frontend/src/components/InstrumentSelector.css CHANGED Viewed

@@ -69,6 +69,42 @@
   font-style: italic;
 }
 /* Responsive adjustments */
 @media (max-width: 600px) {
   .instrument-grid {

   font-style: italic;
 }
+.vocal-instrument-selector {
+  margin: 1.5rem 0;
+  padding: 1rem;
+  background-color: #f8f9fa;
+  border-radius: 8px;
+  border: 1px solid #dee2e6;
+}
+.vocal-instrument-selector label {
+  display: block;
+  margin-bottom: 0.5rem;
+  font-weight: 500;
+  color: #495057;
+}
+.vocal-instrument-selector select {
+  width: 100%;
+  padding: 0.5rem;
+  font-size: 1rem;
+  border: 1px solid #ced4da;
+  border-radius: 4px;
+  background-color: white;
+  cursor: pointer;
+  transition: border-color 0.2s ease;
+}
+.vocal-instrument-selector select:hover {
+  border-color: #007bff;
+}
+.vocal-instrument-selector select:focus {
+  outline: none;
+  border-color: #007bff;
+  box-shadow: 0 0 0 3px rgba(0, 123, 255, 0.1);
+}
 /* Responsive adjustments */
 @media (max-width: 600px) {
   .instrument-grid {

frontend/src/components/InstrumentSelector.tsx CHANGED Viewed

@@ -12,19 +12,35 @@ export interface Instrument {
 const INSTRUMENTS: Instrument[] = [
   { id: 'piano', label: 'Piano', icon: '🎹' },
-  { id: 'vocals', label: 'Vocals (Violin)', icon: '🎤' },
   { id: 'drums', label: 'Drums', icon: '🥁' },
   { id: 'bass', label: 'Bass', icon: '🎸' },
   { id: 'guitar', label: 'Guitar', icon: '🎸' },
   { id: 'other', label: 'Other Instruments', icon: '🎵' }
 ];
 interface InstrumentSelectorProps {
   selectedInstruments: string[];
   onChange: (instruments: string[]) => void;
 }
-export function InstrumentSelector({ selectedInstruments, onChange }: InstrumentSelectorProps) {
   const handleToggle = (instrumentId: string) => {
     const isSelected = selectedInstruments.includes(instrumentId);
@@ -33,12 +49,18 @@ export function InstrumentSelector({ selectedInstruments, onChange }: Instrument
       if (selectedInstruments.length === 1) {
         return;
       }
-      onChange(selectedInstruments.filter(id => id !== instrumentId));
     } else {
-      onChange([...selectedInstruments, instrumentId]);
     }
   };
   return (
     <div className="instrument-selector">
       <label className="selector-label">Select Instruments:</label>
@@ -56,6 +78,24 @@ export function InstrumentSelector({ selectedInstruments, onChange }: Instrument
           </button>
         ))}
       </div>
       <p className="selector-hint">
         Select at least one instrument to transcribe
       </p>

 const INSTRUMENTS: Instrument[] = [
   { id: 'piano', label: 'Piano', icon: '🎹' },
+  { id: 'vocals', label: 'Vocals', icon: '🎤' },
   { id: 'drums', label: 'Drums', icon: '🥁' },
   { id: 'bass', label: 'Bass', icon: '🎸' },
   { id: 'guitar', label: 'Guitar', icon: '🎸' },
   { id: 'other', label: 'Other Instruments', icon: '🎵' }
 ];
+export const VOCAL_INSTRUMENTS = [
+  { id: 'violin', label: 'Violin', program: 40 },
+  { id: 'flute', label: 'Flute', program: 73 },
+  { id: 'clarinet', label: 'Clarinet', program: 71 },
+  { id: 'saxophone', label: 'Saxophone', program: 64 },
+  { id: 'trumpet', label: 'Trumpet', program: 56 },
+  { id: 'voice', label: 'Singing Voice', program: 65 },
+];
 interface InstrumentSelectorProps {
   selectedInstruments: string[];
   onChange: (instruments: string[]) => void;
+  vocalInstrument?: string;
+  onVocalInstrumentChange?: (instrument: string) => void;
 }
+export function InstrumentSelector({
+  selectedInstruments,
+  onChange,
+  vocalInstrument = 'violin',
+  onVocalInstrumentChange
+}: InstrumentSelectorProps) {
   const handleToggle = (instrumentId: string) => {
     const isSelected = selectedInstruments.includes(instrumentId);
       if (selectedInstruments.length === 1) {
         return;
       }
+      const newInstruments = selectedInstruments.filter(id => id !== instrumentId);
+      console.log('[DEBUG] InstrumentSelector: Removing', instrumentId, '-> New list:', newInstruments);
+      onChange(newInstruments);
     } else {
+      const newInstruments = [...selectedInstruments, instrumentId];
+      console.log('[DEBUG] InstrumentSelector: Adding', instrumentId, '-> New list:', newInstruments);
+      onChange(newInstruments);
     }
   };
+  const vocalsSelected = selectedInstruments.includes('vocals');
   return (
     <div className="instrument-selector">
       <label className="selector-label">Select Instruments:</label>
           </button>
         ))}
       </div>
+      {vocalsSelected && onVocalInstrumentChange && (
+        <div className="vocal-instrument-selector">
+          <label htmlFor="vocal-instrument">Transcribe vocals as:</label>
+          <select
+            id="vocal-instrument"
+            value={vocalInstrument}
+            onChange={(e) => onVocalInstrumentChange(e.target.value)}
+          >
+            {VOCAL_INSTRUMENTS.map(inst => (
+              <option key={inst.id} value={inst.id}>
+                {inst.label}
+              </option>
+            ))}
+          </select>
+        </div>
+      )}
       <p className="selector-hint">
         Select at least one instrument to transcribe
       </p>

frontend/src/components/JobSubmission.css CHANGED Viewed

@@ -111,3 +111,38 @@ button:hover {
   margin-top: 1rem;
   border: 1px solid #f5c6cb;
 }

   margin-top: 1rem;
   border: 1px solid #f5c6cb;
 }
+.upload-mode-selector {
+  display: flex;
+  gap: 0.5rem;
+  margin-top: 0.5rem;
+}
+.upload-mode-selector button {
+  flex: 1;
+  padding: 0.5rem 1rem;
+  background-color: #f0f0f0;
+  color: #333;
+  border: 2px solid #ddd;
+  border-radius: 4px;
+  font-size: 0.9rem;
+  cursor: pointer;
+  transition: all 0.2s ease;
+}
+.upload-mode-selector button:hover {
+  background-color: #e0e0e0;
+  border-color: #bbb;
+}
+.upload-mode-selector button.active {
+  background-color: #007bff;
+  color: white;
+  border-color: #007bff;
+}
+.file-info {
+  margin-top: 0.5rem;
+  font-size: 0.9rem;
+  color: #666;
+}

frontend/src/components/JobSubmission.tsx CHANGED Viewed

@@ -4,7 +4,7 @@
 import { useState, useRef, useEffect } from 'react';
 import { api } from '../api/client';
 import type { ProgressUpdate } from '../api/client';
-import { InstrumentSelector } from './InstrumentSelector';
 import './JobSubmission.css';
 interface JobSubmissionProps {
@@ -14,7 +14,10 @@ interface JobSubmissionProps {
 export function JobSubmission({ onComplete, onJobSubmitted }: JobSubmissionProps) {
   const [youtubeUrl, setYoutubeUrl] = useState('');
   const [selectedInstruments, setSelectedInstruments] = useState<string[]>(['piano']);
   const [status, setStatus] = useState<'idle' | 'submitting' | 'processing' | 'failed'>('idle');
   const [error, setError] = useState<string | null>(null);
   const [progress, setProgress] = useState(0);
@@ -46,11 +49,18 @@ export function JobSubmission({ onComplete, onJobSubmitted }: JobSubmissionProps
     e.preventDefault();
     setError(null);
-    // Validate URL
-    const validation = validateUrl(youtubeUrl);
-    if (validation) {
-      setError(validation);
-      return;
     }
     // Validate at least one instrument is selected
@@ -61,9 +71,18 @@ export function JobSubmission({ onComplete, onJobSubmitted }: JobSubmissionProps
     setStatus('submitting');
     try {
-      const response = await api.submitJob(youtubeUrl, { instruments: selectedInstruments });
       setYoutubeUrl('');
       if (onJobSubmitted) onJobSubmitted(response);
       // Switch to processing status and connect WebSocket
@@ -164,23 +183,82 @@ export function JobSubmission({ onComplete, onJobSubmitted }: JobSubmissionProps
           <InstrumentSelector
             selectedInstruments={selectedInstruments}
             onChange={setSelectedInstruments}
           />
           <div className="form-group">
-            <label htmlFor="youtube-url">YouTube URL:</label>
-            <input
-              id="youtube-url"
-              type="text"
-              value={youtubeUrl}
-              onChange={(e) => setYoutubeUrl(e.target.value)}
-              placeholder="https://www.youtube.com/watch?v=..."
-              required
-              onBlur={() => {
-                const validation = validateUrl(youtubeUrl);
-                if (validation) setError(validation);
-              }}
-            />
           </div>
           <button type="submit" disabled={status === 'submitting'}>Transcribe</button>
           {status === 'submitting' && <div>Submitting...</div>}
           {error && <div role="alert" className="error-alert">{error}</div>}

 import { useState, useRef, useEffect } from 'react';
 import { api } from '../api/client';
 import type { ProgressUpdate } from '../api/client';
+import { InstrumentSelector, VOCAL_INSTRUMENTS } from './InstrumentSelector';
 import './JobSubmission.css';
 interface JobSubmissionProps {
 export function JobSubmission({ onComplete, onJobSubmitted }: JobSubmissionProps) {
   const [youtubeUrl, setYoutubeUrl] = useState('');
+  const [uploadMode, setUploadMode] = useState<'url' | 'file'>('url');
+  const [selectedFile, setSelectedFile] = useState<File | null>(null);
   const [selectedInstruments, setSelectedInstruments] = useState<string[]>(['piano']);
+  const [vocalInstrument, setVocalInstrument] = useState('violin');
   const [status, setStatus] = useState<'idle' | 'submitting' | 'processing' | 'failed'>('idle');
   const [error, setError] = useState<string | null>(null);
   const [progress, setProgress] = useState(0);
     e.preventDefault();
     setError(null);
+    // Validate based on mode
+    if (uploadMode === 'url') {
+      const validation = validateUrl(youtubeUrl);
+      if (validation) {
+        setError(validation);
+        return;
+      }
+    } else {
+      if (!selectedFile) {
+        setError('Please select an audio file');
+        return;
+      }
     }
     // Validate at least one instrument is selected
     setStatus('submitting');
+    console.log('[DEBUG] About to submit job with instruments:', selectedInstruments);
+    // Get the MIDI program number for the selected vocal instrument
+    const vocalProgram = VOCAL_INSTRUMENTS.find(v => v.id === vocalInstrument)?.program || 40;
     try {
+      const response = uploadMode === 'url'
+        ? await api.submitJob(youtubeUrl, { instruments: selectedInstruments, vocalInstrument: vocalProgram })
+        : await api.submitFileJob(selectedFile!, { instruments: selectedInstruments, vocalInstrument: vocalProgram });
       setYoutubeUrl('');
+      setSelectedFile(null);
       if (onJobSubmitted) onJobSubmitted(response);
       // Switch to processing status and connect WebSocket
           <InstrumentSelector
             selectedInstruments={selectedInstruments}
             onChange={setSelectedInstruments}
+            vocalInstrument={vocalInstrument}
+            onVocalInstrumentChange={setVocalInstrument}
           />
           <div className="form-group">
+            <label>Input Method:</label>
+            <div className="upload-mode-selector">
+              <button
+                type="button"
+                className={uploadMode === 'url' ? 'active' : ''}
+                onClick={() => {
+                  setUploadMode('url');
+                  setError(null);
+                }}
+              >
+                YouTube URL
+              </button>
+              <button
+                type="button"
+                className={uploadMode === 'file' ? 'active' : ''}
+                onClick={() => {
+                  setUploadMode('file');
+                  setError(null);
+                }}
+              >
+                Upload Audio File
+              </button>
+            </div>
           </div>
+          {uploadMode === 'url' ? (
+            <div className="form-group">
+              <label htmlFor="youtube-url">YouTube URL:</label>
+              <input
+                id="youtube-url"
+                type="text"
+                value={youtubeUrl}
+                onChange={(e) => setYoutubeUrl(e.target.value)}
+                placeholder="https://www.youtube.com/watch?v=..."
+                required
+                onBlur={() => {
+                  const validation = validateUrl(youtubeUrl);
+                  if (validation) setError(validation);
+                }}
+              />
+            </div>
+          ) : (
+            <div className="form-group">
+              <label htmlFor="audio-file">Audio File (WAV, MP3, FLAC, etc.):</label>
+              <input
+                id="audio-file"
+                type="file"
+                accept=".wav,.mp3,.flac,.ogg,.m4a,.aac"
+                onChange={(e) => {
+                  const file = e.target.files?.[0];
+                  if (file) {
+                    const maxSize = 100 * 1024 * 1024; // 100MB
+                    if (file.size > maxSize) {
+                      setError('File too large. Maximum size: 100MB');
+                      setSelectedFile(null);
+                    } else {
+                      setSelectedFile(file);
+                      setError(null);
+                    }
+                  }
+                }}
+                required
+              />
+              {selectedFile && (
+                <p className="file-info">
+                  Selected: {selectedFile.name} ({(selectedFile.size / 1024 / 1024).toFixed(2)} MB)
+                </p>
+              )}
+            </div>
+          )}
           <button type="submit" disabled={status === 'submitting'}>Transcribe</button>
           {status === 'submitting' && <div>Submitting...</div>}
           {error && <div role="alert" className="error-alert">{error}</div>}