Spaces:

bharatverse11
/

AutomixBackend

Sleeping

File size: 50,624 Bytes

"""
AutoMixAI – HuggingFace Space Backend (All-in-One)

Consolidated FastAPI backend hosting ALL services:
  - /upload        Upload audio files (stored temporarily)
  - /analyze       BPM + beat detection + energy analysis
  - /mix           Advanced DJ mixing with EQ crossfade
  - /generate      Procedural drum beat generation
  - /output/{id}   Download generated files
  - /recognize     Song recognition via Shazam API
  - /health        Health check

Advanced DJ Mixing Features:
  - LUFS loudness normalization (-24 LUFS pre-mix, -14 LUFS final)
  - High-pass filtering (40Hz rumble removal)
  - Beat detection + BPM estimation
  - Pitch-preserving time-stretching to common BPM
  - Beat-aligned trimming
  - EQ-based crossfade (DJ-style bass swap transition)
  - Equal-power S-curve crossfade
  - Per-track EQ: bass boost, brightness, vocal boost
  - Stereo panning
  - Final LUFS mastering to streaming standard
"""

import os
import sys
import time
import uuid
import tempfile
import subprocess
import re
import math
from pathlib import Path
from typing import Optional, Dict, List, Tuple, Any
from dataclasses import dataclass
from enum import Enum

import numpy as np
import librosa
import soundfile as sf
import scipy.signal
import pyloudnorm as pyln
import torch
from transformers import pipeline as hf_pipeline, AutoFeatureExtractor, AutoModelForAudioClassification

from fastapi import FastAPI, File, UploadFile, HTTPException, Form
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse
from pydantic import BaseModel, Field
import httpx

# ═══════════════════════════════════════════════════════════════════════════════
# CONFIG
# ═══════════════════════════════════════════════════════════════════════════════

SR = 22050               # librosa default for analysis
SR_MIX = 44100           # output sample rate for mixing
HOP_LENGTH = 512
TARGET_LOUDNESS = -24.0  # pre-mix normalization (LUFS)
FINAL_LOUDNESS = -14.0   # streaming-standard final loudness (LUFS)
UPLOAD_DIR = Path(tempfile.gettempdir()) / "automixai_uploads"
OUTPUT_DIR = Path(tempfile.gettempdir()) / "automixai_outputs"
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

RAPIDAPI_KEY = os.environ.get("RAPIDAPI_KEY", "")
SHAZAM_URL = "https://shazam-core.p.rapidapi.com/v1/tracks/recognize"
SHAZAM_HOST = "shazam-core.p.rapidapi.com"


# ═══════════════════════════════════════════════════════════════════════════════
# FASTAPI APP
# ═══════════════════════════════════════════════════════════════════════════════

app = FastAPI(
    title="AutoMixAI Backend",
    description="AI-powered DJ mixing, beat generation, audio analysis, and song recognition.",
    version="2.0.0",
)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


# ═══════════════════════════════════════════════════════════════════════════════
# SCHEMAS
# ═══════════════════════════════════════════════════════════════════════════════

class UploadResponse(BaseModel):
    file_id: str
    filename: str
    duration: float
    message: str = "File uploaded successfully."

class AnalyzeRequest(BaseModel):
    file_id: str

class AnalysisResponse(BaseModel):
    file_id: str
    bpm: float
    beat_times: list[float]
    duration: float
    sample_rate: int
    genre: str = "unknown"
    genre_confidence: float = 0.0
    genre_top3: list = []
    tags: list[str] = []
    tag_scores: list = []
    mood: str = "neutral"
    has_vocals: bool = False
    dominant_instrument: str = "unknown"
    instrument_confidence: float = 0.0
    instruments_top3: list = []
    energy: str = "medium"
    message: str = "Analysis complete."

class MixRequest(BaseModel):
    file_id_a: str
    file_id_b: str
    crossfade_duration: float = 8.0
    bass_boost: float = 0.0
    brightness: float = 0.0
    vocal_boost: float = 0.0
    pan_a: float = 0.0
    pan_b: float = 0.0
    eq_transition: bool = True  # Use EQ-based DJ crossfade

class MixResponse(BaseModel):
    output_file_id: str
    duration: float
    bpm_a: float
    bpm_b: float
    target_bpm: float
    message: str = "Mix generated successfully."

class GenerateBeatRequest(BaseModel):
    prompt: str = Field(..., min_length=3, max_length=500)
    bars: int = Field(default=4, ge=1, le=32)

class PatternInfo(BaseModel):
    kick: list[int]
    snare: list[int]
    hihat_c: list[int]
    hihat_o: list[int]
    clap: list[int]

class GenerateBeatResponse(BaseModel):
    output_file_id: str
    genre: str
    bpm: float
    bars: int
    complexity: str
    description: str
    duration: float
    pattern: PatternInfo
    sample_rate: int = 44100
    message: str = "Beat generated successfully."


# ═══════════════════════════════════════════════════════════════════════════════
# HELPERS
# ═══════════════════════════════════════════════════════════════════════════════

def generate_file_id() -> str:
    return uuid.uuid4().hex

def find_upload(file_id: str) -> Path:
    for path in UPLOAD_DIR.iterdir():
        if path.stem == file_id:
            return path
    raise FileNotFoundError(f"No file found for ID '{file_id}'")


# ═══════════════════════════════════════════════════════════════════════════════
# AUDIO LOADING
# ═══════════════════════════════════════════════════════════════════════════════

def load_audio(path: str, sr: int = None, mono: bool = True):
    sr = sr or SR
    y, loaded_sr = librosa.load(path, sr=sr, mono=mono)
    peak = np.max(np.abs(y))
    if peak > 0:
        y = y / peak
    return y, loaded_sr

def load_audio_full_rate(path: str, mono: bool = True):
    """Load audio at native sample rate (for mixing quality)."""
    y, sr = librosa.load(path, sr=SR_MIX, mono=mono)
    return y, sr

def get_audio_info(path: str) -> dict:
    try:
        info = sf.info(path)
        return {"duration": info.duration, "sample_rate": info.samplerate, "channels": info.channels}
    except Exception:
        y, sr = librosa.load(path, sr=None, mono=False)
        channels = 1 if y.ndim == 1 else y.shape[0]
        duration = (len(y) if y.ndim == 1 else y.shape[1]) / sr
        return {"duration": float(duration), "sample_rate": int(sr), "channels": int(channels)}

def save_audio(path: str, audio: np.ndarray, sr: int):
    sf.write(path, audio, sr, subtype="PCM_16")


# ═══════════════════════════════════════════════════════════════════════════════
# BEAT DETECTION + BPM
# ═══════════════════════════════════════════════════════════════════════════════

def detect_beats(y: np.ndarray, sr: int) -> list[float]:
    tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr, hop_length=HOP_LENGTH)
    beat_times = librosa.frames_to_time(beat_frames, sr=sr, hop_length=HOP_LENGTH).tolist()
    return beat_times

def estimate_bpm_from_beats(beat_times: list[float]) -> float:
    if len(beat_times) < 2:
        return 120.0
    ibis = np.diff(beat_times)
    median_ibi = float(np.median(ibis))
    if median_ibi <= 0:
        return 120.0
    return round(60.0 / median_ibi, 2)

def estimate_bpm_librosa(y: np.ndarray, sr: int) -> float:
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr, hop_length=HOP_LENGTH)
    return round(float(np.asarray(tempo).flat[0]), 2)


# ═══════════════════════════════════════════════════════════════════════════════
# ADVANCED DJ MIXING ENGINE
# ═══════════════════════════════════════════════════════════════════════════════

def normalize_loudness(audio: np.ndarray, sr: int, target_lufs: float) -> np.ndarray:
    """LUFS loudness normalization (EBU R128)."""
    try:
        meter = pyln.Meter(sr)
        current_loudness = meter.integrated_loudness(audio)
        if np.isinf(current_loudness) or np.isnan(current_loudness):
            return audio
        return pyln.normalize.loudness(audio, current_loudness, target_lufs)
    except Exception:
        return audio

def highpass_filter(audio: np.ndarray, sr: int, cutoff: float = 40.0) -> np.ndarray:
    """Remove sub-bass rumble below cutoff frequency."""
    sos = scipy.signal.butter(2, cutoff, btype='highpass', fs=sr, output='sos')
    return scipy.signal.sosfilt(sos, audio).astype(np.float32)

def lowpass_filter(audio: np.ndarray, sr: int, cutoff: float = 200.0) -> np.ndarray:
    """Extract bass frequencies."""
    sos = scipy.signal.butter(2, cutoff, btype='low', fs=sr, output='sos')
    return scipy.signal.sosfilt(sos, audio).astype(np.float32)

def highpass_extract(audio: np.ndarray, sr: int, cutoff: float = 200.0) -> np.ndarray:
    """Extract mid+high frequencies (above cutoff)."""
    sos = scipy.signal.butter(2, cutoff, btype='high', fs=sr, output='sos')
    return scipy.signal.sosfilt(sos, audio).astype(np.float32)

def apply_bass_boost(audio: np.ndarray, sr: int, amount: float) -> np.ndarray:
    """Boost low frequencies (below 150Hz)."""
    if amount <= 0:
        return audio
    sos = scipy.signal.butter(2, 150, btype='low', fs=sr, output='sos')
    bass = scipy.signal.sosfilt(sos, audio)
    return (audio + amount * bass).astype(np.float32)

def apply_brightness(audio: np.ndarray, sr: int, amount: float) -> np.ndarray:
    """Boost high frequencies (above 4kHz)."""
    if amount <= 0:
        return audio
    sos = scipy.signal.butter(2, 4000, btype='high', fs=sr, output='sos')
    highs = scipy.signal.sosfilt(sos, audio)
    return (audio + amount * highs).astype(np.float32)

def pan_stereo(mono_audio: np.ndarray, pan: float) -> np.ndarray:
    """Pan mono audio to stereo. pan: -1 (left) to 1 (right), 0 = center."""
    left_gain = np.cos(max(0, pan) * np.pi / 2)
    right_gain = np.cos(max(0, -pan) * np.pi / 2)
    return np.vstack((mono_audio * left_gain, mono_audio * right_gain))

def time_stretch_to_bpm(y: np.ndarray, sr: int, original_bpm: float, target_bpm: float) -> np.ndarray:
    """Pitch-preserving time-stretch to match target BPM."""
    rate = target_bpm / original_bpm
    if abs(rate - 1.0) < 0.005:
        return y
    return librosa.effects.time_stretch(y, rate=rate)

def align_to_beat(y: np.ndarray, sr: int, beat_times: list[float]) -> np.ndarray:
    """Trim audio to start on the first beat."""
    if not beat_times:
        return y
    start_sample = int(beat_times[0] * sr)
    return y[start_sample:]

def equal_power_crossfade(y1: np.ndarray, y2: np.ndarray, sr: int, duration: float) -> np.ndarray:
    """
    Equal-power (S-curve) crossfade — psychoacoustically smooth.
    Uses sin/cos curves instead of linear, preventing the volume dip
    that occurs with linear crossfades.
    """
    fade_samples = int(duration * sr)
    fade_samples = min(fade_samples, len(y1), len(y2))
    if fade_samples < 1:
        return np.concatenate([y1, y2])

    # Equal-power curves: cos² + sin² = 1 (constant power)
    t = np.linspace(0, np.pi / 2, fade_samples)
    fade_out = np.cos(t) ** 2  # smooth fade out for track A
    fade_in = np.sin(t) ** 2   # smooth fade in for track B

    tail = y1[-fade_samples:] * fade_out
    head = y2[:fade_samples] * fade_in
    mixed_region = tail + head

    return np.concatenate([y1[:-fade_samples], mixed_region, y2[fade_samples:]])

def eq_crossfade(y1: np.ndarray, y2: np.ndarray, sr: int, duration: float) -> np.ndarray:
    """
    Professional DJ-style EQ crossfade transition.

    Instead of simply fading volumes, this mimics what real DJs do:
    1. Fade out Track A's BASS while fading in Track B's BASS
    2. Crossfade mid+high frequencies with an S-curve
    3. Prevents muddiness and bass clashing during transitions

    This is the core technique used in professional DJ sets.
    """
    fade_samples = int(duration * sr)
    fade_samples = min(fade_samples, len(y1), len(y2))
    if fade_samples < 1:
        return np.concatenate([y1, y2])

    # Split both tracks into bass (< 200Hz) and mid+high (> 200Hz)
    y1_bass = lowpass_filter(y1[-fade_samples:], sr, cutoff=200)
    y1_mid_high = highpass_extract(y1[-fade_samples:], sr, cutoff=200)
    y2_bass = lowpass_filter(y2[:fade_samples], sr, cutoff=200)
    y2_mid_high = highpass_extract(y2[:fade_samples], sr, cutoff=200)

    # Equal-power curves
    t = np.linspace(0, np.pi / 2, fade_samples)
    fade_out = np.cos(t) ** 2
    fade_in = np.sin(t) ** 2

    # Bass: sharper transition (prevent bass clash)
    # The bass swaps over more aggressively in the middle
    bass_t = np.linspace(0, np.pi / 2, fade_samples)
    bass_fade_out = np.cos(bass_t) ** 3  # sharper cut
    bass_fade_in = np.sin(bass_t) ** 3   # sharper rise

    # Mix the bass swap + mid/high crossfade
    bass_region = y1_bass * bass_fade_out + y2_bass * bass_fade_in
    mid_high_region = y1_mid_high * fade_out + y2_mid_high * fade_in
    mixed_region = bass_region + mid_high_region

    return np.concatenate([y1[:-fade_samples], mixed_region, y2[fade_samples:]])

def create_advanced_mix(
    path_a: str,
    path_b: str,
    output_path: str,
    crossfade_duration: float = 8.0,
    bass_boost: float = 0.0,
    brightness: float = 0.0,
    vocal_boost: float = 0.0,
    pan_a: float = 0.0,
    pan_b: float = 0.0,
    eq_transition: bool = True,
) -> dict:
    """
    Advanced DJ mixing pipeline.

    Steps:
      1. Load both tracks at 44.1kHz
      2. LUFS normalize to -24 LUFS
      3. High-pass filter at 40Hz (remove rumble)
      4. Detect beats + estimate BPM
      5. Time-stretch both to a common BPM (average)
      6. Align to first beat boundary
      7. Apply per-track EQ (bass boost, brightness)
      8. EQ-based crossfade or equal-power crossfade
      9. Final LUFS mastering to -14 LUFS
    """
    print(f"=== Creating advanced DJ mix ===")
    print(f"Track A: {path_a}")
    print(f"Track B: {path_b}")

    # 1. Load at high quality
    y_a, sr = load_audio_full_rate(path_a)
    y_b, _ = load_audio_full_rate(path_b)
    print(f"Loaded: A={len(y_a)/sr:.1f}s, B={len(y_b)/sr:.1f}s at {sr}Hz")

    # 2. LUFS normalize
    y_a = normalize_loudness(y_a, sr, TARGET_LOUDNESS)
    y_b = normalize_loudness(y_b, sr, TARGET_LOUDNESS)
    print("LUFS normalized to -24 LUFS")

    # 3. High-pass filter
    y_a = highpass_filter(y_a, sr, cutoff=40)
    y_b = highpass_filter(y_b, sr, cutoff=40)

    # 4. Beat detection + BPM (use lower SR for speed, then apply to full audio)
    y_a_analysis = librosa.resample(y_a, orig_sr=sr, target_sr=SR)
    y_b_analysis = librosa.resample(y_b, orig_sr=sr, target_sr=SR)

    beats_a = detect_beats(y_a_analysis, SR)
    beats_b = detect_beats(y_b_analysis, SR)

    bpm_a = estimate_bpm_from_beats(beats_a) if len(beats_a) >= 2 else estimate_bpm_librosa(y_a_analysis, SR)
    bpm_b = estimate_bpm_from_beats(beats_b) if len(beats_b) >= 2 else estimate_bpm_librosa(y_b_analysis, SR)

    target_bpm = round((bpm_a + bpm_b) / 2, 2)
    print(f"BPMs — A: {bpm_a:.1f}, B: {bpm_b:.1f} → target: {target_bpm:.1f}")

    # 5. Time-stretch to common BPM
    y_a = time_stretch_to_bpm(y_a, sr, bpm_a, target_bpm)
    y_b = time_stretch_to_bpm(y_b, sr, bpm_b, target_bpm)
    print("Time-stretched to common BPM")

    # Re-detect beats after stretch for alignment
    y_a_stretched_analysis = librosa.resample(y_a, orig_sr=sr, target_sr=SR)
    y_b_stretched_analysis = librosa.resample(y_b, orig_sr=sr, target_sr=SR)
    beats_a_new = detect_beats(y_a_stretched_analysis, SR)
    beats_b_new = detect_beats(y_b_stretched_analysis, SR)

    # 6. Align to beat boundaries
    y_a = align_to_beat(y_a, sr, beats_a_new)
    y_b = align_to_beat(y_b, sr, beats_b_new)

    # 7. Apply per-track EQ
    y_a = apply_bass_boost(y_a, sr, bass_boost)
    y_b = apply_bass_boost(y_b, sr, bass_boost)
    y_a = apply_brightness(y_a, sr, brightness)
    y_b = apply_brightness(y_b, sr, brightness)

    # Vocal boost (boost mid frequencies 1-4kHz)
    if vocal_boost > 0:
        sos = scipy.signal.butter(2, [1000, 4000], btype='band', fs=sr, output='sos')
        y_a_vocal = scipy.signal.sosfilt(sos, y_a)
        y_b_vocal = scipy.signal.sosfilt(sos, y_b)
        y_a = y_a + vocal_boost * y_a_vocal
        y_b = y_b + vocal_boost * y_b_vocal

    # 8. Crossfade
    if eq_transition:
        mixed = eq_crossfade(y_a, y_b, sr, duration=crossfade_duration)
        print(f"EQ crossfade applied: {crossfade_duration:.1f}s")
    else:
        mixed = equal_power_crossfade(y_a, y_b, sr, duration=crossfade_duration)
        print(f"Equal-power crossfade applied: {crossfade_duration:.1f}s")

    # 9. Final LUFS mastering
    mixed = normalize_loudness(mixed, sr, FINAL_LOUDNESS)
    print("Final mastering to -14 LUFS")

    # Soft limiter to prevent clipping
    peak = np.max(np.abs(mixed))
    if peak > 0.99:
        mixed = mixed * (0.99 / peak)

    # Save
    save_audio(output_path, mixed, sr)

    duration = float(len(mixed) / sr)
    print(f"Mix complete: {duration:.1f}s saved to {output_path}")

    return {
        "bpm_a": bpm_a,
        "bpm_b": bpm_b,
        "target_bpm": target_bpm,
        "duration": duration,
    }


# ═══════════════════════════════════════════════════════════════════════════════
# ML-POWERED AUDIO CLASSIFICATION (Genre, Mood, Vocals)
# Uses HuggingFace transformer models for accurate classification
# ═══════════════════════════════════════════════════════════════════════════════

# Model IDs
GENRE_MODEL_ID = "dima806/music_genres_classification"      # wav2vec2, GTZAN 10 genres
MOOD_MODEL_ID = "StanislavKo28/music_moods_classification"  # wav2vec2, 14 moods

# Lazy-loaded classifiers (loaded on first use to speed up startup)
_genre_classifier = None
_mood_classifier = None

def _get_genre_classifier():
    """Lazy-load the genre classification pipeline."""
    global _genre_classifier
    if _genre_classifier is None:
        print(f"Loading genre model: {GENRE_MODEL_ID}")
        _genre_classifier = hf_pipeline(
            "audio-classification",
            model=GENRE_MODEL_ID,
            device=-1,  # CPU
        )
        print("Genre model loaded.")
    return _genre_classifier

def _get_mood_classifier():
    """Lazy-load the mood classification pipeline."""
    global _mood_classifier
    if _mood_classifier is None:
        print(f"Loading mood model: {MOOD_MODEL_ID}")
        _mood_classifier = hf_pipeline(
            "audio-classification",
            model=MOOD_MODEL_ID,
            device=-1,  # CPU
        )
        print("Mood model loaded.")
    return _mood_classifier


def classify_genre(file_path: str) -> dict:
    """
    Classify music genre using dima806/music_genres_classification.
    Returns top genre + confidence + top 3 predictions.
    Genres: blues, classical, country, disco, hiphop, jazz, metal, pop, reggae, rock
    """
    try:
        classifier = _get_genre_classifier()
        results = classifier(file_path, top_k=5)
        top = results[0]
        top3 = [
            {"genre": r["label"], "confidence": round(r["score"], 4)}
            for r in results[:3]
        ]
        return {
            "genre": top["label"],
            "confidence": round(top["score"], 4),
            "top3": top3,
        }
    except Exception as e:
        print(f"Genre classification error: {e}")
        return {"genre": "unknown", "confidence": 0.0, "top3": []}


def classify_mood(file_path: str) -> dict:
    """
    Classify music mood using StanislavKo28/music_moods_classification.
    Returns top mood + confidence.
    Moods: angry, dark, energetic, epic, euphoric, happy, mysterious,
           relaxing, romantic, sad, scary, glamorous, uplifting, sentimental
    """
    try:
        classifier = _get_mood_classifier()
        results = classifier(file_path, top_k=5)
        top = results[0]
        top3 = [
            {"mood": r["label"], "confidence": round(r["score"], 4)}
            for r in results[:3]
        ]
        return {
            "mood": top["label"],
            "confidence": round(top["score"], 4),
            "top3": top3,
        }
    except Exception as e:
        print(f"Mood classification error: {e}")
        return {"mood": "neutral", "confidence": 0.0, "top3": []}


def detect_vocals(y: np.ndarray, sr: int) -> dict:
    """
    Detect whether the audio has vocals using harmonic/percussive separation.
    If the harmonic component has strong mid-frequency energy (1-4kHz vocal range),
    the track likely has vocals.
    """
    try:
        # Separate harmonic and percussive components
        y_harmonic, y_percussive = librosa.effects.hpss(y)

        # Compute spectral centroid of harmonic part
        harmonic_centroid = float(np.mean(librosa.feature.spectral_centroid(y=y_harmonic, sr=sr)))

        # Compute energy in vocal frequency range (1-4kHz)
        S = np.abs(librosa.stft(y_harmonic))
        freqs = librosa.fft_frequencies(sr=sr)
        vocal_mask = (freqs >= 1000) & (freqs <= 4000)
        full_mask = freqs >= 80

        vocal_energy = float(np.mean(S[vocal_mask, :])) if np.any(vocal_mask) else 0
        total_energy = float(np.mean(S[full_mask, :])) if np.any(full_mask) else 1

        vocal_ratio = vocal_energy / max(total_energy, 1e-8)

        # Harmonic-to-percussive ratio
        h_energy = float(np.mean(y_harmonic ** 2))
        p_energy = float(np.mean(y_percussive ** 2))
        hp_ratio = h_energy / max(p_energy, 1e-8)

        # Decision: high vocal ratio + high harmonic content = vocals
        has_vocals = vocal_ratio > 0.8 and hp_ratio > 1.2

        return {
            "has_vocals": has_vocals,
            "vocal_ratio": round(vocal_ratio, 4),
            "harmonic_percussive_ratio": round(hp_ratio, 4),
            "label": "Vocal" if has_vocals else "Instrumental",
        }
    except Exception as e:
        print(f"Vocal detection error: {e}")
        return {"has_vocals": False, "vocal_ratio": 0.0, "harmonic_percussive_ratio": 0.0, "label": "Unknown"}


# ═══════════════════════════════════════════════════════════════════════════════
# BEAT GENERATOR (Procedural Drum Synthesis)
# (Condensed version of the full beat_generator.py)
# ═══════════════════════════════════════════════════════════════════════════════

BEAT_SR = 44100

class Complexity(Enum):
    MINIMAL = "minimal"
    SIMPLE = "simple"
    MEDIUM = "medium"
    COMPLEX = "complex"
    INTRICATE = "intricate"

class Energy(Enum):
    SOFT = "soft"
    LOW = "low"
    MEDIUM = "medium"
    HIGH = "high"
    INTENSE = "intense"

class Mood(Enum):
    DARK = "dark"
    MELANCHOLIC = "melancholic"
    NEUTRAL = "neutral"
    UPLIFTING = "uplifting"
    AGGRESSIVE = "aggressive"

@dataclass
class BeatParams:
    genre: str
    sub_genre: Optional[str]
    bpm: float
    bars: int
    complexity: Complexity
    energy: Energy
    mood: Mood
    time_signature: Tuple[int, int]
    swing: float
    humanize: float
    include_fills: bool
    instruments: List[str]
    description: str

# Genre defaults
_GENRE_DEFAULTS = {
    "hiphop": {"bpm": 90, "range": (70, 110)},
    "trap": {"bpm": 140, "range": (120, 160)},
    "edm": {"bpm": 128, "range": (118, 150)},
    "house": {"bpm": 124, "range": (118, 130)},
    "techno": {"bpm": 130, "range": (120, 145)},
    "dnb": {"bpm": 174, "range": (160, 180)},
    "rock": {"bpm": 120, "range": (100, 150)},
    "metal": {"bpm": 160, "range": (100, 220)},
    "pop": {"bpm": 120, "range": (95, 135)},
    "jazz": {"bpm": 120, "range": (60, 200)},
    "reggae": {"bpm": 80, "range": (65, 100)},
    "funk": {"bpm": 105, "range": (85, 125)},
    "ambient": {"bpm": 75, "range": (50, 100)},
}

_GENRE_KEYWORDS = {
    "hiphop": ["hip hop", "hip-hop", "hiphop", "rap", "boom bap", "lofi", "lo-fi"],
    "trap": ["trap", "atlanta trap", "drill", "phonk"],
    "edm": ["edm", "electronic dance", "electronic"],
    "house": ["house", "deep house", "tech house"],
    "techno": ["techno", "industrial", "acid"],
    "dnb": ["drum and bass", "dnb", "d&b", "jungle"],
    "rock": ["rock", "indie", "punk", "grunge"],
    "metal": ["metal", "heavy metal", "thrash", "death metal"],
    "pop": ["pop", "mainstream"],
    "jazz": ["jazz", "swing", "bebop"],
    "reggae": ["reggae", "dub", "ska", "dancehall"],
    "funk": ["funk", "funky", "groove", "disco", "soul"],
    "ambient": ["ambient", "chill", "chillout", "downtempo"],
}

def _kw_in(kw: str, text: str) -> bool:
    pattern = r"(?<![a-z])" + re.escape(kw) + r"(?![a-z])"
    return bool(re.search(pattern, text, re.IGNORECASE))

def parse_prompt(prompt: str) -> BeatParams:
    text = prompt.lower().strip()

    # Find genre
    genre = "hiphop"
    for g, keywords in _GENRE_KEYWORDS.items():
        if any(_kw_in(kw, text) for kw in keywords):
            genre = g
            break

    # Find BPM
    bpm_match = re.search(r"(\d{2,3})\s*(?:bpm|tempo)", text, re.IGNORECASE)
    if bpm_match:
        bpm = max(40.0, min(250.0, float(bpm_match.group(1))))
    else:
        bpm = _GENRE_DEFAULTS.get(genre, {}).get("bpm", 120)

    # Find bars
    bars_match = re.search(r"(\d+)\s*(?:bar|bars|measure)", text, re.IGNORECASE)
    bars = int(bars_match.group(1)) if bars_match else 4
    bars = max(1, min(32, bars))

    # Complexity
    if any(w in text for w in ["complex", "intricate", "busy"]):
        complexity = Complexity.COMPLEX
    elif any(w in text for w in ["simple", "basic", "minimal"]):
        complexity = Complexity.SIMPLE
    else:
        complexity = Complexity.MEDIUM

    # Energy
    if any(w in text for w in ["intense", "aggressive", "hard", "heavy"]):
        energy = Energy.INTENSE
    elif any(w in text for w in ["energetic", "powerful", "driving"]):
        energy = Energy.HIGH
    elif any(w in text for w in ["soft", "gentle", "quiet"]):
        energy = Energy.SOFT
    elif any(w in text for w in ["chill", "relaxed", "laid-back"]):
        energy = Energy.LOW
    else:
        energy = Energy.MEDIUM

    mood = Mood.NEUTRAL
    if any(w in text for w in ["dark", "sinister"]):
        mood = Mood.DARK
    elif any(w in text for w in ["happy", "uplifting", "bright"]):
        mood = Mood.UPLIFTING

    instruments = ["kick", "snare", "hihat_c", "hihat_o", "clap"]

    desc = f"{mood.value.title()} {energy.value} {genre.title()} beat at {bpm:.0f} BPM, {bars} bars"

    return BeatParams(
        genre=genre, sub_genre=None, bpm=bpm, bars=bars,
        complexity=complexity, energy=energy, mood=mood,
        time_signature=(4, 4), swing=0.0, humanize=0.3,
        include_fills=False, instruments=instruments, description=desc,
    )


# Drum synthesis functions
def _env(length: int, attack: float = 0.002, decay: float = 0.15):
    t = np.linspace(0, 1, length)
    env = np.exp(-t / max(decay, 1e-6))
    atk = int(attack * BEAT_SR)
    if 0 < atk < length:
        env[:atk] *= np.linspace(0, 1, atk)
    return env

def synth_kick(dur=0.4):
    n = int(dur * BEAT_SR)
    t = np.linspace(0, dur, n)
    freq = np.linspace(200, 55, n)
    phase = 2 * np.pi * np.cumsum(freq) / BEAT_SR
    sine = np.sin(phase) * _env(n, 0.001, 0.25) * 0.8
    click_n = int(0.008 * BEAT_SR)
    click = np.zeros(n)
    click[:click_n] = (np.random.rand(click_n) * 2 - 1) * np.linspace(1, 0, click_n)
    return (sine + click * 0.2).astype(np.float32)

def synth_snare(dur=0.25):
    n = int(dur * BEAT_SR)
    t = np.linspace(0, dur, n)
    noise = np.random.randn(n) * _env(n, 0.001, 0.08) * 0.6
    body = np.sin(2 * np.pi * 200 * t) * _env(n, 0.001, 0.05) * 0.5
    return (noise + body).astype(np.float32)

def synth_hihat_c(dur=0.05):
    n = int(dur * BEAT_SR)
    noise = np.random.randn(n)
    filtered = np.diff(noise, prepend=noise[0])
    return (filtered * _env(n, 0.0005, 0.02) * 0.5).astype(np.float32)

def synth_hihat_o(dur=0.3):
    n = int(dur * BEAT_SR)
    noise = np.random.randn(n)
    filtered = np.diff(noise, prepend=noise[0])
    ring = np.sin(2 * np.pi * 6000 * np.linspace(0, dur, n))
    return (filtered * _env(n, 0.001, 0.2) * 0.4 + ring * _env(n, 0.001, 0.2) * 0.1).astype(np.float32)

def synth_clap(dur=0.15):
    n = int(dur * BEAT_SR)
    noise = np.random.randn(n)
    env = np.zeros(n)
    for offset_ms in [0, 10, 20, 30]:
        offset = int(offset_ms * BEAT_SR / 1000)
        if offset < n:
            env[offset:] += _env(n - offset, 0.001, 0.04) * (1 - offset_ms / 50)
    return (noise * env * 0.5).astype(np.float32)

# Sound bank
_SOUNDS = {
    "kick": synth_kick(),
    "snare": synth_snare(),
    "hihat_c": synth_hihat_c(),
    "hihat_o": synth_hihat_o(),
    "clap": synth_clap(),
}

def generate_pattern(params: BeatParams) -> Dict[str, List[int]]:
    steps = 16
    pattern = {}
    genre = params.genre
    cx = params.complexity

    # Kick
    kick = [0] * steps
    if genre in ["edm", "house", "techno"]:
        for i in range(0, steps, 4): kick[i] = 1
    elif genre == "dnb":
        kick[0] = 1; kick[10] = 1
    elif genre in ["hiphop", "trap"]:
        kick[0] = 1; kick[6] = 1
        if cx in [Complexity.COMPLEX, Complexity.INTRICATE]: kick[10] = 1
    else:
        kick[0] = 1; kick[8] = 1
    pattern["kick"] = kick

    # Snare
    snare = [0] * steps
    if genre == "reggae":
        snare[8] = 1
    elif genre == "trap":
        snare[12] = 1
    else:
        snare[4] = 1; snare[12] = 1
    pattern["snare"] = snare

    # Hi-hats
    hh_c = [0] * steps
    if cx == Complexity.MINIMAL:
        for i in range(0, steps, 4): hh_c[i] = 1
    elif cx == Complexity.SIMPLE:
        for i in range(0, steps, 2): hh_c[i] = 1
    else:
        for i in range(steps): hh_c[i] = 1
    pattern["hihat_c"] = hh_c

    hh_o = [0] * steps
    if cx != Complexity.MINIMAL:
        hh_o[7] = 1; hh_o[15] = 1
    pattern["hihat_o"] = hh_o

    # Clap
    clap = [0] * steps
    if genre in ["trap", "edm", "house"]:
        clap[4] = 1; clap[12] = 1
    pattern["clap"] = clap

    return pattern


def render_beat(pattern: Dict[str, List[int]], params: BeatParams) -> np.ndarray:
    steps_per_bar = 16
    beat_dur = 60.0 / params.bpm
    step_dur = beat_dur / 4.0
    bar_samples = int(steps_per_bar * step_dur * BEAT_SR)
    total_samples = bar_samples * params.bars

    audio = np.zeros(total_samples, dtype=np.float32)

    gain_map = {"kick": 0.95, "snare": 0.80, "hihat_c": 0.50, "hihat_o": 0.55, "clap": 0.70}

    for instrument, steps in pattern.items():
        sound = _SOUNDS.get(instrument)
        if sound is None:
            continue
        base_gain = gain_map.get(instrument, 0.6)
        for bar in range(params.bars):
            for step, hit in enumerate(steps):
                if not hit:
                    continue
                sample_pos = bar * bar_samples + int(step * step_dur * BEAT_SR)
                # Humanize
                if params.humanize > 0:
                    sample_pos += int(np.random.normal(0, params.humanize * 0.01 * BEAT_SR))
                sample_pos = max(0, min(sample_pos, total_samples - 1))
                end = min(sample_pos + len(sound), total_samples)
                write_len = end - sample_pos
                if write_len > 0:
                    audio[sample_pos:end] += sound[:write_len] * base_gain

    peak = np.abs(audio).max()
    if peak > 0.95:
        audio = audio * (0.95 / peak)

    return audio


def generate_beat_full(prompt: str, output_path: str) -> dict:
    params = parse_prompt(prompt)
    pattern = generate_pattern(params)
    audio = render_beat(pattern, params)
    duration = len(audio) / BEAT_SR
    sf.write(output_path, audio, BEAT_SR, subtype="PCM_16")

    return {
        "genre": params.genre,
        "bpm": params.bpm,
        "bars": params.bars,
        "complexity": params.complexity.value,
        "description": params.description,
        "duration": round(duration, 3),
        "pattern": pattern,
        "sample_rate": BEAT_SR,
    }


# ═══════════════════════════════════════════════════════════════════════════════
# SHAZAM RECOGNITION
# ═══════════════════════════════════════════════════════════════════════════════

def convert_to_wav(input_path):
    output_path = input_path.rsplit('.', 1)[0] + '_converted.wav'
    try:
        cmd = ['ffmpeg', '-y', '-i', input_path, '-ar', '44100', '-ac', '1', '-sample_fmt', 's16', '-f', 'wav', output_path]
        result = subprocess.run(cmd, capture_output=True, timeout=30)
        if result.returncode == 0 and os.path.exists(output_path):
            return output_path
    except Exception as e:
        print(f"Conversion error: {e}")
    return None

async def recognize_shazam(audio_path):
    if not RAPIDAPI_KEY:
        return None
    headers = {"X-RapidAPI-Key": RAPIDAPI_KEY, "X-RapidAPI-Host": SHAZAM_HOST}
    try:
        with open(audio_path, 'rb') as f:
            audio_bytes = f.read()
        async with httpx.AsyncClient(timeout=25) as client:
            response = await client.post(SHAZAM_URL, headers=headers, files={"file": ("audio.wav", audio_bytes, "audio/wav")})
        if response.status_code != 200:
            return None
        data = response.json()
        track = data.get("track") or data
        if not track.get("title") and not track.get("heading"):
            return None
        title = track.get("title", "Unknown")
        artist = track.get("subtitle", "Unknown Artist")
        album = cover = year = spotify_url = apple_music_url = None
        shazam_url = track.get("url")
        sections = track.get("sections", [])
        for section in sections:
            if section.get("type") == "SONG":
                for meta in section.get("metadata", []):
                    if meta.get("title") == "Album": album = meta.get("text")
                    elif meta.get("title") == "Released": year = meta.get("text")
        images = track.get("images", {})
        cover = images.get("coverarthq") or images.get("coverart")
        hub = track.get("hub", {})
        for provider in hub.get("providers", []):
            ptype = provider.get("type", "").upper()
            for action in provider.get("actions", []):
                uri = action.get("uri", "")
                if ptype == "SPOTIFY" and uri: spotify_url = uri
                elif ptype == "APPLE" and uri: apple_music_url = uri
        score = len(data.get("matches", []))
        return {"title": title, "artist": artist, "album": album, "cover": cover, "year": year,
                "spotify": spotify_url, "apple_music": apple_music_url, "shazam_url": shazam_url,
                "score": max(score, 1), "source": "shazam"}
    except Exception as e:
        print(f"Shazam error: {e}")
        return None


# ═══════════════════════════════════════════════════════════════════════════════
# API ROUTES
# ═══════════════════════════════════════════════════════════════════════════════

@app.get("/")
def root():
    return {"status": "ok", "service": "AutoMixAI Backend v2.0", "features": [
        "upload", "analyze", "mix (advanced DJ)", "generate", "recognize"
    ]}

@app.get("/health")
def health():
    return {"status": "healthy"}


# ── Upload ───────────────────────────────────────────────────────────────────

ALLOWED_EXTENSIONS = {".wav", ".mp3", ".flac", ".ogg", ".m4a", ".aac"}

@app.post("/upload", response_model=UploadResponse)
async def upload_audio(file: UploadFile = File(...)):
    original_name = file.filename or "unknown"
    ext = Path(original_name).suffix.lower()
    if ext not in ALLOWED_EXTENSIONS:
        raise HTTPException(status_code=400, detail=f"Unsupported file type '{ext}'.")

    file_id = generate_file_id()
    save_path = UPLOAD_DIR / f"{file_id}{ext}"

    try:
        contents = await file.read()
        save_path.write_bytes(contents)
    except Exception as exc:
        raise HTTPException(status_code=500, detail="Failed to save file.") from exc

    try:
        info = get_audio_info(str(save_path))
    except Exception as exc:
        save_path.unlink(missing_ok=True)
        raise HTTPException(status_code=400, detail="Not a valid audio file.") from exc

    return UploadResponse(file_id=file_id, filename=original_name, duration=round(info["duration"], 2))


# ── Analyze ──────────────────────────────────────────────────────────────────

@app.post("/analyze", response_model=AnalysisResponse)
async def analyze_audio(request: AnalyzeRequest):
    try:
        file_path = find_upload(request.file_id)
    except FileNotFoundError:
        raise HTTPException(status_code=404, detail=f"File '{request.file_id}' not found.")

    file_path_str = str(file_path)

    # Load for librosa analysis
    y, sr = load_audio(file_path_str)
    beat_times = detect_beats(y, sr)
    bpm = estimate_bpm_from_beats(beat_times) if len(beat_times) >= 2 else estimate_bpm_librosa(y, sr)

    # ML Genre classification (transformer model)
    genre_result = classify_genre(file_path_str)

    # ML Mood classification (transformer model)
    mood_result = classify_mood(file_path_str)

    # Vocal detection (spectral analysis)
    vocal_result = detect_vocals(y, sr)

    # Energy (RMS-based)
    rms = float(np.mean(librosa.feature.rms(y=y)))
    if rms > 0.12: energy = "high"
    elif rms > 0.06: energy = "medium"
    else: energy = "low"

    return AnalysisResponse(
        file_id=request.file_id,
        bpm=bpm,
        beat_times=beat_times,
        duration=round(len(y) / sr, 2),
        sample_rate=sr,
        genre=genre_result["genre"],
        genre_confidence=genre_result["confidence"],
        genre_top3=genre_result.get("top3", []),
        mood=mood_result["mood"],
        has_vocals=vocal_result["has_vocals"],
        energy=energy,
    )


# ── Mix ──────────────────────────────────────────────────────────────────────

@app.post("/mix", response_model=MixResponse)
async def mix_tracks(request: MixRequest):
    try:
        path_a = find_upload(request.file_id_a)
    except FileNotFoundError:
        raise HTTPException(status_code=404, detail=f"Track A '{request.file_id_a}' not found.")
    try:
        path_b = find_upload(request.file_id_b)
    except FileNotFoundError:
        raise HTTPException(status_code=404, detail=f"Track B '{request.file_id_b}' not found.")

    output_id = generate_file_id()
    output_path = OUTPUT_DIR / f"{output_id}.wav"

    try:
        result = create_advanced_mix(
            path_a=str(path_a),
            path_b=str(path_b),
            output_path=str(output_path),
            crossfade_duration=request.crossfade_duration,
            bass_boost=request.bass_boost,
            brightness=request.brightness,
            vocal_boost=request.vocal_boost,
            pan_a=request.pan_a,
            pan_b=request.pan_b,
            eq_transition=request.eq_transition,
        )
    except Exception as exc:
        print(f"Mix error: {exc}")
        import traceback
        traceback.print_exc()
        raise HTTPException(status_code=500, detail=f"Mixing failed: {str(exc)}") from exc

    return MixResponse(
        output_file_id=output_id,
        duration=result["duration"],
        bpm_a=result["bpm_a"],
        bpm_b=result["bpm_b"],
        target_bpm=result["target_bpm"],
    )


# ── Generate (Procedural Synth) ──────────────────────────────────────────────

@app.post("/generate", response_model=GenerateBeatResponse)
async def generate_beat_route(request: GenerateBeatRequest):
    output_id = generate_file_id()
    output_path = OUTPUT_DIR / f"{output_id}.wav"

    effective_prompt = request.prompt
    if request.bars:
        effective_prompt = f"{request.prompt} {request.bars} bars"

    try:
        result = generate_beat_full(effective_prompt, str(output_path))
    except Exception as exc:
        raise HTTPException(status_code=500, detail=f"Beat generation failed: {exc}") from exc

    pattern_raw = result["pattern"]
    return GenerateBeatResponse(
        output_file_id=output_id,
        genre=result["genre"],
        bpm=result["bpm"],
        bars=result["bars"],
        complexity=result["complexity"],
        description=result["description"],
        duration=result["duration"],
        pattern=PatternInfo(
            kick=pattern_raw.get("kick", [0]*16),
            snare=pattern_raw.get("snare", [0]*16),
            hihat_c=pattern_raw.get("hihat_c", [0]*16),
            hihat_o=pattern_raw.get("hihat_o", [0]*16),
            clap=pattern_raw.get("clap", [0]*16),
        ),
    )


# ── Generate AI (MusicGen via HF Inference API) ─────────────────────────────

HF_TOKEN = os.environ.get("HF_TOKEN", "")
MUSICGEN_API_URL = "https://router.huggingface.co/hf-inference/models/facebook/musicgen-small"

class GenerateAIRequest(BaseModel):
    prompt: str = Field(..., min_length=3, max_length=500)
    duration: int = Field(default=10, ge=3, le=30)

class GenerateAIResponse(BaseModel):
    output_file_id: str
    prompt: str
    duration: float
    model: str = "facebook/musicgen-small"
    sample_rate: int = 32000
    message: str = "AI beat generated successfully."

@app.post("/generate-ai", response_model=GenerateAIResponse)
async def generate_beat_ai(request: GenerateAIRequest):
    """Generate a beat using Meta's MusicGen via HuggingFace Inference API (free GPU)."""
    output_id = generate_file_id()
    output_path = OUTPUT_DIR / f"{output_id}.wav"

    headers = {
        "Content-Type": "application/json",
        "x-wait-for-model": "true",  # Wait for model to load instead of 503
    }
    if HF_TOKEN:
        headers["Authorization"] = f"Bearer {HF_TOKEN}"

    payload = {
        "inputs": request.prompt,
    }

    try:
        print(f"MusicGen AI generating: '{request.prompt}' ({request.duration}s)")
        async with httpx.AsyncClient(timeout=300) as client:
            response = await client.post(
                MUSICGEN_API_URL,
                headers=headers,
                json=payload,
            )

        print(f"HF API response: status={response.status_code}, content-type={response.headers.get('content-type', 'unknown')}, size={len(response.content)} bytes")

        if response.status_code == 503:
            error_data = response.json() if response.headers.get("content-type", "").startswith("application/json") else {}
            wait_time = error_data.get("estimated_time", 30)
            raise HTTPException(
                status_code=503,
                detail=f"MusicGen model is loading, please try again in ~{int(wait_time)} seconds."
            )
        if response.status_code != 200:
            error_msg = response.text[:500]
            print(f"HF API error: {error_msg}")
            raise HTTPException(
                status_code=502,
                detail=f"HF Inference API error ({response.status_code}): {error_msg}"
            )

        # Response is raw audio bytes (FLAC format)
        audio_bytes = response.content

        # Save the raw audio first
        temp_path = str(output_path).replace(".wav", "_raw.flac")
        with open(temp_path, "wb") as f:
            f.write(audio_bytes)

        # Convert to WAV using librosa
        y, sr = librosa.load(temp_path, sr=None, mono=True)
        sf.write(str(output_path), y, sr, subtype="PCM_16")

        # Clean up temp
        os.remove(temp_path)

        actual_duration = round(len(y) / sr, 2)
        print(f"MusicGen AI complete: {actual_duration}s")

    except HTTPException:
        raise
    except Exception as exc:
        import traceback
        traceback.print_exc()
        raise HTTPException(status_code=500, detail=f"AI generation failed: {str(exc)}") from exc

    return GenerateAIResponse(
        output_file_id=output_id,
        prompt=request.prompt,
        duration=actual_duration,
        sample_rate=int(sr),
    )


# ── Output ───────────────────────────────────────────────────────────────────

@app.get("/output/{file_id}")
async def download_output(file_id: str):
    output_path = OUTPUT_DIR / f"{file_id}.wav"
    if not output_path.exists():
        raise HTTPException(status_code=404, detail=f"Output file '{file_id}' not found.")
    return FileResponse(str(output_path), media_type="audio/wav", filename=f"automix_{file_id}.wav")


# ── Recognize ────────────────────────────────────────────────────────────────

@app.post("/recognize")
async def recognize(file: UploadFile = File(...)):
    if not file:
        raise HTTPException(status_code=400, detail="No file uploaded")

    suffix = os.path.splitext(file.filename or "audio.webm")[1] or ".webm"
    tmp_path = os.path.join(tempfile.gettempdir(), f"shazam_{uuid.uuid4().hex}{suffix}")
    converted_path = None

    try:
        content = await file.read()
        if len(content) < 500:
            return {"status": "error", "message": "Audio too small"}
        with open(tmp_path, 'wb') as f:
            f.write(content)

        converted_path = convert_to_wav(tmp_path)
        work_path = converted_path if converted_path else tmp_path

        result = await recognize_shazam(work_path)

        if result:
            return {
                "status": "found", "title": result["title"], "artist": result["artist"],
                "album": result.get("album"), "cover": result.get("cover"),
                "year": result.get("year"), "spotify": result.get("spotify"),
                "apple_music": result.get("apple_music"), "shazam_url": result.get("shazam_url"),
                "score": result.get("score", 0), "source": result.get("source", "unknown"),
                "match_quality": "high", "is_early": True,
            }
        else:
            return {"status": "not_found", "message": "No song matched.", "is_early": False}
    except Exception as e:
        return {"status": "error", "message": f"Recognition failed: {str(e)}"}
    finally:
        for path in [tmp_path, converted_path]:
            if path:
                try: os.unlink(path)
                except: pass


# ═══════════════════════════════════════════════════════════════════════════════
# ENTRYPOINT
# ═══════════════════════════════════════════════════════════════════════════════

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)