Spaces:

heybaeheef
/

KU_SW_Academy

Paused

App Files Files Community

heybaeheef commited on Dec 9, 2025

Commit

3cc9d6f

verified ·

1 Parent(s): a7b8b45

Upload 9 files

Browse files

Files changed (9) hide show

Dockerfile +23 -0
README.md +24 -5
audio_processing/__init__.py +0 -0
audio_processing/effect_chain.py +255 -0
main.py +275 -0
models/__init__.py +0 -0
models/ai_effector.py +404 -0
models/audio_encoder.py +189 -0
requirements.txt +20 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+FROM python:3.10-slim
+WORKDIR /app
+# 시스템 패키지 설치
+RUN apt-get update && apt-get install -y \
+    libsndfile1 \
+    ffmpeg \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Python 패키지 설치
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# 앱 코드 복사
+COPY . .
+# Hugging Face Spaces는 포트 7860 사용
+EXPOSE 7860
+# 서버 실행
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,10 +1,29 @@
 ---
-title: KU SW Academy
-emoji: 🌖
-colorFrom: red
-colorTo: yellow
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: DiffVox AI Vocal Effects Server
+emoji: 🎤
+colorFrom: purple
+colorTo: pink
 sdk: docker
+app_port: 7860
 pinned: false
 ---
+# DiffVox AI Vocal Effects Server
+AI-powered vocal effect processing server using DiffVox LLM.
+## API Endpoints
+- `GET /` - Server info
+- `GET /health` - Health check
+- `POST /predict` - Predict effect parameters
+- `POST /process` - Process audio with AI-predicted parameters
+- `POST /process_with_params` - Process audio and return parameters + audio
+## Usage
+```bash
+curl -X POST "https://YOUR-SPACE.hf.space/process_with_params" \
+  -F "audio=@your_vocal.wav" \
+  -F "prompt=warm vintage sound"
+```

audio_processing/__init__.py ADDED Viewed

File without changes

audio_processing/effect_chain.py ADDED Viewed

	@@ -0,0 +1,255 @@

+"""
+Audio Effect Chain
+==================
+실제 오디오에 이펙트를 적용하는 처리 체인
+pedalboard 라이브러리 사용 (Spotify에서 만든 오디오 플러그인 라이브러리)
+- 고품질 VST 수준의 이펙트
+- Python에서 쉽게 사용 가능
+- 실시간 처리도 가능
+"""
+import numpy as np
+from pathlib import Path
+from typing import Dict, Any, List
+import soundfile as sf
+# pedalboard - 오디오 이펙트 라이브러리
+from pedalboard import (
+    Pedalboard,
+    Compressor,
+    Gain,
+    LowShelfFilter,
+    HighShelfFilter,
+    PeakFilter,
+    Delay,
+    Reverb,
+    Distortion,
+    Limiter,
+    HighpassFilter,
+    LowpassFilter
+)
+from pedalboard.io import AudioFile
+class EffectChain:
+    """오디오 이펙트 처리 체인"""
+    AVAILABLE_EFFECTS = [
+        "eq_lowshelf",
+        "eq_highshelf",
+        "eq_peak1",
+        "eq_peak2",
+        "compressor",
+        "distortion",
+        "delay",
+        "reverb",
+        "limiter"
+    ]
+    def __init__(self):
+        """이펙트 체인 초기화"""
+        pass
+    def get_available_effects(self) -> List[str]:
+        """사용 가능한 이펙트 목록 반환"""
+        return self.AVAILABLE_EFFECTS.copy()
+    def process(
+        self,
+        input_path: str,
+        output_path: str,
+        parameters: Dict[str, float]
+    ) -> None:
+        """
+        오디오 파일에 이펙트 체인 적용
+        Args:
+            input_path: 입력 오디오 파일 경로
+            output_path: 출력 오디오 파일 경로
+            parameters: 이펙터 파라미터 딕셔너리
+        """
+        # 오디오 파일 읽기
+        audio, sample_rate = sf.read(input_path)
+        # 모노면 스테레오로 변환 (일부 이펙트가 스테레오 필요)
+        if len(audio.shape) == 1:
+            audio = np.column_stack([audio, audio])
+        # float32로 변환
+        audio = audio.astype(np.float32)
+        # 이펙트 체인 구성
+        board = self._build_pedalboard(parameters, sample_rate)
+        # 이펙트 적용
+        processed = board(audio, sample_rate)
+        # Wet/Dry 믹스 적용
+        wet_mix = parameters.get("final_wet_mix", 0.5)
+        final_audio = (1 - wet_mix) * audio + wet_mix * processed
+        # 클리핑 방지
+        final_audio = np.clip(final_audio, -1.0, 1.0)
+        # 출력 파일 저장
+        sf.write(output_path, final_audio, sample_rate)
+        print(f"[EffectChain] 처리 완료: {output_path}")
+    def _build_pedalboard(
+        self,
+        params: Dict[str, float],
+        sample_rate: int
+    ) -> Pedalboard:
+        """
+        파라미터로부터 pedalboard 이펙트 체인 구성
+        """
+        effects = []
+        # === EQ Section ===
+        # Low Shelf EQ
+        if params.get("eq_lowshelf_gain", 0) != 0:
+            effects.append(
+                LowShelfFilter(
+                    cutoff_frequency_hz=params.get("eq_lowshelf_freq", 200),
+                    gain_db=params.get("eq_lowshelf_gain", 0),
+                    q=0.707
+                )
+            )
+        # High Shelf EQ
+        if params.get("eq_highshelf_gain", 0) != 0:
+            effects.append(
+                HighShelfFilter(
+                    cutoff_frequency_hz=params.get("eq_highshelf_freq", 8000),
+                    gain_db=params.get("eq_highshelf_gain", 0),
+                    q=0.707
+                )
+            )
+        # Peak EQ 1
+        if params.get("eq_peak1_gain", 0) != 0:
+            effects.append(
+                PeakFilter(
+                    cutoff_frequency_hz=params.get("eq_peak1_freq", 1000),
+                    gain_db=params.get("eq_peak1_gain", 0),
+                    q=params.get("eq_peak1_q", 1.0)
+                )
+            )
+        # Peak EQ 2
+        if params.get("eq_peak2_gain", 0) != 0:
+            effects.append(
+                PeakFilter(
+                    cutoff_frequency_hz=params.get("eq_peak2_freq", 3000),
+                    gain_db=params.get("eq_peak2_gain", 0),
+                    q=params.get("eq_peak2_q", 1.0)
+                )
+            )
+        # === Dynamics Section ===
+        # Compressor
+        threshold = params.get("compressor_threshold", -24)
+        ratio = params.get("compressor_ratio", 4.0)
+        if ratio > 1.0:
+            effects.append(
+                Compressor(
+                    threshold_db=threshold,
+                    ratio=ratio,
+                    attack_ms=params.get("compressor_attack", 5),
+                    release_ms=params.get("compressor_release", 50)
+                )
+            )
+            # Makeup Gain
+            makeup = params.get("compressor_makeup", 0)
+            if makeup != 0:
+                effects.append(Gain(gain_db=makeup))
+        # === Distortion Section ===
+        distortion_amount = params.get("distortion_amount", 0)
+        if distortion_amount > 0:
+            # pedalboard의 Distortion은 0-100 범위
+            effects.append(
+                Distortion(drive_db=distortion_amount * 40)  # 0-1 -> 0-40dB
+            )
+            # Distortion 후 톤 조절 (Tone = LPF)
+            tone = params.get("distortion_tone", 0.5)
+            lpf_freq = 2000 + tone * 10000  # 2kHz ~ 12kHz
+            effects.append(
+                LowpassFilter(cutoff_frequency_hz=lpf_freq)
+            )
+        # === Time-based Effects Section ===
+        # Delay
+        delay_mix = params.get("delay_mix", 0)
+        if delay_mix > 0:
+            delay_time_ms = params.get("delay_time", 250)
+            effects.append(
+                Delay(
+                    delay_seconds=delay_time_ms / 1000,
+                    feedback=params.get("delay_feedback", 0.3),
+                    mix=delay_mix
+                )
+            )
+        # Reverb
+        reverb_wet = params.get("reverb_wet_dry", 0)
+        if reverb_wet > 0:
+            effects.append(
+                Reverb(
+                    room_size=params.get("reverb_room_size", 0.5),
+                    damping=params.get("reverb_damping", 0.5),
+                    wet_level=reverb_wet,
+                    dry_level=1 - reverb_wet,
+                    width=1.0
+                )
+            )
+        # === Output Section ===
+        # Limiter (클리핑 방지)
+        effects.append(
+            Limiter(
+                threshold_db=-1.0,
+                release_ms=100
+            )
+        )
+        return Pedalboard(effects)
+    def process_realtime(
+        self,
+        audio_chunk: np.ndarray,
+        sample_rate: int,
+        parameters: Dict[str, float]
+    ) -> np.ndarray:
+        """
+        실시간 오디오 청크 처리 (스트리밍용)
+        Args:
+            audio_chunk: 오디오 데이터 배열
+            sample_rate: 샘플레이트
+            parameters: 이펙터 파라미터
+        Returns:
+            처리된 오디오 청크
+        """
+        if len(audio_chunk.shape) == 1:
+            audio_chunk = np.column_stack([audio_chunk, audio_chunk])
+        audio_chunk = audio_chunk.astype(np.float32)
+        board = self._build_pedalboard(parameters, sample_rate)
+        processed = board(audio_chunk, sample_rate)
+        wet_mix = parameters.get("final_wet_mix", 0.5)
+        final = (1 - wet_mix) * audio_chunk + wet_mix * processed
+        return np.clip(final, -1.0, 1.0)

main.py ADDED Viewed

	@@ -0,0 +1,275 @@

+"""
+MagicPath AI Vocal Effects Server - DiffVox LLM 통합 버전
+=========================================================
+Dry 보컬 파일을 받아서 학습된 AI가 이펙터 파라미터를 예측하고,
+실제로 이펙트를 적용한 오디오를 반환하는 서버
+"""
+from fastapi import FastAPI, UploadFile, File, Form, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse, JSONResponse
+import tempfile
+import os
+import uuid
+from pathlib import Path
+# 내부 모듈
+from models.ai_effector import AIEffector
+from audio_processing.effect_chain import EffectChain
+# ============================================
+# 설정
+# ============================================
+# 학습된 모델 경로 (Hugging Face 레포 또는 로컬 경로)
+MODEL_PATH = os.environ.get("DIFFVOX_MODEL_PATH", "heybaeheef/KU_SW_Academy")
+BASE_MODEL_NAME = os.environ.get("BASE_MODEL_NAME", "Qwen/Qwen3-8B")
+AUDIO_FEATURE_DIM = int(os.environ.get("AUDIO_FEATURE_DIM", "64"))
+USE_HUGGINGFACE = os.environ.get("USE_HUGGINGFACE", "true").lower() == "true"
+# ============================================
+# FastAPI 앱 초기화
+# ============================================
+app = FastAPI(
+    title="MagicPath AI Vocal Effects",
+    description="AI-powered vocal effect processing server (DiffVox LLM 통합)",
+    version="2.0.0"
+)
+# CORS 설정
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # 배포 시 특정 도메인으로 제한 권장
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# 전역 객체 초기화
+print("=" * 60)
+print("MagicPath AI Vocal Effects Server v2.0")
+print("=" * 60)
+print(f"Model Path: {MODEL_PATH}")
+print(f"Base Model: {BASE_MODEL_NAME}")
+print(f"Audio Feature Dim: {AUDIO_FEATURE_DIM}")
+print(f"Use Hugging Face: {USE_HUGGINGFACE}")
+print("=" * 60)
+ai_effector = AIEffector(
+    model_path=MODEL_PATH,
+    base_model_name=BASE_MODEL_NAME,
+    audio_feature_dim=AUDIO_FEATURE_DIM,
+    use_huggingface=USE_HUGGINGFACE
+)
+effect_chain = EffectChain()
+# 임시 파일 저장 경로
+TEMP_DIR = Path(tempfile.gettempdir()) / "magicpath"
+TEMP_DIR.mkdir(exist_ok=True)
+# ============================================
+# API 엔드포인트
+# ============================================
+@app.get("/")
+async def root():
+    """서버 정보"""
+    return {
+        "status": "running",
+        "message": "MagicPath AI Vocal Effects Server v2.0 (DiffVox LLM)",
+        "ai_model_loaded": ai_effector.is_loaded(),
+        "endpoints": {
+            "POST /process": "오디오 파일 처리 후 반환",
+            "POST /predict": "파라미터만 예측 (JSON)",
+            "GET /health": "서버 상태 확인"
+        }
+    }
+@app.get("/health")
+async def health_check():
+    """서버 및 모델 상태 확인"""
+    return {
+        "status": "healthy",
+        "ai_model_loaded": ai_effector.is_loaded(),
+        "supported_effects": effect_chain.get_available_effects(),
+        "model_path": MODEL_PATH,
+        "base_model": BASE_MODEL_NAME
+    }
+@app.post("/predict")
+async def predict_parameters(
+    audio: UploadFile = File(..., description="Dry 보컬 오디오 파일"),
+    prompt: str = Form("", description="텍스트 명령 (예: 'warm', 'bright')")
+):
+    """
+    AI 모델로 이펙터 파라미터 예측 (오디오 처리 없이)
+    - audio: wav, mp3 등 오디오 파일
+    - prompt: 원하는 사운드 설명
+    Returns: 예측된 이펙터 파라미터 JSON
+    """
+    try:
+        # 임시 파일로 저장
+        input_path = TEMP_DIR / f"{uuid.uuid4()}_{audio.filename}"
+        with open(input_path, "wb") as f:
+            content = await audio.read()
+            f.write(content)
+        # AI 모델로 파라미터 예측
+        parameters = ai_effector.predict(
+            audio_path=str(input_path),
+            text_prompt=prompt
+        )
+        # 임시 파일 삭제
+        os.remove(input_path)
+        return JSONResponse(content={
+            "status": "success",
+            "prompt": prompt,
+            "ai_model_used": ai_effector.is_loaded(),
+            "parameters": parameters
+        })
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/process")
+async def process_audio(
+    audio: UploadFile = File(..., description="Dry 보컬 오디오 파일"),
+    prompt: str = Form("", description="텍스트 명령 (예: 'warm', 'bright')")
+):
+    """
+    AI가 예측한 파라미터로 실제 오디오 처리
+    - audio: wav, mp3 등 오디오 파일
+    - prompt: 원하는 사운드 설명
+    Returns: 처리된 오디오 파일 (wav)
+    """
+    input_path = None
+    output_path = None
+    try:
+        # 임시 파일 경로 생성
+        file_id = str(uuid.uuid4())
+        input_path = TEMP_DIR / f"{file_id}_input_{audio.filename}"
+        output_path = TEMP_DIR / f"{file_id}_output.wav"
+        # 입력 파일 저장
+        with open(input_path, "wb") as f:
+            content = await audio.read()
+            f.write(content)
+        print(f"[Process] 입력 파일: {input_path}")
+        print(f"[Process] 프롬프트: {prompt}")
+        # Step 1: AI 모델로 파라미터 예측
+        parameters = ai_effector.predict(
+            audio_path=str(input_path),
+            text_prompt=prompt
+        )
+        print(f"[Process] 예측된 파라미터: {len(parameters)}개")
+        # Step 2: 이펙터 체인으로 오디오 처리
+        effect_chain.process(
+            input_path=str(input_path),
+            output_path=str(output_path),
+            parameters=parameters
+        )
+        # 입력 파일 삭제
+        os.remove(input_path)
+        # 처리된 오디오 반환
+        return FileResponse(
+            path=str(output_path),
+            media_type="audio/wav",
+            filename=f"processed_{audio.filename.rsplit('.', 1)[0]}.wav",
+            background=None
+        )
+    except Exception as e:
+        # 에러 시 임시 파일 정리
+        if input_path and input_path.exists():
+            os.remove(input_path)
+        if output_path and output_path.exists():
+            os.remove(output_path)
+        print(f"[Process] ❌ 에러: {e}")
+        import traceback
+        traceback.print_exc()
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/process_with_params")
+async def process_audio_with_params(
+    audio: UploadFile = File(..., description="Dry 보컬 오디오 파일"),
+    prompt: str = Form("", description="텍스트 명령")
+):
+    """
+    오디오 처리 + 사용된 파라미터도 함께 반환
+    Returns: JSON (처리된 오디오 URL + 파라미터)
+    """
+    input_path = None
+    output_path = None
+    try:
+        file_id = str(uuid.uuid4())
+        input_path = TEMP_DIR / f"{file_id}_input_{audio.filename}"
+        output_path = TEMP_DIR / f"{file_id}_output.wav"
+        with open(input_path, "wb") as f:
+            content = await audio.read()
+            f.write(content)
+        # AI 파라미터 예측
+        parameters = ai_effector.predict(
+            audio_path=str(input_path),
+            text_prompt=prompt
+        )
+        # 오디오 처리
+        effect_chain.process(
+            input_path=str(input_path),
+            output_path=str(output_path),
+            parameters=parameters
+        )
+        os.remove(input_path)
+        # Base64 인코딩으로 오디오 반환 (또는 URL)
+        import base64
+        with open(output_path, "rb") as f:
+            audio_base64 = base64.b64encode(f.read()).decode('utf-8')
+        os.remove(output_path)
+        return JSONResponse(content={
+            "status": "success",
+            "prompt": prompt,
+            "ai_model_used": ai_effector.is_loaded(),
+            "parameters": parameters,
+            "audio_base64": audio_base64,
+            "audio_format": "wav"
+        })
+    except Exception as e:
+        if input_path and input_path.exists():
+            os.remove(input_path)
+        if output_path and output_path.exists():
+            os.remove(output_path)
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

models/__init__.py ADDED Viewed

File without changes

models/ai_effector.py ADDED Viewed

	@@ -0,0 +1,404 @@

+"""
+AI Effector Model - DiffVox LLM 통합 버전
+==========================================
+CLAP 인코더 + 학습된 LLM을 사용하여 오디오에서 이펙터 파라미터를 예측
+DiffVox LLM 파라미터 → MagicPath 웹 파라미터 자동 변환
+"""
+import json
+import re
+import os
+from pathlib import Path
+from typing import Dict, Any, Optional
+import torch
+# AI 모델 관련 import (설치 필요)
+try:
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    from peft import PeftModel
+    TRANSFORMERS_AVAILABLE = True
+except ImportError:
+    TRANSFORMERS_AVAILABLE = False
+    print("[AIEffector] transformers/peft 미설치 - 프리셋 모드로 동작")
+# CLAP 인코더 (별도 파일)
+try:
+    from models.audio_encoder import AudioEncoder
+    AUDIO_ENCODER_AVAILABLE = True
+except ImportError:
+    AUDIO_ENCODER_AVAILABLE = False
+    print("[AIEffector] AudioEncoder 미설치 - 프리셋 모드로 동작")
+class ParameterMapper:
+    """DiffVox LLM 파라미터 ↔ MagicPath 웹 파라미터 변환"""
+    # DiffVox LLM → MagicPath 웹 매핑
+    DIFFVOX_TO_WEB = {
+        # EQ Low Shelf
+        "eq_lowshelf.params.gain": "eq_lowshelf_gain",
+        "eq_lowshelf.params.parametrizations.freq.original": "eq_lowshelf_freq",
+        # EQ High Shelf
+        "eq_highshelf.params.gain": "eq_highshelf_gain",
+        "eq_highshelf.params.parametrizations.freq.original": "eq_highshelf_freq",
+        # EQ Peak 1
+        "eq_peak1.params.gain": "eq_peak1_gain",
+        "eq_peak1.params.parametrizations.freq.original": "eq_peak1_freq",
+        "eq_peak1.params.parametrizations.Q.original": "eq_peak1_q",
+        # EQ Peak 2
+        "eq_peak2.params.gain": "eq_peak2_gain",
+        "eq_peak2.params.parametrizations.freq.original": "eq_peak2_freq",
+        "eq_peak2.params.parametrizations.Q.original": "eq_peak2_q",
+        # Delay
+        "delay.delay_time": "delay_time",
+        "delay.feedback": "delay_feedback",
+        "delay.mix": "delay_mix",
+        # Distortion
+        "distortion_amount": "distortion_amount",
+        # Master
+        "final_wet_mix": "final_wet_mix",
+    }
+    # 역방향 매핑
+    WEB_TO_DIFFVOX = {v: k for k, v in DIFFVOX_TO_WEB.items()}
+    # 값 변환 규칙 (정규화된 값 → 실제 값)
+    VALUE_TRANSFORMS = {
+        # EQ gain: -1~1 → -12~12 dB
+        "eq_lowshelf_gain": lambda x: x * 12,
+        "eq_highshelf_gain": lambda x: x * 12,
+        "eq_peak1_gain": lambda x: x * 12,
+        "eq_peak2_gain": lambda x: x * 12,
+        # EQ freq: 정규화된 값 → Hz (로그 스케일 역변환 필요할 수 있음)
+        "eq_lowshelf_freq": lambda x: 20 * (20000/20) ** ((x + 1) / 2),  # -1~1 → 20~20000
+        "eq_highshelf_freq": lambda x: 20 * (20000/20) ** ((x + 1) / 2),
+        "eq_peak1_freq": lambda x: 20 * (20000/20) ** ((x + 1) / 2),
+        "eq_peak2_freq": lambda x: 20 * (20000/20) ** ((x + 1) / 2),
+        # Q: -1~1 → 0.1~10
+        "eq_peak1_q": lambda x: 0.1 * (10/0.1) ** ((x + 1) / 2),
+        "eq_peak2_q": lambda x: 0.1 * (10/0.1) ** ((x + 1) / 2),
+        # Delay time: -1~1 → 0~1000 ms
+        "delay_time": lambda x: (x + 1) / 2 * 1000,
+        # Delay feedback: -1~1 → 0~1
+        "delay_feedback": lambda x: (x + 1) / 2,
+        # Delay mix: -1~1 → 0~1
+        "delay_mix": lambda x: (x + 1) / 2,
+        # Distortion: -1~1 → 0~1
+        "distortion_amount": lambda x: (x + 1) / 2,
+        # Wet mix: -1~1 → 0~1
+        "final_wet_mix": lambda x: (x + 1) / 2,
+    }
+    @classmethod
+    def diffvox_to_web(cls, diffvox_params: Dict[str, float]) -> Dict[str, float]:
+        """DiffVox LLM 출력 → MagicPath 웹 파라미터"""
+        web_params = {}
+        for diffvox_key, value in diffvox_params.items():
+            # 키 변환
+            if diffvox_key in cls.DIFFVOX_TO_WEB:
+                web_key = cls.DIFFVOX_TO_WEB[diffvox_key]
+            else:
+                # 매핑에 없으면 스킵
+                continue
+            # 값 변환
+            if web_key in cls.VALUE_TRANSFORMS:
+                try:
+                    web_params[web_key] = cls.VALUE_TRANSFORMS[web_key](value)
+                except:
+                    web_params[web_key] = value
+            else:
+                web_params[web_key] = value
+        return web_params
+class ParameterParser:
+    """LLM 출력에서 파라미터 JSON 추출"""
+    @staticmethod
+    def parse(llm_output: str) -> Optional[Dict]:
+        """LLM 출력에서 파라미터 딕셔너리 추출"""
+        # 방법 1: JSON 블록 찾기
+        json_patterns = [
+            r'\{[^{}]*\}',
+            r'\{(?:[^{}]|\{[^{}]*\})*\}',
+        ]
+        for pattern in json_patterns:
+            matches = re.findall(pattern, llm_output, re.DOTALL)
+            for match in matches:
+                try:
+                    params = json.loads(match)
+                    if isinstance(params, dict) and len(params) > 0:
+                        return params
+                except json.JSONDecodeError:
+                    continue
+        # 방법 2: key: value 패턴 파싱
+        param_pattern = r'"([^"]+)":\s*([-\d.]+)'
+        matches = re.findall(param_pattern, llm_output)
+        if matches:
+            params = {}
+            for key, value in matches:
+                try:
+                    params[key] = float(value)
+                except ValueError:
+                    params[key] = value
+            if params:
+                return params
+        return None
+class AIEffector:
+    """AI 기반 이펙터 파라미터 예측 모델 - DiffVox LLM 통합"""
+    # 기본 파라미터
+    DEFAULT_PARAMS = {
+        "eq_lowshelf_gain": 0.0,
+        "eq_lowshelf_freq": 200,
+        "eq_highshelf_gain": 0.0,
+        "eq_highshelf_freq": 8000,
+        "eq_peak1_gain": 0.0,
+        "eq_peak1_freq": 1000,
+        "eq_peak1_q": 1.0,
+        "eq_peak2_gain": 0.0,
+        "eq_peak2_freq": 3000,
+        "eq_peak2_q": 1.0,
+        "compressor_threshold": -24,
+        "compressor_ratio": 4.0,
+        "compressor_attack": 5,
+        "compressor_release": 50,
+        "compressor_makeup": 0.0,
+        "distortion_amount": 0.0,
+        "distortion_tone": 0.5,
+        "delay_time": 250,
+        "delay_feedback": 0.3,
+        "delay_mix": 0.0,
+        "reverb_room_size": 0.5,
+        "reverb_damping": 0.5,
+        "reverb_wet_dry": 0.0,
+        "final_wet_mix": 0.5
+    }
+    # 프리셋 (fallback용)
+    PRESETS = {
+        "warm": {
+            "eq_lowshelf_gain": 5.5,
+            "eq_lowshelf_freq": 200,
+            "eq_highshelf_gain": -1.5,
+            "eq_highshelf_freq": 8000,
+            "eq_peak1_gain": 2.0,
+            "eq_peak1_freq": 400,
+            "eq_peak1_q": 1.0,
+            "compressor_threshold": -18,
+            "compressor_ratio": 3.0,
+            "distortion_amount": 0.05,
+            "reverb_room_size": 0.4,
+            "reverb_wet_dry": 0.15,
+            "final_wet_mix": 0.5
+        },
+        "bright": {
+            "eq_lowshelf_gain": -2.0,
+            "eq_lowshelf_freq": 150,
+            "eq_highshelf_gain": 4.0,
+            "eq_highshelf_freq": 6000,
+            "eq_peak1_gain": 1.0,
+            "eq_peak1_freq": 3000,
+            "compressor_threshold": -20,
+            "compressor_ratio": 6.0,
+            "reverb_room_size": 0.3,
+            "reverb_wet_dry": 0.1,
+            "final_wet_mix": 0.5
+        },
+    }
+    def __init__(
+        self,
+        model_path: Optional[str] = None,
+        base_model_name: str = "Qwen/Qwen3-8B",
+        audio_feature_dim: int = 64,
+        use_huggingface: bool = True
+    ):
+        """
+        AI 모델 초기화
+        Args:
+            model_path: 학습된 LoRA 모델 경로 (로컬 또는 Hugging Face 레포)
+            base_model_name: 베이스 LLM 모델 이름
+            audio_feature_dim: 오디오 특징 차원 (CLAP 출력)
+            use_huggingface: True면 model_path를 Hugging Face 레포로 간주
+        """
+        self.model = None
+        self.tokenizer = None
+        self.audio_encoder = None
+        self.model_loaded = False
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.base_model_name = base_model_name
+        self.audio_feature_dim = audio_feature_dim
+        self.use_huggingface = use_huggingface
+        if model_path:
+            self._load_model(model_path)
+    def _load_model(self, model_path: str):
+        """학습된 LoRA 모델 로드 (로컬 또는 Hugging Face)"""
+        if not TRANSFORMERS_AVAILABLE:
+            print("[AIEffector] transformers/peft 미설치")
+            return
+        # 로컬 경로인지 Hugging Face 레포인지 확인
+        is_local = os.path.exists(model_path)
+        if not is_local and not self.use_huggingface:
+            print(f"[AIEffector] 로컬 모델 경로 없음: {model_path}")
+            return
+        try:
+            if self.use_huggingface and not is_local:
+                print(f"[AIEffector] Hugging Face에서 모델 로딩: {model_path}")
+            else:
+                print(f"[AIEffector] 로컬 모델 로딩: {model_path}")
+            # 토크나이저 로드
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.base_model_name,
+                trust_remote_code=True
+            )
+            if self.tokenizer.pad_token is None:
+                self.tokenizer.pad_token = self.tokenizer.eos_token
+            # 베이스 모델 로드
+            base_model = AutoModelForCausalLM.from_pretrained(
+                self.base_model_name,
+                torch_dtype=torch.bfloat16,
+                device_map="auto",
+                trust_remote_code=True,
+            )
+            # LoRA 어댑터 적용 (Hugging Face 레포 또는 로컬 경로)
+            self.model = PeftModel.from_pretrained(
+                base_model,
+                model_path,  # Hugging Face 레포 이름 또는 로컬 경로
+                is_trainable=False
+            )
+            self.model.eval()
+            # 오디오 인코더 로드
+            if AUDIO_ENCODER_AVAILABLE:
+                self.audio_encoder = AudioEncoder(
+                    output_dim=self.audio_feature_dim,
+                    reduction_method="pool"
+                )
+                print("[AIEffector] AudioEncoder 로드 완료")
+            self.model_loaded = True
+            print("[AIEffector] ✅ 모델 로드 완료")
+        except Exception as e:
+            print(f"[AIEffector] ❌ 모델 로드 실패: {e}")
+            import traceback
+            traceback.print_exc()
+            self.model_loaded = False
+    def is_loaded(self) -> bool:
+        """AI 모델 로드 상태 확인"""
+        return self.model_loaded
+    def predict(self, audio_path: str, text_prompt: str) -> Dict[str, float]:
+        """
+        오디오와 텍스트로부터 이펙터 파라미터 예측
+        Args:
+            audio_path: 입력 오디오 파일 경로
+            text_prompt: 사용자 텍스트 명령
+        Returns:
+            MagicPath 웹 형식의 이펙터 파라미터 딕셔너리
+        """
+        if self.model_loaded and self.audio_encoder:
+            return self._predict_with_model(audio_path, text_prompt)
+        else:
+            return self._predict_with_preset(text_prompt)
+    def _predict_with_model(self, audio_path: str, text_prompt: str) -> Dict[str, float]:
+        """학습된 DiffVox LLM으로 추론"""
+        try:
+            # 1. 오디오 특징 추출
+            audio_features = self.audio_encoder.get_audio_features(audio_path)
+            if not audio_features:
+                print("[AIEffector] 오디오 특징 추출 실패, 프리셋 사용")
+                return self._predict_with_preset(text_prompt)
+            # 2. 프롬프트 구성 (train_model.py와 동일한 형식)
+            audio_state_str = json.dumps(audio_features)
+            prompt = f"""Task: Convert text to audio parameters.
+Audio: {audio_state_str}
+Text: {text_prompt}
+Parameters:"""
+            # 3. LLM 추론
+            inputs = self.tokenizer(
+                prompt,
+                return_tensors="pt",
+                truncation=True,
+                max_length=1500
+            ).to(self.device)
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=500,
+                    temperature=0.1,
+                    do_sample=False,
+                    pad_token_id=self.tokenizer.eos_token_id,
+                )
+            generated_text = self.tokenizer.decode(
+                outputs[0][inputs['input_ids'].shape[1]:],
+                skip_special_tokens=True
+            ).strip()
+            print(f"[AIEffector] LLM 출력: {generated_text[:200]}...")
+            # 4. 파라미터 파싱
+            diffvox_params = ParameterParser.parse(generated_text)
+            if not diffvox_params:
+                print("[AIEffector] 파라미터 파싱 실패, 프리셋 사용")
+                return self._predict_with_preset(text_prompt)
+            # 5. DiffVox → Web 파라미터 변환
+            web_params = ParameterMapper.diffvox_to_web(diffvox_params)
+            # 6. 기본값과 병합
+            result = self.DEFAULT_PARAMS.copy()
+            result.update(web_params)
+            print(f"[AIEffector] ✅ AI 파라미터 생성 완료: {len(web_params)}개 파라미터")
+            return result
+        except Exception as e:
+            print(f"[AIEffector] 추론 에러: {e}")
+            import traceback
+            traceback.print_exc()
+            return self._predict_with_preset(text_prompt)
+    def _predict_with_preset(self, text_prompt: str) -> Dict[str, float]:
+        """프리셋 기반 파라미터 반환 (fallback)"""
+        prompt_lower = text_prompt.lower()
+        for preset_name, preset_params in self.PRESETS.items():
+            if preset_name in prompt_lower:
+                print(f"[AIEffector] 프리셋 매칭: '{preset_name}'")
+                result = self.DEFAULT_PARAMS.copy()
+                result.update(preset_params)
+                return result
+        print("[AIEffector] 프리셋 매칭 실패, 기본값 반환")
+        return self.DEFAULT_PARAMS.copy()

models/audio_encoder.py ADDED Viewed

	@@ -0,0 +1,189 @@

+"""
+Audio Encoder for MagicPath Server
+===================================
+CLAP 모델을 사용하여 오디오 파일에서 특징 벡터 추출
+DiffVox LLM과 동일한 인코더 사용
+"""
+import torch
+import numpy as np
+from typing import List, Optional
+import warnings
+warnings.filterwarnings("ignore")
+class AudioEncoder:
+    """CLAP 기반 오디오 인코더"""
+    def __init__(
+        self,
+        output_dim: int = 64,
+        reduction_method: str = "pool",
+        model_name: str = "laion/larger_clap_general"
+    ):
+        """
+        오디오 인코더 초기화
+        Args:
+            output_dim: 출력 특징 차원 (기본 64)
+            reduction_method: 차원 축소 방법 ("pool", "pca", "linear")
+            model_name: CLAP 모델 이름
+        """
+        self.output_dim = output_dim
+        self.reduction_method = reduction_method
+        self.model_name = model_name
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model = None
+        self.processor = None
+        self.projection = None
+        self._load_model()
+    def _load_model(self):
+        """CLAP 모델 로드"""
+        try:
+            from transformers import ClapModel, ClapProcessor
+            print(f"[AudioEncoder] CLAP 모델 로딩 중: {self.model_name}")
+            self.processor = ClapProcessor.from_pretrained(self.model_name)
+            self.model = ClapModel.from_pretrained(self.model_name)
+            self.model = self.model.to(self.device)
+            self.model.eval()
+            # CLAP 출력 차원 확인 (보통 512)
+            clap_dim = self.model.config.projection_dim
+            print(f"[AudioEncoder] CLAP 출력 차원: {clap_dim}")
+            # 차원 축소를 위한 projection layer
+            if self.reduction_method == "linear" and clap_dim != self.output_dim:
+                self.projection = torch.nn.Linear(clap_dim, self.output_dim)
+                self.projection = self.projection.to(self.device)
+                print(f"[AudioEncoder] Linear projection: {clap_dim} → {self.output_dim}")
+            print("[AudioEncoder] ✅ 모델 로드 완료")
+        except ImportError:
+            print("[AudioEncoder] ❌ transformers 미설치")
+            print("   pip install transformers")
+        except Exception as e:
+            print(f"[AudioEncoder] ❌ 모델 로드 실패: {e}")
+    def get_audio_features(self, audio_path: str) -> List[float]:
+        """
+        오디오 파일에서 특징 벡터 추출
+        Args:
+            audio_path: 오디오 파일 경로
+        Returns:
+            특징 벡터 (output_dim 차원)
+        """
+        if self.model is None:
+            print("[AudioEncoder] 모델이 로드되지 않음")
+            return []
+        try:
+            import librosa
+            # 오디오 로드
+            audio, sr = librosa.load(audio_path, sr=48000, mono=True)
+            # CLAP 입력 준비
+            inputs = self.processor(
+                audios=audio,
+                sampling_rate=48000,
+                return_tensors="pt"
+            )
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            # 특징 추출
+            with torch.no_grad():
+                audio_features = self.model.get_audio_features(**inputs)
+            # CPU로 이동
+            features = audio_features.squeeze().cpu().numpy()
+            # 차원 축소
+            features = self._reduce_dimension(features)
+            return features.tolist()
+        except Exception as e:
+            print(f"[AudioEncoder] 특징 추출 실패: {e}")
+            import traceback
+            traceback.print_exc()
+            return []
+    def _reduce_dimension(self, features: np.ndarray) -> np.ndarray:
+        """특징 벡터 차원 축소"""
+        current_dim = len(features)
+        if current_dim == self.output_dim:
+            return features
+        if self.reduction_method == "pool":
+            # 평균 풀링으로 차원 축소
+            if current_dim > self.output_dim:
+                pool_size = current_dim // self.output_dim
+                remainder = current_dim % self.output_dim
+                pooled = []
+                idx = 0
+                for i in range(self.output_dim):
+                    size = pool_size + (1 if i < remainder else 0)
+                    pooled.append(np.mean(features[idx:idx+size]))
+                    idx += size
+                return np.array(pooled)
+            else:
+                # 차원이 작으면 zero-padding
+                padded = np.zeros(self.output_dim)
+                padded[:current_dim] = features
+                return padded
+        elif self.reduction_method == "linear" and self.projection is not None:
+            # Linear projection
+            with torch.no_grad():
+                features_tensor = torch.tensor(features, dtype=torch.float32).to(self.device)
+                projected = self.projection(features_tensor)
+                return projected.cpu().numpy()
+        else:
+            # 기본: 앞에서부터 자르기
+            return features[:self.output_dim]
+    def get_text_features(self, text: str) -> List[float]:
+        """
+        텍스트에서 특징 벡터 추출 (CLAP text encoder)
+        Args:
+            text: 입력 텍스트
+        Returns:
+            특징 벡터
+        """
+        if self.model is None:
+            return []
+        try:
+            inputs = self.processor(
+                text=text,
+                return_tensors="pt",
+                padding=True
+            )
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            with torch.no_grad():
+                text_features = self.model.get_text_features(**inputs)
+            features = text_features.squeeze().cpu().numpy()
+            features = self._reduce_dimension(features)
+            return features.tolist()
+        except Exception as e:
+            print(f"[AudioEncoder] 텍스트 특징 추출 실패: {e}")
+            return []

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+# MagicPath Server - DiffVox LLM 통합 버전
+# ==========================================
+# 웹 서버
+fastapi>=0.104.0
+uvicorn>=0.24.0
+python-multipart>=0.0.6
+# 오디오 처리
+soundfile>=0.12.0
+pedalboard>=0.8.0
+librosa>=0.10.0
+numpy>=1.24.0
+# AI 모델
+torch>=2.2.0
+transformers>=4.36.0
+peft>=0.7.0
+huggingface_hub>=0.20.0
+accelerate>=0.25.0