Spaces:

heybaeheef
/

KU_SW_Academy

Running on A10G

App Files Files Community

heybaeheef commited on 3 days ago

Commit

4336a6e

verified ·

1 Parent(s): 3bfa04b

Delete models/ai_effector.py

Browse files

Files changed (1) hide show

models/ai_effector.py +0 -482

models/ai_effector.py DELETED Viewed

@@ -1,482 +0,0 @@
-"""
-AI Effector - DiffVox LLM 기반 이펙트 파라미터 예측
-===================================================
-상세 로그 버전
-"""
-import os
-import json
-import re
-import torch
-import numpy as np
-from typing import Dict, List, Optional, Any
-from pathlib import Path
-from datetime import datetime
-import warnings
-warnings.filterwarnings("ignore")
-# 기본 파라미터 (모델 로드 실패 시 사용)
-DEFAULT_PARAMETERS = {
-    "eq_peak1.params.freq": 1000.0,
-    "eq_peak1.params.gain": 0.0,
-    "eq_peak1.params.q": 1.0,
-    "eq_peak2.params.freq": 4000.0,
-    "eq_peak2.params.gain": 0.0,
-    "eq_peak2.params.q": 1.0,
-    "eq_lowshelf.params.freq": 200.0,
-    "eq_lowshelf.params.gain": 0.0,
-    "eq_lowshelf.params.q": 0.707,
-    "eq_highshelf.params.freq": 8000.0,
-    "eq_highshelf.params.gain": 0.0,
-    "eq_highshelf.params.q": 0.707,
-    "distortion_amount": 0.0,
-    "delay.delay_time": 0.02,
-    "delay.feedback": 0.3,
-    "delay.mix": 0.2,
-    "final_wet_mix": 0.5
-}
-# 스타일 프리셋 (AI 없이도 작동)
-STYLE_PRESETS = {
-    "warm": {
-        "eq_lowshelf.params.gain": 3.0,
-        "eq_highshelf.params.gain": -1.0,
-        "distortion_amount": 0.05,
-    },
-    "bright": {
-        "eq_highshelf.params.gain": 4.0,
-        "eq_peak2.params.gain": 2.0,
-        "eq_lowshelf.params.gain": -1.0,
-    },
-    "vintage": {
-        "eq_lowshelf.params.gain": 2.0,
-        "eq_highshelf.params.gain": -2.0,
-        "distortion_amount": 0.1,
-        "delay.mix": 0.15,
-    },
-    "modern": {
-        "eq_peak1.params.gain": 2.0,
-        "eq_peak2.params.gain": 3.0,
-        "eq_highshelf.params.gain": 2.0,
-    },
-    "spacious": {
-        "delay.delay_time": 0.05,
-        "delay.feedback": 0.4,
-        "delay.mix": 0.35,
-    },
-    "dry": {
-        "final_wet_mix": 0.2,
-        "delay.mix": 0.0,
-    },
-    "saturated": {
-        "distortion_amount": 0.15,
-        "eq_lowshelf.params.gain": 1.0,
-    }
-}
-class AudioEncoder:
-    """간소화된 오디오 인코더 (CLAP 대체)"""
-    def __init__(self, output_dim: int = 64):
-        self.output_dim = output_dim
-        self.sr = 44100
-    def get_audio_features(self, audio_path: str) -> Dict:
-        """오디오에서 특징 추출 (상세 정보 포함)"""
-        try:
-            import librosa
-            y, sr = librosa.load(audio_path, sr=self.sr, duration=5.0)
-            # 기본 오디오 정보
-            duration = len(y) / sr
-            # 기본 특징 추출
-            features = []
-            feature_details = {}
-            # MFCC (20개)
-            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
-            mfcc_mean = np.mean(mfcc, axis=1).tolist()
-            features.extend(mfcc_mean)
-            feature_details["mfcc_mean"] = [round(v, 4) for v in mfcc_mean[:5]]  # 처음 5개만
-            # Spectral features
-            spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
-            spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
-            spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
-            features.extend([spectral_centroid / 10000, spectral_bandwidth / 10000, spectral_rolloff / 10000])
-            feature_details["spectral_centroid"] = round(spectral_centroid, 2)
-            feature_details["spectral_bandwidth"] = round(spectral_bandwidth, 2)
-            feature_details["spectral_rolloff"] = round(spectral_rolloff, 2)
-            # RMS energy
-            rms = np.mean(librosa.feature.rms(y=y))
-            features.append(float(rms))
-            feature_details["rms_energy"] = round(float(rms), 4)
-            # Zero crossing rate
-            zcr = np.mean(librosa.feature.zero_crossing_rate(y))
-            features.append(float(zcr))
-            feature_details["zero_crossing_rate"] = round(float(zcr), 4)
-            # Chroma (12개)
-            chroma = librosa.feature.chroma_stft(y=y, sr=sr)
-            chroma_mean = np.mean(chroma, axis=1).tolist()
-            features.extend(chroma_mean)
-            feature_details["chroma_mean"] = [round(v, 4) for v in chroma_mean[:5]]  # 처음 5개만
-            # 피치 추정
-            pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
-            pitch_values = []
-            for t in range(pitches.shape[1]):
-                index = magnitudes[:, t].argmax()
-                pitch = pitches[index, t]
-                if pitch > 0:
-                    pitch_values.append(pitch)
-            median_pitch = np.median(pitch_values) if pitch_values else 0
-            feature_details["estimated_pitch_hz"] = round(float(median_pitch), 2)
-            # 음색 분석
-            if spectral_centroid > 3000:
-                brightness = "bright"
-            elif spectral_centroid > 1500:
-                brightness = "neutral"
-            else:
-                brightness = "dark"
-            feature_details["brightness"] = brightness
-            # 에너지 분석
-            if rms > 0.1:
-                intensity = "powerful"
-            elif rms > 0.03:
-                intensity = "moderate"
-            else:
-                intensity = "soft"
-            feature_details["intensity"] = intensity
-            # Pad or truncate to output_dim
-            if len(features) < self.output_dim:
-                features.extend([0.0] * (self.output_dim - len(features)))
-            else:
-                features = features[:self.output_dim]
-            return {
-                "features": features,
-                "details": feature_details,
-                "duration_sec": round(duration, 2),
-                "sample_rate": sr
-            }
-        except Exception as e:
-            print(f"[AudioEncoder] ❌ 특징 추출 실패: {e}")
-            return {
-                "features": [0.0] * self.output_dim,
-                "details": {"error": str(e)},
-                "duration_sec": 0,
-                "sample_rate": self.sr
-            }
-class AIEffector:
-    """AI 기반 이펙터 파라미터 예측"""
-    def __init__(
-        self,
-        model_repo_id: str = "heybaeheef/KU_SW_Academy",
-        model_subfolder: str = "checkpoints",
-        base_model_name: str = "Qwen/Qwen3-8B",
-        audio_feature_dim: int = 64,
-        use_huggingface: bool = True
-    ):
-        self.model_repo_id = model_repo_id
-        self.model_subfolder = model_subfolder
-        self.base_model_name = base_model_name
-        self.audio_feature_dim = audio_feature_dim
-        self.use_huggingface = use_huggingface
-        self.model = None
-        self.tokenizer = None
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        # 오디오 인코더
-        self.audio_encoder = AudioEncoder(output_dim=audio_feature_dim)
-        # 요청 카운터
-        self.request_count = 0
-        # 모델 로드 시도
-        self._load_model()
-    def _load_model(self):
-        """모델 로드"""
-        try:
-            from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-            from peft import PeftModel
-            print(f"[AIEffector] 모델 로딩 시작...")
-            print(f"  - Base Model: {self.base_model_name}")
-            print(f"  - Adapter Repo: {self.model_repo_id}")
-            print(f"  - Adapter Subfolder: {self.model_subfolder}")
-            # 토크나이저 로드
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                self.base_model_name,
-                trust_remote_code=True
-            )
-            if self.tokenizer.pad_token is None:
-                self.tokenizer.pad_token = self.tokenizer.eos_token
-            # 4bit 양자화 설정 (메모리 절약)
-            quantization_config = None
-            if torch.cuda.is_available():
-                try:
-                    quantization_config = BitsAndBytesConfig(
-                        load_in_4bit=True,
-                        bnb_4bit_compute_dtype=torch.float16,
-                        bnb_4bit_use_double_quant=True,
-                        bnb_4bit_quant_type="nf4"
-                    )
-                    print(f"  - 4bit 양자화 활성화")
-                except Exception as e:
-                    print(f"  - 4bit 양자화 실패, 기본 로드: {e}")
-            # 베이스 모델 로드
-            base_model = AutoModelForCausalLM.from_pretrained(
-                self.base_model_name,
-                quantization_config=quantization_config,
-                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-                device_map="auto" if torch.cuda.is_available() else None,
-                trust_remote_code=True,
-                low_cpu_mem_usage=True
-            )
-            # LoRA 어댑터 로드 (subfolder 파라미터 사용!)
-            if self.use_huggingface:
-                print(f"[AIEffector] Hugging Face에서 LoRA 어댑터 로딩...")
-                self.model = PeftModel.from_pretrained(
-                    base_model,
-                    self.model_repo_id,
-                    subfolder=self.model_subfolder,
-                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-                )
-            else:
-                local_path = os.path.join(self.model_repo_id, self.model_subfolder)
-                print(f"[AIEffector] 로컬에서 LoRA 어댑터 로딩: {local_path}")
-                self.model = PeftModel.from_pretrained(
-                    base_model,
-                    local_path,
-                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-                )
-            self.model.eval()
-            print(f"[AIEffector] ✅ 모델 로드 성공!")
-        except Exception as e:
-            print(f"[AIEffector] ❌ 모델 로드 실패: {e}")
-            print(f"[AIEffector] 폴백 모드로 전환 (프리셋 기반)")
-            self.model = None
-            self.tokenizer = None
-    def is_loaded(self) -> bool:
-        """모델 로드 여부"""
-        return self.model is not None
-    def _apply_preset(self, prompt: str) -> Dict[str, float]:
-        """프롬프트에서 프리셋 매칭"""
-        params = DEFAULT_PARAMETERS.copy()
-        prompt_lower = prompt.lower()
-        matched_presets = []
-        for style_name, style_params in STYLE_PRESETS.items():
-            if style_name in prompt_lower:
-                params.update(style_params)
-                matched_presets.append(style_name)
-        if matched_presets:
-            print(f"    [Preset] 매칭된 프리셋: {matched_presets}")
-        return params
-    def _format_prompt(self, text_prompt: str, audio_features: List[float]) -> str:
-        """LLM 입력 프롬프트 포맷팅"""
-        audio_summary = ", ".join([f"{v:.3f}" for v in audio_features[:8]])
-        prompt = f"""You are an audio effect parameter predictor.
-Input:
-- Text description: {text_prompt}
-- Audio features (first 8): [{audio_summary}]
-Output the effect parameters as JSON:
-```json
-{{
-    "eq_peak1.params.freq": <float>,
-    "eq_peak1.params.gain": <float>,
-    "eq_peak1.params.q": <float>,
-    "eq_peak2.params.freq": <float>,
-    "eq_peak2.params.gain": <float>,
-    "eq_peak2.params.q": <float>,
-    "eq_lowshelf.params.freq": <float>,
-    "eq_lowshelf.params.gain": <float>,
-    "eq_lowshelf.params.q": <float>,
-    "eq_highshelf.params.freq": <float>,
-    "eq_highshelf.params.gain": <float>,
-    "eq_highshelf.params.q": <float>,
-    "distortion_amount": <float>,
-    "delay.delay_time": <float>,
-    "delay.feedback": <float>,
-    "delay.mix": <float>,
-    "final_wet_mix": <float>
-}}
-```
-JSON output:"""
-        return prompt
-    def _parse_output(self, output_text: str) -> Dict[str, float]:
-        """LLM 출력에서 파라미터 추출"""
-        try:
-            json_match = re.search(r'\{[^{}]*\}', output_text, re.DOTALL)
-            if json_match:
-                params = json.loads(json_match.group())
-                result = DEFAULT_PARAMETERS.copy()
-                for key, value in params.items():
-                    if key in result and isinstance(value, (int, float)):
-                        result[key] = float(value)
-                return result
-        except Exception as e:
-            print(f"    [Parse] ❌ 출력 파싱 실패: {e}")
-        return DEFAULT_PARAMETERS.copy()
-    def predict(self, audio_path: str, text_prompt: str = "") -> Dict[str, float]:
-        """파라미터 예측 (상세 로그 포함)"""
-        self.request_count += 1
-        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        print(f"\n{'='*60}")
-        print(f"[AIEffector] 🎵 요청 #{self.request_count} - {timestamp}")
-        print(f"{'='*60}")
-        print(f"  📂 오디오 파일: {Path(audio_path).name}")
-        print(f"  💬 텍스트 프롬프트: '{text_prompt}'")
-        print(f"  🤖 모델 상태: {'AI 모드' if self.is_loaded() else '프리셋 모드'}")
-        # 모델이 없으면 프리셋 사용
-        if not self.is_loaded():
-            print(f"\n  ⚠️ AI 모델 미로드 - 프리셋 모드 사용")
-            params = self._apply_preset(text_prompt)
-            self._log_parameters(params)
-            return params
-        try:
-            # 1. 오디오 특징 추출
-            print(f"\n  📊 [Step 1] 오디오 특징 추출 중...")
-            audio_result = self.audio_encoder.get_audio_features(audio_path)
-            audio_features = audio_result["features"]
-            audio_details = audio_result["details"]
-            print(f"    - 오디오 길이: {audio_result['duration_sec']}초")
-            print(f"    - 샘플레이트: {audio_result['sample_rate']}Hz")
-            print(f"    - 추정 피치: {audio_details.get('estimated_pitch_hz', 'N/A')}Hz")
-            print(f"    - 밝기: {audio_details.get('brightness', 'N/A')}")
-            print(f"    - 강도: {audio_details.get('intensity', 'N/A')}")
-            print(f"    - Spectral Centroid: {audio_details.get('spectral_centroid', 'N/A')}")
-            print(f"    - RMS Energy: {audio_details.get('rms_energy', 'N/A')}")
-            print(f"    - 특징 벡터 (처음 8개): {[round(v, 3) for v in audio_features[:8]]}")
-            # 2. LLM 프롬프트 생성
-            print(f"\n  🔤 [Step 2] LLM 프롬프트 생성 중...")
-            prompt = self._format_prompt(text_prompt, audio_features)
-            print(f"    - 프롬프트 길이: {len(prompt)} 문자")
-            # 3. 토큰화
-            print(f"\n  🔢 [Step 3] 토큰화 중...")
-            inputs = self.tokenizer(
-                prompt,
-                return_tensors="pt",
-                truncation=True,
-                max_length=1024
-            ).to(self.device)
-            print(f"    - 입력 토큰 수: {inputs['input_ids'].shape[1]}")
-            # 4. LLM 생성
-            print(f"\n  🧠 [Step 4] LLM 추론 중...")
-            import time
-            start_time = time.time()
-            with torch.no_grad():
-                outputs = self.model.generate(
-                    **inputs,
-                    max_new_tokens=256,
-                    do_sample=False,
-                    temperature=0.1,
-                    pad_token_id=self.tokenizer.pad_token_id
-                )
-            inference_time = time.time() - start_time
-            print(f"    - 추론 시간: {inference_time:.2f}초")
-            print(f"    - 출력 토큰 수: {outputs.shape[1]}")
-            # 5. 디코딩
-            print(f"\n  📝 [Step 5] 출력 디코딩 중...")
-            output_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            # JSON 부분만 추출해서 로그
-            json_match = re.search(r'\{[^{}]*\}', output_text, re.DOTALL)
-            if json_match:
-                print(f"    - LLM 출력 JSON:\n{json_match.group()}")
-            # 6. 파싱
-            print(f"\n  🔧 [Step 6] 파라미터 파싱 중...")
-            params = self._parse_output(output_text)
-            # 7. 결과 로깅
-            self._log_parameters(params)
-            print(f"\n  ✅ AI 예측 완료!")
-            print(f"{'='*60}\n")
-            return params
-        except Exception as e:
-            print(f"\n  ❌ 예측 실패: {e}")
-            print(f"  ⚠️ 프리셋으로 폴백...")
-            params = self._apply_preset(text_prompt)
-            self._log_parameters(params)
-            return params
-    def _log_parameters(self, params: Dict[str, float]):
-        """예측된 파라미터 로깅"""
-        print(f"\n  📋 예측된 파라미터:")
-        print(f"    [EQ Peak 1]")
-        print(f"      - Freq: {params.get('eq_peak1.params.freq', 0):.1f} Hz")
-        print(f"      - Gain: {params.get('eq_peak1.params.gain', 0):.2f} dB")
-        print(f"      - Q: {params.get('eq_peak1.params.q', 0):.2f}")
-        print(f"    [EQ Peak 2]")
-        print(f"      - Freq: {params.get('eq_peak2.params.freq', 0):.1f} Hz")
-        print(f"      - Gain: {params.get('eq_peak2.params.gain', 0):.2f} dB")
-        print(f"      - Q: {params.get('eq_peak2.params.q', 0):.2f}")
-        print(f"    [Low Shelf]")
-        print(f"      - Freq: {params.get('eq_lowshelf.params.freq', 0):.1f} Hz")
-        print(f"      - Gain: {params.get('eq_lowshelf.params.gain', 0):.2f} dB")
-        print(f"    [High Shelf]")
-        print(f"      - Freq: {params.get('eq_highshelf.params.freq', 0):.1f} Hz")
-        print(f"      - Gain: {params.get('eq_highshelf.params.gain', 0):.2f} dB")
-        print(f"    [Effects]")
-        print(f"      - Distortion: {params.get('distortion_amount', 0):.3f}")
-        print(f"      - Delay Time: {params.get('delay.delay_time', 0):.3f}s")
-        print(f"      - Delay Feedback: {params.get('delay.feedback', 0):.2f}")
-        print(f"      - Delay Mix: {params.get('delay.mix', 0):.2f}")
-        print(f"      - Final Wet Mix: {params.get('final_wet_mix', 0):.2f}")