heybaeheef commited on
Commit
ccad488
Β·
verified Β·
1 Parent(s): 6dd0b5f

Upload ai_effector.py

Browse files
Files changed (1) hide show
  1. models/ai_effector.py +692 -0
models/ai_effector.py ADDED
@@ -0,0 +1,692 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AI Effector - DiffVox LLM 기반 μ΄νŽ™νŠΈ νŒŒλΌλ―Έν„° 예츑
3
+ ===================================================
4
+ V6: Compressor/Reverb νŒŒλΌλ―Έν„° μΆ”κ°€
5
+ - freq: MinMax(min, max) λ³€ν™˜
6
+ - Q: MinMax(min, max) λ³€ν™˜
7
+ - delay.feedback, delay.mix: sigmoid
8
+ - distortion_amount: sigmoid * 0.1
9
+ - final_wet_mix: sigmoid
10
+ - Compressor/Reverb: 프리셋 기반 (ν•™μŠ΅λ˜μ§€ μ•ŠμŒ)
11
+ """
12
+
13
+ import os
14
+ import json
15
+ import re
16
+ import math
17
+ import torch
18
+ import numpy as np
19
+ from typing import Dict, List, Optional, Any, Tuple
20
+ from pathlib import Path
21
+ from datetime import datetime
22
+ import warnings
23
+
24
+ warnings.filterwarnings("ignore")
25
+
26
+
27
+ def sigmoid(x: float) -> float:
28
+ """μ‹œκ·Έλͺ¨μ΄λ“œ ν•¨μˆ˜"""
29
+ try:
30
+ return 1 / (1 + math.exp(-x))
31
+ except OverflowError:
32
+ return 0.0 if x < 0 else 1.0
33
+
34
+
35
+ def minmax_transform(raw: float, min_val: float, max_val: float) -> float:
36
+ """MinMax λ³€ν™˜: sigmoid(raw) * (max - min) + min"""
37
+ return sigmoid(raw) * (max_val - min_val) + min_val
38
+
39
+
40
+ # =====================================================
41
+ # fx.pyμ—μ„œ κ°€μ Έμ˜¨ νŒŒλΌλ―Έν„° λ²”μœ„ (μ •ν™•ν•œ κ°’!)
42
+ # =====================================================
43
+ PARAM_TRANSFORMS = {
44
+ # Peak EQ 1 & 2
45
+ "eq_peak1.params.freq": {"type": "minmax", "min": 33.0, "max": 17500.0},
46
+ "eq_peak1.params.Q": {"type": "minmax", "min": 0.2, "max": 20.0},
47
+ "eq_peak1.params.gain": {"type": "none"},
48
+
49
+ "eq_peak2.params.freq": {"type": "minmax", "min": 33.0, "max": 17500.0},
50
+ "eq_peak2.params.Q": {"type": "minmax", "min": 0.2, "max": 20.0},
51
+ "eq_peak2.params.gain": {"type": "none"},
52
+
53
+ # LowShelf
54
+ "eq_lowshelf.params.freq": {"type": "minmax", "min": 30.0, "max": 200.0},
55
+ "eq_lowshelf.params.gain": {"type": "none"},
56
+
57
+ # HighShelf
58
+ "eq_highshelf.params.freq": {"type": "minmax", "min": 2500.0, "max": 16000.0},
59
+ "eq_highshelf.params.gain": {"type": "none"},
60
+
61
+ # Delay
62
+ "delay.delay_time": {"type": "none"},
63
+ "delay.feedback": {"type": "sigmoid"},
64
+ "delay.mix": {"type": "sigmoid"},
65
+
66
+ # Distortion
67
+ "distortion_amount": {"type": "sigmoid_scale", "scale": 0.1},
68
+
69
+ # Wet Mix
70
+ "final_wet_mix": {"type": "sigmoid"},
71
+ }
72
+
73
+ # =====================================================
74
+ # κΈ°λ³Έ νŒŒλΌλ―Έν„° (V6: Compressor/Reverb μΆ”κ°€)
75
+ # =====================================================
76
+ DEFAULT_PARAMETERS = {
77
+ # EQ
78
+ "eq_peak1.params.freq": 1000.0,
79
+ "eq_peak1.params.gain": 0.0,
80
+ "eq_peak1.params.Q": 1.0,
81
+ "eq_peak2.params.freq": 4000.0,
82
+ "eq_peak2.params.gain": 0.0,
83
+ "eq_peak2.params.Q": 1.0,
84
+ "eq_lowshelf.params.freq": 115.0,
85
+ "eq_lowshelf.params.gain": 0.0,
86
+ "eq_highshelf.params.freq": 8000.0,
87
+ "eq_highshelf.params.gain": 0.0,
88
+
89
+ # Compressor (ν•™μŠ΅λ˜μ§€ μ•ŠμŒ - 프리셋 기반)
90
+ "compressor.threshold": -18.0,
91
+ "compressor.ratio": 2.0,
92
+
93
+ # Distortion
94
+ "distortion_amount": 0.0,
95
+
96
+ # Delay
97
+ "delay.delay_time": 0.02,
98
+ "delay.feedback": 0.3,
99
+ "delay.mix": 0.2,
100
+
101
+ # Reverb (ν•™μŠ΅λ˜μ§€ μ•ŠμŒ - 프리셋 기반)
102
+ "reverb.room_size": 0.3,
103
+ "reverb.damping": 0.5,
104
+ "reverb.wet_level": 0.0,
105
+ "reverb.dry_level": 1.0,
106
+
107
+ # Master
108
+ "final_wet_mix": 0.5
109
+ }
110
+
111
+ # νŒŒλΌλ―Έν„° λ²”μœ„ μ œν•œ
112
+ PARAM_RANGES = {
113
+ "eq_peak1.params.freq": (33.0, 17500.0),
114
+ "eq_peak1.params.gain": (-12.0, 12.0),
115
+ "eq_peak1.params.Q": (0.2, 20.0),
116
+ "eq_peak2.params.freq": (33.0, 17500.0),
117
+ "eq_peak2.params.gain": (-12.0, 12.0),
118
+ "eq_peak2.params.Q": (0.2, 20.0),
119
+ "eq_lowshelf.params.freq": (30.0, 200.0),
120
+ "eq_lowshelf.params.gain": (-12.0, 12.0),
121
+ "eq_highshelf.params.freq": (2500.0, 16000.0),
122
+ "eq_highshelf.params.gain": (-12.0, 12.0),
123
+ "compressor.threshold": (-40.0, 0.0),
124
+ "compressor.ratio": (1.0, 20.0),
125
+ "distortion_amount": (0.0, 0.1),
126
+ "delay.delay_time": (0.01, 1.0),
127
+ "delay.feedback": (0.0, 0.95),
128
+ "delay.mix": (0.0, 1.0),
129
+ "reverb.room_size": (0.0, 1.0),
130
+ "reverb.damping": (0.0, 1.0),
131
+ "reverb.wet_level": (0.0, 1.0),
132
+ "reverb.dry_level": (0.0, 1.0),
133
+ "final_wet_mix": (0.0, 1.0),
134
+ }
135
+
136
+ # λ™μ˜μ–΄ λ§€ν•‘
137
+ SYNONYM_MAP = {
138
+ "calm": "warm soft",
139
+ "relaxed": "warm soft",
140
+ "chill": "warm soft",
141
+ "smooth": "warm",
142
+ "mellow": "warm soft",
143
+ "breezy": "bright spacious",
144
+ "airy": "bright spacious",
145
+ "light": "bright",
146
+ "crisp": "bright",
147
+ "clean": "bright",
148
+ "dreamy": "warm spacious",
149
+ "ethereal": "bright spacious",
150
+ "atmospheric": "spacious",
151
+ "ambient": "spacious warm",
152
+ "aggressive": "saturated bright",
153
+ "powerful": "saturated",
154
+ "punchy": "saturated bright",
155
+ "hard": "saturated",
156
+ "gritty": "saturated dark",
157
+ "soft": "warm",
158
+ "harsh": "bright saturated",
159
+ "muddy": "dark",
160
+ "thin": "bright",
161
+ "thick": "warm dark",
162
+ "full": "warm",
163
+ "reverb": "spacious",
164
+ "echo": "spacious",
165
+ "wet": "spacious",
166
+ }
167
+
168
+ # =====================================================
169
+ # μŠ€νƒ€μΌ 프리셋 (V6: Compressor/Reverb 포함)
170
+ # =====================================================
171
+ STYLE_PRESETS = {
172
+ "warm": {
173
+ "compressor.threshold": -15.0,
174
+ "compressor.ratio": 3.0,
175
+ "reverb.room_size": 0.2,
176
+ "reverb.wet_level": 0.1,
177
+ "reverb.dry_level": 0.9,
178
+ },
179
+ "bright": {
180
+ "compressor.threshold": -12.0,
181
+ "compressor.ratio": 2.5,
182
+ "reverb.room_size": 0.15,
183
+ "reverb.wet_level": 0.08,
184
+ "reverb.dry_level": 0.92,
185
+ },
186
+ "spacious": {
187
+ "delay.delay_time": 0.05,
188
+ "compressor.threshold": -18.0,
189
+ "compressor.ratio": 2.0,
190
+ "reverb.room_size": 0.6,
191
+ "reverb.wet_level": 0.35,
192
+ "reverb.dry_level": 0.65,
193
+ },
194
+ "dark": {
195
+ "compressor.threshold": -20.0,
196
+ "compressor.ratio": 2.5,
197
+ "reverb.room_size": 0.4,
198
+ "reverb.wet_level": 0.2,
199
+ "reverb.dry_level": 0.8,
200
+ },
201
+ "saturated": {
202
+ "compressor.threshold": -10.0,
203
+ "compressor.ratio": 4.0,
204
+ "reverb.room_size": 0.1,
205
+ "reverb.wet_level": 0.05,
206
+ "reverb.dry_level": 0.95,
207
+ },
208
+ "soft": {
209
+ "compressor.threshold": -22.0,
210
+ "compressor.ratio": 1.5,
211
+ "reverb.room_size": 0.3,
212
+ "reverb.wet_level": 0.15,
213
+ "reverb.dry_level": 0.85,
214
+ },
215
+ }
216
+
217
+
218
+ class CLAPAudioEncoder:
219
+ """CLAP 기반 μ˜€λ””μ˜€ 인코더"""
220
+
221
+ def __init__(self, output_dim: int = 64, model_name: str = "laion/larger_clap_music"):
222
+ self.output_dim = output_dim
223
+ self.model_name = model_name
224
+ self.target_sr = 48000
225
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
226
+
227
+ self.model = None
228
+ self.processor = None
229
+ self._load_model()
230
+
231
+ def _load_model(self):
232
+ try:
233
+ from transformers import ClapModel, ClapProcessor
234
+
235
+ print(f"[CLAPEncoder] CLAP λͺ¨λΈ λ‘œλ”© 쀑: {self.model_name}")
236
+
237
+ self.processor = ClapProcessor.from_pretrained(self.model_name)
238
+ self.model = ClapModel.from_pretrained(self.model_name)
239
+ self.model = self.model.to(self.device)
240
+ self.model.eval()
241
+
242
+ print(f"[CLAPEncoder] βœ… CLAP λͺ¨λΈ λ‘œλ“œ μ™„λ£Œ")
243
+
244
+ except Exception as e:
245
+ print(f"[CLAPEncoder] ❌ λͺ¨λΈ λ‘œλ“œ μ‹€νŒ¨: {e}")
246
+
247
+ def get_audio_features(self, audio_path: str) -> List[float]:
248
+ if self.model is None:
249
+ return [0.0] * self.output_dim
250
+
251
+ try:
252
+ import librosa
253
+
254
+ audio, sr = librosa.load(audio_path, sr=self.target_sr, mono=True)
255
+
256
+ inputs = self.processor(
257
+ audios=audio,
258
+ sampling_rate=self.target_sr,
259
+ return_tensors="pt",
260
+ padding=True
261
+ ).to(self.device)
262
+
263
+ with torch.no_grad():
264
+ outputs = self.model.get_audio_features(**inputs)
265
+
266
+ features_512 = outputs[0].cpu().numpy()
267
+ features_64 = self._reduce_dimension(features_512)
268
+
269
+ return features_64.tolist()
270
+
271
+ except Exception as e:
272
+ print(f"[CLAPEncoder] νŠΉμ§• μΆ”μΆœ μ‹€νŒ¨: {e}")
273
+ return [0.0] * self.output_dim
274
+
275
+ def _reduce_dimension(self, features: np.ndarray) -> np.ndarray:
276
+ current_dim = len(features)
277
+ if current_dim == self.output_dim:
278
+ return features
279
+
280
+ pool_size = current_dim // self.output_dim
281
+ remainder = current_dim % self.output_dim
282
+
283
+ pooled = []
284
+ idx = 0
285
+ for i in range(self.output_dim):
286
+ size = pool_size + (1 if i < remainder else 0)
287
+ pooled.append(np.mean(features[idx:idx+size]))
288
+ idx += size
289
+
290
+ return np.array(pooled)
291
+
292
+ def is_loaded(self) -> bool:
293
+ return self.model is not None
294
+
295
+
296
+ class AIEffector:
297
+ """AI 기반 μ΄νŽ™ν„° νŒŒλΌλ―Έν„° 예츑 (V6)"""
298
+
299
+ def __init__(
300
+ self,
301
+ model_repo_id: str = "heybaeheef/KU_SW_Academy",
302
+ model_subfolder: str = "checkpoints",
303
+ base_model_name: str = "Qwen/Qwen3-8B",
304
+ audio_feature_dim: int = 64,
305
+ use_huggingface: bool = True
306
+ ):
307
+ self.model_repo_id = model_repo_id
308
+ self.model_subfolder = model_subfolder
309
+ self.base_model_name = base_model_name
310
+ self.audio_feature_dim = audio_feature_dim
311
+ self.use_huggingface = use_huggingface
312
+
313
+ self.model = None
314
+ self.tokenizer = None
315
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
316
+
317
+ print(f"[AIEffector V6] CLAP 인코더 μ΄ˆκΈ°ν™”...")
318
+ self.audio_encoder = CLAPAudioEncoder(output_dim=audio_feature_dim)
319
+
320
+ self.request_count = 0
321
+ self._load_model()
322
+
323
+ def _load_model(self):
324
+ try:
325
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
326
+ from peft import PeftModel
327
+
328
+ print(f"[AIEffector] 베이슀 λͺ¨λΈ λ‘œλ”©: {self.base_model_name}")
329
+
330
+ if torch.cuda.is_available():
331
+ bnb_config = BitsAndBytesConfig(
332
+ load_in_4bit=True,
333
+ bnb_4bit_quant_type="nf4",
334
+ bnb_4bit_compute_dtype=torch.float16,
335
+ bnb_4bit_use_double_quant=True
336
+ )
337
+ base_model = AutoModelForCausalLM.from_pretrained(
338
+ self.base_model_name,
339
+ quantization_config=bnb_config,
340
+ device_map="auto",
341
+ trust_remote_code=True
342
+ )
343
+ else:
344
+ base_model = AutoModelForCausalLM.from_pretrained(
345
+ self.base_model_name,
346
+ torch_dtype=torch.float32,
347
+ device_map="auto",
348
+ trust_remote_code=True
349
+ )
350
+
351
+ self.tokenizer = AutoTokenizer.from_pretrained(
352
+ self.base_model_name,
353
+ trust_remote_code=True
354
+ )
355
+
356
+ if self.tokenizer.pad_token is None:
357
+ self.tokenizer.pad_token = self.tokenizer.eos_token
358
+
359
+ print(f"[AIEffector] LoRA μ–΄λŒ‘ν„° λ‘œλ”©...")
360
+
361
+ if self.use_huggingface:
362
+ self.model = PeftModel.from_pretrained(
363
+ base_model,
364
+ self.model_repo_id,
365
+ subfolder=self.model_subfolder,
366
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
367
+ )
368
+ else:
369
+ local_path = os.path.join(self.model_repo_id, self.model_subfolder)
370
+ self.model = PeftModel.from_pretrained(
371
+ base_model,
372
+ local_path,
373
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
374
+ )
375
+
376
+ self.model.eval()
377
+ print(f"[AIEffector] βœ… λͺ¨λΈ λ‘œλ“œ 성곡!")
378
+
379
+ except Exception as e:
380
+ print(f"[AIEffector] ❌ λͺ¨λΈ λ‘œλ“œ μ‹€νŒ¨: {e}")
381
+ import traceback
382
+ traceback.print_exc()
383
+ self.model = None
384
+ self.tokenizer = None
385
+
386
+ def is_loaded(self) -> bool:
387
+ return self.model is not None
388
+
389
+ def _preprocess_text(self, text: str) -> str:
390
+ text_lower = text.lower()
391
+ for synonym, replacement in SYNONYM_MAP.items():
392
+ if synonym in text_lower:
393
+ text_lower = text_lower.replace(synonym, replacement)
394
+ print(f" [Synonym] '{synonym}' β†’ '{replacement}'")
395
+ return text_lower
396
+
397
+ def _apply_preset(self, prompt: str) -> Dict[str, float]:
398
+ """프리셋 적용 - Compressor/Reverb νŒŒλΌλ―Έν„° μ„€μ •"""
399
+ params = {}
400
+ prompt_lower = prompt.lower()
401
+
402
+ matched = []
403
+ for style_name, style_params in STYLE_PRESETS.items():
404
+ if style_name in prompt_lower:
405
+ params.update(style_params)
406
+ matched.append(style_name)
407
+
408
+ if matched:
409
+ print(f" [Preset] λ§€μΉ­: {matched}")
410
+ else:
411
+ # κΈ°λ³Έ 프리셋 (λ§€μΉ­ 없을 λ•Œ)
412
+ params.update(STYLE_PRESETS["warm"])
413
+ print(f" [Preset] κΈ°λ³Έκ°’ 적용: warm")
414
+
415
+ return params
416
+
417
+ def _format_prompt(self, text_prompt: str, audio_features: List[float]) -> str:
418
+ audio_state_str = json.dumps(audio_features)
419
+ return f"""Task: Convert text to audio parameters.
420
+ Audio: {audio_state_str}
421
+ Text: {text_prompt}
422
+ Parameters:"""
423
+
424
+ def _preprocess_json(self, json_str: str) -> str:
425
+ json_str = re.sub(r'(\d)_(\d)', r'\1\2', json_str)
426
+ json_str = re.sub(r',(\s*[}\]])', r'\1', json_str)
427
+ json_str = re.sub(r'\bNaN\b', '0', json_str)
428
+ json_str = re.sub(r'\bInfinity\b', '999999', json_str)
429
+ json_str = re.sub(r'-Infinity\b', '-999999', json_str)
430
+ return json_str
431
+
432
+ def _normalize_key(self, key: str) -> str:
433
+ key = re.sub(r'\.parametrizations\.(\w+)\.original', r'.\1', key)
434
+ return key
435
+
436
+ def _extract_json_object(self, text: str) -> Optional[str]:
437
+ start = text.find('{')
438
+ if start == -1:
439
+ return None
440
+
441
+ depth = 0
442
+ for i, char in enumerate(text[start:], start):
443
+ if char == '{':
444
+ depth += 1
445
+ elif char == '}':
446
+ depth -= 1
447
+ if depth == 0:
448
+ return text[start:i+1]
449
+ return None
450
+
451
+ def _convert_raw_to_actual(self, params: Dict[str, float]) -> Dict[str, float]:
452
+ """Raw 값을 μ‹€μ œ κ°’μœΌοΏ½οΏ½οΏ½ λ³€ν™˜"""
453
+ result = params.copy()
454
+
455
+ for key, transform in PARAM_TRANSFORMS.items():
456
+ if key not in result:
457
+ continue
458
+
459
+ raw = result[key]
460
+ transform_type = transform["type"]
461
+
462
+ if transform_type == "none":
463
+ actual = raw
464
+
465
+ elif transform_type == "minmax":
466
+ min_val = transform["min"]
467
+ max_val = transform["max"]
468
+ actual = minmax_transform(raw, min_val, max_val)
469
+ print(f" [MinMax] {key}: {raw:.4f} β†’ {actual:.2f} (range: {min_val}-{max_val})")
470
+
471
+ elif transform_type == "sigmoid":
472
+ actual = sigmoid(raw)
473
+ print(f" [Sigmoid] {key}: {raw:.4f} β†’ {actual:.4f}")
474
+
475
+ elif transform_type == "sigmoid_scale":
476
+ scale = transform["scale"]
477
+ actual = sigmoid(raw) * scale
478
+ print(f" [Sigmoid*{scale}] {key}: {raw:.4f} β†’ {actual:.4f}")
479
+
480
+ else:
481
+ actual = raw
482
+
483
+ result[key] = actual
484
+
485
+ return result
486
+
487
+ def _clamp_values(self, params: Dict[str, float]) -> Dict[str, float]:
488
+ result = params.copy()
489
+
490
+ for key, (min_val, max_val) in PARAM_RANGES.items():
491
+ if key in result:
492
+ original = result[key]
493
+ clamped = max(min_val, min(max_val, original))
494
+ if abs(clamped - original) > 0.001:
495
+ print(f" [Clamp] {key}: {original:.4f} β†’ {clamped:.4f}")
496
+ result[key] = clamped
497
+
498
+ return result
499
+
500
+ def _parse_output(self, output_text: str) -> Dict[str, float]:
501
+ """LLM 좜λ ₯ νŒŒμ‹±"""
502
+
503
+ print(f" [Parse] Raw output 길이: {len(output_text)} 문자")
504
+
505
+ json_str = None
506
+
507
+ try:
508
+ text = output_text
509
+
510
+ text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
511
+
512
+ code_match = re.search(r'```(?:json)?\s*([\s\S]*?)```', text)
513
+ if code_match:
514
+ text = code_match.group(1)
515
+
516
+ json_str = self._extract_json_object(text)
517
+
518
+ if json_str:
519
+ print(f" [Parse] JSON 발견 (길이: {len(json_str)})")
520
+
521
+ json_str = self._preprocess_json(json_str)
522
+
523
+ raw_params = json.loads(json_str)
524
+
525
+ result = DEFAULT_PARAMETERS.copy()
526
+ parsed_count = 0
527
+
528
+ for key, value in raw_params.items():
529
+ try:
530
+ norm_key = self._normalize_key(key)
531
+ float_val = float(value)
532
+
533
+ if norm_key in DEFAULT_PARAMETERS:
534
+ result[norm_key] = float_val
535
+ parsed_count += 1
536
+ else:
537
+ for default_key in DEFAULT_PARAMETERS.keys():
538
+ norm_parts = norm_key.split('.')
539
+ default_parts = default_key.split('.')
540
+
541
+ if len(norm_parts) >= 3 and len(default_parts) >= 3:
542
+ if norm_parts[0] == default_parts[0] and norm_parts[-1] == default_parts[-1]:
543
+ result[default_key] = float_val
544
+ parsed_count += 1
545
+ break
546
+
547
+ except (ValueError, TypeError) as e:
548
+ print(f" [Parse] λ³€ν™˜ μ‹€νŒ¨: {key}={value}")
549
+
550
+ print(f" [Parse] βœ… {parsed_count}개 νŒŒλΌλ―Έν„° 맀핑됨")
551
+ return result
552
+
553
+ except json.JSONDecodeError as e:
554
+ print(f" [Parse] ❌ JSON μ—λŸ¬: {e}")
555
+ except Exception as e:
556
+ print(f" [Parse] ❌ μ˜ˆμ™Έ: {e}")
557
+
558
+ print(f" [Parse] ⚠️ κΈ°λ³Έκ°’ 폴백")
559
+ return DEFAULT_PARAMETERS.copy()
560
+
561
+ def predict(self, audio_path: str, text_prompt: str = "") -> Dict[str, float]:
562
+ """νŒŒλΌλ―Έν„° 예츑"""
563
+
564
+ self.request_count += 1
565
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
566
+
567
+ print(f"\n{'='*60}")
568
+ print(f"[AIEffector V6] 🎡 μš”μ²­ #{self.request_count} - {timestamp}")
569
+ print(f"{'='*60}")
570
+ print(f" πŸ“‚ μ˜€λ””μ˜€: {Path(audio_path).name}")
571
+ print(f" πŸ’¬ 원본: '{text_prompt}'")
572
+
573
+ # λ™μ˜μ–΄ λ³€ν™˜
574
+ processed_prompt = self._preprocess_text(text_prompt)
575
+ if processed_prompt != text_prompt.lower():
576
+ print(f" πŸ’¬ λ³€ν™˜: '{processed_prompt}'")
577
+
578
+ print(f" πŸ€– λͺ¨λΈ: {'AI' if self.is_loaded() else '프리셋'}")
579
+
580
+ # λͺ¨λΈ μ—†μœΌλ©΄ 프리셋
581
+ if not self.is_loaded():
582
+ print(f"\n ⚠️ AI λͺ¨λΈ λ―Έλ‘œλ“œ")
583
+ params = DEFAULT_PARAMETERS.copy()
584
+ params.update(self._apply_preset(processed_prompt))
585
+ self._log_parameters(params)
586
+ return self._convert_to_effect_chain_format(params)
587
+
588
+ try:
589
+ # 1. CLAP νŠΉμ§• μΆ”μΆœ
590
+ print(f"\n πŸ“Š [Step 1] CLAP νŠΉμ§• μΆ”μΆœ...")
591
+ audio_features = self.audio_encoder.get_audio_features(audio_path)
592
+
593
+ if not audio_features or all(f == 0 for f in audio_features):
594
+ print(f" ⚠️ μ‹€νŒ¨, 프리셋 폴백")
595
+ params = DEFAULT_PARAMETERS.copy()
596
+ params.update(self._apply_preset(processed_prompt))
597
+ self._log_parameters(params)
598
+ return self._convert_to_effect_chain_format(params)
599
+
600
+ print(f" βœ… {len(audio_features)}차원")
601
+
602
+ # 2. ν”„λ‘¬ν”„νŠΈ 생성
603
+ print(f"\n πŸ”€ [Step 2] ν”„λ‘¬ν”„νŠΈ 생성...")
604
+ prompt = self._format_prompt(processed_prompt, audio_features)
605
+
606
+ # 3. 토큰화
607
+ print(f"\n πŸ”’ [Step 3] 토큰화...")
608
+ inputs = self.tokenizer(
609
+ prompt,
610
+ return_tensors="pt",
611
+ truncation=False,
612
+ ).to(self.device)
613
+ print(f" 토큰 수: {inputs['input_ids'].shape[1]}")
614
+
615
+ # 4. LLM 생성
616
+ print(f"\n 🧠 [Step 4] LLM μΆ”λ‘ ...")
617
+ import time
618
+ start = time.time()
619
+
620
+ with torch.no_grad():
621
+ outputs = self.model.generate(
622
+ **inputs,
623
+ max_new_tokens=500,
624
+ do_sample=False,
625
+ temperature=0.1,
626
+ pad_token_id=self.tokenizer.pad_token_id,
627
+ eos_token_id=self.tokenizer.eos_token_id,
628
+ )
629
+
630
+ print(f" μΆ”λ‘  μ‹œκ°„: {time.time()-start:.2f}초")
631
+
632
+ # 5. λ””μ½”λ”©
633
+ print(f"\n πŸ“ [Step 5] λ””μ½”λ”©...")
634
+ gen_tokens = outputs[0][inputs['input_ids'].shape[1]:]
635
+ output_text = self.tokenizer.decode(gen_tokens, skip_special_tokens=True).strip()
636
+ print(f" 좜λ ₯ (처음 500자):\n{output_text[:500]}")
637
+
638
+ # 6. νŒŒμ‹±
639
+ print(f"\n πŸ”§ [Step 6] νŒŒμ‹±...")
640
+ raw_params = self._parse_output(output_text)
641
+
642
+ # 7. Raw β†’ Actual λ³€ν™˜
643
+ print(f"\n πŸ”„ [Step 7] Raw β†’ Actual λ³€ν™˜...")
644
+ actual_params = self._convert_raw_to_actual(raw_params)
645
+
646
+ # 8. κ°’ ν΄λž¨ν•‘
647
+ print(f"\n πŸ“ [Step 8] κ°’ ν΄λž¨ν•‘...")
648
+ clamped_params = self._clamp_values(actual_params)
649
+
650
+ # 9. 프리셋 보완 (Compressor/Reverb - ν•™μŠ΅λ˜μ§€ μ•Šμ€ νŒŒλΌλ―Έν„°)
651
+ print(f"\n πŸŽ›οΈ [Step 9] 프리셋 보완 (Compressor/Reverb)...")
652
+ preset = self._apply_preset(processed_prompt)
653
+ for key in preset:
654
+ clamped_params[key] = preset[key]
655
+ print(f" {key}: {preset[key]}")
656
+
657
+ # 10. λ‘œκΉ…
658
+ self._log_parameters(clamped_params)
659
+
660
+ print(f"\n βœ… μ™„λ£Œ!")
661
+ print(f"{'='*60}\n")
662
+
663
+ return self._convert_to_effect_chain_format(clamped_params)
664
+
665
+ except Exception as e:
666
+ print(f"\n ❌ μ‹€νŒ¨: {e}")
667
+ import traceback
668
+ traceback.print_exc()
669
+ params = DEFAULT_PARAMETERS.copy()
670
+ params.update(self._apply_preset(processed_prompt))
671
+ self._log_parameters(params)
672
+ return self._convert_to_effect_chain_format(params)
673
+
674
+ def _convert_to_effect_chain_format(self, params: Dict[str, float]) -> Dict[str, float]:
675
+ """effect_chain.py ν˜•μ‹μœΌλ‘œ λ³€ν™˜ (Q β†’ q)"""
676
+ result = {}
677
+ for key, value in params.items():
678
+ new_key = key.replace('.Q', '.q')
679
+ result[new_key] = value
680
+ return result
681
+
682
+ def _log_parameters(self, params: Dict[str, float]):
683
+ print(f"\n πŸ“‹ μ΅œμ’… νŒŒλΌλ―Έν„°:")
684
+ print(f" [EQ Peak 1] freq={params.get('eq_peak1.params.freq',0):.0f}Hz, gain={params.get('eq_peak1.params.gain',0):.2f}dB, Q={params.get('eq_peak1.params.Q',0):.2f}")
685
+ print(f" [EQ Peak 2] freq={params.get('eq_peak2.params.freq',0):.0f}Hz, gain={params.get('eq_peak2.params.gain',0):.2f}dB, Q={params.get('eq_peak2.params.Q',0):.2f}")
686
+ print(f" [Low Shelf] freq={params.get('eq_lowshelf.params.freq',0):.0f}Hz, gain={params.get('eq_lowshelf.params.gain',0):.2f}dB")
687
+ print(f" [High Shelf] freq={params.get('eq_highshelf.params.freq',0):.0f}Hz, gain={params.get('eq_highshelf.params.gain',0):.2f}dB")
688
+ print(f" [Compressor] threshold={params.get('compressor.threshold',-18):.1f}dB, ratio={params.get('compressor.ratio',2):.1f}")
689
+ print(f" [Distortion] {params.get('distortion_amount',0):.4f}")
690
+ print(f" [Delay] time={params.get('delay.delay_time',0):.3f}s, fb={params.get('delay.feedback',0):.2f}, mix={params.get('delay.mix',0):.2f}")
691
+ print(f" [Reverb] room={params.get('reverb.room_size',0):.2f}, damp={params.get('reverb.damping',0):.2f}, wet={params.get('reverb.wet_level',0):.2f}, dry={params.get('reverb.dry_level',1):.2f}")
692
+ print(f" [Wet Mix] {params.get('final_wet_mix',0):.2f}")