heybaeheef commited on
Commit
1242aec
ยท
verified ยท
1 Parent(s): 4336a6e

Upload ai_effector.py

Browse files
Files changed (1) hide show
  1. models/ai_effector.py +539 -0
models/ai_effector.py ADDED
@@ -0,0 +1,539 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AI Effector - DiffVox LLM ๊ธฐ๋ฐ˜ ์ดํŽ™ํŠธ ํŒŒ๋ผ๋ฏธํ„ฐ ์˜ˆ์ธก
3
+ ===================================================
4
+ V2: ํ•™์Šต๊ณผ ๋™์ผํ•œ CLAP ์ธ์ฝ”๋” + ํ”„๋กฌํ”„ํŠธ ํ˜•์‹ ์‚ฌ์šฉ
5
+ """
6
+
7
+ import os
8
+ import json
9
+ import re
10
+ import torch
11
+ import numpy as np
12
+ from typing import Dict, List, Optional, Any
13
+ from pathlib import Path
14
+ from datetime import datetime
15
+ import warnings
16
+
17
+ warnings.filterwarnings("ignore")
18
+
19
+ # ๊ธฐ๋ณธ ํŒŒ๋ผ๋ฏธํ„ฐ (๋ชจ๋ธ ๋กœ๋“œ ์‹คํŒจ ์‹œ ์‚ฌ์šฉ)
20
+ DEFAULT_PARAMETERS = {
21
+ "eq_peak1.params.freq": 1000.0,
22
+ "eq_peak1.params.gain": 0.0,
23
+ "eq_peak1.params.Q": 1.0, # ๋Œ€๋ฌธ์ž Q (ํ•™์Šต ๋ฐ์ดํ„ฐ์™€ ์ผ์น˜)
24
+ "eq_peak2.params.freq": 4000.0,
25
+ "eq_peak2.params.gain": 0.0,
26
+ "eq_peak2.params.Q": 1.0,
27
+ "eq_lowshelf.params.freq": 200.0,
28
+ "eq_lowshelf.params.gain": 0.0,
29
+ "eq_highshelf.params.freq": 8000.0,
30
+ "eq_highshelf.params.gain": 0.0,
31
+ "distortion_amount": 0.0,
32
+ "delay.delay_time": 0.02,
33
+ "delay.feedback": 0.3,
34
+ "delay.mix": 0.2,
35
+ "final_wet_mix": 0.5
36
+ }
37
+
38
+ # ์Šคํƒ€์ผ ํ”„๋ฆฌ์…‹ (AI ์—†์ด๋„ ์ž‘๋™)
39
+ STYLE_PRESETS = {
40
+ "warm": {
41
+ "eq_lowshelf.params.gain": 3.0,
42
+ "eq_highshelf.params.gain": -1.0,
43
+ "distortion_amount": 0.05,
44
+ },
45
+ "bright": {
46
+ "eq_highshelf.params.gain": 4.0,
47
+ "eq_peak2.params.gain": 2.0,
48
+ "eq_lowshelf.params.gain": -1.0,
49
+ },
50
+ "vintage": {
51
+ "eq_lowshelf.params.gain": 2.0,
52
+ "eq_highshelf.params.gain": -2.0,
53
+ "distortion_amount": 0.1,
54
+ "delay.mix": 0.15,
55
+ },
56
+ "modern": {
57
+ "eq_peak1.params.gain": 2.0,
58
+ "eq_peak2.params.gain": 3.0,
59
+ "eq_highshelf.params.gain": 2.0,
60
+ },
61
+ "spacious": {
62
+ "delay.delay_time": 0.05,
63
+ "delay.feedback": 0.4,
64
+ "delay.mix": 0.35,
65
+ },
66
+ "dry": {
67
+ "final_wet_mix": 0.2,
68
+ "delay.mix": 0.0,
69
+ },
70
+ "saturated": {
71
+ "distortion_amount": 0.15,
72
+ "eq_lowshelf.params.gain": 1.0,
73
+ }
74
+ }
75
+
76
+
77
+ class CLAPAudioEncoder:
78
+ """
79
+ CLAP ๊ธฐ๋ฐ˜ ์˜ค๋””์˜ค ์ธ์ฝ”๋” (ํ•™์Šต ์‹œ์™€ ๋™์ผ)
80
+ laion/larger_clap_music ๋ชจ๋ธ ์‚ฌ์šฉ, 512โ†’64 pooling
81
+ """
82
+
83
+ def __init__(self, output_dim: int = 64, model_name: str = "laion/larger_clap_music"):
84
+ self.output_dim = output_dim
85
+ self.model_name = model_name
86
+ self.target_sr = 48000 # CLAP์€ 48kHz ์‚ฌ์šฉ
87
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
88
+
89
+ self.model = None
90
+ self.processor = None
91
+ self._load_model()
92
+
93
+ def _load_model(self):
94
+ """CLAP ๋ชจ๋ธ ๋กœ๋“œ"""
95
+ try:
96
+ from transformers import ClapModel, ClapProcessor
97
+
98
+ print(f"[CLAPEncoder] CLAP ๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘: {self.model_name}")
99
+
100
+ self.processor = ClapProcessor.from_pretrained(self.model_name)
101
+ self.model = ClapModel.from_pretrained(self.model_name)
102
+ self.model = self.model.to(self.device)
103
+ self.model.eval()
104
+
105
+ print(f"[CLAPEncoder] โœ… CLAP ๋ชจ๋ธ ๋กœ๋“œ ์™„๋ฃŒ (512โ†’{self.output_dim} pooling)")
106
+
107
+ except ImportError:
108
+ print("[CLAPEncoder] โŒ transformers ๋ฏธ์„ค์น˜")
109
+ print(" pip install transformers")
110
+ except Exception as e:
111
+ print(f"[CLAPEncoder] โŒ ๋ชจ๋ธ ๋กœ๋“œ ์‹คํŒจ: {e}")
112
+ import traceback
113
+ traceback.print_exc()
114
+
115
+ def get_audio_features(self, audio_path: str) -> List[float]:
116
+ """
117
+ ์˜ค๋””์˜ค ํŒŒ์ผ์—์„œ 64์ฐจ์› ํŠน์ง• ๋ฒกํ„ฐ ์ถ”์ถœ (ํ•™์Šต๊ณผ ๋™์ผํ•œ ๋ฐฉ์‹)
118
+ """
119
+ if self.model is None:
120
+ print("[CLAPEncoder] ๋ชจ๋ธ์ด ๋กœ๋“œ๋˜์ง€ ์•Š์Œ, ๋นˆ ํŠน์ง• ๋ฐ˜ํ™˜")
121
+ return [0.0] * self.output_dim
122
+
123
+ try:
124
+ import librosa
125
+
126
+ # 1. ์˜ค๋””์˜ค ๋กœ๋“œ (48kHz๋กœ ๋ฆฌ์ƒ˜ํ”Œ๋ง - CLAP ์š”๊ตฌ์‚ฌํ•ญ)
127
+ audio, sr = librosa.load(audio_path, sr=self.target_sr, mono=True)
128
+
129
+ # 2. CLAP ์ž…๋ ฅ ์ค€๋น„
130
+ inputs = self.processor(
131
+ audios=audio,
132
+ sampling_rate=self.target_sr,
133
+ return_tensors="pt",
134
+ padding=True
135
+ ).to(self.device)
136
+
137
+ # 3. ํŠน์ง• ์ถ”์ถœ
138
+ with torch.no_grad():
139
+ outputs = self.model.get_audio_features(**inputs)
140
+
141
+ # [1, 512] ํ˜•ํƒœ์˜ ํ…์„œ
142
+ features_512 = outputs[0].cpu().numpy()
143
+
144
+ # 4. 512 โ†’ 64 ์ฐจ์› ์ถ•์†Œ (ํ‰๊ท  ํ’€๋ง, ํ•™์Šต๊ณผ ๋™์ผ)
145
+ features_64 = self._reduce_dimension(features_512)
146
+
147
+ return features_64.tolist()
148
+
149
+ except Exception as e:
150
+ print(f"[CLAPEncoder] ํŠน์ง• ์ถ”์ถœ ์‹คํŒจ: {e}")
151
+ import traceback
152
+ traceback.print_exc()
153
+ return [0.0] * self.output_dim
154
+
155
+ def _reduce_dimension(self, features: np.ndarray) -> np.ndarray:
156
+ """512์ฐจ์› โ†’ 64์ฐจ์› ํ‰๊ท  ํ’€๋ง (ํ•™์Šต๊ณผ ๋™์ผํ•œ ๋ฐฉ์‹)"""
157
+ current_dim = len(features)
158
+
159
+ if current_dim == self.output_dim:
160
+ return features
161
+
162
+ # ํ‰๊ท  ํ’€๋ง: 8๊ฐœ์”ฉ ๋ฌถ์–ด์„œ ํ‰๊ท  (512 / 64 = 8)
163
+ pool_size = current_dim // self.output_dim
164
+ remainder = current_dim % self.output_dim
165
+
166
+ pooled = []
167
+ idx = 0
168
+ for i in range(self.output_dim):
169
+ size = pool_size + (1 if i < remainder else 0)
170
+ pooled.append(np.mean(features[idx:idx+size]))
171
+ idx += size
172
+
173
+ return np.array(pooled)
174
+
175
+ def is_loaded(self) -> bool:
176
+ return self.model is not None
177
+
178
+
179
+ class AIEffector:
180
+ """AI ๊ธฐ๋ฐ˜ ์ดํŽ™ํ„ฐ ํŒŒ๋ผ๋ฏธํ„ฐ ์˜ˆ์ธก (V2: ํ•™์Šต๊ณผ ๋™์ผํ•œ ์„ค์ •)"""
181
+
182
+ def __init__(
183
+ self,
184
+ model_repo_id: str = "heybaeheef/KU_SW_Academy",
185
+ model_subfolder: str = "checkpoints",
186
+ base_model_name: str = "Qwen/Qwen3-8B",
187
+ audio_feature_dim: int = 64,
188
+ use_huggingface: bool = True
189
+ ):
190
+ self.model_repo_id = model_repo_id
191
+ self.model_subfolder = model_subfolder
192
+ self.base_model_name = base_model_name
193
+ self.audio_feature_dim = audio_feature_dim
194
+ self.use_huggingface = use_huggingface
195
+
196
+ self.model = None
197
+ self.tokenizer = None
198
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
199
+
200
+ # โ˜…โ˜…โ˜… ํ•ต์‹ฌ ์ˆ˜์ •: CLAP ์˜ค๋””์˜ค ์ธ์ฝ”๋” ์‚ฌ์šฉ (ํ•™์Šต๊ณผ ๋™์ผ) โ˜…โ˜…โ˜…
201
+ print(f"[AIEffector] CLAP ์˜ค๋””์˜ค ์ธ์ฝ”๋” ์ดˆ๊ธฐํ™” ์ค‘...")
202
+ self.audio_encoder = CLAPAudioEncoder(output_dim=audio_feature_dim)
203
+
204
+ # ์š”์ฒญ ์นด์šดํ„ฐ
205
+ self.request_count = 0
206
+
207
+ # ๋ชจ๋ธ ๋กœ๋“œ ์‹œ๋„
208
+ self._load_model()
209
+
210
+ def _load_model(self):
211
+ """๋ชจ๋ธ ๋กœ๋“œ"""
212
+ try:
213
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
214
+ from peft import PeftModel
215
+
216
+ print(f"[AIEffector] ๋ฒ ์ด์Šค ๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘: {self.base_model_name}")
217
+
218
+ # 4bit ์–‘์žํ™” ์„ค์ •
219
+ if torch.cuda.is_available():
220
+ bnb_config = BitsAndBytesConfig(
221
+ load_in_4bit=True,
222
+ bnb_4bit_quant_type="nf4",
223
+ bnb_4bit_compute_dtype=torch.float16,
224
+ bnb_4bit_use_double_quant=True
225
+ )
226
+ base_model = AutoModelForCausalLM.from_pretrained(
227
+ self.base_model_name,
228
+ quantization_config=bnb_config,
229
+ device_map="auto",
230
+ trust_remote_code=True
231
+ )
232
+ else:
233
+ base_model = AutoModelForCausalLM.from_pretrained(
234
+ self.base_model_name,
235
+ torch_dtype=torch.float32,
236
+ device_map="auto",
237
+ trust_remote_code=True
238
+ )
239
+
240
+ self.tokenizer = AutoTokenizer.from_pretrained(
241
+ self.base_model_name,
242
+ trust_remote_code=True
243
+ )
244
+
245
+ if self.tokenizer.pad_token is None:
246
+ self.tokenizer.pad_token = self.tokenizer.eos_token
247
+
248
+ print(f"[AIEffector] LoRA ์–ด๋Œ‘ํ„ฐ ๋กœ๋”ฉ ์ค‘...")
249
+
250
+ if self.use_huggingface:
251
+ print(f"[AIEffector] HuggingFace์—์„œ LoRA ๋กœ๋”ฉ: {self.model_repo_id}/{self.model_subfolder}")
252
+ self.model = PeftModel.from_pretrained(
253
+ base_model,
254
+ self.model_repo_id,
255
+ subfolder=self.model_subfolder,
256
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
257
+ )
258
+ else:
259
+ local_path = os.path.join(self.model_repo_id, self.model_subfolder)
260
+ print(f"[AIEffector] ๋กœ์ปฌ์—์„œ LoRA ์–ด๋Œ‘ํ„ฐ ๋กœ๋”ฉ: {local_path}")
261
+ self.model = PeftModel.from_pretrained(
262
+ base_model,
263
+ local_path,
264
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
265
+ )
266
+
267
+ self.model.eval()
268
+ print(f"[AIEffector] โœ… ๋ชจ๋ธ ๋กœ๋“œ ์„ฑ๊ณต!")
269
+
270
+ except Exception as e:
271
+ print(f"[AIEffector] โŒ ๋ชจ๋ธ ๋กœ๋“œ ์‹คํŒจ: {e}")
272
+ import traceback
273
+ traceback.print_exc()
274
+ print(f"[AIEffector] ํด๋ฐฑ ๋ชจ๋“œ๋กœ ์ „ํ™˜ (ํ”„๋ฆฌ์…‹ ๊ธฐ๋ฐ˜)")
275
+ self.model = None
276
+ self.tokenizer = None
277
+
278
+ def is_loaded(self) -> bool:
279
+ """๋ชจ๋ธ ๋กœ๋“œ ์—ฌ๋ถ€"""
280
+ return self.model is not None
281
+
282
+ def _apply_preset(self, prompt: str) -> Dict[str, float]:
283
+ """ํ”„๋กฌํ”„ํŠธ์—์„œ ํ”„๋ฆฌ์…‹ ๋งค์นญ"""
284
+ params = DEFAULT_PARAMETERS.copy()
285
+ prompt_lower = prompt.lower()
286
+
287
+ matched_presets = []
288
+ for style_name, style_params in STYLE_PRESETS.items():
289
+ if style_name in prompt_lower:
290
+ params.update(style_params)
291
+ matched_presets.append(style_name)
292
+
293
+ if matched_presets:
294
+ print(f" [Preset] ๋งค์นญ๋œ ํ”„๋ฆฌ์…‹: {matched_presets}")
295
+
296
+ return params
297
+
298
+ def _format_prompt(self, text_prompt: str, audio_features: List[float]) -> str:
299
+ """
300
+ โ˜…โ˜…โ˜… ํ•ต์‹ฌ ์ˆ˜์ •: ํ•™์Šต ์‹œ์™€ ๋™์ผํ•œ ํ”„๋กฌํ”„ํŠธ ํ˜•์‹ ์‚ฌ์šฉ โ˜…โ˜…โ˜…
301
+ train_model.py์˜ 243-246์ค„๊ณผ ๋™์ผํ•œ ํ˜•์‹
302
+ """
303
+ audio_state_str = json.dumps(audio_features)
304
+
305
+ # ํ•™์Šต ์‹œ์™€ ์™„์ „ํžˆ ๋™์ผํ•œ ํ˜•์‹!
306
+ prompt = f"""Task: Convert text to audio parameters.
307
+ Audio: {audio_state_str}
308
+ Text: {text_prompt}
309
+ Parameters:"""
310
+
311
+ return prompt
312
+
313
+ def _parse_output(self, output_text: str) -> Dict[str, float]:
314
+ """LLM ์ถœ๋ ฅ์—์„œ ํŒŒ๋ผ๋ฏธํ„ฐ ์ถ”์ถœ (ํ–ฅ์ƒ๋œ ๋ฒ„์ „)"""
315
+
316
+ print(f" [Parse] Raw output ๊ธธ์ด: {len(output_text)} ๋ฌธ์ž")
317
+
318
+ try:
319
+ text = output_text
320
+
321
+ # 1. <think>...</think> ํƒœ๊ทธ ์ œ๊ฑฐ (Qwen3 thinking mode)
322
+ text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
323
+
324
+ # 2. ๋งˆํฌ๋‹ค์šด ์ฝ”๋“œ๋ธ”๋ก ์ถ”์ถœ
325
+ code_block_match = re.search(r'```(?:json)?\s*([\s\S]*?)```', text)
326
+ if code_block_match:
327
+ text = code_block_match.group(1)
328
+ print(f" [Parse] ์ฝ”๋“œ๋ธ”๋ก์—์„œ JSON ์ถ”์ถœ")
329
+
330
+ # 3. JSON ๊ฐ์ฒด ์ฐพ๊ธฐ (์ค‘์ฒฉ ๋ธŒ๋ ˆ์ด์Šค ์ง€์›)
331
+ json_str = self._extract_json_object(text)
332
+
333
+ if json_str:
334
+ print(f" [Parse] ์ถ”์ถœ๋œ JSON (์ฒ˜์Œ 200์ž):\n{json_str[:200]}...")
335
+
336
+ # 4. JSON ์ „์ฒ˜๋ฆฌ
337
+ json_str = self._preprocess_json(json_str)
338
+
339
+ # 5. ํŒŒ์‹ฑ ์‹œ๋„
340
+ params = json.loads(json_str)
341
+
342
+ # 6. ๊ฒฐ๊ณผ ๊ฒ€์ฆ ๋ฐ ๋งคํ•‘
343
+ result = DEFAULT_PARAMETERS.copy()
344
+ for key, value in params.items():
345
+ # ํ‚ค ์ •๊ทœํ™” (๋Œ€์†Œ๋ฌธ์ž ์ฒ˜๋ฆฌ)
346
+ normalized_key = self._normalize_key(key)
347
+ if normalized_key in result:
348
+ try:
349
+ result[normalized_key] = float(value)
350
+ except (ValueError, TypeError):
351
+ pass
352
+
353
+ print(f" [Parse] โœ… ํŒŒ์‹ฑ ์„ฑ๊ณต! {len(params)}๊ฐœ ํŒŒ๋ผ๋ฏธํ„ฐ ์ถ”์ถœ")
354
+ return result
355
+ else:
356
+ print(f" [Parse] โŒ JSON ๊ฐ์ฒด๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Œ")
357
+
358
+ except json.JSONDecodeError as e:
359
+ print(f" [Parse] โŒ JSON ํŒŒ์‹ฑ ์—๋Ÿฌ: {e}")
360
+ if json_str:
361
+ print(f" [Parse] ๋ฌธ์ œ ์œ„์น˜ ๊ทผ์ฒ˜: ...{json_str[max(0, e.pos-20):e.pos+20]}...")
362
+ except Exception as e:
363
+ print(f" [Parse] โŒ ์˜ˆ์™ธ ๋ฐœ์ƒ: {e}")
364
+
365
+ print(f" [Parse] โš ๏ธ ๊ธฐ๋ณธ๊ฐ’์œผ๋กœ ํด๋ฐฑ")
366
+ return DEFAULT_PARAMETERS.copy()
367
+
368
+ def _normalize_key(self, key: str) -> str:
369
+ """ํŒŒ๋ผ๋ฏธํ„ฐ ํ‚ค ์ •๊ทœํ™” (๋Œ€์†Œ๋ฌธ์ž ์ฒ˜๋ฆฌ)"""
370
+ # Q/q ์ •๊ทœํ™”
371
+ if key.endswith('.q'):
372
+ return key[:-2] + '.Q'
373
+ return key
374
+
375
+ def _extract_json_object(self, text: str) -> Optional[str]:
376
+ """ํ…์ŠคํŠธ์—์„œ JSON ๊ฐ์ฒด ์ถ”์ถœ (์ค‘์ฒฉ ๋ธŒ๋ ˆ์ด์Šค ์ง€์›)"""
377
+ start = text.find('{')
378
+ if start == -1:
379
+ return None
380
+
381
+ depth = 0
382
+ for i, char in enumerate(text[start:], start):
383
+ if char == '{':
384
+ depth += 1
385
+ elif char == '}':
386
+ depth -= 1
387
+ if depth == 0:
388
+ return text[start:i+1]
389
+
390
+ return None
391
+
392
+ def _preprocess_json(self, json_str: str) -> str:
393
+ """JSON ๋ฌธ์ž์—ด ์ „์ฒ˜๋ฆฌ"""
394
+ # Trailing comma ์ œ๊ฑฐ
395
+ json_str = re.sub(r',(\s*[}\]])', r'\1', json_str)
396
+
397
+ # NaN, Infinity ์ฒ˜๋ฆฌ
398
+ json_str = re.sub(r'\bNaN\b', '0', json_str)
399
+ json_str = re.sub(r'\bInfinity\b', '999999', json_str)
400
+ json_str = re.sub(r'-Infinity\b', '-999999', json_str)
401
+
402
+ return json_str
403
+
404
+ def predict(self, audio_path: str, text_prompt: str = "") -> Dict[str, float]:
405
+ """ํŒŒ๋ผ๋ฏธํ„ฐ ์˜ˆ์ธก"""
406
+
407
+ self.request_count += 1
408
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
409
+
410
+ print(f"\n{'='*60}")
411
+ print(f"[AIEffector] ๐ŸŽต ์š”์ฒญ #{self.request_count} - {timestamp}")
412
+ print(f"{'='*60}")
413
+ print(f" ๐Ÿ“‚ ์˜ค๋””์˜ค ํŒŒ์ผ: {Path(audio_path).name}")
414
+ print(f" ๐Ÿ’ฌ ํ…์ŠคํŠธ ํ”„๋กฌํ”„ํŠธ: '{text_prompt}'")
415
+ print(f" ๐Ÿค– ๋ชจ๋ธ ์ƒํƒœ: {'AI ๋ชจ๋“œ' if self.is_loaded() else 'ํ”„๋ฆฌ์…‹ ๋ชจ๋“œ'}")
416
+ print(f" ๐ŸŽง ์ธ์ฝ”๋”: CLAP (ํ•™์Šต๊ณผ ๋™์ผ)")
417
+
418
+ # ๋ชจ๋ธ์ด ์—†์œผ๋ฉด ํ”„๋ฆฌ์…‹ ์‚ฌ์šฉ
419
+ if not self.is_loaded():
420
+ print(f"\n โš ๏ธ AI ๋ชจ๋ธ ๋ฏธ๋กœ๋“œ - ํ”„๋ฆฌ์…‹ ๋ชจ๋“œ ์‚ฌ์šฉ")
421
+ params = self._apply_preset(text_prompt)
422
+ self._log_parameters(params)
423
+ return self._convert_to_effect_chain_format(params)
424
+
425
+ try:
426
+ # 1. CLAP ์˜ค๋””์˜ค ํŠน์ง• ์ถ”์ถœ (ํ•™์Šต๊ณผ ๋™์ผ)
427
+ print(f"\n ๐Ÿ“Š [Step 1] CLAP ์˜ค๋””์˜ค ํŠน์ง• ์ถ”์ถœ ์ค‘...")
428
+ audio_features = self.audio_encoder.get_audio_features(audio_path)
429
+
430
+ if not audio_features or all(f == 0 for f in audio_features):
431
+ print(f" โš ๏ธ ํŠน์ง• ์ถ”์ถœ ์‹คํŒจ, ํ”„๋ฆฌ์…‹์œผ๋กœ ํด๋ฐฑ")
432
+ params = self._apply_preset(text_prompt)
433
+ self._log_parameters(params)
434
+ return self._convert_to_effect_chain_format(params)
435
+
436
+ print(f" โœ… {len(audio_features)}์ฐจ์› ํŠน์ง• ์ถ”์ถœ ์™„๋ฃŒ")
437
+ print(f" - ํŠน์ง• ๋ฒกํ„ฐ (์ฒ˜์Œ 8๊ฐœ): {[round(v, 3) for v in audio_features[:8]]}")
438
+
439
+ # 2. LLM ํ”„๋กฌํ”„ํŠธ ์ƒ์„ฑ (ํ•™์Šต๊ณผ ๋™์ผํ•œ ํ˜•์‹)
440
+ print(f"\n ๐Ÿ”ค [Step 2] LLM ํ”„๋กฌํ”„ํŠธ ์ƒ์„ฑ ์ค‘ (ํ•™์Šต ํ˜•์‹)...")
441
+ prompt = self._format_prompt(text_prompt, audio_features)
442
+ print(f" - ํ”„๋กฌํ”„ํŠธ ๊ธธ์ด: {len(prompt)} ๋ฌธ์ž")
443
+
444
+ # 3. ํ† ํฐํ™”
445
+ print(f"\n ๐Ÿ”ข [Step 3] ํ† ํฐํ™” ์ค‘...")
446
+ inputs = self.tokenizer(
447
+ prompt,
448
+ return_tensors="pt",
449
+ truncation=True,
450
+ max_length=1500 # ํ•™์Šต ์‹œ์™€ ๋™์ผ
451
+ ).to(self.device)
452
+ print(f" - ์ž…๋ ฅ ํ† ํฐ ์ˆ˜: {inputs['input_ids'].shape[1]}")
453
+
454
+ # 4. LLM ์ƒ์„ฑ
455
+ print(f"\n ๐Ÿง  [Step 4] LLM ์ถ”๋ก  ์ค‘...")
456
+ import time
457
+ start_time = time.time()
458
+
459
+ with torch.no_grad():
460
+ outputs = self.model.generate(
461
+ **inputs,
462
+ max_new_tokens=500,
463
+ do_sample=False,
464
+ temperature=0.1,
465
+ pad_token_id=self.tokenizer.pad_token_id,
466
+ eos_token_id=self.tokenizer.eos_token_id,
467
+ )
468
+
469
+ inference_time = time.time() - start_time
470
+ print(f" - ์ถ”๋ก  ์‹œ๊ฐ„: {inference_time:.2f}์ดˆ")
471
+
472
+ # 5. ๋””์ฝ”๋”ฉ (์ƒ์„ฑ๋œ ๋ถ€๋ถ„๋งŒ)
473
+ print(f"\n ๐Ÿ“ [Step 5] ์ถœ๋ ฅ ๋””์ฝ”๋”ฉ ์ค‘...")
474
+ generated_tokens = outputs[0][inputs['input_ids'].shape[1]:]
475
+ output_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
476
+
477
+ print(f" - LLM ์ถœ๋ ฅ (์ฒ˜์Œ 300์ž):\n{output_text[:300]}")
478
+
479
+ # 6. ํŒŒ์‹ฑ
480
+ print(f"\n ๐Ÿ”ง [Step 6] ํŒŒ๋ผ๋ฏธํ„ฐ ํŒŒ์‹ฑ ์ค‘...")
481
+ params = self._parse_output(output_text)
482
+
483
+ # 7. ๊ฒฐ๊ณผ ๋กœ๊น…
484
+ self._log_parameters(params)
485
+
486
+ print(f"\n โœ… AI ์˜ˆ์ธก ์™„๋ฃŒ!")
487
+ print(f"{'='*60}\n")
488
+
489
+ # effect_chain.py ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜
490
+ return self._convert_to_effect_chain_format(params)
491
+
492
+ except Exception as e:
493
+ print(f"\n โŒ ์˜ˆ์ธก ์‹คํŒจ: {e}")
494
+ import traceback
495
+ traceback.print_exc()
496
+ print(f" โš ๏ธ ํ”„๋ฆฌ์…‹์œผ๋กœ ํด๋ฐฑ...")
497
+ params = self._apply_preset(text_prompt)
498
+ self._log_parameters(params)
499
+ return self._convert_to_effect_chain_format(params)
500
+
501
+ def _convert_to_effect_chain_format(self, params: Dict[str, float]) -> Dict[str, float]:
502
+ """
503
+ ํ•™์Šต ๋ฐ์ดํ„ฐ ํ˜•์‹ โ†’ effect_chain.py ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜
504
+ ์ฃผ๋กœ Q/q ๋Œ€์†Œ๋ฌธ์ž ์ฒ˜๋ฆฌ
505
+ """
506
+ result = {}
507
+ for key, value in params.items():
508
+ # Q โ†’ q ๋ณ€ํ™˜ (effect_chain.py๋Š” ์†Œ๋ฌธ์ž q ์‚ฌ์šฉ)
509
+ new_key = key.replace('.Q', '.q')
510
+ result[new_key] = value
511
+ return result
512
+
513
+ def _log_parameters(self, params: Dict[str, float]):
514
+ """์˜ˆ์ธก๋œ ํŒŒ๋ผ๋ฏธํ„ฐ ๋กœ๊น…"""
515
+ print(f"\n ๐Ÿ“‹ ์˜ˆ์ธก๋œ ํŒŒ๋ผ๋ฏธํ„ฐ:")
516
+ print(f" [EQ Peak 1]")
517
+ print(f" - Freq: {params.get('eq_peak1.params.freq', 0):.1f} Hz")
518
+ print(f" - Gain: {params.get('eq_peak1.params.gain', 0):.2f} dB")
519
+ print(f" - Q: {params.get('eq_peak1.params.Q', params.get('eq_peak1.params.q', 0)):.2f}")
520
+
521
+ print(f" [EQ Peak 2]")
522
+ print(f" - Freq: {params.get('eq_peak2.params.freq', 0):.1f} Hz")
523
+ print(f" - Gain: {params.get('eq_peak2.params.gain', 0):.2f} dB")
524
+ print(f" - Q: {params.get('eq_peak2.params.Q', params.get('eq_peak2.params.q', 0)):.2f}")
525
+
526
+ print(f" [Low Shelf]")
527
+ print(f" - Freq: {params.get('eq_lowshelf.params.freq', 0):.1f} Hz")
528
+ print(f" - Gain: {params.get('eq_lowshelf.params.gain', 0):.2f} dB")
529
+
530
+ print(f" [High Shelf]")
531
+ print(f" - Freq: {params.get('eq_highshelf.params.freq', 0):.1f} Hz")
532
+ print(f" - Gain: {params.get('eq_highshelf.params.gain', 0):.2f} dB")
533
+
534
+ print(f" [Effects]")
535
+ print(f" - Distortion: {params.get('distortion_amount', 0):.3f}")
536
+ print(f" - Delay Time: {params.get('delay.delay_time', 0):.3f}s")
537
+ print(f" - Delay Feedback: {params.get('delay.feedback', 0):.2f}")
538
+ print(f" - Delay Mix: {params.get('delay.mix', 0):.2f}")
539
+ print(f" - Final Wet Mix: {params.get('final_wet_mix', 0):.2f}")