heybaeheef commited on
Commit
0c3b738
ยท
verified ยท
1 Parent(s): 7a4005e

Upload 3 files

Browse files
Files changed (3) hide show
  1. models/__init__.py +4 -4
  2. models/ai_effector.py +633 -0
  3. models/audio_encoder.py +189 -189
models/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- # models package
2
- from .ai_effector import AIEffector
3
-
4
- __all__ = ["AIEffector"]
 
1
+ # models package
2
+ from .ai_effector import AIEffector
3
+
4
+ __all__ = ["AIEffector"]
models/ai_effector.py ADDED
@@ -0,0 +1,633 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AI Effector - DiffVox LLM ๊ธฐ๋ฐ˜ ์ดํŽ™ํŠธ ํŒŒ๋ผ๋ฏธํ„ฐ ์˜ˆ์ธก
3
+ ===================================================
4
+ V4: ๊ทผ๋ณธ ์›์ธ ํ•ด๊ฒฐ
5
+ - sigmoid ๋ณ€ํ™˜ (delay.feedback, delay.mix, distortion_amount)
6
+ - parametrizations.X.original ํ‚ค ์ •๊ทœํ™”
7
+ - delay.delay_time์€ ํ•™์Šต ์•ˆ๋จ โ†’ ํ”„๋ฆฌ์…‹ ๋ณด์™„
8
+ - ๋™์˜์–ด ๋งคํ•‘
9
+ """
10
+
11
+ import os
12
+ import json
13
+ import re
14
+ import math
15
+ import torch
16
+ import numpy as np
17
+ from typing import Dict, List, Optional, Any
18
+ from pathlib import Path
19
+ from datetime import datetime
20
+ import warnings
21
+
22
+ warnings.filterwarnings("ignore")
23
+
24
+
25
+ def sigmoid(x: float) -> float:
26
+ """์‹œ๊ทธ๋ชจ์ด๋“œ ํ•จ์ˆ˜"""
27
+ try:
28
+ return 1 / (1 + math.exp(-x))
29
+ except OverflowError:
30
+ return 0.0 if x < 0 else 1.0
31
+
32
+
33
+ # ๊ธฐ๋ณธ ํŒŒ๋ผ๋ฏธํ„ฐ (๋ชจ๋ธ ๋กœ๋“œ ์‹คํŒจ ์‹œ ์‚ฌ์šฉ)
34
+ DEFAULT_PARAMETERS = {
35
+ "eq_peak1.params.freq": 1000.0,
36
+ "eq_peak1.params.gain": 0.0,
37
+ "eq_peak1.params.Q": 1.0,
38
+ "eq_peak2.params.freq": 4000.0,
39
+ "eq_peak2.params.gain": 0.0,
40
+ "eq_peak2.params.Q": 1.0,
41
+ "eq_lowshelf.params.freq": 200.0,
42
+ "eq_lowshelf.params.gain": 0.0,
43
+ "eq_highshelf.params.freq": 8000.0,
44
+ "eq_highshelf.params.gain": 0.0,
45
+ "distortion_amount": 0.0,
46
+ "delay.delay_time": 0.02,
47
+ "delay.feedback": 0.3,
48
+ "delay.mix": 0.2,
49
+ "final_wet_mix": 0.5
50
+ }
51
+
52
+ # ํŒŒ๋ผ๋ฏธํ„ฐ ๋ฒ”์œ„ ์ œํ•œ (๋ณ€ํ™˜ ํ›„ ์‹ค์ œ ๊ฐ’ ๊ธฐ์ค€)
53
+ PARAM_RANGES = {
54
+ "eq_peak1.params.freq": (20.0, 20000.0),
55
+ "eq_peak1.params.gain": (-12.0, 12.0),
56
+ "eq_peak1.params.Q": (0.1, 10.0),
57
+ "eq_peak2.params.freq": (20.0, 20000.0),
58
+ "eq_peak2.params.gain": (-12.0, 12.0),
59
+ "eq_peak2.params.Q": (0.1, 10.0),
60
+ "eq_lowshelf.params.freq": (20.0, 2000.0),
61
+ "eq_lowshelf.params.gain": (-12.0, 12.0),
62
+ "eq_highshelf.params.freq": (1000.0, 20000.0),
63
+ "eq_highshelf.params.gain": (-12.0, 12.0),
64
+ "distortion_amount": (0.0, 0.1), # sigmoid * 0.1 ํ›„
65
+ "delay.delay_time": (0.01, 1.0),
66
+ "delay.feedback": (0.0, 0.9), # sigmoid ํ›„
67
+ "delay.mix": (0.0, 1.0), # sigmoid ํ›„
68
+ "final_wet_mix": (0.0, 1.0), # sigmoid ํ›„
69
+ }
70
+
71
+ # ๋™์˜์–ด ๋งคํ•‘ (๋ฏธํ•™์Šต ๋‹จ์–ด โ†’ ํ•™์Šต๋œ ๋‹จ์–ด)
72
+ SYNONYM_MAP = {
73
+ "calm": "warm soft",
74
+ "relaxed": "warm soft",
75
+ "chill": "warm soft",
76
+ "smooth": "warm",
77
+ "mellow": "warm soft",
78
+ "breezy": "bright spacious",
79
+ "airy": "bright spacious",
80
+ "light": "bright",
81
+ "crisp": "bright",
82
+ "clean": "bright",
83
+ "dreamy": "warm spacious",
84
+ "ethereal": "bright spacious",
85
+ "atmospheric": "spacious",
86
+ "ambient": "spacious warm",
87
+ "aggressive": "saturated bright",
88
+ "powerful": "saturated",
89
+ "punchy": "saturated bright",
90
+ "hard": "saturated",
91
+ "gritty": "saturated dark",
92
+ "soft": "warm",
93
+ "harsh": "bright saturated",
94
+ "muddy": "dark",
95
+ "thin": "bright",
96
+ "thick": "warm dark",
97
+ "full": "warm",
98
+ "reverb": "spacious",
99
+ "echo": "spacious",
100
+ "wet": "spacious",
101
+ }
102
+
103
+ # ์Šคํƒ€์ผ ํ”„๋ฆฌ์…‹ (delay.delay_time ๋ณด์™„์šฉ)
104
+ STYLE_PRESETS = {
105
+ "warm": {
106
+ "eq_lowshelf.params.gain": 3.0,
107
+ "eq_highshelf.params.gain": -1.0,
108
+ },
109
+ "bright": {
110
+ "eq_highshelf.params.gain": 4.0,
111
+ "eq_peak2.params.gain": 2.0,
112
+ "eq_lowshelf.params.gain": -1.0,
113
+ },
114
+ "spacious": {
115
+ "delay.delay_time": 0.05, # ํ•™์Šต ์•ˆ๋œ ํŒŒ๋ผ๋ฏธํ„ฐ ๋ณด์™„
116
+ },
117
+ "dark": {
118
+ "eq_highshelf.params.gain": -4.0,
119
+ "eq_lowshelf.params.gain": 2.0,
120
+ },
121
+ "saturated": {},
122
+ "soft": {
123
+ "eq_highshelf.params.gain": -2.0,
124
+ "eq_lowshelf.params.gain": 1.0,
125
+ },
126
+ }
127
+
128
+
129
+ class CLAPAudioEncoder:
130
+ """CLAP ๊ธฐ๋ฐ˜ ์˜ค๋””์˜ค ์ธ์ฝ”๋” (ํ•™์Šต ์‹œ์™€ ๋™์ผ)"""
131
+
132
+ def __init__(self, output_dim: int = 64, model_name: str = "laion/larger_clap_music"):
133
+ self.output_dim = output_dim
134
+ self.model_name = model_name
135
+ self.target_sr = 48000
136
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
137
+
138
+ self.model = None
139
+ self.processor = None
140
+ self._load_model()
141
+
142
+ def _load_model(self):
143
+ try:
144
+ from transformers import ClapModel, ClapProcessor
145
+
146
+ print(f"[CLAPEncoder] CLAP ๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘: {self.model_name}")
147
+
148
+ self.processor = ClapProcessor.from_pretrained(self.model_name)
149
+ self.model = ClapModel.from_pretrained(self.model_name)
150
+ self.model = self.model.to(self.device)
151
+ self.model.eval()
152
+
153
+ print(f"[CLAPEncoder] โœ… CLAP ๋ชจ๋ธ ๋กœ๋“œ ์™„๋ฃŒ (512โ†’{self.output_dim} pooling)")
154
+
155
+ except ImportError:
156
+ print("[CLAPEncoder] โŒ transformers ๋ฏธ์„ค์น˜")
157
+ except Exception as e:
158
+ print(f"[CLAPEncoder] โŒ ๋ชจ๋ธ ๋กœ๋“œ ์‹คํŒจ: {e}")
159
+
160
+ def get_audio_features(self, audio_path: str) -> List[float]:
161
+ if self.model is None:
162
+ return [0.0] * self.output_dim
163
+
164
+ try:
165
+ import librosa
166
+
167
+ audio, sr = librosa.load(audio_path, sr=self.target_sr, mono=True)
168
+
169
+ inputs = self.processor(
170
+ audios=audio,
171
+ sampling_rate=self.target_sr,
172
+ return_tensors="pt",
173
+ padding=True
174
+ ).to(self.device)
175
+
176
+ with torch.no_grad():
177
+ outputs = self.model.get_audio_features(**inputs)
178
+
179
+ features_512 = outputs[0].cpu().numpy()
180
+ features_64 = self._reduce_dimension(features_512)
181
+
182
+ return features_64.tolist()
183
+
184
+ except Exception as e:
185
+ print(f"[CLAPEncoder] ํŠน์ง• ์ถ”์ถœ ์‹คํŒจ: {e}")
186
+ return [0.0] * self.output_dim
187
+
188
+ def _reduce_dimension(self, features: np.ndarray) -> np.ndarray:
189
+ current_dim = len(features)
190
+ if current_dim == self.output_dim:
191
+ return features
192
+
193
+ pool_size = current_dim // self.output_dim
194
+ remainder = current_dim % self.output_dim
195
+
196
+ pooled = []
197
+ idx = 0
198
+ for i in range(self.output_dim):
199
+ size = pool_size + (1 if i < remainder else 0)
200
+ pooled.append(np.mean(features[idx:idx+size]))
201
+ idx += size
202
+
203
+ return np.array(pooled)
204
+
205
+ def is_loaded(self) -> bool:
206
+ return self.model is not None
207
+
208
+
209
+ class AIEffector:
210
+ """AI ๊ธฐ๋ฐ˜ ์ดํŽ™ํ„ฐ ํŒŒ๋ผ๋ฏธํ„ฐ ์˜ˆ์ธก (V4)"""
211
+
212
+ def __init__(
213
+ self,
214
+ model_repo_id: str = "heybaeheef/KU_SW_Academy",
215
+ model_subfolder: str = "checkpoints",
216
+ base_model_name: str = "Qwen/Qwen3-8B",
217
+ audio_feature_dim: int = 64,
218
+ use_huggingface: bool = True
219
+ ):
220
+ self.model_repo_id = model_repo_id
221
+ self.model_subfolder = model_subfolder
222
+ self.base_model_name = base_model_name
223
+ self.audio_feature_dim = audio_feature_dim
224
+ self.use_huggingface = use_huggingface
225
+
226
+ self.model = None
227
+ self.tokenizer = None
228
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
229
+
230
+ print(f"[AIEffector] CLAP ์˜ค๋””์˜ค ์ธ์ฝ”๋” ์ดˆ๊ธฐํ™” ์ค‘...")
231
+ self.audio_encoder = CLAPAudioEncoder(output_dim=audio_feature_dim)
232
+
233
+ self.request_count = 0
234
+ self._load_model()
235
+
236
+ def _load_model(self):
237
+ try:
238
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
239
+ from peft import PeftModel
240
+
241
+ print(f"[AIEffector] ๋ฒ ์ด์Šค ๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘: {self.base_model_name}")
242
+
243
+ if torch.cuda.is_available():
244
+ bnb_config = BitsAndBytesConfig(
245
+ load_in_4bit=True,
246
+ bnb_4bit_quant_type="nf4",
247
+ bnb_4bit_compute_dtype=torch.float16,
248
+ bnb_4bit_use_double_quant=True
249
+ )
250
+ base_model = AutoModelForCausalLM.from_pretrained(
251
+ self.base_model_name,
252
+ quantization_config=bnb_config,
253
+ device_map="auto",
254
+ trust_remote_code=True
255
+ )
256
+ else:
257
+ base_model = AutoModelForCausalLM.from_pretrained(
258
+ self.base_model_name,
259
+ torch_dtype=torch.float32,
260
+ device_map="auto",
261
+ trust_remote_code=True
262
+ )
263
+
264
+ self.tokenizer = AutoTokenizer.from_pretrained(
265
+ self.base_model_name,
266
+ trust_remote_code=True
267
+ )
268
+
269
+ if self.tokenizer.pad_token is None:
270
+ self.tokenizer.pad_token = self.tokenizer.eos_token
271
+
272
+ print(f"[AIEffector] LoRA ์–ด๋Œ‘ํ„ฐ ๋กœ๋”ฉ ์ค‘...")
273
+
274
+ if self.use_huggingface:
275
+ self.model = PeftModel.from_pretrained(
276
+ base_model,
277
+ self.model_repo_id,
278
+ subfolder=self.model_subfolder,
279
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
280
+ )
281
+ else:
282
+ local_path = os.path.join(self.model_repo_id, self.model_subfolder)
283
+ self.model = PeftModel.from_pretrained(
284
+ base_model,
285
+ local_path,
286
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
287
+ )
288
+
289
+ self.model.eval()
290
+ print(f"[AIEffector] โœ… ๋ชจ๋ธ ๋กœ๋“œ ์„ฑ๊ณต!")
291
+
292
+ except Exception as e:
293
+ print(f"[AIEffector] โŒ ๋ชจ๋ธ ๋กœ๋“œ ์‹คํŒจ: {e}")
294
+ import traceback
295
+ traceback.print_exc()
296
+ self.model = None
297
+ self.tokenizer = None
298
+
299
+ def is_loaded(self) -> bool:
300
+ return self.model is not None
301
+
302
+ def _preprocess_text(self, text: str) -> str:
303
+ """๋™์˜์–ด ๋งคํ•‘"""
304
+ text_lower = text.lower()
305
+ for synonym, replacement in SYNONYM_MAP.items():
306
+ if synonym in text_lower:
307
+ text_lower = text_lower.replace(synonym, replacement)
308
+ print(f" [Synonym] '{synonym}' โ†’ '{replacement}'")
309
+ return text_lower
310
+
311
+ def _apply_preset(self, prompt: str) -> Dict[str, float]:
312
+ """ํ”„๋ฆฌ์…‹ ๋งค์นญ (delay.delay_time ๋ณด์™„์šฉ)"""
313
+ params = {}
314
+ prompt_lower = prompt.lower()
315
+
316
+ matched = []
317
+ for style_name, style_params in STYLE_PRESETS.items():
318
+ if style_name in prompt_lower:
319
+ params.update(style_params)
320
+ matched.append(style_name)
321
+
322
+ if matched:
323
+ print(f" [Preset] ๋งค์นญ: {matched}")
324
+
325
+ return params
326
+
327
+ def _format_prompt(self, text_prompt: str, audio_features: List[float]) -> str:
328
+ """ํ•™์Šต ์‹œ์™€ ๋™์ผํ•œ ํ”„๋กฌํ”„ํŠธ"""
329
+ audio_state_str = json.dumps(audio_features)
330
+ return f"""Task: Convert text to audio parameters.
331
+ Audio: {audio_state_str}
332
+ Text: {text_prompt}
333
+ Parameters:"""
334
+
335
+ def _preprocess_json(self, json_str: str) -> str:
336
+ """JSON ์ „์ฒ˜๋ฆฌ"""
337
+ # ์ˆซ์ž ์–ธ๋”์Šค์ฝ”์–ด ์ œ๊ฑฐ (0.30_299 โ†’ 0.30299)
338
+ json_str = re.sub(r'(\d)_(\d)', r'\1\2', json_str)
339
+ # Trailing comma ์ œ๊ฑฐ
340
+ json_str = re.sub(r',(\s*[}\]])', r'\1', json_str)
341
+ # NaN, Infinity
342
+ json_str = re.sub(r'\bNaN\b', '0', json_str)
343
+ json_str = re.sub(r'\bInfinity\b', '999999', json_str)
344
+ json_str = re.sub(r'-Infinity\b', '-999999', json_str)
345
+ return json_str
346
+
347
+ def _normalize_key(self, key: str) -> str:
348
+ """
349
+ ํŒŒ๋ผ๋ฏธํ„ฐ ํ‚ค ์ •๊ทœํ™”
350
+ eq_peak1.params.parametrizations.freq.original โ†’ eq_peak1.params.freq
351
+ """
352
+ # parametrizations.X.original โ†’ X
353
+ key = re.sub(r'\.parametrizations\.(\w+)\.original', r'.\1', key)
354
+ # Q โ†’ Q (๋Œ€๋ฌธ์ž ์œ ์ง€)
355
+ return key
356
+
357
+ def _extract_json_object(self, text: str) -> Optional[str]:
358
+ """JSON ๊ฐ์ฒด ์ถ”์ถœ"""
359
+ start = text.find('{')
360
+ if start == -1:
361
+ return None
362
+
363
+ depth = 0
364
+ for i, char in enumerate(text[start:], start):
365
+ if char == '{':
366
+ depth += 1
367
+ elif char == '}':
368
+ depth -= 1
369
+ if depth == 0:
370
+ return text[start:i+1]
371
+ return None
372
+
373
+ def _convert_raw_to_actual(self, params: Dict[str, float]) -> Dict[str, float]:
374
+ """
375
+ โ˜…โ˜…โ˜… ํ•ต์‹ฌ: ํ•™์Šต ๋ฐ์ดํ„ฐ์˜ raw ๊ฐ’์„ ์‹ค์ œ ๊ฐ’์œผ๋กœ ๋ณ€ํ™˜ โ˜…โ˜…โ˜…
376
+
377
+ ํ•™์Šต ๋ฐ์ดํ„ฐ๋Š” nn.Parameter์˜ raw ๊ฐ’์„ ์ €์žฅํ•จ.
378
+ ์‹ค์ œ ์‚ฌ์šฉ ์‹œ sigmoid ๋“ฑ ๋ณ€ํ™˜์ด ์ ์šฉ๋จ.
379
+ """
380
+ result = params.copy()
381
+
382
+ # 1. delay.feedback: sigmoid ๋ณ€ํ™˜
383
+ if 'delay.feedback' in result:
384
+ raw = result['delay.feedback']
385
+ actual = sigmoid(raw)
386
+ print(f" [Convert] delay.feedback: {raw:.4f} โ†’ sigmoid โ†’ {actual:.4f}")
387
+ result['delay.feedback'] = actual
388
+
389
+ # 2. delay.mix: sigmoid ๋ณ€ํ™˜
390
+ if 'delay.mix' in result:
391
+ raw = result['delay.mix']
392
+ actual = sigmoid(raw)
393
+ print(f" [Convert] delay.mix: {raw:.4f} โ†’ sigmoid โ†’ {actual:.4f}")
394
+ result['delay.mix'] = actual
395
+
396
+ # 3. distortion_amount: sigmoid * 0.1
397
+ if 'distortion_amount' in result:
398
+ raw = result['distortion_amount']
399
+ actual = sigmoid(raw) * 0.1
400
+ print(f" [Convert] distortion_amount: {raw:.4f} โ†’ sigmoid*0.1 โ†’ {actual:.4f}")
401
+ result['distortion_amount'] = actual
402
+
403
+ # 4. final_wet_mix: sigmoid ๋ณ€ํ™˜
404
+ if 'final_wet_mix' in result:
405
+ raw = result['final_wet_mix']
406
+ actual = sigmoid(raw)
407
+ print(f" [Convert] final_wet_mix: {raw:.4f} โ†’ sigmoid โ†’ {actual:.4f}")
408
+ result['final_wet_mix'] = actual
409
+
410
+ return result
411
+
412
+ def _clamp_values(self, params: Dict[str, float]) -> Dict[str, float]:
413
+ """๊ฐ’ ๋ฒ”์œ„ ์ œํ•œ"""
414
+ result = params.copy()
415
+
416
+ for key, (min_val, max_val) in PARAM_RANGES.items():
417
+ if key in result:
418
+ original = result[key]
419
+ clamped = max(min_val, min(max_val, original))
420
+ if clamped != original:
421
+ print(f" [Clamp] {key}: {original:.4f} โ†’ {clamped:.4f}")
422
+ result[key] = clamped
423
+
424
+ return result
425
+
426
+ def _parse_output(self, output_text: str) -> Dict[str, float]:
427
+ """LLM ์ถœ๋ ฅ ํŒŒ์‹ฑ"""
428
+
429
+ print(f" [Parse] Raw output ๊ธธ์ด: {len(output_text)} ๋ฌธ์ž")
430
+
431
+ json_str = None
432
+
433
+ try:
434
+ text = output_text
435
+
436
+ # <think> ํƒœ๊ทธ ์ œ๊ฑฐ
437
+ text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
438
+
439
+ # ์ฝ”๋“œ๋ธ”๋ก ์ถ”์ถœ
440
+ code_match = re.search(r'```(?:json)?\s*([\s\S]*?)```', text)
441
+ if code_match:
442
+ text = code_match.group(1)
443
+
444
+ # JSON ์ถ”์ถœ
445
+ json_str = self._extract_json_object(text)
446
+
447
+ if json_str:
448
+ print(f" [Parse] JSON ๋ฐœ๊ฒฌ (๊ธธ์ด: {len(json_str)})")
449
+
450
+ # ์ „์ฒ˜๋ฆฌ
451
+ json_str = self._preprocess_json(json_str)
452
+
453
+ # ํŒŒ์‹ฑ
454
+ raw_params = json.loads(json_str)
455
+
456
+ # ๊ฒฐ๊ณผ ๋งคํ•‘
457
+ result = DEFAULT_PARAMETERS.copy()
458
+ parsed_count = 0
459
+
460
+ for key, value in raw_params.items():
461
+ try:
462
+ # ํ‚ค ์ •๊ทœํ™”
463
+ norm_key = self._normalize_key(key)
464
+ float_val = float(value)
465
+
466
+ # ๋งค์นญ๋˜๋Š” ๊ธฐ๋ณธ ํ‚ค ์ฐพ๊ธฐ
467
+ matched_key = None
468
+ for default_key in DEFAULT_PARAMETERS.keys():
469
+ # ์ •ํ™•ํ•œ ๋งค์นญ
470
+ if norm_key == default_key:
471
+ matched_key = default_key
472
+ break
473
+ # ๋ถ€๋ถ„ ๋งค์นญ (ํ‚ค ๋๋ถ€๋ถ„)
474
+ if norm_key.endswith(default_key.split('.')[-1]) and \
475
+ norm_key.split('.')[0] == default_key.split('.')[0]:
476
+ matched_key = default_key
477
+ break
478
+
479
+ if matched_key:
480
+ result[matched_key] = float_val
481
+ parsed_count += 1
482
+ else:
483
+ print(f" [Parse] ๋งค์นญ ์•ˆ๋จ: {key} โ†’ {norm_key}")
484
+
485
+ except (ValueError, TypeError) as e:
486
+ print(f" [Parse] ๋ณ€ํ™˜ ์‹คํŒจ: {key}={value} ({e})")
487
+
488
+ print(f" [Parse] โœ… {parsed_count}๊ฐœ ํŒŒ๋ผ๋ฏธํ„ฐ ๋งคํ•‘๋จ")
489
+ return result
490
+
491
+ except json.JSONDecodeError as e:
492
+ print(f" [Parse] โŒ JSON ์—๋Ÿฌ: {e}")
493
+ if json_str:
494
+ pos = getattr(e, 'pos', 0)
495
+ print(f" [Parse] ์œ„์น˜: ...{json_str[max(0,pos-20):pos+20]}...")
496
+ except Exception as e:
497
+ print(f" [Parse] โŒ ์˜ˆ์™ธ: {e}")
498
+
499
+ print(f" [Parse] โš ๏ธ ๊ธฐ๋ณธ๊ฐ’ ํด๋ฐฑ")
500
+ return DEFAULT_PARAMETERS.copy()
501
+
502
+ def predict(self, audio_path: str, text_prompt: str = "") -> Dict[str, float]:
503
+ """ํŒŒ๋ผ๋ฏธํ„ฐ ์˜ˆ์ธก"""
504
+
505
+ self.request_count += 1
506
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
507
+
508
+ print(f"\n{'='*60}")
509
+ print(f"[AIEffector V4] ๐ŸŽต ์š”์ฒญ #{self.request_count} - {timestamp}")
510
+ print(f"{'='*60}")
511
+ print(f" ๐Ÿ“‚ ์˜ค๋””์˜ค: {Path(audio_path).name}")
512
+ print(f" ๐Ÿ’ฌ ์›๋ณธ ํ”„๋กฌํ”„ํŠธ: '{text_prompt}'")
513
+
514
+ # ๋™์˜์–ด ๋ณ€ํ™˜
515
+ processed_prompt = self._preprocess_text(text_prompt)
516
+ if processed_prompt != text_prompt.lower():
517
+ print(f" ๐Ÿ’ฌ ๋ณ€ํ™˜ ํ”„๋กฌํ”„ํŠธ: '{processed_prompt}'")
518
+
519
+ print(f" ๐Ÿค– ๋ชจ๋ธ: {'AI' if self.is_loaded() else 'ํ”„๋ฆฌ์…‹'}")
520
+
521
+ # ๋ชจ๋ธ ์—†์œผ๋ฉด ํ”„๋ฆฌ์…‹
522
+ if not self.is_loaded():
523
+ print(f"\n โš ๏ธ AI ๋ชจ๋ธ ๋ฏธ๋กœ๋“œ")
524
+ params = DEFAULT_PARAMETERS.copy()
525
+ params.update(self._apply_preset(processed_prompt))
526
+ self._log_parameters(params)
527
+ return self._convert_to_effect_chain_format(params)
528
+
529
+ try:
530
+ # 1. CLAP ํŠน์ง• ์ถ”์ถœ
531
+ print(f"\n ๐Ÿ“Š [Step 1] CLAP ํŠน์ง• ์ถ”์ถœ...")
532
+ audio_features = self.audio_encoder.get_audio_features(audio_path)
533
+
534
+ if not audio_features or all(f == 0 for f in audio_features):
535
+ print(f" โš ๏ธ ์‹คํŒจ, ํ”„๋ฆฌ์…‹ ํด๋ฐฑ")
536
+ params = DEFAULT_PARAMETERS.copy()
537
+ params.update(self._apply_preset(processed_prompt))
538
+ self._log_parameters(params)
539
+ return self._convert_to_effect_chain_format(params)
540
+
541
+ print(f" โœ… {len(audio_features)}์ฐจ์›")
542
+
543
+ # 2. ํ”„๋กฌํ”„ํŠธ ์ƒ์„ฑ
544
+ print(f"\n ๐Ÿ”ค [Step 2] ํ”„๋กฌํ”„ํŠธ ์ƒ์„ฑ...")
545
+ prompt = self._format_prompt(processed_prompt, audio_features)
546
+
547
+ # 3. ํ† ํฐํ™”
548
+ print(f"\n ๐Ÿ”ข [Step 3] ํ† ํฐํ™”...")
549
+ inputs = self.tokenizer(
550
+ prompt,
551
+ return_tensors="pt",
552
+ truncation=True,
553
+ max_length=1500
554
+ ).to(self.device)
555
+ print(f" ํ† ํฐ ์ˆ˜: {inputs['input_ids'].shape[1]}")
556
+
557
+ # 4. LLM ์ƒ์„ฑ
558
+ print(f"\n ๐Ÿง  [Step 4] LLM ์ถ”๋ก ...")
559
+ import time
560
+ start = time.time()
561
+
562
+ with torch.no_grad():
563
+ outputs = self.model.generate(
564
+ **inputs,
565
+ max_new_tokens=500,
566
+ do_sample=False,
567
+ temperature=0.1,
568
+ pad_token_id=self.tokenizer.pad_token_id,
569
+ eos_token_id=self.tokenizer.eos_token_id,
570
+ )
571
+
572
+ print(f" ์ถ”๋ก  ์‹œ๊ฐ„: {time.time()-start:.2f}์ดˆ")
573
+
574
+ # 5. ๋””์ฝ”๋”ฉ
575
+ print(f"\n ๐Ÿ“ [Step 5] ๋””์ฝ”๋”ฉ...")
576
+ gen_tokens = outputs[0][inputs['input_ids'].shape[1]:]
577
+ output_text = self.tokenizer.decode(gen_tokens, skip_special_tokens=True).strip()
578
+ print(f" ์ถœ๋ ฅ (์ฒ˜์Œ 400์ž):\n{output_text[:400]}")
579
+
580
+ # 6. ํŒŒ์‹ฑ
581
+ print(f"\n ๐Ÿ”ง [Step 6] ํŒŒ์‹ฑ...")
582
+ raw_params = self._parse_output(output_text)
583
+
584
+ # 7. โ˜…โ˜…โ˜… Raw โ†’ Actual ๋ณ€ํ™˜ โ˜…โ˜…โ˜…
585
+ print(f"\n ๐Ÿ”„ [Step 7] Raw โ†’ Actual ๋ณ€ํ™˜...")
586
+ actual_params = self._convert_raw_to_actual(raw_params)
587
+
588
+ # 8. ๊ฐ’ ํด๋žจํ•‘
589
+ print(f"\n ๐Ÿ“ [Step 8] ๊ฐ’ ํด๋žจํ•‘...")
590
+ clamped_params = self._clamp_values(actual_params)
591
+
592
+ # 9. ํ”„๋ฆฌ์…‹ ๋ณด์™„ (delay.delay_time์€ ํ•™์Šต ์•ˆ๋จ)
593
+ print(f"\n ๐ŸŽ›๏ธ [Step 9] ํ”„๋ฆฌ์…‹ ๋ณด์™„...")
594
+ preset = self._apply_preset(processed_prompt)
595
+ if 'delay.delay_time' in preset:
596
+ clamped_params['delay.delay_time'] = preset['delay.delay_time']
597
+ print(f" delay.delay_time: {preset['delay.delay_time']} (ํ”„๋ฆฌ์…‹)")
598
+
599
+ # 10. ๋กœ๊น…
600
+ self._log_parameters(clamped_params)
601
+
602
+ print(f"\n โœ… ์™„๋ฃŒ!")
603
+ print(f"{'='*60}\n")
604
+
605
+ return self._convert_to_effect_chain_format(clamped_params)
606
+
607
+ except Exception as e:
608
+ print(f"\n โŒ ์‹คํŒจ: {e}")
609
+ import traceback
610
+ traceback.print_exc()
611
+ params = DEFAULT_PARAMETERS.copy()
612
+ params.update(self._apply_preset(processed_prompt))
613
+ self._log_parameters(params)
614
+ return self._convert_to_effect_chain_format(params)
615
+
616
+ def _convert_to_effect_chain_format(self, params: Dict[str, float]) -> Dict[str, float]:
617
+ """effect_chain.py ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜ (Q โ†’ q)"""
618
+ result = {}
619
+ for key, value in params.items():
620
+ new_key = key.replace('.Q', '.q')
621
+ result[new_key] = value
622
+ return result
623
+
624
+ def _log_parameters(self, params: Dict[str, float]):
625
+ """ํŒŒ๋ผ๋ฏธํ„ฐ ๋กœ๊น…"""
626
+ print(f"\n ๐Ÿ“‹ ์ตœ์ข… ํŒŒ๋ผ๋ฏธํ„ฐ:")
627
+ print(f" [EQ Peak 1] freq={params.get('eq_peak1.params.freq',0):.0f}Hz, gain={params.get('eq_peak1.params.gain',0):.2f}dB")
628
+ print(f" [EQ Peak 2] freq={params.get('eq_peak2.params.freq',0):.0f}Hz, gain={params.get('eq_peak2.params.gain',0):.2f}dB")
629
+ print(f" [Low Shelf] gain={params.get('eq_lowshelf.params.gain',0):.2f}dB")
630
+ print(f" [High Shelf] gain={params.get('eq_highshelf.params.gain',0):.2f}dB")
631
+ print(f" [Distortion] {params.get('distortion_amount',0):.4f}")
632
+ print(f" [Delay] time={params.get('delay.delay_time',0):.3f}s, fb={params.get('delay.feedback',0):.2f}, mix={params.get('delay.mix',0):.2f}")
633
+ print(f" [Wet Mix] {params.get('final_wet_mix',0):.2f}")
models/audio_encoder.py CHANGED
@@ -1,189 +1,189 @@
1
- """
2
- Audio Encoder for MagicPath Server
3
- ===================================
4
- CLAP ๋ชจ๋ธ์„ ์‚ฌ์šฉํ•˜์—ฌ ์˜ค๋””์˜ค ํŒŒ์ผ์—์„œ ํŠน์ง• ๋ฒกํ„ฐ ์ถ”์ถœ
5
- DiffVox LLM๊ณผ ๋™์ผํ•œ ์ธ์ฝ”๋” ์‚ฌ์šฉ
6
- """
7
-
8
- import torch
9
- import numpy as np
10
- from typing import List, Optional
11
- import warnings
12
-
13
- warnings.filterwarnings("ignore")
14
-
15
-
16
- class AudioEncoder:
17
- """CLAP ๊ธฐ๋ฐ˜ ์˜ค๋””์˜ค ์ธ์ฝ”๋”"""
18
-
19
- def __init__(
20
- self,
21
- output_dim: int = 64,
22
- reduction_method: str = "pool",
23
- model_name: str = "laion/larger_clap_general"
24
- ):
25
- """
26
- ์˜ค๋””์˜ค ์ธ์ฝ”๋” ์ดˆ๊ธฐํ™”
27
-
28
- Args:
29
- output_dim: ์ถœ๋ ฅ ํŠน์ง• ์ฐจ์› (๊ธฐ๋ณธ 64)
30
- reduction_method: ์ฐจ์› ์ถ•์†Œ ๋ฐฉ๋ฒ• ("pool", "pca", "linear")
31
- model_name: CLAP ๋ชจ๋ธ ์ด๋ฆ„
32
- """
33
- self.output_dim = output_dim
34
- self.reduction_method = reduction_method
35
- self.model_name = model_name
36
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
37
-
38
- self.model = None
39
- self.processor = None
40
- self.projection = None
41
-
42
- self._load_model()
43
-
44
- def _load_model(self):
45
- """CLAP ๋ชจ๋ธ ๋กœ๋“œ"""
46
- try:
47
- from transformers import ClapModel, ClapProcessor
48
-
49
- print(f"[AudioEncoder] CLAP ๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘: {self.model_name}")
50
-
51
- self.processor = ClapProcessor.from_pretrained(self.model_name)
52
- self.model = ClapModel.from_pretrained(self.model_name)
53
- self.model = self.model.to(self.device)
54
- self.model.eval()
55
-
56
- # CLAP ์ถœ๋ ฅ ์ฐจ์› ํ™•์ธ (๋ณดํ†ต 512)
57
- clap_dim = self.model.config.projection_dim
58
- print(f"[AudioEncoder] CLAP ์ถœ๋ ฅ ์ฐจ์›: {clap_dim}")
59
-
60
- # ์ฐจ์› ์ถ•์†Œ๋ฅผ ์œ„ํ•œ projection layer
61
- if self.reduction_method == "linear" and clap_dim != self.output_dim:
62
- self.projection = torch.nn.Linear(clap_dim, self.output_dim)
63
- self.projection = self.projection.to(self.device)
64
- print(f"[AudioEncoder] Linear projection: {clap_dim} โ†’ {self.output_dim}")
65
-
66
- print("[AudioEncoder] โœ… ๋ชจ๋ธ ๋กœ๋“œ ์™„๋ฃŒ")
67
-
68
- except ImportError:
69
- print("[AudioEncoder] โŒ transformers ๋ฏธ์„ค์น˜")
70
- print(" pip install transformers")
71
- except Exception as e:
72
- print(f"[AudioEncoder] โŒ ๋ชจ๋ธ ๋กœ๋“œ ์‹คํŒจ: {e}")
73
-
74
- def get_audio_features(self, audio_path: str) -> List[float]:
75
- """
76
- ์˜ค๋””์˜ค ํŒŒ์ผ์—์„œ ํŠน์ง• ๋ฒกํ„ฐ ์ถ”์ถœ
77
-
78
- Args:
79
- audio_path: ์˜ค๋””์˜ค ํŒŒ์ผ ๊ฒฝ๋กœ
80
-
81
- Returns:
82
- ํŠน์ง• ๋ฒกํ„ฐ (output_dim ์ฐจ์›)
83
- """
84
- if self.model is None:
85
- print("[AudioEncoder] ๋ชจ๋ธ์ด ๋กœ๋“œ๋˜์ง€ ์•Š์Œ")
86
- return []
87
-
88
- try:
89
- import librosa
90
-
91
- # ์˜ค๋””์˜ค ๋กœ๋“œ
92
- audio, sr = librosa.load(audio_path, sr=48000, mono=True)
93
-
94
- # CLAP ์ž…๋ ฅ ์ค€๋น„
95
- inputs = self.processor(
96
- audios=audio,
97
- sampling_rate=48000,
98
- return_tensors="pt"
99
- )
100
- inputs = {k: v.to(self.device) for k, v in inputs.items()}
101
-
102
- # ํŠน์ง• ์ถ”์ถœ
103
- with torch.no_grad():
104
- audio_features = self.model.get_audio_features(**inputs)
105
-
106
- # CPU๋กœ ์ด๋™
107
- features = audio_features.squeeze().cpu().numpy()
108
-
109
- # ์ฐจ์› ์ถ•์†Œ
110
- features = self._reduce_dimension(features)
111
-
112
- return features.tolist()
113
-
114
- except Exception as e:
115
- print(f"[AudioEncoder] ํŠน์ง• ์ถ”์ถœ ์‹คํŒจ: {e}")
116
- import traceback
117
- traceback.print_exc()
118
- return []
119
-
120
- def _reduce_dimension(self, features: np.ndarray) -> np.ndarray:
121
- """ํŠน์ง• ๋ฒกํ„ฐ ์ฐจ์› ์ถ•์†Œ"""
122
- current_dim = len(features)
123
-
124
- if current_dim == self.output_dim:
125
- return features
126
-
127
- if self.reduction_method == "pool":
128
- # ํ‰๊ท  ํ’€๋ง์œผ๋กœ ์ฐจ์› ์ถ•์†Œ
129
- if current_dim > self.output_dim:
130
- pool_size = current_dim // self.output_dim
131
- remainder = current_dim % self.output_dim
132
-
133
- pooled = []
134
- idx = 0
135
- for i in range(self.output_dim):
136
- size = pool_size + (1 if i < remainder else 0)
137
- pooled.append(np.mean(features[idx:idx+size]))
138
- idx += size
139
-
140
- return np.array(pooled)
141
- else:
142
- # ์ฐจ์›์ด ์ž‘์œผ๋ฉด zero-padding
143
- padded = np.zeros(self.output_dim)
144
- padded[:current_dim] = features
145
- return padded
146
-
147
- elif self.reduction_method == "linear" and self.projection is not None:
148
- # Linear projection
149
- with torch.no_grad():
150
- features_tensor = torch.tensor(features, dtype=torch.float32).to(self.device)
151
- projected = self.projection(features_tensor)
152
- return projected.cpu().numpy()
153
-
154
- else:
155
- # ๊ธฐ๋ณธ: ์•ž์—์„œ๋ถ€ํ„ฐ ์ž๋ฅด๊ธฐ
156
- return features[:self.output_dim]
157
-
158
- def get_text_features(self, text: str) -> List[float]:
159
- """
160
- ํ…์ŠคํŠธ์—์„œ ํŠน์ง• ๋ฒกํ„ฐ ์ถ”์ถœ (CLAP text encoder)
161
-
162
- Args:
163
- text: ์ž…๋ ฅ ํ…์ŠคํŠธ
164
-
165
- Returns:
166
- ํŠน์ง• ๋ฒกํ„ฐ
167
- """
168
- if self.model is None:
169
- return []
170
-
171
- try:
172
- inputs = self.processor(
173
- text=text,
174
- return_tensors="pt",
175
- padding=True
176
- )
177
- inputs = {k: v.to(self.device) for k, v in inputs.items()}
178
-
179
- with torch.no_grad():
180
- text_features = self.model.get_text_features(**inputs)
181
-
182
- features = text_features.squeeze().cpu().numpy()
183
- features = self._reduce_dimension(features)
184
-
185
- return features.tolist()
186
-
187
- except Exception as e:
188
- print(f"[AudioEncoder] ํ…์ŠคํŠธ ํŠน์ง• ์ถ”์ถœ ์‹คํŒจ: {e}")
189
- return []
 
1
+ """
2
+ Audio Encoder for MagicPath Server
3
+ ===================================
4
+ CLAP ๋ชจ๋ธ์„ ์‚ฌ์šฉํ•˜์—ฌ ์˜ค๋””์˜ค ํŒŒ์ผ์—์„œ ํŠน์ง• ๋ฒกํ„ฐ ์ถ”์ถœ
5
+ DiffVox LLM๊ณผ ๋™์ผํ•œ ์ธ์ฝ”๋” ์‚ฌ์šฉ
6
+ """
7
+
8
+ import torch
9
+ import numpy as np
10
+ from typing import List, Optional
11
+ import warnings
12
+
13
+ warnings.filterwarnings("ignore")
14
+
15
+
16
+ class AudioEncoder:
17
+ """CLAP ๊ธฐ๋ฐ˜ ์˜ค๋””์˜ค ์ธ์ฝ”๋”"""
18
+
19
+ def __init__(
20
+ self,
21
+ output_dim: int = 64,
22
+ reduction_method: str = "pool",
23
+ model_name: str = "laion/larger_clap_general"
24
+ ):
25
+ """
26
+ ์˜ค๋””์˜ค ์ธ์ฝ”๋” ์ดˆ๊ธฐํ™”
27
+
28
+ Args:
29
+ output_dim: ์ถœ๋ ฅ ํŠน์ง• ์ฐจ์› (๊ธฐ๋ณธ 64)
30
+ reduction_method: ์ฐจ์› ์ถ•์†Œ ๋ฐฉ๋ฒ• ("pool", "pca", "linear")
31
+ model_name: CLAP ๋ชจ๋ธ ์ด๋ฆ„
32
+ """
33
+ self.output_dim = output_dim
34
+ self.reduction_method = reduction_method
35
+ self.model_name = model_name
36
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
37
+
38
+ self.model = None
39
+ self.processor = None
40
+ self.projection = None
41
+
42
+ self._load_model()
43
+
44
+ def _load_model(self):
45
+ """CLAP ๋ชจ๋ธ ๋กœ๋“œ"""
46
+ try:
47
+ from transformers import ClapModel, ClapProcessor
48
+
49
+ print(f"[AudioEncoder] CLAP ๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘: {self.model_name}")
50
+
51
+ self.processor = ClapProcessor.from_pretrained(self.model_name)
52
+ self.model = ClapModel.from_pretrained(self.model_name)
53
+ self.model = self.model.to(self.device)
54
+ self.model.eval()
55
+
56
+ # CLAP ์ถœ๋ ฅ ์ฐจ์› ํ™•์ธ (๋ณดํ†ต 512)
57
+ clap_dim = self.model.config.projection_dim
58
+ print(f"[AudioEncoder] CLAP ์ถœ๋ ฅ ์ฐจ์›: {clap_dim}")
59
+
60
+ # ์ฐจ์› ์ถ•์†Œ๋ฅผ ์œ„ํ•œ projection layer
61
+ if self.reduction_method == "linear" and clap_dim != self.output_dim:
62
+ self.projection = torch.nn.Linear(clap_dim, self.output_dim)
63
+ self.projection = self.projection.to(self.device)
64
+ print(f"[AudioEncoder] Linear projection: {clap_dim} โ†’ {self.output_dim}")
65
+
66
+ print("[AudioEncoder] โœ… ๋ชจ๋ธ ๋กœ๋“œ ์™„๋ฃŒ")
67
+
68
+ except ImportError:
69
+ print("[AudioEncoder] โŒ transformers ๋ฏธ์„ค์น˜")
70
+ print(" pip install transformers")
71
+ except Exception as e:
72
+ print(f"[AudioEncoder] โŒ ๋ชจ๋ธ ๋กœ๋“œ ์‹คํŒจ: {e}")
73
+
74
+ def get_audio_features(self, audio_path: str) -> List[float]:
75
+ """
76
+ ์˜ค๋””์˜ค ํŒŒ์ผ์—์„œ ํŠน์ง• ๋ฒกํ„ฐ ์ถ”์ถœ
77
+
78
+ Args:
79
+ audio_path: ์˜ค๋””์˜ค ํŒŒ์ผ ๊ฒฝ๋กœ
80
+
81
+ Returns:
82
+ ํŠน์ง• ๋ฒกํ„ฐ (output_dim ์ฐจ์›)
83
+ """
84
+ if self.model is None:
85
+ print("[AudioEncoder] ๋ชจ๋ธ์ด ๋กœ๋“œ๋˜์ง€ ์•Š์Œ")
86
+ return []
87
+
88
+ try:
89
+ import librosa
90
+
91
+ # ์˜ค๋””์˜ค ๋กœ๋“œ
92
+ audio, sr = librosa.load(audio_path, sr=48000, mono=True)
93
+
94
+ # CLAP ์ž…๋ ฅ ์ค€๋น„
95
+ inputs = self.processor(
96
+ audios=audio,
97
+ sampling_rate=48000,
98
+ return_tensors="pt"
99
+ )
100
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
101
+
102
+ # ํŠน์ง• ์ถ”์ถœ
103
+ with torch.no_grad():
104
+ audio_features = self.model.get_audio_features(**inputs)
105
+
106
+ # CPU๋กœ ์ด๋™
107
+ features = audio_features.squeeze().cpu().numpy()
108
+
109
+ # ์ฐจ์› ์ถ•์†Œ
110
+ features = self._reduce_dimension(features)
111
+
112
+ return features.tolist()
113
+
114
+ except Exception as e:
115
+ print(f"[AudioEncoder] ํŠน์ง• ์ถ”์ถœ ์‹คํŒจ: {e}")
116
+ import traceback
117
+ traceback.print_exc()
118
+ return []
119
+
120
+ def _reduce_dimension(self, features: np.ndarray) -> np.ndarray:
121
+ """ํŠน์ง• ๋ฒกํ„ฐ ์ฐจ์› ์ถ•์†Œ"""
122
+ current_dim = len(features)
123
+
124
+ if current_dim == self.output_dim:
125
+ return features
126
+
127
+ if self.reduction_method == "pool":
128
+ # ํ‰๊ท  ํ’€๋ง์œผ๋กœ ์ฐจ์› ์ถ•์†Œ
129
+ if current_dim > self.output_dim:
130
+ pool_size = current_dim // self.output_dim
131
+ remainder = current_dim % self.output_dim
132
+
133
+ pooled = []
134
+ idx = 0
135
+ for i in range(self.output_dim):
136
+ size = pool_size + (1 if i < remainder else 0)
137
+ pooled.append(np.mean(features[idx:idx+size]))
138
+ idx += size
139
+
140
+ return np.array(pooled)
141
+ else:
142
+ # ์ฐจ์›์ด ์ž‘์œผ๋ฉด zero-padding
143
+ padded = np.zeros(self.output_dim)
144
+ padded[:current_dim] = features
145
+ return padded
146
+
147
+ elif self.reduction_method == "linear" and self.projection is not None:
148
+ # Linear projection
149
+ with torch.no_grad():
150
+ features_tensor = torch.tensor(features, dtype=torch.float32).to(self.device)
151
+ projected = self.projection(features_tensor)
152
+ return projected.cpu().numpy()
153
+
154
+ else:
155
+ # ๊ธฐ๋ณธ: ์•ž์—์„œ๋ถ€ํ„ฐ ์ž๋ฅด๊ธฐ
156
+ return features[:self.output_dim]
157
+
158
+ def get_text_features(self, text: str) -> List[float]:
159
+ """
160
+ ํ…์ŠคํŠธ์—์„œ ํŠน์ง• ๋ฒกํ„ฐ ์ถ”์ถœ (CLAP text encoder)
161
+
162
+ Args:
163
+ text: ์ž…๋ ฅ ํ…์ŠคํŠธ
164
+
165
+ Returns:
166
+ ํŠน์ง• ๋ฒกํ„ฐ
167
+ """
168
+ if self.model is None:
169
+ return []
170
+
171
+ try:
172
+ inputs = self.processor(
173
+ text=text,
174
+ return_tensors="pt",
175
+ padding=True
176
+ )
177
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
178
+
179
+ with torch.no_grad():
180
+ text_features = self.model.get_text_features(**inputs)
181
+
182
+ features = text_features.squeeze().cpu().numpy()
183
+ features = self._reduce_dimension(features)
184
+
185
+ return features.tolist()
186
+
187
+ except Exception as e:
188
+ print(f"[AudioEncoder] ํ…์ŠคํŠธ ํŠน์ง• ์ถ”์ถœ ์‹คํŒจ: {e}")
189
+ return []