heybaeheef commited on
Commit
5c6cdde
ยท
1 Parent(s): 8212fa0

Fix model loading with subfolder parameter

Browse files
audio_processing/__init__.py CHANGED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # audio_processing package
2
+ from .effect_chain import EffectChain
3
+
4
+ __all__ = ["EffectChain"]
audio_processing/effect_chain.py CHANGED
@@ -1,255 +1,161 @@
1
  """
2
- Audio Effect Chain
3
- ==================
4
- ์‹ค์ œ ์˜ค๋””์˜ค์— ์ดํŽ™ํŠธ๋ฅผ ์ ์šฉํ•˜๋Š” ์ฒ˜๋ฆฌ ์ฒด์ธ
5
-
6
- pedalboard ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์‚ฌ์šฉ (Spotify์—์„œ ๋งŒ๋“  ์˜ค๋””์˜ค ํ”Œ๋Ÿฌ๊ทธ์ธ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ)
7
- - ๊ณ ํ’ˆ์งˆ VST ์ˆ˜์ค€์˜ ์ดํŽ™ํŠธ
8
- - Python์—์„œ ์‰ฝ๊ฒŒ ์‚ฌ์šฉ ๊ฐ€๋Šฅ
9
- - ์‹ค์‹œ๊ฐ„ ์ฒ˜๋ฆฌ๋„ ๊ฐ€๋Šฅ
10
  """
11
 
12
  import numpy as np
13
- from pathlib import Path
14
- from typing import Dict, Any, List
15
  import soundfile as sf
16
-
17
- # pedalboard - ์˜ค๋””์˜ค ์ดํŽ™ํŠธ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ
18
  from pedalboard import (
19
- Pedalboard,
20
- Compressor,
21
  Gain,
22
- LowShelfFilter,
23
  HighShelfFilter,
 
24
  PeakFilter,
25
  Delay,
26
  Reverb,
27
  Distortion,
28
- Limiter,
29
- HighpassFilter,
30
- LowpassFilter
31
  )
32
- from pedalboard.io import AudioFile
33
 
34
 
35
  class EffectChain:
36
- """์˜ค๋””์˜ค ์ดํŽ™ํŠธ ์ฒ˜๋ฆฌ ์ฒด์ธ"""
37
 
38
- AVAILABLE_EFFECTS = [
39
- "eq_lowshelf",
40
- "eq_highshelf",
41
- "eq_peak1",
42
- "eq_peak2",
43
- "compressor",
44
- "distortion",
45
- "delay",
46
- "reverb",
47
- "limiter"
48
- ]
49
-
50
- def __init__(self):
51
- """์ดํŽ™ํŠธ ์ฒด์ธ ์ดˆ๊ธฐํ™”"""
52
- pass
53
 
54
  def get_available_effects(self) -> List[str]:
55
- """์‚ฌ์šฉ ๊ฐ€๋Šฅํ•œ ์ดํŽ™ํŠธ ๋ชฉ๋ก ๋ฐ˜ํ™˜"""
56
- return self.AVAILABLE_EFFECTS.copy()
57
 
58
- def process(
59
- self,
60
- input_path: str,
61
- output_path: str,
62
- parameters: Dict[str, float]
63
- ) -> None:
64
- """
65
- ์˜ค๋””์˜ค ํŒŒ์ผ์— ์ดํŽ™ํŠธ ์ฒด์ธ ์ ์šฉ
66
-
67
- Args:
68
- input_path: ์ž…๋ ฅ ์˜ค๋””์˜ค ํŒŒ์ผ ๊ฒฝ๋กœ
69
- output_path: ์ถœ๋ ฅ ์˜ค๋””์˜ค ํŒŒ์ผ ๊ฒฝ๋กœ
70
- parameters: ์ดํŽ™ํ„ฐ ํŒŒ๋ผ๋ฏธํ„ฐ ๋”•์…”๋„ˆ๋ฆฌ
71
- """
72
- # ์˜ค๋””์˜ค ํŒŒ์ผ ์ฝ๊ธฐ
73
- audio, sample_rate = sf.read(input_path)
74
-
75
- # ๋ชจ๋…ธ๋ฉด ์Šคํ…Œ๋ ˆ์˜ค๋กœ ๋ณ€ํ™˜ (์ผ๋ถ€ ์ดํŽ™ํŠธ๊ฐ€ ์Šคํ…Œ๋ ˆ์˜ค ํ•„์š”)
76
- if len(audio.shape) == 1:
77
- audio = np.column_stack([audio, audio])
78
-
79
- # float32๋กœ ๋ณ€ํ™˜
80
- audio = audio.astype(np.float32)
81
 
82
- # ์ดํŽ™ํŠธ ์ฒด์ธ ๊ตฌ์„ฑ
83
- board = self._build_pedalboard(parameters, sample_rate)
84
-
85
- # ์ดํŽ™ํŠธ ์ ์šฉ
86
- processed = board(audio, sample_rate)
87
-
88
- # Wet/Dry ๋ฏน์Šค ์ ์šฉ
89
- wet_mix = parameters.get("final_wet_mix", 0.5)
90
- final_audio = (1 - wet_mix) * audio + wet_mix * processed
91
-
92
- # ํด๋ฆฌํ•‘ ๋ฐฉ์ง€
93
- final_audio = np.clip(final_audio, -1.0, 1.0)
94
-
95
- # ์ถœ๋ ฅ ํŒŒ์ผ ์ €์žฅ
96
- sf.write(output_path, final_audio, sample_rate)
97
-
98
- print(f"[EffectChain] ์ฒ˜๋ฆฌ ์™„๋ฃŒ: {output_path}")
99
-
100
- def _build_pedalboard(
101
- self,
102
- params: Dict[str, float],
103
- sample_rate: int
104
- ) -> Pedalboard:
105
- """
106
- ํŒŒ๋ผ๋ฏธํ„ฐ๋กœ๋ถ€ํ„ฐ pedalboard ์ดํŽ™ํŠธ ์ฒด์ธ ๊ตฌ์„ฑ
107
- """
108
  effects = []
109
 
110
- # === EQ Section ===
111
-
112
- # Low Shelf EQ
113
- if params.get("eq_lowshelf_gain", 0) != 0:
114
- effects.append(
115
- LowShelfFilter(
116
- cutoff_frequency_hz=params.get("eq_lowshelf_freq", 200),
117
- gain_db=params.get("eq_lowshelf_gain", 0),
118
- q=0.707
119
- )
120
- )
121
-
122
- # High Shelf EQ
123
- if params.get("eq_highshelf_gain", 0) != 0:
124
- effects.append(
125
- HighShelfFilter(
126
- cutoff_frequency_hz=params.get("eq_highshelf_freq", 8000),
127
- gain_db=params.get("eq_highshelf_gain", 0),
128
- q=0.707
129
- )
130
- )
131
-
132
- # Peak EQ 1
133
- if params.get("eq_peak1_gain", 0) != 0:
134
- effects.append(
135
- PeakFilter(
136
- cutoff_frequency_hz=params.get("eq_peak1_freq", 1000),
137
- gain_db=params.get("eq_peak1_gain", 0),
138
- q=params.get("eq_peak1_q", 1.0)
139
- )
140
- )
141
-
142
- # Peak EQ 2
143
- if params.get("eq_peak2_gain", 0) != 0:
144
- effects.append(
145
- PeakFilter(
146
- cutoff_frequency_hz=params.get("eq_peak2_freq", 3000),
147
- gain_db=params.get("eq_peak2_gain", 0),
148
- q=params.get("eq_peak2_q", 1.0)
149
- )
150
- )
151
-
152
- # === Dynamics Section ===
153
-
154
- # Compressor
155
- threshold = params.get("compressor_threshold", -24)
156
- ratio = params.get("compressor_ratio", 4.0)
157
- if ratio > 1.0:
158
- effects.append(
159
- Compressor(
160
- threshold_db=threshold,
161
- ratio=ratio,
162
- attack_ms=params.get("compressor_attack", 5),
163
- release_ms=params.get("compressor_release", 50)
164
- )
165
- )
166
-
167
- # Makeup Gain
168
- makeup = params.get("compressor_makeup", 0)
169
- if makeup != 0:
170
- effects.append(Gain(gain_db=makeup))
171
-
172
- # === Distortion Section ===
173
-
174
- distortion_amount = params.get("distortion_amount", 0)
175
- if distortion_amount > 0:
176
- # pedalboard์˜ Distortion์€ 0-100 ๋ฒ”์œ„
177
- effects.append(
178
- Distortion(drive_db=distortion_amount * 40) # 0-1 -> 0-40dB
179
- )
180
-
181
- # Distortion ํ›„ ํ†ค ์กฐ์ ˆ (Tone = LPF)
182
- tone = params.get("distortion_tone", 0.5)
183
- lpf_freq = 2000 + tone * 10000 # 2kHz ~ 12kHz
184
- effects.append(
185
- LowpassFilter(cutoff_frequency_hz=lpf_freq)
186
- )
187
-
188
- # === Time-based Effects Section ===
189
 
190
  # Delay
191
- delay_mix = params.get("delay_mix", 0)
192
- if delay_mix > 0:
193
- delay_time_ms = params.get("delay_time", 250)
194
- effects.append(
195
- Delay(
196
- delay_seconds=delay_time_ms / 1000,
197
- feedback=params.get("delay_feedback", 0.3),
198
- mix=delay_mix
199
- )
200
- )
201
-
202
- # Reverb
203
- reverb_wet = params.get("reverb_wet_dry", 0)
204
- if reverb_wet > 0:
205
- effects.append(
206
- Reverb(
207
- room_size=params.get("reverb_room_size", 0.5),
208
- damping=params.get("reverb_damping", 0.5),
209
- wet_level=reverb_wet,
210
- dry_level=1 - reverb_wet,
211
- width=1.0
212
- )
213
- )
214
-
215
- # === Output Section ===
216
-
217
- # Limiter (ํด๋ฆฌํ•‘ ๋ฐฉ์ง€)
218
- effects.append(
219
- Limiter(
220
- threshold_db=-1.0,
221
- release_ms=100
222
- )
223
- )
224
 
225
  return Pedalboard(effects)
226
 
227
- def process_realtime(
228
- self,
229
- audio_chunk: np.ndarray,
230
- sample_rate: int,
231
  parameters: Dict[str, float]
232
- ) -> np.ndarray:
233
- """
234
- ์‹ค์‹œ๊ฐ„ ์˜ค๋””์˜ค ์ฒญํฌ ์ฒ˜๋ฆฌ (์ŠคํŠธ๋ฆฌ๋ฐ์šฉ)
235
-
236
- Args:
237
- audio_chunk: ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ ๋ฐฐ์—ด
238
- sample_rate: ์ƒ˜ํ”Œ๋ ˆ์ดํŠธ
239
- parameters: ์ดํŽ™ํ„ฐ ํŒŒ๋ผ๋ฏธํ„ฐ
240
 
241
- Returns:
242
- ์ฒ˜๋ฆฌ๋œ ์˜ค๋””์˜ค ์ฒญํฌ
243
- """
244
- if len(audio_chunk.shape) == 1:
245
- audio_chunk = np.column_stack([audio_chunk, audio_chunk])
246
-
247
- audio_chunk = audio_chunk.astype(np.float32)
248
-
249
- board = self._build_pedalboard(parameters, sample_rate)
250
- processed = board(audio_chunk, sample_rate)
251
-
252
- wet_mix = parameters.get("final_wet_mix", 0.5)
253
- final = (1 - wet_mix) * audio_chunk + wet_mix * processed
254
-
255
- return np.clip(final, -1.0, 1.0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ Effect Chain - Pedalboard ๊ธฐ๋ฐ˜ ์˜ค๋””์˜ค ์ดํŽ™ํŠธ ์ฒ˜๋ฆฌ
3
+ =================================================
 
 
 
 
 
 
4
  """
5
 
6
  import numpy as np
 
 
7
  import soundfile as sf
8
+ from typing import Dict, List, Optional
 
9
  from pedalboard import (
10
+ Pedalboard,
11
+ Compressor,
12
  Gain,
 
13
  HighShelfFilter,
14
+ LowShelfFilter,
15
  PeakFilter,
16
  Delay,
17
  Reverb,
18
  Distortion,
19
+ Limiter
 
 
20
  )
 
21
 
22
 
23
  class EffectChain:
24
+ """Pedalboard ๊ธฐ๋ฐ˜ ์ดํŽ™ํŠธ ์ฒด์ธ"""
25
 
26
+ def __init__(self, sample_rate: int = 44100):
27
+ self.sample_rate = sample_rate
28
+
29
+ self.available_effects = [
30
+ "eq_peak1", "eq_peak2",
31
+ "eq_lowshelf", "eq_highshelf",
32
+ "distortion", "delay", "compressor",
33
+ "reverb", "limiter"
34
+ ]
 
 
 
 
 
 
35
 
36
  def get_available_effects(self) -> List[str]:
37
+ """์‚ฌ์šฉ ๊ฐ€๋Šฅํ•œ ์ดํŽ™ํŠธ ๋ชฉ๋ก"""
38
+ return self.available_effects
39
 
40
+ def _build_pedalboard(self, params: Dict[str, float]) -> Pedalboard:
41
+ """ํŒŒ๋ผ๋ฏธํ„ฐ๋กœ Pedalboard ๊ตฌ์„ฑ"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  effects = []
44
 
45
+ # Compressor (ํ•ญ์ƒ ์ ์šฉ)
46
+ effects.append(Compressor(
47
+ threshold_db=-18.0,
48
+ ratio=2.0,
49
+ attack_ms=10.0,
50
+ release_ms=100.0
51
+ ))
52
+
53
+ # EQ Peak 1
54
+ freq1 = params.get("eq_peak1.params.freq", 1000.0)
55
+ gain1 = params.get("eq_peak1.params.gain", 0.0)
56
+ q1 = params.get("eq_peak1.params.q", 1.0)
57
+ if abs(gain1) > 0.1:
58
+ effects.append(PeakFilter(
59
+ cutoff_frequency_hz=max(20, min(20000, freq1)),
60
+ gain_db=max(-12, min(12, gain1)),
61
+ q=max(0.1, min(10, q1))
62
+ ))
63
+
64
+ # EQ Peak 2
65
+ freq2 = params.get("eq_peak2.params.freq", 4000.0)
66
+ gain2 = params.get("eq_peak2.params.gain", 0.0)
67
+ q2 = params.get("eq_peak2.params.q", 1.0)
68
+ if abs(gain2) > 0.1:
69
+ effects.append(PeakFilter(
70
+ cutoff_frequency_hz=max(20, min(20000, freq2)),
71
+ gain_db=max(-12, min(12, gain2)),
72
+ q=max(0.1, min(10, q2))
73
+ ))
74
+
75
+ # Low Shelf
76
+ freq_low = params.get("eq_lowshelf.params.freq", 200.0)
77
+ gain_low = params.get("eq_lowshelf.params.gain", 0.0)
78
+ if abs(gain_low) > 0.1:
79
+ effects.append(LowShelfFilter(
80
+ cutoff_frequency_hz=max(20, min(2000, freq_low)),
81
+ gain_db=max(-12, min(12, gain_low)),
82
+ q=0.707
83
+ ))
84
+
85
+ # High Shelf
86
+ freq_high = params.get("eq_highshelf.params.freq", 8000.0)
87
+ gain_high = params.get("eq_highshelf.params.gain", 0.0)
88
+ if abs(gain_high) > 0.1:
89
+ effects.append(HighShelfFilter(
90
+ cutoff_frequency_hz=max(1000, min(20000, freq_high)),
91
+ gain_db=max(-12, min(12, gain_high)),
92
+ q=0.707
93
+ ))
94
+
95
+ # Distortion
96
+ dist_amount = params.get("distortion_amount", 0.0)
97
+ if dist_amount > 0.01:
98
+ effects.append(Distortion(
99
+ drive_db=max(0, min(20, dist_amount * 100))
100
+ ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  # Delay
103
+ delay_time = params.get("delay.delay_time", 0.02)
104
+ delay_feedback = params.get("delay.feedback", 0.3)
105
+ delay_mix = params.get("delay.mix", 0.2)
106
+ if delay_mix > 0.01:
107
+ effects.append(Delay(
108
+ delay_seconds=max(0.01, min(1.0, delay_time)),
109
+ feedback=max(0.0, min(0.9, delay_feedback)),
110
+ mix=max(0.0, min(1.0, delay_mix))
111
+ ))
112
+
113
+ # Limiter (ํ•ญ์ƒ ๋งˆ์ง€๋ง‰์—)
114
+ effects.append(Limiter(threshold_db=-1.0))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  return Pedalboard(effects)
117
 
118
+ def process(
119
+ self,
120
+ input_path: str,
121
+ output_path: str,
122
  parameters: Dict[str, float]
123
+ ) -> bool:
124
+ """์˜ค๋””์˜ค ํŒŒ์ผ ์ฒ˜๋ฆฌ"""
125
+ try:
126
+ # ์˜ค๋””์˜ค ๋กœ๋“œ
127
+ audio, sr = sf.read(input_path)
 
 
 
128
 
129
+ # ๋ชจ๋…ธ/์Šคํ…Œ๋ ˆ์˜ค ์ฒ˜๋ฆฌ
130
+ if len(audio.shape) == 1:
131
+ audio = audio.reshape(-1, 1)
132
+
133
+ # float32๋กœ ๋ณ€ํ™˜
134
+ audio = audio.astype(np.float32)
135
+
136
+ # Pedalboard ๊ตฌ์„ฑ
137
+ board = self._build_pedalboard(parameters)
138
+
139
+ # ์ฒ˜๋ฆฌ
140
+ processed = board(audio, sr)
141
+
142
+ # Wet/Dry ๋ฏน์Šค
143
+ wet_mix = parameters.get("final_wet_mix", 0.5)
144
+ wet_mix = max(0.0, min(1.0, wet_mix))
145
+
146
+ # ๊ธธ์ด ๋งž์ถ”๊ธฐ
147
+ min_len = min(len(audio), len(processed))
148
+ output = audio[:min_len] * (1 - wet_mix) + processed[:min_len] * wet_mix
149
+
150
+ # ํด๋ฆฌํ•‘ ๋ฐฉ์ง€
151
+ output = np.clip(output, -1.0, 1.0)
152
+
153
+ # ์ €์žฅ
154
+ sf.write(output_path, output, sr)
155
+
156
+ print(f"[EffectChain] โœ… ์ฒ˜๋ฆฌ ์™„๋ฃŒ: {output_path}")
157
+ return True
158
+
159
+ except Exception as e:
160
+ print(f"[EffectChain] โŒ ์ฒ˜๋ฆฌ ์‹คํŒจ: {e}")
161
+ raise e
main.py CHANGED
@@ -11,6 +11,14 @@ import tempfile
11
  import os
12
  import uuid
13
  import base64
 
 
 
 
 
 
 
 
14
 
15
  # ๋‚ด๋ถ€ ๋ชจ๋“ˆ
16
  from models.ai_effector import AIEffector
@@ -20,13 +28,14 @@ from audio_processing.effect_chain import EffectChain
20
  # ์„ค์ •
21
  # ============================================
22
 
23
- # ํ•™์Šต๋œ ๋ชจ๋ธ ๊ฒฝ๋กœ - checkpoints ํด๋” ํฌํ•จ!
24
- MODEL_PATH = os.environ.get("DIFFVOX_MODEL_PATH", "heybaeheef/KU_SW_Academy/checkpoints")
 
25
  BASE_MODEL_NAME = os.environ.get("BASE_MODEL_NAME", "Qwen/Qwen3-8B")
26
  AUDIO_FEATURE_DIM = int(os.environ.get("AUDIO_FEATURE_DIM", "64"))
27
  USE_HUGGINGFACE = os.environ.get("USE_HUGGINGFACE", "true").lower() == "true"
28
 
29
- # ์ž„์‹œ ํŒŒ์ผ ์ €์žฅ ๊ฒฝ๋กœ - ๋จผ์ € ์ •์˜
30
  TEMP_DIR = Path(tempfile.gettempdir()) / "magicpath"
31
  TEMP_DIR.mkdir(exist_ok=True)
32
 
@@ -53,14 +62,16 @@ app.add_middleware(
53
  print("=" * 60)
54
  print("MagicPath AI Vocal Effects Server v2.0")
55
  print("=" * 60)
56
- print(f"Model Path: {MODEL_PATH}")
 
57
  print(f"Base Model: {BASE_MODEL_NAME}")
58
  print(f"Audio Feature Dim: {AUDIO_FEATURE_DIM}")
59
  print(f"Use Hugging Face: {USE_HUGGINGFACE}")
60
  print("=" * 60)
61
 
62
  ai_effector = AIEffector(
63
- model_path=MODEL_PATH,
 
64
  base_model_name=BASE_MODEL_NAME,
65
  audio_feature_dim=AUDIO_FEATURE_DIM,
66
  use_huggingface=USE_HUGGINGFACE
@@ -79,7 +90,8 @@ async def root():
79
  "status": "running",
80
  "message": "MagicPath AI Vocal Effects Server v2.0 (DiffVox LLM)",
81
  "ai_model_loaded": ai_effector.is_loaded(),
82
- "model_path": MODEL_PATH,
 
83
  "endpoints": {
84
  "POST /process": "์˜ค๋””์˜ค ํŒŒ์ผ ์ฒ˜๋ฆฌ ํ›„ ๋ฐ˜ํ™˜",
85
  "POST /predict": "ํŒŒ๋ผ๋ฏธํ„ฐ๋งŒ ์˜ˆ์ธก (JSON)",
@@ -96,7 +108,7 @@ async def health_check():
96
  "status": "healthy",
97
  "ai_model_loaded": ai_effector.is_loaded(),
98
  "supported_effects": effect_chain.get_available_effects(),
99
- "model_path": MODEL_PATH,
100
  "base_model": BASE_MODEL_NAME
101
  }
102
 
@@ -128,6 +140,7 @@ async def predict_parameters(
128
  })
129
 
130
  except Exception as e:
 
131
  raise HTTPException(status_code=500, detail=str(e))
132
 
133
 
@@ -170,6 +183,7 @@ async def process_audio(
170
  )
171
 
172
  except Exception as e:
 
173
  if input_path and Path(input_path).exists():
174
  os.remove(input_path)
175
  if output_path and Path(output_path).exists():
@@ -223,6 +237,7 @@ async def process_audio_with_params(
223
  })
224
 
225
  except Exception as e:
 
226
  if input_path and Path(input_path).exists():
227
  os.remove(input_path)
228
  if output_path and Path(output_path).exists():
 
11
  import os
12
  import uuid
13
  import base64
14
+ import logging
15
+ from datetime import datetime
16
+
17
+ # ๋กœ๊น… ์„ค์ •
18
+ logging.basicConfig(level=logging.INFO)
19
+ logger = logging.getLogger(__name__)
20
+
21
+ print(f"\n===== Application Startup at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} =====\n")
22
 
23
  # ๋‚ด๋ถ€ ๋ชจ๋“ˆ
24
  from models.ai_effector import AIEffector
 
28
  # ์„ค์ •
29
  # ============================================
30
 
31
+ # ํ•™์Šต๋œ ๋ชจ๋ธ ๊ฒฝ๋กœ - repo_id์™€ subfolder ๋ถ„๋ฆฌ!
32
+ MODEL_REPO_ID = os.environ.get("DIFFVOX_MODEL_REPO", "heybaeheef/KU_SW_Academy")
33
+ MODEL_SUBFOLDER = os.environ.get("DIFFVOX_MODEL_SUBFOLDER", "checkpoints")
34
  BASE_MODEL_NAME = os.environ.get("BASE_MODEL_NAME", "Qwen/Qwen3-8B")
35
  AUDIO_FEATURE_DIM = int(os.environ.get("AUDIO_FEATURE_DIM", "64"))
36
  USE_HUGGINGFACE = os.environ.get("USE_HUGGINGFACE", "true").lower() == "true"
37
 
38
+ # ์ž„์‹œ ํŒŒ์ผ ์ €์žฅ ๊ฒฝ๋กœ
39
  TEMP_DIR = Path(tempfile.gettempdir()) / "magicpath"
40
  TEMP_DIR.mkdir(exist_ok=True)
41
 
 
62
  print("=" * 60)
63
  print("MagicPath AI Vocal Effects Server v2.0")
64
  print("=" * 60)
65
+ print(f"Model Repo: {MODEL_REPO_ID}")
66
+ print(f"Model Subfolder: {MODEL_SUBFOLDER}")
67
  print(f"Base Model: {BASE_MODEL_NAME}")
68
  print(f"Audio Feature Dim: {AUDIO_FEATURE_DIM}")
69
  print(f"Use Hugging Face: {USE_HUGGINGFACE}")
70
  print("=" * 60)
71
 
72
  ai_effector = AIEffector(
73
+ model_repo_id=MODEL_REPO_ID,
74
+ model_subfolder=MODEL_SUBFOLDER,
75
  base_model_name=BASE_MODEL_NAME,
76
  audio_feature_dim=AUDIO_FEATURE_DIM,
77
  use_huggingface=USE_HUGGINGFACE
 
90
  "status": "running",
91
  "message": "MagicPath AI Vocal Effects Server v2.0 (DiffVox LLM)",
92
  "ai_model_loaded": ai_effector.is_loaded(),
93
+ "model_repo": MODEL_REPO_ID,
94
+ "model_subfolder": MODEL_SUBFOLDER,
95
  "endpoints": {
96
  "POST /process": "์˜ค๋””์˜ค ํŒŒ์ผ ์ฒ˜๋ฆฌ ํ›„ ๋ฐ˜ํ™˜",
97
  "POST /predict": "ํŒŒ๋ผ๋ฏธํ„ฐ๋งŒ ์˜ˆ์ธก (JSON)",
 
108
  "status": "healthy",
109
  "ai_model_loaded": ai_effector.is_loaded(),
110
  "supported_effects": effect_chain.get_available_effects(),
111
+ "model_repo": MODEL_REPO_ID,
112
  "base_model": BASE_MODEL_NAME
113
  }
114
 
 
140
  })
141
 
142
  except Exception as e:
143
+ logger.error(f"Predict error: {e}")
144
  raise HTTPException(status_code=500, detail=str(e))
145
 
146
 
 
183
  )
184
 
185
  except Exception as e:
186
+ logger.error(f"Process error: {e}")
187
  if input_path and Path(input_path).exists():
188
  os.remove(input_path)
189
  if output_path and Path(output_path).exists():
 
237
  })
238
 
239
  except Exception as e:
240
+ logger.error(f"Process with params error: {e}")
241
  if input_path and Path(input_path).exists():
242
  os.remove(input_path)
243
  if output_path and Path(output_path).exists():
models/__init__.py CHANGED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # models package
2
+ from .ai_effector import AIEffector
3
+
4
+ __all__ = ["AIEffector"]
models/ai_effector.py CHANGED
@@ -1,269 +1,169 @@
1
  """
2
- AI Effector Model - DiffVox LLM ํ†ตํ•ฉ ๋ฒ„์ „
3
- ==========================================
4
- CLAP ์ธ์ฝ”๋” + ํ•™์Šต๋œ LLM์„ ์‚ฌ์šฉํ•˜์—ฌ ์˜ค๋””์˜ค์—์„œ ์ดํŽ™ํ„ฐ ํŒŒ๋ผ๋ฏธํ„ฐ๋ฅผ ์˜ˆ์ธก
5
-
6
- DiffVox LLM ํŒŒ๋ผ๋ฏธํ„ฐ โ†’ MagicPath ์›น ํŒŒ๋ผ๋ฏธํ„ฐ ์ž๋™ ๋ณ€ํ™˜
7
  """
8
 
 
9
  import json
10
  import re
11
- import os
12
- from pathlib import Path
13
- from typing import Dict, Any, Optional
14
  import torch
 
 
 
 
15
 
16
- # AI ๋ชจ๋ธ ๊ด€๋ จ import (์„ค์น˜ ํ•„์š”)
17
- try:
18
- from transformers import AutoModelForCausalLM, AutoTokenizer
19
- from peft import PeftModel
20
- TRANSFORMERS_AVAILABLE = True
21
- except ImportError:
22
- TRANSFORMERS_AVAILABLE = False
23
- print("[AIEffector] transformers/peft ๋ฏธ์„ค์น˜ - ํ”„๋ฆฌ์…‹ ๋ชจ๋“œ๋กœ ๋™์ž‘")
24
-
25
- # CLAP ์ธ์ฝ”๋” (๋ณ„๋„ ํŒŒ์ผ)
26
- try:
27
- from models.audio_encoder import AudioEncoder
28
- AUDIO_ENCODER_AVAILABLE = True
29
- except ImportError:
30
- AUDIO_ENCODER_AVAILABLE = False
31
- print("[AIEffector] AudioEncoder ๋ฏธ์„ค์น˜ - ํ”„๋ฆฌ์…‹ ๋ชจ๋“œ๋กœ ๋™์ž‘")
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- class ParameterMapper:
35
- """DiffVox LLM ํŒŒ๋ผ๋ฏธํ„ฐ โ†” MagicPath ์›น ํŒŒ๋ผ๋ฏธํ„ฐ ๋ณ€ํ™˜"""
36
-
37
- # DiffVox LLM โ†’ MagicPath ์›น ๋งคํ•‘
38
- DIFFVOX_TO_WEB = {
39
- # EQ Low Shelf
40
- "eq_lowshelf.params.gain": "eq_lowshelf_gain",
41
- "eq_lowshelf.params.parametrizations.freq.original": "eq_lowshelf_freq",
42
- # EQ High Shelf
43
- "eq_highshelf.params.gain": "eq_highshelf_gain",
44
- "eq_highshelf.params.parametrizations.freq.original": "eq_highshelf_freq",
45
- # EQ Peak 1
46
- "eq_peak1.params.gain": "eq_peak1_gain",
47
- "eq_peak1.params.parametrizations.freq.original": "eq_peak1_freq",
48
- "eq_peak1.params.parametrizations.Q.original": "eq_peak1_q",
49
- # EQ Peak 2
50
- "eq_peak2.params.gain": "eq_peak2_gain",
51
- "eq_peak2.params.parametrizations.freq.original": "eq_peak2_freq",
52
- "eq_peak2.params.parametrizations.Q.original": "eq_peak2_q",
53
- # Delay
54
- "delay.delay_time": "delay_time",
55
- "delay.feedback": "delay_feedback",
56
- "delay.mix": "delay_mix",
57
- # Distortion
58
- "distortion_amount": "distortion_amount",
59
- # Master
60
- "final_wet_mix": "final_wet_mix",
61
- }
62
-
63
- # ์—ญ๋ฐฉํ–ฅ ๋งคํ•‘
64
- WEB_TO_DIFFVOX = {v: k for k, v in DIFFVOX_TO_WEB.items()}
65
-
66
- # ๊ฐ’ ๋ณ€ํ™˜ ๊ทœ์น™ (์ •๊ทœํ™”๋œ ๊ฐ’ โ†’ ์‹ค์ œ ๊ฐ’)
67
- VALUE_TRANSFORMS = {
68
- # EQ gain: -1~1 โ†’ -12~12 dB
69
- "eq_lowshelf_gain": lambda x: x * 12,
70
- "eq_highshelf_gain": lambda x: x * 12,
71
- "eq_peak1_gain": lambda x: x * 12,
72
- "eq_peak2_gain": lambda x: x * 12,
73
- # EQ freq: ์ •๊ทœํ™”๋œ ๊ฐ’ โ†’ Hz (๋กœ๊ทธ ์Šค์ผ€์ผ ์—ญ๋ณ€ํ™˜ ํ•„์š”ํ•  ์ˆ˜ ์žˆ์Œ)
74
- "eq_lowshelf_freq": lambda x: 20 * (20000/20) ** ((x + 1) / 2), # -1~1 โ†’ 20~20000
75
- "eq_highshelf_freq": lambda x: 20 * (20000/20) ** ((x + 1) / 2),
76
- "eq_peak1_freq": lambda x: 20 * (20000/20) ** ((x + 1) / 2),
77
- "eq_peak2_freq": lambda x: 20 * (20000/20) ** ((x + 1) / 2),
78
- # Q: -1~1 โ†’ 0.1~10
79
- "eq_peak1_q": lambda x: 0.1 * (10/0.1) ** ((x + 1) / 2),
80
- "eq_peak2_q": lambda x: 0.1 * (10/0.1) ** ((x + 1) / 2),
81
- # Delay time: -1~1 โ†’ 0~1000 ms
82
- "delay_time": lambda x: (x + 1) / 2 * 1000,
83
- # Delay feedback: -1~1 โ†’ 0~1
84
- "delay_feedback": lambda x: (x + 1) / 2,
85
- # Delay mix: -1~1 โ†’ 0~1
86
- "delay_mix": lambda x: (x + 1) / 2,
87
- # Distortion: -1~1 โ†’ 0~1
88
- "distortion_amount": lambda x: (x + 1) / 2,
89
- # Wet mix: -1~1 โ†’ 0~1
90
- "final_wet_mix": lambda x: (x + 1) / 2,
91
  }
92
-
93
- @classmethod
94
- def diffvox_to_web(cls, diffvox_params: Dict[str, float]) -> Dict[str, float]:
95
- """DiffVox LLM ์ถœ๋ ฅ โ†’ MagicPath ์›น ํŒŒ๋ผ๋ฏธํ„ฐ"""
96
- web_params = {}
97
-
98
- for diffvox_key, value in diffvox_params.items():
99
- # ํ‚ค ๋ณ€ํ™˜
100
- if diffvox_key in cls.DIFFVOX_TO_WEB:
101
- web_key = cls.DIFFVOX_TO_WEB[diffvox_key]
102
- else:
103
- # ๋งคํ•‘์— ์—†์œผ๋ฉด ์Šคํ‚ต
104
- continue
105
-
106
- # ๊ฐ’ ๋ณ€ํ™˜
107
- if web_key in cls.VALUE_TRANSFORMS:
108
- try:
109
- web_params[web_key] = cls.VALUE_TRANSFORMS[web_key](value)
110
- except:
111
- web_params[web_key] = value
112
- else:
113
- web_params[web_key] = value
114
-
115
- return web_params
116
 
117
 
118
- class ParameterParser:
119
- """LLM ์ถœ๋ ฅ์—์„œ ํŒŒ๋ผ๋ฏธํ„ฐ JSON ์ถ”์ถœ"""
120
 
121
- @staticmethod
122
- def parse(llm_output: str) -> Optional[Dict]:
123
- """LLM ์ถœ๋ ฅ์—์„œ ํŒŒ๋ผ๋ฏธํ„ฐ ๋”•์…”๋„ˆ๋ฆฌ ์ถ”์ถœ"""
124
-
125
- # ๋ฐฉ๋ฒ• 1: JSON ๋ธ”๋ก ์ฐพ๊ธฐ
126
- json_patterns = [
127
- r'\{[^{}]*\}',
128
- r'\{(?:[^{}]|\{[^{}]*\})*\}',
129
- ]
130
-
131
- for pattern in json_patterns:
132
- matches = re.findall(pattern, llm_output, re.DOTALL)
133
- for match in matches:
134
- try:
135
- params = json.loads(match)
136
- if isinstance(params, dict) and len(params) > 0:
137
- return params
138
- except json.JSONDecodeError:
139
- continue
140
 
141
- # ๋ฐฉ๋ฒ• 2: key: value ํŒจํ„ด ํŒŒ์‹ฑ
142
- param_pattern = r'"([^"]+)":\s*([-\d.]+)'
143
- matches = re.findall(param_pattern, llm_output)
144
- if matches:
145
- params = {}
146
- for key, value in matches:
147
- try:
148
- params[key] = float(value)
149
- except ValueError:
150
- params[key] = value
151
- if params:
152
- return params
153
-
154
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
 
157
  class AIEffector:
158
- """AI ๊ธฐ๋ฐ˜ ์ดํŽ™ํ„ฐ ํŒŒ๋ผ๋ฏธํ„ฐ ์˜ˆ์ธก ๋ชจ๋ธ - DiffVox LLM ํ†ตํ•ฉ"""
159
-
160
- # ๊ธฐ๋ณธ ํŒŒ๋ผ๋ฏธํ„ฐ
161
- DEFAULT_PARAMS = {
162
- "eq_lowshelf_gain": 0.0,
163
- "eq_lowshelf_freq": 200,
164
- "eq_highshelf_gain": 0.0,
165
- "eq_highshelf_freq": 8000,
166
- "eq_peak1_gain": 0.0,
167
- "eq_peak1_freq": 1000,
168
- "eq_peak1_q": 1.0,
169
- "eq_peak2_gain": 0.0,
170
- "eq_peak2_freq": 3000,
171
- "eq_peak2_q": 1.0,
172
- "compressor_threshold": -24,
173
- "compressor_ratio": 4.0,
174
- "compressor_attack": 5,
175
- "compressor_release": 50,
176
- "compressor_makeup": 0.0,
177
- "distortion_amount": 0.0,
178
- "distortion_tone": 0.5,
179
- "delay_time": 250,
180
- "delay_feedback": 0.3,
181
- "delay_mix": 0.0,
182
- "reverb_room_size": 0.5,
183
- "reverb_damping": 0.5,
184
- "reverb_wet_dry": 0.0,
185
- "final_wet_mix": 0.5
186
- }
187
-
188
- # ํ”„๋ฆฌ์…‹ (fallback์šฉ)
189
- PRESETS = {
190
- "warm": {
191
- "eq_lowshelf_gain": 5.5,
192
- "eq_lowshelf_freq": 200,
193
- "eq_highshelf_gain": -1.5,
194
- "eq_highshelf_freq": 8000,
195
- "eq_peak1_gain": 2.0,
196
- "eq_peak1_freq": 400,
197
- "eq_peak1_q": 1.0,
198
- "compressor_threshold": -18,
199
- "compressor_ratio": 3.0,
200
- "distortion_amount": 0.05,
201
- "reverb_room_size": 0.4,
202
- "reverb_wet_dry": 0.15,
203
- "final_wet_mix": 0.5
204
- },
205
- "bright": {
206
- "eq_lowshelf_gain": -2.0,
207
- "eq_lowshelf_freq": 150,
208
- "eq_highshelf_gain": 4.0,
209
- "eq_highshelf_freq": 6000,
210
- "eq_peak1_gain": 1.0,
211
- "eq_peak1_freq": 3000,
212
- "compressor_threshold": -20,
213
- "compressor_ratio": 6.0,
214
- "reverb_room_size": 0.3,
215
- "reverb_wet_dry": 0.1,
216
- "final_wet_mix": 0.5
217
- },
218
- }
219
 
220
  def __init__(
221
- self,
222
- model_path: Optional[str] = None,
 
223
  base_model_name: str = "Qwen/Qwen3-8B",
224
  audio_feature_dim: int = 64,
225
  use_huggingface: bool = True
226
  ):
227
- """
228
- AI ๋ชจ๋ธ ์ดˆ๊ธฐํ™”
 
 
 
229
 
230
- Args:
231
- model_path: ํ•™์Šต๋œ LoRA ๋ชจ๋ธ ๊ฒฝ๋กœ (๋กœ์ปฌ ๋˜๋Š” Hugging Face ๋ ˆํฌ)
232
- base_model_name: ๋ฒ ์ด์Šค LLM ๋ชจ๋ธ ์ด๋ฆ„
233
- audio_feature_dim: ์˜ค๋””์˜ค ํŠน์ง• ์ฐจ์› (CLAP ์ถœ๋ ฅ)
234
- use_huggingface: True๋ฉด model_path๋ฅผ Hugging Face ๋ ˆํฌ๋กœ ๊ฐ„์ฃผ
235
- """
236
  self.model = None
237
  self.tokenizer = None
238
- self.audio_encoder = None
239
- self.model_loaded = False
240
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
241
 
242
- self.base_model_name = base_model_name
243
- self.audio_feature_dim = audio_feature_dim
244
- self.use_huggingface = use_huggingface
245
 
246
- if model_path:
247
- self._load_model(model_path)
248
 
249
- def _load_model(self, model_path: str):
250
- """ํ•™์Šต๋œ LoRA ๋ชจ๋ธ ๋กœ๋“œ (๋กœ์ปฌ ๋˜๋Š” Hugging Face)"""
251
- if not TRANSFORMERS_AVAILABLE:
252
- print("[AIEffector] transformers/peft ๋ฏธ์„ค์น˜")
253
- return
254
-
255
- # ๋กœ์ปฌ ๊ฒฝ๋กœ์ธ์ง€ Hugging Face ๋ ˆํฌ์ธ์ง€ ํ™•์ธ
256
- is_local = os.path.exists(model_path)
257
-
258
- if not is_local and not self.use_huggingface:
259
- print(f"[AIEffector] ๋กœ์ปฌ ๋ชจ๋ธ ๊ฒฝ๋กœ ์—†์Œ: {model_path}")
260
- return
261
-
262
  try:
263
- if self.use_huggingface and not is_local:
264
- print(f"[AIEffector] Hugging Face์—์„œ ๋ชจ๋ธ ๋กœ๋”ฉ: {model_path}")
265
- else:
266
- print(f"[AIEffector] ๋กœ์ปฌ ๋ชจ๋ธ ๋กœ๋”ฉ: {model_path}")
 
 
 
267
 
268
  # ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ
269
  self.tokenizer = AutoTokenizer.from_pretrained(
@@ -276,129 +176,155 @@ class AIEffector:
276
  # ๋ฒ ์ด์Šค ๋ชจ๋ธ ๋กœ๋“œ
277
  base_model = AutoModelForCausalLM.from_pretrained(
278
  self.base_model_name,
279
- torch_dtype=torch.bfloat16,
280
- device_map="auto",
281
  trust_remote_code=True,
 
282
  )
283
 
284
- # LoRA ์–ด๋Œ‘ํ„ฐ ์ ์šฉ (Hugging Face ๋ ˆํฌ ๋˜๋Š” ๋กœ์ปฌ ๊ฒฝ๋กœ)
285
- self.model = PeftModel.from_pretrained(
286
- base_model,
287
- model_path, # Hugging Face ๋ ˆํฌ ์ด๋ฆ„ ๋˜๋Š” ๋กœ์ปฌ ๊ฒฝ๋กœ
288
- is_trainable=False
289
- )
290
- self.model.eval()
291
-
292
- # ์˜ค๋””์˜ค ์ธ์ฝ”๋” ๋กœ๋“œ
293
- if AUDIO_ENCODER_AVAILABLE:
294
- self.audio_encoder = AudioEncoder(
295
- output_dim=self.audio_feature_dim,
296
- reduction_method="pool"
 
 
 
 
297
  )
298
- print("[AIEffector] AudioEncoder ๋กœ๋“œ ์™„๋ฃŒ")
299
 
300
- self.model_loaded = True
301
- print("[AIEffector] โœ… ๋ชจ๋ธ ๋กœ๋“œ ์™„๋ฃŒ")
302
 
303
  except Exception as e:
304
  print(f"[AIEffector] โŒ ๋ชจ๋ธ ๋กœ๋“œ ์‹คํŒจ: {e}")
305
- import traceback
306
- traceback.print_exc()
307
- self.model_loaded = False
308
 
309
  def is_loaded(self) -> bool:
310
- """AI ๋ชจ๋ธ ๋กœ๋“œ ์ƒํƒœ ํ™•์ธ"""
311
- return self.model_loaded
312
 
313
- def predict(self, audio_path: str, text_prompt: str) -> Dict[str, float]:
314
- """
315
- ์˜ค๋””์˜ค์™€ ํ…์ŠคํŠธ๋กœ๋ถ€ํ„ฐ ์ดํŽ™ํ„ฐ ํŒŒ๋ผ๋ฏธํ„ฐ ์˜ˆ์ธก
 
316
 
317
- Args:
318
- audio_path: ์ž…๋ ฅ ์˜ค๋””์˜ค ํŒŒ์ผ ๊ฒฝ๋กœ
319
- text_prompt: ์‚ฌ์šฉ์ž ํ…์ŠคํŠธ ๋ช…๋ น
320
-
321
- Returns:
322
- MagicPath ์›น ํ˜•์‹์˜ ์ดํŽ™ํ„ฐ ํŒŒ๋ผ๋ฏธํ„ฐ ๋”•์…”๋„ˆ๋ฆฌ
323
- """
324
- if self.model_loaded and self.audio_encoder:
325
- return self._predict_with_model(audio_path, text_prompt)
326
- else:
327
- return self._predict_with_preset(text_prompt)
328
 
329
- def _predict_with_model(self, audio_path: str, text_prompt: str) -> Dict[str, float]:
330
- """ํ•™์Šต๋œ DiffVox LLM์œผ๋กœ ์ถ”๋ก """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
  try:
332
- # 1. ์˜ค๋””์˜ค ํŠน์ง• ์ถ”์ถœ
333
  audio_features = self.audio_encoder.get_audio_features(audio_path)
334
- if not audio_features:
335
- print("[AIEffector] ์˜ค๋””์˜ค ํŠน์ง• ์ถ”์ถœ ์‹คํŒจ, ํ”„๋ฆฌ์…‹ ์‚ฌ์šฉ")
336
- return self._predict_with_preset(text_prompt)
337
 
338
- # 2. ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ (train_model.py์™€ ๋™์ผํ•œ ํ˜•์‹)
339
- audio_state_str = json.dumps(audio_features)
340
- prompt = f"""Task: Convert text to audio parameters.
341
- Audio: {audio_state_str}
342
- Text: {text_prompt}
343
- Parameters:"""
344
 
345
- # 3. LLM ์ถ”๋ก 
346
  inputs = self.tokenizer(
347
  prompt,
348
  return_tensors="pt",
349
  truncation=True,
350
- max_length=1500
351
  ).to(self.device)
352
 
 
353
  with torch.no_grad():
354
  outputs = self.model.generate(
355
  **inputs,
356
- max_new_tokens=500,
357
- temperature=0.1,
358
  do_sample=False,
359
- pad_token_id=self.tokenizer.eos_token_id,
 
360
  )
361
 
362
- generated_text = self.tokenizer.decode(
363
- outputs[0][inputs['input_ids'].shape[1]:],
364
- skip_special_tokens=True
365
- ).strip()
366
-
367
- print(f"[AIEffector] LLM ์ถœ๋ ฅ: {generated_text[:200]}...")
368
-
369
- # 4. ํŒŒ๋ผ๋ฏธํ„ฐ ํŒŒ์‹ฑ
370
- diffvox_params = ParameterParser.parse(generated_text)
371
-
372
- if not diffvox_params:
373
- print("[AIEffector] ํŒŒ๋ผ๋ฏธํ„ฐ ํŒŒ์‹ฑ ์‹คํŒจ, ํ”„๋ฆฌ์…‹ ์‚ฌ์šฉ")
374
- return self._predict_with_preset(text_prompt)
375
-
376
- # 5. DiffVox โ†’ Web ํŒŒ๋ผ๋ฏธํ„ฐ ๋ณ€ํ™˜
377
- web_params = ParameterMapper.diffvox_to_web(diffvox_params)
378
 
379
- # 6. ๊ธฐ๋ณธ๊ฐ’๊ณผ ๋ณ‘ํ•ฉ
380
- result = self.DEFAULT_PARAMS.copy()
381
- result.update(web_params)
382
 
383
- print(f"[AIEffector] โœ… AI ํŒŒ๋ผ๋ฏธํ„ฐ ์ƒ์„ฑ ์™„๋ฃŒ: {len(web_params)}๊ฐœ ํŒŒ๋ผ๋ฏธํ„ฐ")
384
- return result
385
 
386
  except Exception as e:
387
- print(f"[AIEffector] ์ถ”๋ก  ์—๋Ÿฌ: {e}")
388
- import traceback
389
- traceback.print_exc()
390
- return self._predict_with_preset(text_prompt)
391
-
392
- def _predict_with_preset(self, text_prompt: str) -> Dict[str, float]:
393
- """ํ”„๋ฆฌ์…‹ ๊ธฐ๋ฐ˜ ํŒŒ๋ผ๋ฏธํ„ฐ ๋ฐ˜ํ™˜ (fallback)"""
394
- prompt_lower = text_prompt.lower()
395
-
396
- for preset_name, preset_params in self.PRESETS.items():
397
- if preset_name in prompt_lower:
398
- print(f"[AIEffector] ํ”„๋ฆฌ์…‹ ๋งค์นญ: '{preset_name}'")
399
- result = self.DEFAULT_PARAMS.copy()
400
- result.update(preset_params)
401
- return result
402
-
403
- print("[AIEffector] ํ”„๋ฆฌ์…‹ ๋งค์นญ ์‹คํŒจ, ๊ธฐ๋ณธ๊ฐ’ ๋ฐ˜ํ™˜")
404
- return self.DEFAULT_PARAMS.copy()
 
1
  """
2
+ AI Effector - DiffVox LLM ๊ธฐ๋ฐ˜ ์ดํŽ™ํŠธ ํŒŒ๋ผ๋ฏธํ„ฐ ์˜ˆ์ธก
3
+ ===================================================
 
 
 
4
  """
5
 
6
+ import os
7
  import json
8
  import re
 
 
 
9
  import torch
10
+ import numpy as np
11
+ from typing import Dict, List, Optional, Any
12
+ from pathlib import Path
13
+ import warnings
14
 
15
+ warnings.filterwarnings("ignore")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ # ๊ธฐ๋ณธ ํŒŒ๋ผ๋ฏธํ„ฐ (๋ชจ๋ธ ๋กœ๋“œ ์‹คํŒจ ์‹œ ์‚ฌ์šฉ)
18
+ DEFAULT_PARAMETERS = {
19
+ "eq_peak1.params.freq": 1000.0,
20
+ "eq_peak1.params.gain": 0.0,
21
+ "eq_peak1.params.q": 1.0,
22
+ "eq_peak2.params.freq": 4000.0,
23
+ "eq_peak2.params.gain": 0.0,
24
+ "eq_peak2.params.q": 1.0,
25
+ "eq_lowshelf.params.freq": 200.0,
26
+ "eq_lowshelf.params.gain": 0.0,
27
+ "eq_lowshelf.params.q": 0.707,
28
+ "eq_highshelf.params.freq": 8000.0,
29
+ "eq_highshelf.params.gain": 0.0,
30
+ "eq_highshelf.params.q": 0.707,
31
+ "distortion_amount": 0.0,
32
+ "delay.delay_time": 0.02,
33
+ "delay.feedback": 0.3,
34
+ "delay.mix": 0.2,
35
+ "final_wet_mix": 0.5
36
+ }
37
 
38
+ # ์Šคํƒ€์ผ ํ”„๋ฆฌ์…‹ (AI ์—†์ด๋„ ์ž‘๋™)
39
+ STYLE_PRESETS = {
40
+ "warm": {
41
+ "eq_lowshelf.params.gain": 3.0,
42
+ "eq_highshelf.params.gain": -1.0,
43
+ "distortion_amount": 0.05,
44
+ },
45
+ "bright": {
46
+ "eq_highshelf.params.gain": 4.0,
47
+ "eq_peak2.params.gain": 2.0,
48
+ "eq_lowshelf.params.gain": -1.0,
49
+ },
50
+ "vintage": {
51
+ "eq_lowshelf.params.gain": 2.0,
52
+ "eq_highshelf.params.gain": -2.0,
53
+ "distortion_amount": 0.1,
54
+ "delay.mix": 0.15,
55
+ },
56
+ "modern": {
57
+ "eq_peak1.params.gain": 2.0,
58
+ "eq_peak2.params.gain": 3.0,
59
+ "eq_highshelf.params.gain": 2.0,
60
+ },
61
+ "spacious": {
62
+ "delay.delay_time": 0.05,
63
+ "delay.feedback": 0.4,
64
+ "delay.mix": 0.35,
65
+ },
66
+ "dry": {
67
+ "final_wet_mix": 0.2,
68
+ "delay.mix": 0.0,
69
+ },
70
+ "saturated": {
71
+ "distortion_amount": 0.15,
72
+ "eq_lowshelf.params.gain": 1.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  }
74
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
 
77
+ class AudioEncoder:
78
+ """๊ฐ„์†Œํ™”๋œ ์˜ค๋””์˜ค ์ธ์ฝ”๋” (CLAP ๋Œ€์ฒด)"""
79
 
80
+ def __init__(self, output_dim: int = 64):
81
+ self.output_dim = output_dim
82
+ self.sr = 44100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
+ def get_audio_features(self, audio_path: str) -> List[float]:
85
+ """์˜ค๋””์˜ค์—์„œ ํŠน์ง• ์ถ”์ถœ (๊ฐ„์†Œํ™” ๋ฒ„์ „)"""
86
+ try:
87
+ import librosa
88
+
89
+ y, sr = librosa.load(audio_path, sr=self.sr, duration=5.0)
90
+
91
+ # ๊ธฐ๋ณธ ํŠน์ง• ์ถ”์ถœ
92
+ features = []
93
+
94
+ # MFCC (20๊ฐœ)
95
+ mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
96
+ features.extend(np.mean(mfcc, axis=1).tolist())
97
+
98
+ # Spectral features
99
+ spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
100
+ spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
101
+ spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
102
+
103
+ features.extend([spectral_centroid / 10000, spectral_bandwidth / 10000, spectral_rolloff / 10000])
104
+
105
+ # RMS energy
106
+ rms = np.mean(librosa.feature.rms(y=y))
107
+ features.append(float(rms))
108
+
109
+ # Zero crossing rate
110
+ zcr = np.mean(librosa.feature.zero_crossing_rate(y))
111
+ features.append(float(zcr))
112
+
113
+ # Chroma (12๊ฐœ)
114
+ chroma = librosa.feature.chroma_stft(y=y, sr=sr)
115
+ features.extend(np.mean(chroma, axis=1).tolist())
116
+
117
+ # Pad or truncate to output_dim
118
+ if len(features) < self.output_dim:
119
+ features.extend([0.0] * (self.output_dim - len(features)))
120
+ else:
121
+ features = features[:self.output_dim]
122
+
123
+ return features
124
+
125
+ except Exception as e:
126
+ print(f"[AudioEncoder] ํŠน์ง• ์ถ”์ถœ ์‹คํŒจ: {e}")
127
+ return [0.0] * self.output_dim
128
 
129
 
130
  class AIEffector:
131
+ """AI ๊ธฐ๋ฐ˜ ์ดํŽ™ํ„ฐ ํŒŒ๋ผ๋ฏธํ„ฐ ์˜ˆ์ธก"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  def __init__(
134
+ self,
135
+ model_repo_id: str = "heybaeheef/KU_SW_Academy",
136
+ model_subfolder: str = "checkpoints",
137
  base_model_name: str = "Qwen/Qwen3-8B",
138
  audio_feature_dim: int = 64,
139
  use_huggingface: bool = True
140
  ):
141
+ self.model_repo_id = model_repo_id
142
+ self.model_subfolder = model_subfolder
143
+ self.base_model_name = base_model_name
144
+ self.audio_feature_dim = audio_feature_dim
145
+ self.use_huggingface = use_huggingface
146
 
 
 
 
 
 
 
147
  self.model = None
148
  self.tokenizer = None
 
 
149
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
150
 
151
+ # ์˜ค๋””์˜ค ์ธ์ฝ”๋”
152
+ self.audio_encoder = AudioEncoder(output_dim=audio_feature_dim)
 
153
 
154
+ # ๋ชจ๋ธ ๋กœ๋“œ ์‹œ๋„
155
+ self._load_model()
156
 
157
+ def _load_model(self):
158
+ """๋ชจ๋ธ ๋กœ๋“œ"""
 
 
 
 
 
 
 
 
 
 
 
159
  try:
160
+ from transformers import AutoModelForCausalLM, AutoTokenizer
161
+ from peft import PeftModel
162
+
163
+ print(f"[AIEffector] ๋ชจ๋ธ ๋กœ๋”ฉ ์‹œ์ž‘...")
164
+ print(f" - Base Model: {self.base_model_name}")
165
+ print(f" - Adapter Repo: {self.model_repo_id}")
166
+ print(f" - Adapter Subfolder: {self.model_subfolder}")
167
 
168
  # ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ
169
  self.tokenizer = AutoTokenizer.from_pretrained(
 
176
  # ๋ฒ ์ด์Šค ๋ชจ๋ธ ๋กœ๋“œ
177
  base_model = AutoModelForCausalLM.from_pretrained(
178
  self.base_model_name,
179
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
180
+ device_map="auto" if torch.cuda.is_available() else None,
181
  trust_remote_code=True,
182
+ low_cpu_mem_usage=True
183
  )
184
 
185
+ # LoRA ์–ด๋Œ‘ํ„ฐ ๋กœ๋“œ (subfolder ํŒŒ๋ผ๋ฏธํ„ฐ ์‚ฌ์šฉ!)
186
+ if self.use_huggingface:
187
+ print(f"[AIEffector] Hugging Face์—์„œ LoRA ์–ด๋Œ‘ํ„ฐ ๋กœ๋”ฉ...")
188
+ self.model = PeftModel.from_pretrained(
189
+ base_model,
190
+ self.model_repo_id,
191
+ subfolder=self.model_subfolder, # ํ•ต์‹ฌ ์ˆ˜์ •!
192
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
193
+ )
194
+ else:
195
+ # ๋กœ์ปฌ ๊ฒฝ๋กœ ์‚ฌ์šฉ
196
+ local_path = os.path.join(self.model_repo_id, self.model_subfolder)
197
+ print(f"[AIEffector] ๋กœ์ปฌ์—์„œ LoRA ์–ด๋Œ‘ํ„ฐ ๋กœ๋”ฉ: {local_path}")
198
+ self.model = PeftModel.from_pretrained(
199
+ base_model,
200
+ local_path,
201
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
202
  )
 
203
 
204
+ self.model.eval()
205
+ print(f"[AIEffector] โœ… ๋ชจ๋ธ ๋กœ๋“œ ์„ฑ๊ณต!")
206
 
207
  except Exception as e:
208
  print(f"[AIEffector] โŒ ๋ชจ๋ธ ๋กœ๋“œ ์‹คํŒจ: {e}")
209
+ print(f"[AIEffector] ํด๋ฐฑ ๋ชจ๋“œ๋กœ ์ „ํ™˜ (ํ”„๋ฆฌ์…‹ ๊ธฐ๋ฐ˜)")
210
+ self.model = None
211
+ self.tokenizer = None
212
 
213
  def is_loaded(self) -> bool:
214
+ """๋ชจ๋ธ ๋กœ๋“œ ์—ฌ๋ถ€"""
215
+ return self.model is not None
216
 
217
+ def _apply_preset(self, prompt: str) -> Dict[str, float]:
218
+ """ํ”„๋กฌํ”„ํŠธ์—์„œ ํ”„๋ฆฌ์…‹ ๋งค์นญ"""
219
+ params = DEFAULT_PARAMETERS.copy()
220
+ prompt_lower = prompt.lower()
221
 
222
+ for style_name, style_params in STYLE_PRESETS.items():
223
+ if style_name in prompt_lower:
224
+ params.update(style_params)
225
+
226
+ return params
 
 
 
 
 
 
227
 
228
+ def _format_prompt(self, text_prompt: str, audio_features: List[float]) -> str:
229
+ """LLM ์ž…๋ ฅ ํ”„๋กฌํ”„ํŠธ ํฌ๋งทํŒ…"""
230
+ # ์˜ค๋””์˜ค ํŠน์ง•์„ ๊ฐ„๊ฒฐํ•˜๊ฒŒ ํ‘œํ˜„
231
+ audio_summary = ", ".join([f"{v:.3f}" for v in audio_features[:8]])
232
+
233
+ prompt = f"""You are an audio effect parameter predictor.
234
+
235
+ Input:
236
+ - Text description: {text_prompt}
237
+ - Audio features (first 8): [{audio_summary}]
238
+
239
+ Output the effect parameters as JSON:
240
+ ```json
241
+ {{
242
+ "eq_peak1.params.freq": <float>,
243
+ "eq_peak1.params.gain": <float>,
244
+ "eq_peak1.params.q": <float>,
245
+ "eq_peak2.params.freq": <float>,
246
+ "eq_peak2.params.gain": <float>,
247
+ "eq_peak2.params.q": <float>,
248
+ "eq_lowshelf.params.freq": <float>,
249
+ "eq_lowshelf.params.gain": <float>,
250
+ "eq_lowshelf.params.q": <float>,
251
+ "eq_highshelf.params.freq": <float>,
252
+ "eq_highshelf.params.gain": <float>,
253
+ "eq_highshelf.params.q": <float>,
254
+ "distortion_amount": <float>,
255
+ "delay.delay_time": <float>,
256
+ "delay.feedback": <float>,
257
+ "delay.mix": <float>,
258
+ "final_wet_mix": <float>
259
+ }}
260
+ ```
261
+
262
+ JSON output:"""
263
+
264
+ return prompt
265
+
266
+ def _parse_output(self, output_text: str) -> Dict[str, float]:
267
+ """LLM ์ถœ๋ ฅ์—์„œ ํŒŒ๋ผ๋ฏธํ„ฐ ์ถ”์ถœ"""
268
+ try:
269
+ # JSON ๋ธ”๋ก ์ฐพ๊ธฐ
270
+ json_match = re.search(r'\{[^{}]*\}', output_text, re.DOTALL)
271
+ if json_match:
272
+ params = json.loads(json_match.group())
273
+
274
+ # ์œ ํšจ์„ฑ ๊ฒ€์‚ฌ ๋ฐ ๊ธฐ๋ณธ๊ฐ’ ๋ณ‘ํ•ฉ
275
+ result = DEFAULT_PARAMETERS.copy()
276
+ for key, value in params.items():
277
+ if key in result and isinstance(value, (int, float)):
278
+ result[key] = float(value)
279
+
280
+ return result
281
+ except Exception as e:
282
+ print(f"[AIEffector] ์ถœ๋ ฅ ํŒŒ์‹ฑ ์‹คํŒจ: {e}")
283
+
284
+ return DEFAULT_PARAMETERS.copy()
285
+
286
+ def predict(self, audio_path: str, text_prompt: str = "") -> Dict[str, float]:
287
+ """ํŒŒ๋ผ๋ฏธํ„ฐ ์˜ˆ์ธก"""
288
+
289
+ # ๋ชจ๋ธ์ด ์—†์œผ๋ฉด ํ”„๋ฆฌ์…‹ ์‚ฌ์šฉ
290
+ if not self.is_loaded():
291
+ print(f"[AIEffector] ํ”„๋ฆฌ์…‹ ๋ชจ๋“œ ์‚ฌ์šฉ (prompt: {text_prompt})")
292
+ return self._apply_preset(text_prompt)
293
+
294
  try:
295
+ # ์˜ค๋””์˜ค ํŠน์ง• ์ถ”์ถœ
296
  audio_features = self.audio_encoder.get_audio_features(audio_path)
 
 
 
297
 
298
+ # ํ”„๋กฌํ”„ํŠธ ์ƒ์„ฑ
299
+ prompt = self._format_prompt(text_prompt, audio_features)
 
 
 
 
300
 
301
+ # ํ† ํฐํ™”
302
  inputs = self.tokenizer(
303
  prompt,
304
  return_tensors="pt",
305
  truncation=True,
306
+ max_length=1024
307
  ).to(self.device)
308
 
309
+ # ์ƒ์„ฑ
310
  with torch.no_grad():
311
  outputs = self.model.generate(
312
  **inputs,
313
+ max_new_tokens=256,
 
314
  do_sample=False,
315
+ temperature=0.1,
316
+ pad_token_id=self.tokenizer.pad_token_id
317
  )
318
 
319
+ # ๋””์ฝ”๋”ฉ
320
+ output_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
 
322
+ # ํŒŒ์‹ฑ
323
+ params = self._parse_output(output_text)
 
324
 
325
+ print(f"[AIEffector] โœ… AI ์˜ˆ์ธก ์™„๋ฃŒ")
326
+ return params
327
 
328
  except Exception as e:
329
+ print(f"[AIEffector] ์˜ˆ์ธก ์‹คํŒจ: {e}, ํ”„๋ฆฌ์…‹์œผ๋กœ ํด๋ฐฑ")
330
+ return self._apply_preset(text_prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -18,3 +18,6 @@ transformers>=4.36.0
18
  peft>=0.7.0
19
  huggingface_hub>=0.20.0
20
  accelerate>=0.25.0
 
 
 
 
18
  peft>=0.7.0
19
  huggingface_hub>=0.20.0
20
  accelerate>=0.25.0
21
+
22
+ # ์ถ”๊ฐ€ ์˜์กด์„ฑ
23
+ scipy>=1.10.0