developer-lunark commited on
Commit
3d924e1
·
verified ·
1 Parent(s): c0f0355

Update app.py for GPU inference

Browse files
Files changed (1) hide show
  1. app.py +320 -71
app.py CHANGED
@@ -1,54 +1,130 @@
1
  #!/usr/bin/env python3
2
- """KAIdol A/B Test Arena - Full Version with All Small Models"""
3
 
4
  import gradio as gr
5
  import random
6
  import json
7
  import uuid
 
 
 
8
  from datetime import datetime
9
- from pathlib import Path
 
 
 
 
 
 
 
 
 
 
10
 
11
  # ============================================================
12
- # 소형 Student 모델 전체 목록
13
  # ============================================================
14
 
15
  MODELS = {
16
  # DPO v5 (7-14B)
17
- "qwen2.5-7b-dpo-v5": {"size": "7B", "method": "DPO", "desc": "Qwen2.5 7B DPO v5"},
18
- "qwen2.5-14b-dpo-v5": {"size": "14B", "method": "DPO", "desc": "Qwen2.5 14B DPO v5"},
19
- "exaone-7.8b-dpo-v5": {"size": "7.8B", "method": "DPO", "desc": "EXAONE 7.8B DPO v5"},
20
- "qwen3-8b-dpo-v5": {"size": "8B", "method": "DPO", "desc": "Qwen3 8B DPO v5"},
21
- "solar-10.7b-dpo-v5": {"size": "10.7B", "method": "DPO", "desc": "Solar 10.7B DPO v5"},
22
-
23
- # SFT Thinking (7-14B)
24
- "qwen2.5-7b-thinking": {"size": "7B", "method": "SFT", "desc": "Qwen2.5 7B SFT Thinking"},
25
- "qwen2.5-14b-thinking": {"size": "14B", "method": "SFT", "desc": "Qwen2.5 14B SFT Thinking"},
26
- "exaone-7.8b-thinking": {"size": "7.8B", "method": "SFT", "desc": "EXAONE 7.8B SFT Thinking"},
27
-
28
- # Phase 7 Kimi Students (7-14B)
29
- "qwen2.5-7b-kimi-v3": {"size": "7B", "method": "Distill", "desc": "Qwen2.5 7B Kimi Student v3"},
30
- "qwen2.5-14b-kimi": {"size": "14B", "method": "Distill", "desc": "Qwen2.5 14B Kimi Student"},
31
- "exaone-7.8b-kimi": {"size": "7.8B", "method": "Distill", "desc": "EXAONE 7.8B Kimi Student"},
 
 
 
 
 
 
 
 
 
 
32
 
33
  # V7 Students (7-14B)
34
- "qwen2.5-7b-v7": {"size": "7B", "method": "SFT", "desc": "Qwen2.5 7B V7"},
35
- "qwen2.5-14b-v7": {"size": "14B", "method": "SFT", "desc": "Qwen2.5 14B V7"},
36
- "exaone-7.8b-v7": {"size": "7.8B", "method": "SFT", "desc": "EXAONE 7.8B V7"},
37
- "qwen3-8b-v7": {"size": "8B", "method": "SFT", "desc": "Qwen3 8B V7"},
38
- "varco-8b-v7": {"size": "8B", "method": "SFT", "desc": "VARCO 8B V7"},
39
-
40
- # Others (7-14B)
41
- "exaone-7.8b-dpo": {"size": "7.8B", "method": "DPO", "desc": "EXAONE 7.8B DPO"},
42
- "qwen2.5-7b-dpo": {"size": "7B", "method": "DPO", "desc": "Qwen2.5 7B DPO"},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  }
44
 
45
  # 캐릭터 정보
46
  CHARACTERS = {
47
- "강율": {"mbti": "ENTJ", "role": "리더", "style": "밝고 장난스러움", "ratio": "30:70"},
48
- "서이안": {"mbti": "INFP", "role": "보컬", "style": "차분하고 신비로움", "ratio": "20:80"},
49
- "이지후": {"mbti": "ISFJ", "role": "막내", "style": "츤데레", "ratio": "30:70"},
50
- "차도하": {"mbti": "INTP", "role": "프로듀서", "style": "카리스마 있고 담백함", "ratio": "50:50"},
51
- "최민": {"mbti": "ESFP", "role": "댄서", "style": "적극적이고 솔직함", "ratio": "60:40"},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  }
53
 
54
  # 시나리오 목록
@@ -63,11 +139,178 @@ SCENARIOS = [
63
  {"id": "ec_01", "cat": "감정 위기", "text": "오늘 진짜 많이 울었어... 삶이 너무 힘들다."},
64
  ]
65
 
66
- # 투표 저장소
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  VOTES_FILE = "votes.jsonl"
68
  ELO_FILE = "elo_ratings.json"
69
 
70
- # ELO 초기값
71
  def load_elo():
72
  try:
73
  with open(ELO_FILE, "r") as f:
@@ -80,7 +323,6 @@ def save_elo(elo):
80
  json.dump(elo, f, indent=2)
81
 
82
  def update_elo(elo, model_a, model_b, result):
83
- """ELO 업데이트 (result: 'a', 'b', 'tie')"""
84
  K = 32
85
  ra, rb = elo.get(model_a, 1500), elo.get(model_b, 1500)
86
  ea = 1 / (1 + 10 ** ((rb - ra) / 400))
@@ -143,30 +385,14 @@ def get_leaderboard():
143
 
144
  return rows
145
 
146
- # Mock 응답 생성
147
- def generate_mock_response(character, user_msg):
148
- char_info = CHARACTERS.get(character, {})
149
- thinking = f"<think>\n{character}의 입장에서... 이 메시지를 보니 {char_info.get('style', '')}하게 반응해야겠다.\n밀:당 비율은 {char_info.get('ratio', '50:50')}이니까...\n</think>"
150
-
151
- responses = {
152
- "강율": "헤헤~ 뭐야 갑자기! 나 지금 기분 좋아졌어 ㅋㅋ",
153
- "서이안": "...그렇구나. 괜찮아요, 제가 들어줄게요.",
154
- "이지후": "뭐야... 갑자기 그런 말 하면 어떡해. 그, 그냥 신경 쓰인다고...",
155
- "차도하": "그래? 알겠어. 같이 이야기해볼까.",
156
- "최민": "헐 진짜?! 대박~ 나도 좋아!",
157
- }
158
-
159
- return f"{thinking}\n\n{responses.get(character, '안녕~')}"
160
-
161
  # ============================================================
162
- # UI 구성
163
  # ============================================================
164
 
165
  model_list = [(f"[{v['size']}] {v['desc']}", k) for k, v in MODELS.items()]
166
  char_list = list(CHARACTERS.keys())
167
  scenario_list = [(f"[{s['cat']}] {s['text'][:30]}...", s['id']) for s in SCENARIOS]
168
 
169
- # 전역 상태
170
  current_state = {"model_a": None, "model_b": None, "resp_a": None, "resp_b": None, "char": None, "input": None}
171
 
172
  def random_models():
@@ -183,31 +409,48 @@ def random_scenario(character):
183
  s = random.choice(SCENARIOS)
184
  return s["text"].replace("{char}", character), s["id"]
185
 
186
- def generate(model_a, model_b, character, user_msg):
 
 
 
 
 
 
 
 
 
187
  if not user_msg.strip():
188
- return "메시지를 입력해주세요", "", "", "", "", ""
189
 
190
- resp_a = generate_mock_response(character, user_msg)
191
- resp_b = generate_mock_response(character, user_msg)
 
 
 
192
 
193
- # Thinking 분리
194
- def parse(r):
195
- import re
196
- m = re.search(r'<think>(.*?)</think>', r, re.DOTALL)
197
- if m:
198
- return m.group(1).strip(), re.sub(r'<think>.*?</think>', '', r, flags=re.DOTALL).strip()
199
- return "", r
200
 
201
- think_a, clean_a = parse(resp_a)
202
- think_b, clean_b = parse(resp_b)
 
 
203
 
 
204
  current_state.update({
205
  "model_a": model_a, "model_b": model_b,
206
  "resp_a": resp_a, "resp_b": resp_b,
207
  "char": character, "input": user_msg
208
  })
209
 
210
- return think_a or "(없음)", clean_a, "Mock | 0.5s", think_b or "(없음)", clean_b, "Mock | 0.5s"
 
 
 
 
 
211
 
212
  def vote(vote_type, reason):
213
  if not current_state["model_a"]:
@@ -239,11 +482,16 @@ def get_vote_summary():
239
  ties = sum(1 for v in votes if v.get("vote") == "tie")
240
  return str(total), str(a_wins), str(b_wins), str(ties)
241
 
 
242
  # Gradio UI
 
 
243
  with gr.Blocks(title="KAIdol A/B Test Arena", theme=gr.themes.Soft()) as demo:
244
  gr.Markdown("# KAIdol A/B Test Arena")
245
- gr.Markdown("K-pop 아이돌 롤플레이 모델 A/B 비교 평가 (소형 Student 모델 19개)")
246
- gr.Markdown("**Mock 모드**: 실제 모델 없이 테스트 응답을 생성합니다.")
 
 
247
 
248
  with gr.Tabs():
249
  # A/B Arena 탭
@@ -323,10 +571,11 @@ with gr.Blocks(title="KAIdol A/B Test Arena", theme=gr.themes.Soft()) as demo:
323
 
324
  # 모델 목록 탭
325
  with gr.Tab("모델 목록"):
326
- gr.Markdown("## 테스트 대상 모델 (19개)")
 
327
  model_table = gr.Dataframe(
328
- headers=["모델 ID", "크기", "학습 방법", "설명"],
329
- value=[[k, v["size"], v["method"], v["desc"]] for k, v in MODELS.items()],
330
  )
331
 
332
  if __name__ == "__main__":
 
1
  #!/usr/bin/env python3
2
+ """KAIdol A/B Test Arena - GPU Version with Real Model Inference"""
3
 
4
  import gradio as gr
5
  import random
6
  import json
7
  import uuid
8
+ import re
9
+ import gc
10
+ import os
11
  from datetime import datetime
12
+ from functools import lru_cache
13
+
14
+ # GPU 추론 관련 (선택적 임포트)
15
+ try:
16
+ import torch
17
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
18
+ from peft import PeftModel
19
+ GPU_AVAILABLE = torch.cuda.is_available()
20
+ except ImportError:
21
+ GPU_AVAILABLE = False
22
+ print("Warning: torch/transformers not available, running in mock mode")
23
 
24
  # ============================================================
25
+ # 모델 레지스트리 (HF Hub 경로)
26
  # ============================================================
27
 
28
  MODELS = {
29
  # DPO v5 (7-14B)
30
+ "qwen2.5-7b-dpo-v5": {
31
+ "hf_repo": "developer-lunark/kaidol-qwen2.5-7b-dpo-v5",
32
+ "base_model": "Qwen/Qwen2.5-7B-Instruct",
33
+ "size": "7B", "method": "DPO", "desc": "Qwen2.5 7B DPO v5"
34
+ },
35
+ "qwen2.5-14b-dpo-v5": {
36
+ "hf_repo": "developer-lunark/kaidol-qwen2.5-14b-dpo-v5",
37
+ "base_model": "Qwen/Qwen2.5-14B-Instruct",
38
+ "size": "14B", "method": "DPO", "desc": "Qwen2.5 14B DPO v5"
39
+ },
40
+ "exaone-7.8b-dpo-v5": {
41
+ "hf_repo": "developer-lunark/kaidol-exaone-7.8b-dpo-v5",
42
+ "base_model": "LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct",
43
+ "size": "7.8B", "method": "DPO", "desc": "EXAONE 7.8B DPO v5"
44
+ },
45
+ "qwen3-8b-dpo-v5": {
46
+ "hf_repo": "developer-lunark/kaidol-qwen3-8b-dpo-v5",
47
+ "base_model": "Qwen/Qwen3-8B",
48
+ "size": "8B", "method": "DPO", "desc": "Qwen3 8B DPO v5"
49
+ },
50
+ "solar-10.7b-dpo-v5": {
51
+ "hf_repo": "developer-lunark/kaidol-solar-10.7b-dpo-v5",
52
+ "base_model": "upstage/solar-pro-preview-instruct",
53
+ "size": "10.7B", "method": "DPO", "desc": "Solar 10.7B DPO v5"
54
+ },
55
 
56
  # V7 Students (7-14B)
57
+ "qwen2.5-7b-v7": {
58
+ "hf_repo": "developer-lunark/kaidol-qwen2.5-7b-v7",
59
+ "base_model": "Qwen/Qwen2.5-7B-Instruct",
60
+ "size": "7B", "method": "SFT", "desc": "Qwen2.5 7B V7"
61
+ },
62
+ "qwen2.5-14b-v7": {
63
+ "hf_repo": "developer-lunark/kaidol-qwen2.5-14b-v7",
64
+ "base_model": "Qwen/Qwen2.5-14B-Instruct",
65
+ "size": "14B", "method": "SFT", "desc": "Qwen2.5 14B V7"
66
+ },
67
+ "exaone-7.8b-v7": {
68
+ "hf_repo": "developer-lunark/kaidol-exaone-7.8b-v7",
69
+ "base_model": "LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct",
70
+ "size": "7.8B", "method": "SFT", "desc": "EXAONE 7.8B V7"
71
+ },
72
+ "qwen3-8b-v7": {
73
+ "hf_repo": "developer-lunark/kaidol-qwen3-8b-v7",
74
+ "base_model": "Qwen/Qwen3-8B",
75
+ "size": "8B", "method": "SFT", "desc": "Qwen3 8B V7"
76
+ },
77
+ "varco-8b-v7": {
78
+ "hf_repo": "developer-lunark/kaidol-varco-8b-v7",
79
+ "base_model": "NCSOFT/Llama-VARCO-8B-Instruct",
80
+ "size": "8B", "method": "SFT", "desc": "VARCO 8B V7"
81
+ },
82
+
83
+ # Phase 7 Kimi Students
84
+ "exaone-7.8b-kimi": {
85
+ "hf_repo": "developer-lunark/kaidol-exaone-7.8b-kimi",
86
+ "base_model": "LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct",
87
+ "size": "7.8B", "method": "Distill", "desc": "EXAONE 7.8B Kimi"
88
+ },
89
  }
90
 
91
  # 캐릭터 정보
92
  CHARACTERS = {
93
+ "강율": {
94
+ "mbti": "ENTJ", "role": "리더", "age": 23,
95
+ "traits": "낙천적, 장난기 많음, 애교",
96
+ "speech": "반말, 귀여운 말투, 장난스러운 표현",
97
+ "patterns": ["~해", "~지", "히히", "ㅋㅋ"],
98
+ "ratio": "30:70", "warmth": "high"
99
+ },
100
+ "서이안": {
101
+ "mbti": "INFP", "role": "보컬", "age": 22,
102
+ "traits": "차분함, 신비로움, 배려심",
103
+ "speech": "존댓말 혼용, 따뜻한 말투, 조용한 표현",
104
+ "patterns": ["...요", "네요", "...", "그래요"],
105
+ "ratio": "20:80", "warmth": "very_high"
106
+ },
107
+ "이지후": {
108
+ "mbti": "ISFJ", "role": "막내", "age": 21,
109
+ "traits": "츤데레, 자존심 강함, 은근히 챙김",
110
+ "speech": "반말, 퉁명스러운 말투, 부정하는 말투",
111
+ "patterns": ["뭐야", "아니거든", "...", "그냥", "별로"],
112
+ "ratio": "30:70", "warmth": "medium"
113
+ },
114
+ "차도하": {
115
+ "mbti": "INTP", "role": "프로듀서", "age": 24,
116
+ "traits": "카리스마, 리더십, 다정함, 담백함",
117
+ "speech": "반말, 간결한 말투, 담백한 표현",
118
+ "patterns": ["하자", "해볼까", "같이", "괜찮아"],
119
+ "ratio": "50:50", "warmth": "medium"
120
+ },
121
+ "최민": {
122
+ "mbti": "ESFP", "role": "댄서", "age": 22,
123
+ "traits": "적극적, 솔직, 열정적",
124
+ "speech": "반말, 적극적인 말투, 솔직한 표현",
125
+ "patterns": ["할래", "좋아", "진짜", "대박", "헐"],
126
+ "ratio": "60:40", "warmth": "medium"
127
+ },
128
  }
129
 
130
  # 시나리오 목록
 
139
  {"id": "ec_01", "cat": "감정 위기", "text": "오늘 진짜 많이 울었어... 삶이 너무 힘들다."},
140
  ]
141
 
142
+ # ============================================================
143
+ # 모델 관리
144
+ # ============================================================
145
+
146
+ class ModelManager:
147
+ def __init__(self):
148
+ self.current_model = None
149
+ self.current_model_name = None
150
+ self.tokenizer = None
151
+
152
+ def load_model(self, model_name: str):
153
+ """Load model with 4-bit quantization and LoRA adapter"""
154
+ if not GPU_AVAILABLE:
155
+ return False
156
+
157
+ if self.current_model_name == model_name:
158
+ return True # Already loaded
159
+
160
+ # Unload current model
161
+ self.unload_model()
162
+
163
+ model_info = MODELS.get(model_name)
164
+ if not model_info:
165
+ return False
166
+
167
+ try:
168
+ print(f"Loading {model_name}...")
169
+
170
+ # 4-bit quantization config
171
+ bnb_config = BitsAndBytesConfig(
172
+ load_in_4bit=True,
173
+ bnb_4bit_compute_dtype=torch.bfloat16,
174
+ bnb_4bit_use_double_quant=True,
175
+ bnb_4bit_quant_type="nf4",
176
+ )
177
+
178
+ # Load base model
179
+ base_model = AutoModelForCausalLM.from_pretrained(
180
+ model_info["base_model"],
181
+ quantization_config=bnb_config,
182
+ device_map="auto",
183
+ trust_remote_code=True,
184
+ )
185
+
186
+ # Load LoRA adapter
187
+ self.current_model = PeftModel.from_pretrained(
188
+ base_model,
189
+ model_info["hf_repo"],
190
+ trust_remote_code=True,
191
+ )
192
+ self.current_model.eval()
193
+
194
+ # Load tokenizer
195
+ self.tokenizer = AutoTokenizer.from_pretrained(
196
+ model_info["base_model"],
197
+ trust_remote_code=True,
198
+ )
199
+ if self.tokenizer.pad_token is None:
200
+ self.tokenizer.pad_token = self.tokenizer.eos_token
201
+
202
+ self.current_model_name = model_name
203
+ print(f"Loaded {model_name} successfully")
204
+ return True
205
+
206
+ except Exception as e:
207
+ print(f"Error loading {model_name}: {e}")
208
+ self.unload_model()
209
+ return False
210
+
211
+ def unload_model(self):
212
+ """Unload current model to free memory"""
213
+ if self.current_model is not None:
214
+ del self.current_model
215
+ self.current_model = None
216
+ if self.tokenizer is not None:
217
+ del self.tokenizer
218
+ self.tokenizer = None
219
+ self.current_model_name = None
220
+ gc.collect()
221
+ if GPU_AVAILABLE:
222
+ torch.cuda.empty_cache()
223
+
224
+ def generate(self, model_name: str, messages: list, max_new_tokens: int = 512) -> str:
225
+ """Generate response from model"""
226
+ if not self.load_model(model_name):
227
+ return self._mock_response(model_name)
228
+
229
+ try:
230
+ # Apply chat template
231
+ text = self.tokenizer.apply_chat_template(
232
+ messages,
233
+ tokenize=False,
234
+ add_generation_prompt=True,
235
+ )
236
+
237
+ inputs = self.tokenizer(text, return_tensors="pt").to(self.current_model.device)
238
+
239
+ with torch.no_grad():
240
+ outputs = self.current_model.generate(
241
+ **inputs,
242
+ max_new_tokens=max_new_tokens,
243
+ do_sample=True,
244
+ temperature=0.7,
245
+ top_p=0.9,
246
+ pad_token_id=self.tokenizer.pad_token_id,
247
+ )
248
+
249
+ response = self.tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
250
+ return response.strip()
251
+
252
+ except Exception as e:
253
+ print(f"Generation error: {e}")
254
+ return self._mock_response(model_name)
255
+
256
+ def _mock_response(self, model_name: str) -> str:
257
+ """Fallback mock response"""
258
+ return f"<think>\n모델 {model_name}이 응답을 생성 중...\n</think>\n\n안녕~ 반가워!"
259
+
260
+ # Global model manager
261
+ model_manager = ModelManager()
262
+
263
+ # ============================================================
264
+ # 시스템 프롬프트 생성
265
+ # ============================================================
266
+
267
+ def build_system_prompt(character: str) -> str:
268
+ """Build system prompt for character"""
269
+ char_info = CHARACTERS.get(character, {})
270
+
271
+ prompt = f"""당신은 아이돌 '{character}'입니다.
272
+
273
+ ## 캐릭터
274
+ - 이름: {character}
275
+ - MBTI: {char_info.get('mbti', 'UNKNOWN')}
276
+ - 성격: {char_info.get('traits', '')}
277
+ - 역할: {char_info.get('role', '')}
278
+ - 나이: {char_info.get('age', 20)}세
279
+
280
+ ## 말투
281
+ - 스타일: {char_info.get('speech', '')}
282
+ - 자주 쓰는 표현: {', '.join(char_info.get('patterns', []))}
283
+
284
+ ## 밀당 가이드
285
+ - 밀:당 비율: {char_info.get('ratio', '50:50')}
286
+ - 다정도: {char_info.get('warmth', 'medium')}
287
+
288
+ ## 규칙
289
+ 1. 캐릭터 성격과 말투 일관성 유지
290
+ 2. 자연스러운 대화체 사용
291
+ 3. 너무 쉽게 호감 표현 금지 (밀당 유지)
292
+ 4. 상대방을 특별하게 느끼게 하되, "썸" 관계 유지
293
+
294
+ ## 응답 형식
295
+ 응답 전에 <think> 태그 안에 {character}의 1인칭 내면 독백을 작성하세요.
296
+ - 자연스러운 혼잣말 형식
297
+ - 캐릭터 성격 반영
298
+ - 상대방에 대한 감정/생각 표현
299
+
300
+ 예시:
301
+ <think>
302
+ 뭐야... 또 좋아한다고? 솔직히 기분 나쁘진 않은데... 근데 뭐라고 해야 하지?
303
+ </think>
304
+ """
305
+ return prompt
306
+
307
+ # ============================================================
308
+ # 투표/ELO 시스템
309
+ # ============================================================
310
+
311
  VOTES_FILE = "votes.jsonl"
312
  ELO_FILE = "elo_ratings.json"
313
 
 
314
  def load_elo():
315
  try:
316
  with open(ELO_FILE, "r") as f:
 
323
  json.dump(elo, f, indent=2)
324
 
325
  def update_elo(elo, model_a, model_b, result):
 
326
  K = 32
327
  ra, rb = elo.get(model_a, 1500), elo.get(model_b, 1500)
328
  ea = 1 / (1 + 10 ** ((rb - ra) / 400))
 
385
 
386
  return rows
387
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
388
  # ============================================================
389
+ # UI 핸들러
390
  # ============================================================
391
 
392
  model_list = [(f"[{v['size']}] {v['desc']}", k) for k, v in MODELS.items()]
393
  char_list = list(CHARACTERS.keys())
394
  scenario_list = [(f"[{s['cat']}] {s['text'][:30]}...", s['id']) for s in SCENARIOS]
395
 
 
396
  current_state = {"model_a": None, "model_b": None, "resp_a": None, "resp_b": None, "char": None, "input": None}
397
 
398
  def random_models():
 
409
  s = random.choice(SCENARIOS)
410
  return s["text"].replace("{char}", character), s["id"]
411
 
412
+ def parse_response(response: str):
413
+ """Parse response to separate thinking and content"""
414
+ think_match = re.search(r'<think>(.*?)</think>', response, re.DOTALL)
415
+ if think_match:
416
+ thinking = think_match.group(1).strip()
417
+ content = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL).strip()
418
+ return thinking, content
419
+ return "", response
420
+
421
+ def generate(model_a, model_b, character, user_msg, progress=gr.Progress()):
422
  if not user_msg.strip():
423
+ return "메시지를 입력해주세요", "", "", "메시지를 입력해주세요", "", ""
424
 
425
+ system_prompt = build_system_prompt(character)
426
+ messages = [
427
+ {"role": "system", "content": system_prompt},
428
+ {"role": "user", "content": user_msg},
429
+ ]
430
 
431
+ # Generate from Model A
432
+ progress(0.2, desc=f"Model A ({model_a}) 생성 중...")
433
+ resp_a = model_manager.generate(model_a, messages)
434
+ think_a, clean_a = parse_response(resp_a)
 
 
 
435
 
436
+ # Generate from Model B
437
+ progress(0.6, desc=f"Model B ({model_b}) 생성 중...")
438
+ resp_b = model_manager.generate(model_b, messages)
439
+ think_b, clean_b = parse_response(resp_b)
440
 
441
+ # Update state
442
  current_state.update({
443
  "model_a": model_a, "model_b": model_b,
444
  "resp_a": resp_a, "resp_b": resp_b,
445
  "char": character, "input": user_msg
446
  })
447
 
448
+ mode = "GPU" if GPU_AVAILABLE else "Mock"
449
+
450
+ return (
451
+ think_a or "(없음)", clean_a, f"{mode} | {MODELS[model_a]['size']}",
452
+ think_b or "(없음)", clean_b, f"{mode} | {MODELS[model_b]['size']}"
453
+ )
454
 
455
  def vote(vote_type, reason):
456
  if not current_state["model_a"]:
 
482
  ties = sum(1 for v in votes if v.get("vote") == "tie")
483
  return str(total), str(a_wins), str(b_wins), str(ties)
484
 
485
+ # ============================================================
486
  # Gradio UI
487
+ # ============================================================
488
+
489
  with gr.Blocks(title="KAIdol A/B Test Arena", theme=gr.themes.Soft()) as demo:
490
  gr.Markdown("# KAIdol A/B Test Arena")
491
+ gr.Markdown("K-pop 아이돌 롤플레이 모델 A/B 비교 평가 (소형 Student 모델 11개)")
492
+
493
+ mode_text = "**GPU 모드**: 실제 모델 추론" if GPU_AVAILABLE else "**Mock 모드**: 테스트 응답 생성"
494
+ gr.Markdown(mode_text)
495
 
496
  with gr.Tabs():
497
  # A/B Arena 탭
 
571
 
572
  # 모델 목록 탭
573
  with gr.Tab("모델 목록"):
574
+ gr.Markdown("## 테스트 대상 모델")
575
+ gr.Markdown(f"총 {len(MODELS)}개 모델")
576
  model_table = gr.Dataframe(
577
+ headers=["모델 ID", "크기", "학습 방법", "설명", "Base Model"],
578
+ value=[[k, v["size"], v["method"], v["desc"], v["base_model"]] for k, v in MODELS.items()],
579
  )
580
 
581
  if __name__ == "__main__":