import gradio as gr import torch import torch.nn.functional as F from transformers import AutoTokenizer, AutoModelForMaskedLM # 설정 MODEL_ID = "solonsophy/kf-deberta-gen" # 파인튜닝된 모델 BASE_MODEL_ID = "kakaobank/kf-deberta-base" # 기반 모델 (토크나이저용) MAX_LEN = 256 Q_MAX_LEN = 100 # 모델 로드 print("🔄 Loading model...") tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID) # 기반 모델에서 토크나이저 로드 model = AutoModelForMaskedLM.from_pretrained(MODEL_ID) # 파인튜닝된 가중치 로드 device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) model.eval() print(f"✅ Model loaded on {device}") MASK_ID = tokenizer.mask_token_id PAD_ID = tokenizer.pad_token_id CLS_ID = tokenizer.cls_token_id SEP_ID = tokenizer.sep_token_id def generate_response(question, num_steps, temperature, top_k, max_answer_len): """Diffusion 기반 답변 생성""" if not question.strip(): return "질문을 입력해주세요." # 질문 토큰화 q_tokens = tokenizer.encode(question, add_special_tokens=False)[:Q_MAX_LEN] # 초기: [CLS] Q [SEP] [MASK]*N input_ids = [CLS_ID] + q_tokens + [SEP_ID] + [MASK_ID] * max_answer_len input_ids = input_ids[:MAX_LEN] answer_start = len(q_tokens) + 2 answer_end = len(input_ids) input_ids = torch.tensor([input_ids], device=device) attention_mask = torch.ones_like(input_ids) # Iterative denoising for step in range(num_steps): with torch.no_grad(): outputs = model(input_ids=input_ids, attention_mask=attention_mask) logits = outputs.logits # 마스크 위치 찾기 mask_positions = (input_ids[0, answer_start:answer_end] == MASK_ID).nonzero(as_tuple=True)[0] mask_positions = mask_positions + answer_start if len(mask_positions) == 0: break # 이번 스텝에서 unmask할 개수 remaining_steps = num_steps - step tokens_per_step = max(1, len(mask_positions) // remaining_steps) # logits 처리 mask_logits = logits[0, mask_positions] / temperature # Top-k filtering if top_k > 0: top_k_values, _ = torch.topk(mask_logits, min(top_k, mask_logits.size(-1)), dim=-1) threshold = top_k_values[:, -1].unsqueeze(-1) mask_logits = torch.where(mask_logits < threshold, float('-inf'), mask_logits) # 샘플링 probs = F.softmax(mask_logits, dim=-1) sampled_tokens = torch.multinomial(probs, num_samples=1).squeeze(-1) # Confidence confidences = probs.gather(1, sampled_tokens.unsqueeze(-1)).squeeze(-1) # Confidence 기반 unmask _, top_indices = torch.topk(confidences, min(tokens_per_step, len(confidences))) selected_positions = mask_positions[top_indices] selected_tokens = sampled_tokens[top_indices] input_ids[0, selected_positions] = selected_tokens # 결과 추출 answer_tokens = input_ids[0, answer_start:answer_end] valid_mask = (answer_tokens != MASK_ID) & (answer_tokens != PAD_ID) answer_tokens = answer_tokens[valid_mask] answer = tokenizer.decode(answer_tokens, skip_special_tokens=True) return answer.strip() if answer.strip() else "(생성 실패)" # Gradio UI with gr.Blocks(title="kf-deberta-gen Demo", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🌀 kf-deberta-gen Demo **Generative Diffusion BERT** - 한국어 Diffusion 기반 생성 언어 모델 (실험적) > ⚠️ 이 모델은 PoC 단계입니다. 생성 품질이 불안정하며 반복 생성 등의 문제가 있을 수 있습니다. """) with gr.Row(): with gr.Column(scale=2): question_input = gr.Textbox( label="질문", placeholder="질문을 입력하세요...", lines=2 ) submit_btn = gr.Button("🚀 생성", variant="primary") with gr.Column(scale=1): num_steps = gr.Slider(10, 100, value=50, step=5, label="Steps") temperature = gr.Slider(0.1, 2.0, value=0.5, step=0.1, label="Temperature") top_k = gr.Slider(1, 50, value=10, step=1, label="Top-K") max_len = gr.Slider(20, 150, value=80, step=10, label="Max Answer Length") output = gr.Textbox(label="답변", lines=5) gr.Examples( examples=[ ["오늘 날씨 어때?"], ["파이썬을 배우려면 어떻게 해야 하나요?"], ["안녕하세요"], ], inputs=question_input ) submit_btn.click( fn=generate_response, inputs=[question_input, num_steps, temperature, top_k, max_len], outputs=output ) question_input.submit( fn=generate_response, inputs=[question_input, num_steps, temperature, top_k, max_len], outputs=output ) if __name__ == "__main__": demo.launch()