import gradio as gr
import torch
import transformers
import os

# --- 모델 설정 ---
# 사용할 모델 ID를 지정합니다.
MODEL_ID = "Qwen/Qwen2.5-Coder-7B-Instruct"

# --- 모델 로딩 (Space가 시작될 때 한 번만 실행됩니다) ---
print("모델을 로드하는 중입니다... 초기 실행 시 시간이 다소 걸릴 수 있습니다.")
try:
    # 4비트 양자화로 VRAM 사용량을 줄입니다. (T4 GPU에서 실행 가능)
    model = transformers.AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.bfloat16, # T4 GPU와 호환되는 데이터 타입
        device_map="auto",          # 자동으로 GPU에 할당
        load_in_4bit=True,          # 4비트 양자화 활성화
    )
    tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_ID)
    
    # 텍스트 생성 파이프라인을 미리 만들어 둡니다.
    text_generator = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
    )
    print("✅ 모델 로딩 완료!")
except Exception as e:
    print(f"❌ 모델 로딩 실패: {e}")
    # 모델 로딩에 실패하면 오류 메시지를 반환하는 더미 함수로 대체
    def text_generator(*args, **kwargs):
        yield "모델을 로드하는 데 실패했습니다. Space의 하드웨어 설정을 확인하거나 모델 이름이 올바른지 확인해주세요."


def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    """
    사용자의 메시지에 대한 답변을 생성하는 함수
    """
    # Qwen 모델이 요구하는 형식으로 메시지 포맷팅
    messages = [{"role": "system", "content": system_message}]
    
    # Gradio의 history는 [(user1, bot1), (user2, bot2)] 형태
    for user_msg, bot_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": bot_msg})
        
    messages.append({"role": "user", "content": message})

    # 프롬프트를 토크나이저의 채팅 템플릿에 맞게 변환
    prompt = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )

    # 모델로부터 답변 생성 (스트리밍)
    response = ""
    generation_args = {
        "max_new_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "do_sample": True,
        "stream": True, # 스트리밍으로 실시간 응답
    }

    for chunk in text_generator(prompt, **generation_args):
        # 스트리밍 응답에서 실제 텍스트 부분만 추출
        token = chunk[0]['generated_text'][len(prompt):]
        response = token
        yield response


"""
Gradio ChatInterface를 사용하여 챗봇 UI를 만듭니다.
"""
chatbot = gr.ChatInterface(
    respond,
    type="messages", # Gradio 4.x 이상의 최신 메시지 형식 사용
    additional_inputs_accordion="⚙️ 매개변수 설정",
    additional_inputs=[
        gr.Textbox(
            value="You are Qwen2.5-Coder, created by Alibaba Cloud. You are a helpful assistant specialized in coding and programming.", 
            label="System message"
        ),
        gr.Slider(
            minimum=1, 
            maximum=4096, 
            value=1024, 
            step=1, 
            label="Max new tokens"
        ),
        gr.Slider(
            minimum=0.1, 
            maximum=4.0, 
            value=0.7, 
            step=0.1, 
            label="Temperature"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
    examples=[
        ["PyTorch로 간단한 CNN 모델을 만들어줘."],
        ["이 파이썬 코드를 최적화해줘:\n\n```python\nfor i in range(len(my_list)):\n    print(my_list[i])\n```"],
        ["FastAPI로 'hello world'를 출력하는 API 엔드포인트를 만들어줘."],
    ],
    cache_examples=False, # 예제 캐싱 비활성화 (메모리 절약)
)

# Gradio Blocks를 사용하여 레이아웃 구성
with gr.Blocks(theme=gr.themes.Soft(), title="나만의 AI 코드 리더") as demo:
    gr.Markdown("# 🤖 나만의 AI 코드 리더 (Qwen2.5-Coder)")
    gr.Markdown("이 챗봇은 **Qwen2.5-Coder-7B-Instruct** 모델을 기반으로 코드를 생성하고 분석합니다.")
    chatbot.render()

if __name__ == "__main__":
    demo.launch()