"""
Hugging Face LLM Chatbot with Gradio
Using transformers library to run models locally
"""

import os
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Get HF token from environment (Spaces uses Secrets, local uses .env)
HF_TOKEN = os.getenv("HF_TOKEN", None)

# Check device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Available models (optimized for local execution)
MODELS = {
    "microsoft/DialoGPT-small": {
        "name": "DialoGPT Small (영어, 빠름)",
        "max_length": 80,
        "language": "en",
    },
    "microsoft/DialoGPT-medium": {
        "name": "DialoGPT Medium (영어, 고품질)",
        "max_length": 100,
        "language": "en",
    },
    "gpt2": {
        "name": "GPT-2 (영어, 범용)",
        "max_length": 80,
        "language": "en",
    },
    "beomi/llama-2-ko-7b": {
        "name": "Llama-2-Ko 7B (한글 대화형, ⚠️ 14GB+ RAM 필요)",
        "max_length": 150,
        "language": "ko",
        "warning": "이 모델은 14GB 이상의 메모리가 필요합니다. HF Spaces 무료 tier에서는 메모리 부족으로 실행되지 않을 수 있습니다.",
    },
    "kyujinpy/KoT-Llama2-7B-Chat": {
        "name": "KoT-Llama2-7B-Chat (한글 대화, ⚠️ 14GB+ RAM 필요)",
        "max_length": 150,
        "language": "ko",
        "warning": "이 모델은 14GB 이상의 메모리가 필요합니다. HF Spaces 무료 tier에서는 메모리 부족으로 실행되지 않을 수 있습니다.",
    },
    "beomi/KoAlpaca-Polyglot-5.8B": {
        "name": "KoAlpaca 5.8B (한글 대화형, ⚠️ 12GB+ RAM 필요)",
        "max_length": 150,
        "language": "ko",
        "warning": "이 모델은 12GB 이상의 메모리가 필요합니다. HF Spaces 무료 tier에서는 메모리 부족으로 실행되지 않을 수 있습니다.",
    },
    "nlpai-lab/kullm-polyglot-5.8b-v2": {
        "name": "KULLM-Polyglot 5.8B (한글 대화, ⚠️ 12GB+ RAM 필요)",
        "max_length": 150,
        "language": "ko",
        "warning": "이 모델은 12GB 이상의 메모리가 필요합니다. HF Spaces 무료 tier에서는 메모리 부족으로 실행되지 않을 수 있습니다.",
    },
}

# Model cache
loaded_models = {}
loaded_tokenizers = {}


def load_model(model_name):
    """Load model and tokenizer"""
    if model_name not in loaded_models:
        try:
            print(f"Loading model: {model_name}")

            # Load tokenizer
            tokenizer = AutoTokenizer.from_pretrained(
                model_name,
                token=HF_TOKEN,
                padding_side='left',
                trust_remote_code=True
            )

            # Add pad token if missing
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token

            # Load model with safetensors support
            try:
                model = AutoModelForCausalLM.from_pretrained(
                    model_name,
                    token=HF_TOKEN,
                    torch_dtype=torch.float32,
                    low_cpu_mem_usage=True,
                    trust_remote_code=True,
                    use_safetensors=True
                )
            except Exception as e:
                # Fallback to default loading if safetensors fails
                print(f"⚠️ Safetensors loading failed, trying default method: {e}")
                model = AutoModelForCausalLM.from_pretrained(
                    model_name,
                    token=HF_TOKEN,
                    torch_dtype=torch.float32,
                    low_cpu_mem_usage=True,
                    trust_remote_code=True
                )

            model.to(device)
            model.eval()

            loaded_models[model_name] = model
            loaded_tokenizers[model_name] = tokenizer

            print(f"✅ Model {model_name} loaded successfully")

        except Exception as e:
            print(f"❌ Failed to load model {model_name}: {e}")
            return None, None

    return loaded_models.get(model_name), loaded_tokenizers.get(model_name)


def chat_response(message, history, model_name):
    """
    Generate chatbot response

    Args:
        message: User input
        history: Chat history in Gradio format
        model_name: Selected model

    Returns:
        Response text
    """
    try:
        # Load model and tokenizer
        model, tokenizer = load_model(model_name)

        if model is None or tokenizer is None:
            return f"❌ 모델 '{model_name}'을 로드할 수 없습니다. 다른 모델을 선택해주세요."

        model_config = MODELS[model_name]

        # Build conversation context
        conversation = ""
        for msg in history:
            if msg["role"] == "user":
                conversation += f"{msg['content']}\n"
            elif msg["role"] == "assistant":
                conversation += f"{msg['content']}\n"

        # Add current message
        conversation += f"{message}\n"

        # Tokenize
        inputs = tokenizer.encode(conversation, return_tensors="pt").to(device)

        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                inputs,
                max_new_tokens=model_config["max_length"],
                temperature=0.9,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )

        # Decode response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Remove the input prompt from response
        response = response[len(conversation):].strip()

        # If empty, return a default message
        if not response:
            response = "I understand. Could you tell me more?"

        return response

    except Exception as e:
        import traceback
        error_msg = str(e)
        error_type = type(e).__name__

        print("=" * 50)
        print(f"Error Type: {error_type}")
        print(f"Error Message: {error_msg}")
        print(f"Traceback:\n{traceback.format_exc()}")
        print("=" * 50)

        if "out of memory" in error_msg.lower() or "oom" in error_msg.lower():
            return "❌ 메모리 부족. 더 작은 모델을 선택하거나 앱을 재시작하세요."
        elif "cuda" in error_msg.lower() and device == "cpu":
            return "⚠️ GPU 없이 CPU로 실행 중입니다. 응답이 느릴 수 있습니다."
        else:
            return f"❌ 오류: {error_type}\n{error_msg[:200]}\n\n터미널에서 전체 로그를 확인하세요."


# Global state
current_model = "microsoft/DialoGPT-small"

# Preload default model
print("Preloading default model...")
load_model(current_model)

# Create Gradio interface
with gr.Blocks(
    title="🤖 Hugging Face Chatbot",
    theme=gr.themes.Soft(),
) as demo:
    gr.Markdown(
        """
        # 🤖 Hugging Face LLM Chatbot

        **로컬 모델 실행 방식** - API 제한 없음!

        **사용 방법:**
        1. 모델을 선택하세요 (처음에는 로딩 시간 필요)
        2. 메시지를 입력하고 대화하세요
        3. CPU에서 실행되므로 응답이 조금 느릴 수 있습니다

        **언어별 추천 모델:**
        - 🇬🇧 영어: DialoGPT, GPT-2
        - 🇰🇷 한글: KoGPT-2, KoAlpaca (5.8B는 큰 모델, 느림)

        **장점:** API 제한 없음, 완전 무료, 오프라인 작동 가능
        """
    )

    # Model selector
    model_dropdown = gr.Dropdown(
        choices=[(config["name"], model_id) for model_id, config in MODELS.items()],
        value="microsoft/DialoGPT-small",
        label="🎯 모델 선택",
        info="모델을 변경하면 새 모델을 다운로드합니다 (처음 한 번만)",
    )

    # Warning message for model requirements
    model_warning = gr.Markdown("", visible=False)

    # Chat interface
    chatbot = gr.ChatInterface(
        fn=chat_response,
        type="messages",
        additional_inputs=[model_dropdown],
        chatbot=gr.Chatbot(
            height=500,
            placeholder="메시지를 입력하세요...",
            type="messages",
        ),
        textbox=gr.Textbox(
            placeholder="메시지를 입력하세요 (영어 권장)...",
            container=False,
            scale=7,
        ),
        examples=[
            ["Hello! How are you?", "microsoft/DialoGPT-small"],
            ["Tell me a joke", "microsoft/DialoGPT-medium"],
            ["안녕하세요! 오늘 날씨가 어때요?", "beomi/llama-2-ko-7b"],
            ["인공지능에 대해 간단히 설명해주세요.", "kyujinpy/KoT-Llama2-7B-Chat"],
        ],
    )

    # Show warning and clear chat when model changes
    def on_model_change(new_model):
        global current_model
        current_model = new_model

        # Check if model has warning
        warning_text = ""
        warning_visible = False
        if "warning" in MODELS[new_model]:
            warning_text = f"⚠️ **경고**: {MODELS[new_model]['warning']}"
            warning_visible = True

        # Preload new model
        load_model(new_model)

        # Return: empty chat history, warning text, warning visibility
        return [], warning_text, gr.update(visible=warning_visible)

    model_dropdown.change(
        fn=on_model_change,
        inputs=[model_dropdown],
        outputs=[chatbot.chatbot_state, model_warning, model_warning],
    )

    gr.Markdown(
        """
        ---

        **⚠️ 참고:**
        - 모델은 로컬에서 실행됩니다 (첫 실행 시 다운로드)
        - CPU에서 실행되므로 GPU보다 느립니다
        - 각 모델은 특정 언어에 최적화되어 있습니다

        **💾 디스크 사용량:**
        - DialoGPT-small: ~350MB
        - DialoGPT-medium: ~800MB
        - GPT-2: ~500MB
        - KoGPT-2: ~500MB
        - KoAlpaca-5.8B: ~12GB (큰 모델, 메모리 8GB+ 필요)

        **💡 팁:**
        - 영어 대화는 DialoGPT 추천
        - 한글 대화는 KoGPT-2 추천 (KoAlpaca는 리소스 충분할 때만)
        - 짧은 문장으로 대화하면 더 나은 결과
        - 모델이 한 번 로드되면 다시 다운로드하지 않습니다
        """
    )

if __name__ == "__main__":
    demo.launch()