Spaces:
Sleeping
Sleeping
| """ | |
| Hugging Face LLM Chatbot with Gradio | |
| Using transformers library to run models locally | |
| """ | |
| import os | |
| import gradio as gr | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import torch | |
| # Get HF token from environment (Spaces uses Secrets, local uses .env) | |
| HF_TOKEN = os.getenv("HF_TOKEN", None) | |
| # Check device | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Using device: {device}") | |
| # Available models (optimized for local execution) | |
| MODELS = { | |
| "microsoft/DialoGPT-small": { | |
| "name": "DialoGPT Small (μμ΄, λΉ λ¦)", | |
| "max_length": 80, | |
| "language": "en", | |
| }, | |
| "microsoft/DialoGPT-medium": { | |
| "name": "DialoGPT Medium (μμ΄, κ³ νμ§)", | |
| "max_length": 100, | |
| "language": "en", | |
| }, | |
| "gpt2": { | |
| "name": "GPT-2 (μμ΄, λ²μ©)", | |
| "max_length": 80, | |
| "language": "en", | |
| }, | |
| "beomi/llama-2-ko-7b": { | |
| "name": "Llama-2-Ko 7B (νκΈ λνν, β οΈ 14GB+ RAM νμ)", | |
| "max_length": 150, | |
| "language": "ko", | |
| "warning": "μ΄ λͺ¨λΈμ 14GB μ΄μμ λ©λͺ¨λ¦¬κ° νμν©λλ€. HF Spaces λ¬΄λ£ tierμμλ λ©λͺ¨λ¦¬ λΆμ‘±μΌλ‘ μ€νλμ§ μμ μ μμ΅λλ€.", | |
| }, | |
| "kyujinpy/KoT-Llama2-7B-Chat": { | |
| "name": "KoT-Llama2-7B-Chat (νκΈ λν, β οΈ 14GB+ RAM νμ)", | |
| "max_length": 150, | |
| "language": "ko", | |
| "warning": "μ΄ λͺ¨λΈμ 14GB μ΄μμ λ©λͺ¨λ¦¬κ° νμν©λλ€. HF Spaces λ¬΄λ£ tierμμλ λ©λͺ¨λ¦¬ λΆμ‘±μΌλ‘ μ€νλμ§ μμ μ μμ΅λλ€.", | |
| }, | |
| "beomi/KoAlpaca-Polyglot-5.8B": { | |
| "name": "KoAlpaca 5.8B (νκΈ λνν, β οΈ 12GB+ RAM νμ)", | |
| "max_length": 150, | |
| "language": "ko", | |
| "warning": "μ΄ λͺ¨λΈμ 12GB μ΄μμ λ©λͺ¨λ¦¬κ° νμν©λλ€. HF Spaces λ¬΄λ£ tierμμλ λ©λͺ¨λ¦¬ λΆμ‘±μΌλ‘ μ€νλμ§ μμ μ μμ΅λλ€.", | |
| }, | |
| "nlpai-lab/kullm-polyglot-5.8b-v2": { | |
| "name": "KULLM-Polyglot 5.8B (νκΈ λν, β οΈ 12GB+ RAM νμ)", | |
| "max_length": 150, | |
| "language": "ko", | |
| "warning": "μ΄ λͺ¨λΈμ 12GB μ΄μμ λ©λͺ¨λ¦¬κ° νμν©λλ€. HF Spaces λ¬΄λ£ tierμμλ λ©λͺ¨λ¦¬ λΆμ‘±μΌλ‘ μ€νλμ§ μμ μ μμ΅λλ€.", | |
| }, | |
| } | |
| # Model cache | |
| loaded_models = {} | |
| loaded_tokenizers = {} | |
| def load_model(model_name): | |
| """Load model and tokenizer""" | |
| if model_name not in loaded_models: | |
| try: | |
| print(f"Loading model: {model_name}") | |
| # Load tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| model_name, | |
| token=HF_TOKEN, | |
| padding_side='left', | |
| trust_remote_code=True | |
| ) | |
| # Add pad token if missing | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| # Load model with safetensors support | |
| try: | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| token=HF_TOKEN, | |
| torch_dtype=torch.float32, | |
| low_cpu_mem_usage=True, | |
| trust_remote_code=True, | |
| use_safetensors=True | |
| ) | |
| except Exception as e: | |
| # Fallback to default loading if safetensors fails | |
| print(f"β οΈ Safetensors loading failed, trying default method: {e}") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| token=HF_TOKEN, | |
| torch_dtype=torch.float32, | |
| low_cpu_mem_usage=True, | |
| trust_remote_code=True | |
| ) | |
| model.to(device) | |
| model.eval() | |
| loaded_models[model_name] = model | |
| loaded_tokenizers[model_name] = tokenizer | |
| print(f"β Model {model_name} loaded successfully") | |
| except Exception as e: | |
| print(f"β Failed to load model {model_name}: {e}") | |
| return None, None | |
| return loaded_models.get(model_name), loaded_tokenizers.get(model_name) | |
| def chat_response(message, history, model_name): | |
| """ | |
| Generate chatbot response | |
| Args: | |
| message: User input | |
| history: Chat history in Gradio format | |
| model_name: Selected model | |
| Returns: | |
| Response text | |
| """ | |
| try: | |
| # Load model and tokenizer | |
| model, tokenizer = load_model(model_name) | |
| if model is None or tokenizer is None: | |
| return f"β λͺ¨λΈ '{model_name}'μ λ‘λν μ μμ΅λλ€. λ€λ₯Έ λͺ¨λΈμ μ νν΄μ£ΌμΈμ." | |
| model_config = MODELS[model_name] | |
| # Build conversation context | |
| conversation = "" | |
| for msg in history: | |
| if msg["role"] == "user": | |
| conversation += f"{msg['content']}\n" | |
| elif msg["role"] == "assistant": | |
| conversation += f"{msg['content']}\n" | |
| # Add current message | |
| conversation += f"{message}\n" | |
| # Tokenize | |
| inputs = tokenizer.encode(conversation, return_tensors="pt").to(device) | |
| # Generate response | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| inputs, | |
| max_new_tokens=model_config["max_length"], | |
| temperature=0.9, | |
| do_sample=True, | |
| pad_token_id=tokenizer.pad_token_id, | |
| eos_token_id=tokenizer.eos_token_id, | |
| ) | |
| # Decode response | |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Remove the input prompt from response | |
| response = response[len(conversation):].strip() | |
| # If empty, return a default message | |
| if not response: | |
| response = "I understand. Could you tell me more?" | |
| return response | |
| except Exception as e: | |
| import traceback | |
| error_msg = str(e) | |
| error_type = type(e).__name__ | |
| print("=" * 50) | |
| print(f"Error Type: {error_type}") | |
| print(f"Error Message: {error_msg}") | |
| print(f"Traceback:\n{traceback.format_exc()}") | |
| print("=" * 50) | |
| if "out of memory" in error_msg.lower() or "oom" in error_msg.lower(): | |
| return "β λ©λͺ¨λ¦¬ λΆμ‘±. λ μμ λͺ¨λΈμ μ ννκ±°λ μ±μ μ¬μμνμΈμ." | |
| elif "cuda" in error_msg.lower() and device == "cpu": | |
| return "β οΈ GPU μμ΄ CPUλ‘ μ€ν μ€μ λλ€. μλ΅μ΄ λ릴 μ μμ΅λλ€." | |
| else: | |
| return f"β μ€λ₯: {error_type}\n{error_msg[:200]}\n\nν°λ―Έλμμ μ 체 λ‘κ·Έλ₯Ό νμΈνμΈμ." | |
| # Global state | |
| current_model = "microsoft/DialoGPT-small" | |
| # Preload default model | |
| print("Preloading default model...") | |
| load_model(current_model) | |
| # Create Gradio interface | |
| with gr.Blocks( | |
| title="π€ Hugging Face Chatbot", | |
| theme=gr.themes.Soft(), | |
| ) as demo: | |
| gr.Markdown( | |
| """ | |
| # π€ Hugging Face LLM Chatbot | |
| **λ‘컬 λͺ¨λΈ μ€ν λ°©μ** - API μ ν μμ! | |
| **μ¬μ© λ°©λ²:** | |
| 1. λͺ¨λΈμ μ ννμΈμ (μ²μμλ λ‘λ© μκ° νμ) | |
| 2. λ©μμ§λ₯Ό μ λ ₯νκ³ λννμΈμ | |
| 3. CPUμμ μ€νλλ―λ‘ μλ΅μ΄ μ‘°κΈ λ릴 μ μμ΅λλ€ | |
| **μΈμ΄λ³ μΆμ² λͺ¨λΈ:** | |
| - π¬π§ μμ΄: DialoGPT, GPT-2 | |
| - π°π· νκΈ: KoGPT-2, KoAlpaca (5.8Bλ ν° λͺ¨λΈ, λλ¦Ό) | |
| **μ₯μ :** API μ ν μμ, μμ 무λ£, μ€νλΌμΈ μλ κ°λ₯ | |
| """ | |
| ) | |
| # Model selector | |
| model_dropdown = gr.Dropdown( | |
| choices=[(config["name"], model_id) for model_id, config in MODELS.items()], | |
| value="microsoft/DialoGPT-small", | |
| label="π― λͺ¨λΈ μ ν", | |
| info="λͺ¨λΈμ λ³κ²½νλ©΄ μ λͺ¨λΈμ λ€μ΄λ‘λν©λλ€ (μ²μ ν λ²λ§)", | |
| ) | |
| # Warning message for model requirements | |
| model_warning = gr.Markdown("", visible=False) | |
| # Chat interface | |
| chatbot = gr.ChatInterface( | |
| fn=chat_response, | |
| type="messages", | |
| additional_inputs=[model_dropdown], | |
| chatbot=gr.Chatbot( | |
| height=500, | |
| placeholder="λ©μμ§λ₯Ό μ λ ₯νμΈμ...", | |
| type="messages", | |
| ), | |
| textbox=gr.Textbox( | |
| placeholder="λ©μμ§λ₯Ό μ λ ₯νμΈμ (μμ΄ κΆμ₯)...", | |
| container=False, | |
| scale=7, | |
| ), | |
| examples=[ | |
| ["Hello! How are you?", "microsoft/DialoGPT-small"], | |
| ["Tell me a joke", "microsoft/DialoGPT-medium"], | |
| ["μλ νμΈμ! μ€λ λ μ¨κ° μ΄λμ?", "beomi/llama-2-ko-7b"], | |
| ["μΈκ³΅μ§λ₯μ λν΄ κ°λ¨ν μ€λͺ ν΄μ£ΌμΈμ.", "kyujinpy/KoT-Llama2-7B-Chat"], | |
| ], | |
| ) | |
| # Show warning and clear chat when model changes | |
| def on_model_change(new_model): | |
| global current_model | |
| current_model = new_model | |
| # Check if model has warning | |
| warning_text = "" | |
| warning_visible = False | |
| if "warning" in MODELS[new_model]: | |
| warning_text = f"β οΈ **κ²½κ³ **: {MODELS[new_model]['warning']}" | |
| warning_visible = True | |
| # Preload new model | |
| load_model(new_model) | |
| # Return: empty chat history, warning text, warning visibility | |
| return [], warning_text, gr.update(visible=warning_visible) | |
| model_dropdown.change( | |
| fn=on_model_change, | |
| inputs=[model_dropdown], | |
| outputs=[chatbot.chatbot_state, model_warning, model_warning], | |
| ) | |
| gr.Markdown( | |
| """ | |
| --- | |
| **β οΈ μ°Έκ³ :** | |
| - λͺ¨λΈμ λ‘컬μμ μ€νλ©λλ€ (첫 μ€ν μ λ€μ΄λ‘λ) | |
| - CPUμμ μ€νλλ―λ‘ GPUλ³΄λ€ λ립λλ€ | |
| - κ° λͺ¨λΈμ νΉμ μΈμ΄μ μ΅μ νλμ΄ μμ΅λλ€ | |
| **πΎ λμ€ν¬ μ¬μ©λ:** | |
| - DialoGPT-small: ~350MB | |
| - DialoGPT-medium: ~800MB | |
| - GPT-2: ~500MB | |
| - KoGPT-2: ~500MB | |
| - KoAlpaca-5.8B: ~12GB (ν° λͺ¨λΈ, λ©λͺ¨λ¦¬ 8GB+ νμ) | |
| **π‘ ν:** | |
| - μμ΄ λνλ DialoGPT μΆμ² | |
| - νκΈ λνλ KoGPT-2 μΆμ² (KoAlpacaλ 리μμ€ μΆ©λΆν λλ§) | |
| - μ§§μ λ¬Έμ₯μΌλ‘ λννλ©΄ λ λμ κ²°κ³Ό | |
| - λͺ¨λΈμ΄ ν λ² λ‘λλλ©΄ λ€μ λ€μ΄λ‘λνμ§ μμ΅λλ€ | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |