Spaces:

alex4cip
/

simple-chat

Sleeping

App Files Files Community

simple-chat / app.py

alex4cip

fix: Add fallback for model loading with better error handling

884298e about 2 months ago

raw

history blame

10.4 kB

	"""
	Hugging Face LLM Chatbot with Gradio
	Using transformers library to run models locally
	"""

	import os
	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch

	# Get HF token from environment (Spaces uses Secrets, local uses .env)
	HF_TOKEN = os.getenv("HF_TOKEN", None)

	# Check device
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {device}")

	# Available models (optimized for local execution)
	MODELS = {
	"microsoft/DialoGPT-small": {
	"name": "DialoGPT Small (영어, 빠름)",
	"max_length": 80,
	"language": "en",
	},
	"microsoft/DialoGPT-medium": {
	"name": "DialoGPT Medium (영어, 고품질)",
	"max_length": 100,
	"language": "en",
	},
	"gpt2": {
	"name": "GPT-2 (영어, 범용)",
	"max_length": 80,
	"language": "en",
	},
	"beomi/llama-2-ko-7b": {
	"name": "Llama-2-Ko 7B (한글 대화형, ⚠️ 14GB+ RAM 필요)",
	"max_length": 150,
	"language": "ko",
	"warning": "이 모델은 14GB 이상의 메모리가 필요합니다. HF Spaces 무료 tier에서는 메모리 부족으로 실행되지 않을 수 있습니다.",
	},
	"kyujinpy/KoT-Llama2-7B-Chat": {
	"name": "KoT-Llama2-7B-Chat (한글 대화, ⚠️ 14GB+ RAM 필요)",
	"max_length": 150,
	"language": "ko",
	"warning": "이 모델은 14GB 이상의 메모리가 필요합니다. HF Spaces 무료 tier에서는 메모리 부족으로 실행되지 않을 수 있습니다.",
	},
	"beomi/KoAlpaca-Polyglot-5.8B": {
	"name": "KoAlpaca 5.8B (한글 대화형, ⚠️ 12GB+ RAM 필요)",
	"max_length": 150,
	"language": "ko",
	"warning": "이 모델은 12GB 이상의 메모리가 필요합니다. HF Spaces 무료 tier에서는 메모리 부족으로 실행되지 않을 수 있습니다.",
	},
	"nlpai-lab/kullm-polyglot-5.8b-v2": {
	"name": "KULLM-Polyglot 5.8B (한글 대화, ⚠️ 12GB+ RAM 필요)",
	"max_length": 150,
	"language": "ko",
	"warning": "이 모델은 12GB 이상의 메모리가 필요합니다. HF Spaces 무료 tier에서는 메모리 부족으로 실행되지 않을 수 있습니다.",
	},
	}

	# Model cache
	loaded_models = {}
	loaded_tokenizers = {}


	def load_model(model_name):
	"""Load model and tokenizer"""
	if model_name not in loaded_models:
	try:
	print(f"Loading model: {model_name}")

	# Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained(
	model_name,
	token=HF_TOKEN,
	padding_side='left',
	trust_remote_code=True
	)

	# Add pad token if missing
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	# Load model with safetensors support
	try:
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	token=HF_TOKEN,
	torch_dtype=torch.float32,
	low_cpu_mem_usage=True,
	trust_remote_code=True,
	use_safetensors=True
	)
	except Exception as e:
	# Fallback to default loading if safetensors fails
	print(f"⚠️ Safetensors loading failed, trying default method: {e}")
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	token=HF_TOKEN,
	torch_dtype=torch.float32,
	low_cpu_mem_usage=True,
	trust_remote_code=True
	)

	model.to(device)
	model.eval()

	loaded_models[model_name] = model
	loaded_tokenizers[model_name] = tokenizer

	print(f"✅ Model {model_name} loaded successfully")

	except Exception as e:
	print(f"❌ Failed to load model {model_name}: {e}")
	return None, None

	return loaded_models.get(model_name), loaded_tokenizers.get(model_name)


	def chat_response(message, history, model_name):
	"""
	Generate chatbot response

	Args:
	message: User input
	history: Chat history in Gradio format
	model_name: Selected model

	Returns:
	Response text
	"""
	try:
	# Load model and tokenizer
	model, tokenizer = load_model(model_name)

	if model is None or tokenizer is None:
	return f"❌ 모델 '{model_name}'을 로드할 수 없습니다. 다른 모델을 선택해주세요."

	model_config = MODELS[model_name]

	# Build conversation context
	conversation = ""
	for msg in history:
	if msg["role"] == "user":
	conversation += f"{msg['content']}\n"
	elif msg["role"] == "assistant":
	conversation += f"{msg['content']}\n"

	# Add current message
	conversation += f"{message}\n"

	# Tokenize
	inputs = tokenizer.encode(conversation, return_tensors="pt").to(device)

	# Generate response
	with torch.no_grad():
	outputs = model.generate(
	inputs,
	max_new_tokens=model_config["max_length"],
	temperature=0.9,
	do_sample=True,
	pad_token_id=tokenizer.pad_token_id,
	eos_token_id=tokenizer.eos_token_id,
	)

	# Decode response
	response = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Remove the input prompt from response
	response = response[len(conversation):].strip()

	# If empty, return a default message
	if not response:
	response = "I understand. Could you tell me more?"

	return response

	except Exception as e:
	import traceback
	error_msg = str(e)
	error_type = type(e).__name__

	print("=" * 50)
	print(f"Error Type: {error_type}")
	print(f"Error Message: {error_msg}")
	print(f"Traceback:\n{traceback.format_exc()}")
	print("=" * 50)

	if "out of memory" in error_msg.lower() or "oom" in error_msg.lower():
	return "❌ 메모리 부족. 더 작은 모델을 선택하거나 앱을 재시작하세요."
	elif "cuda" in error_msg.lower() and device == "cpu":
	return "⚠️ GPU 없이 CPU로 실행 중입니다. 응답이 느릴 수 있습니다."
	else:
	return f"❌ 오류: {error_type}\n{error_msg[:200]}\n\n터미널에서 전체 로그를 확인하세요."


	# Global state
	current_model = "microsoft/DialoGPT-small"

	# Preload default model
	print("Preloading default model...")
	load_model(current_model)

	# Create Gradio interface
	with gr.Blocks(
	title="🤖 Hugging Face Chatbot",
	theme=gr.themes.Soft(),
	) as demo:
	gr.Markdown(
	"""
	# 🤖 Hugging Face LLM Chatbot

	로컬 모델 실행 방식 - API 제한 없음!

	사용 방법:
	1. 모델을 선택하세요 (처음에는 로딩 시간 필요)
	2. 메시지를 입력하고 대화하세요
	3. CPU에서 실행되므로 응답이 조금 느릴 수 있습니다

	언어별 추천 모델:
	- 🇬🇧 영어: DialoGPT, GPT-2
	- 🇰🇷 한글: KoGPT-2, KoAlpaca (5.8B는 큰 모델, 느림)

	장점: API 제한 없음, 완전 무료, 오프라인 작동 가능
	"""
	)

	# Model selector
	model_dropdown = gr.Dropdown(
	choices=[(config["name"], model_id) for model_id, config in MODELS.items()],
	value="microsoft/DialoGPT-small",
	label="🎯 모델 선택",
	info="모델을 변경하면 새 모델을 다운로드합니다 (처음 한 번만)",
	)

	# Warning message for model requirements
	model_warning = gr.Markdown("", visible=False)

	# Chat interface
	chatbot = gr.ChatInterface(
	fn=chat_response,
	type="messages",
	additional_inputs=[model_dropdown],
	chatbot=gr.Chatbot(
	height=500,
	placeholder="메시지를 입력하세요...",
	type="messages",
	),
	textbox=gr.Textbox(
	placeholder="메시지를 입력하세요 (영어 권장)...",
	container=False,
	scale=7,
	),
	examples=[
	["Hello! How are you?", "microsoft/DialoGPT-small"],
	["Tell me a joke", "microsoft/DialoGPT-medium"],
	["안녕하세요! 오늘 날씨가 어때요?", "beomi/llama-2-ko-7b"],
	["인공지능에 대해 간단히 설명해주세요.", "kyujinpy/KoT-Llama2-7B-Chat"],
	],
	)

	# Show warning and clear chat when model changes
	def on_model_change(new_model):
	global current_model
	current_model = new_model

	# Check if model has warning
	warning_text = ""
	warning_visible = False
	if "warning" in MODELS[new_model]:
	warning_text = f"⚠️ 경고: {MODELS[new_model]['warning']}"
	warning_visible = True

	# Preload new model
	load_model(new_model)

	# Return: empty chat history, warning text, warning visibility
	return [], warning_text, gr.update(visible=warning_visible)

	model_dropdown.change(
	fn=on_model_change,
	inputs=[model_dropdown],
	outputs=[chatbot.chatbot_state, model_warning, model_warning],
	)

	gr.Markdown(
	"""
	---

	⚠️ 참고:
	- 모델은 로컬에서 실행됩니다 (첫 실행 시 다운로드)
	- CPU에서 실행되므로 GPU보다 느립니다
	- 각 모델은 특정 언어에 최적화되어 있습니다

	💾 디스크 사용량:
	- DialoGPT-small: ~350MB
	- DialoGPT-medium: ~800MB
	- GPT-2: ~500MB
	- KoGPT-2: ~500MB
	- KoAlpaca-5.8B: ~12GB (큰 모델, 메모리 8GB+ 필요)

	💡 팁:
	- 영어 대화는 DialoGPT 추천
	- 한글 대화는 KoGPT-2 추천 (KoAlpaca는 리소스 충분할 때만)
	- 짧은 문장으로 대화하면 더 나은 결과
	- 모델이 한 번 로드되면 다시 다운로드하지 않습니다
	"""
	)

	if __name__ == "__main__":
	demo.launch()