Spaces:

MuangMuangE
/

test3

Sleeping

App Files Files Community

test3 / app.py

MuangMuangE

Create app.py

e2f81da verified 3 months ago

raw

history blame contribute delete

3.28 kB

	import gradio as gr
	from llama_cpp import Llama

	# ✏️ 아래 두 줄을 본인의 모델 정보로 변경하세요
	# 파인튜닝 모델: "YOUR_USERNAME/qwen3-4b-ft-gguf"
	# 공개 모델 예시: "unsloth/Qwen3-0.6B-GGUF" (테스트용)
	REPO_ID = "unsloth/Qwen3-4B-GGUF" # ✏️ HF Hub 리포지토리 ID
	# REPO_ID = "MuangMuangE/Qwen3-4B-GGUF" # ✏️ HF Hub 리포지토리 ID
	FILENAME = "Qwen3-4B-Q4_K_M.gguf" # ✏️ 파일명 (.gguf 확장자 필수)

	# ✏️ 시스템 프롬프트 — 챗봇의 역할과 말투를 정의합니다
	SYSTEM_PROMPT = "당신은 친절한 한국어 AI 어시스턴트입니다."

	# ⚠️ 모델을 앱 시작 시 바로 로딩하면 HF Spaces 헬스체크(30분) 타임아웃 발생
	# → 해결: 모델 로딩을 첫 번째 사용자 메시지 시점으로 지연 (Lazy Loading)
	llm = None

	def get_model():
	"""
	첫 호출 시에만 모델을 다운로드 + 로딩합니다.
	이후 호출에서는 이미 로딩된 모델을 재사용합니다.
	- from_pretrained: hf_hub_download + Llama 초기화를 한 줄로 처리
	- n_ctx: 컨텍스트 길이 (메모리 확보를 위해 작게 설정)
	- n_threads: CPU Basic = 2 vCPU에 맞춤
	"""
	global llm
	if llm is None:
	llm = Llama.from_pretrained(
	repo_id=REPO_ID,
	filename=FILENAME,
	n_ctx=2048, # ✏️ 컨텍스트 길이 (모델 최대: 40960, 메모리 절약 위해 2048)
	n_threads=2, # ✏️ CPU Basic = 2 vCPU
	verbose=False, # 로딩 시 상세 로그 숨김
	)
	return llm

	def respond(message, history):
	"""
	사용자 메시지를 받아 스트리밍 방식으로 응답을 생성합니다.
	- message: 현재 사용자 입력
	- history: 이전 대화 내역 (Gradio ChatInterface가 자동 관리)
	"""
	# 첫 호출 시 모델 로딩 (1~2분 소요될 수 있음)
	model = get_model()

	# 대화 메시지 구성: 시스템 프롬프트 + 이전 대화 + 현재 입력
	messages = [{"role": "system", "content": SYSTEM_PROMPT}]
	for msg in history:
	messages.append(msg)
	messages.append({"role": "user", "content": message})

	# 스트리밍 방식으로 토큰을 하나씩 생성하여 실시간 출력
	response = ""
	for chunk in model.create_chat_completion(
	messages=messages,
	temperature=0.7, # ✏️ 창의성 조절 (0.0=결정적, 1.0=창의적)
	max_tokens=512, # ✏️ 최대 응답 길이
	stream=True, # 스트리밍 활성화
	):
	delta = chunk["choices"][0]["delta"].get("content", "")
	response += delta
	yield response # Gradio에 실시간으로 전달

	# ✏️ Gradio ChatInterface: 챗봇 UI를 자동으로 생성합니다
	demo = gr.ChatInterface(
	fn=respond,
	title="Qwen3 GGUF 챗봇", # ✏️ 제목
	description="첫 응답 시 모델을 로딩합니다 (1~2분 소요)", # ✏️ 설명
	examples=["안녕하세요!", "파이썬이란 무엇인가요?"], # ✏️ 예시 질문
	)

	# server_name="0.0.0.0": 외부 접속 허용 (Docker 필수)
	# server_port=7860: HF Spaces 기본 포트
	demo.launch(server_name="0.0.0.0", server_port=7860)