llama / app.py
han145's picture
Update app.py
ed809ec verified
import streamlit as st
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import time
import threading
# ===== 配置区(CPU友好型模型)=====
MODEL_REPO = "Qwen/Qwen2.5-1.5B-Instruct-GGUF"
MODEL_FILENAME = "qwen2.5-1_5b-instruct-q4_k_m.gguf"
MODEL_DIR = "/app/models" # Spaces持久化目录
# 全局模型变量(避免重复加载)
llm_instance = None
model_loading = False
model_error = None
def background_model_load():
"""后台线程加载模型(避免阻塞Streamlit主线程)"""
global llm_instance, model_loading, model_error
if model_loading:
return
model_loading = True
try:
# 创建目录
os.makedirs(MODEL_DIR, exist_ok=True)
model_path = os.path.join(MODEL_DIR, MODEL_FILENAME)
# 检查是否已下载
if not os.path.exists(model_path):
st.session_state.download_status = "downloading"
# 下载模型(自动断点续传)
model_path = hf_hub_download(
repo_id=MODEL_REPO,
filename=MODEL_FILENAME,
local_dir=MODEL_DIR,
resume_download=True,
token=None # 公开模型无需token
)
st.session_state.download_status = "downloaded"
# 加载模型到内存
st.session_state.download_status = "loading"
start = time.time()
llm_instance = Llama(
model_path=model_path,
n_ctx=2048,
n_threads=4, # Spaces CPU通常4核
n_gpu_layers=0, # 纯CPU
verbose=False,
n_batch=512 # 优化批处理
)
st.session_state.download_status = "ready"
st.session_state.load_time = time.time() - start
except Exception as e:
model_error = str(e)
st.session_state.download_status = "error"
finally:
model_loading = False
# ===== Streamlit UI =====
st.set_page_config(page_title="🦙 CPU LLM Demo", page_icon="🦙", layout="wide")
# 初始化状态
if "download_status" not in st.session_state:
st.session_state.download_status = "idle"
st.session_state.load_time = 0
# 启动后台加载线程
threading.Thread(target=background_model_load, daemon=True).start()
# 顶部状态栏
col1, col2 = st.columns([3, 1])
with col1:
status_map = {
"idle": "⏳ 准备加载模型...",
"downloading": "⬇️ 正在下载模型 (1.0GB)...",
"downloaded": "✅ 模型下载完成,正在加载到内存...",
"loading": "🧠 正在加载模型到内存(约60-90秒)...",
"ready": f"✅ 模型就绪!加载耗时 {st.session_state.load_time:.1f} 秒",
"error": f"❌ 加载失败: {model_error}"
}
st.info(status_map.get(st.session_state.download_status, "❓ 未知状态"))
with col2:
st.caption("💡 首次加载需1-2分钟 | 休眠后需重新下载")
# 模型未就绪时禁止聊天
if st.session_state.download_status != "ready":
st.stop()
# 聊天界面
st.title("🦙 本地CPU大模型 (Qwen2.5-1.5B)")
st.caption("完全离线运行 · 无外部API调用 · 适合演示用途")
if "messages" not in st.session_state:
st.session_state.messages = []
# 显示历史消息
for msg in st.session_state.messages:
with st.chat_message(msg["role"]):
st.markdown(msg["content"])
# 用户输入
if prompt := st.chat_input("问点什么吧..."):
# 保存用户消息
st.session_state.messages.append({"role": "user", "content": prompt})
with st.chat_message("user"):
st.markdown(prompt)
# 生成回复
with st.chat_message("assistant"):
message_placeholder = st.empty()
full_response = ""
# Qwen2.5对话模板
messages = [
{"role": "system", "content": "You are a helpful assistant."},
*[{"role": m["role"], "content": m["content"]} for m in st.session_state.messages]
]
# 流式生成(CPU较慢,需耐心)
try:
for chunk in llm_instance.create_chat_completion(
messages=messages,
max_tokens=256, # 限制长度避免超时
temperature=0.7,
stream=True
):
delta = chunk["choices"][0]["delta"]
if "content" in delta:
full_response += delta["content"]
message_placeholder.markdown(full_response + "▌")
message_placeholder.markdown(full_response)
st.session_state.messages.append({"role": "assistant", "content": full_response})
except Exception as e:
st.error(f"生成失败: {str(e)}")
message_placeholder.markdown("❌ 生成超时,请缩短问题长度重试")