import streamlit as st from llama_cpp import Llama from huggingface_hub import hf_hub_download import os import time import threading # ===== 配置区(CPU友好型模型)===== MODEL_REPO = "Qwen/Qwen2.5-1.5B-Instruct-GGUF" MODEL_FILENAME = "qwen2.5-1_5b-instruct-q4_k_m.gguf" MODEL_DIR = "/app/models" # Spaces持久化目录 # 全局模型变量(避免重复加载) llm_instance = None model_loading = False model_error = None def background_model_load(): """后台线程加载模型(避免阻塞Streamlit主线程)""" global llm_instance, model_loading, model_error if model_loading: return model_loading = True try: # 创建目录 os.makedirs(MODEL_DIR, exist_ok=True) model_path = os.path.join(MODEL_DIR, MODEL_FILENAME) # 检查是否已下载 if not os.path.exists(model_path): st.session_state.download_status = "downloading" # 下载模型(自动断点续传) model_path = hf_hub_download( repo_id=MODEL_REPO, filename=MODEL_FILENAME, local_dir=MODEL_DIR, resume_download=True, token=None # 公开模型无需token ) st.session_state.download_status = "downloaded" # 加载模型到内存 st.session_state.download_status = "loading" start = time.time() llm_instance = Llama( model_path=model_path, n_ctx=2048, n_threads=4, # Spaces CPU通常4核 n_gpu_layers=0, # 纯CPU verbose=False, n_batch=512 # 优化批处理 ) st.session_state.download_status = "ready" st.session_state.load_time = time.time() - start except Exception as e: model_error = str(e) st.session_state.download_status = "error" finally: model_loading = False # ===== Streamlit UI ===== st.set_page_config(page_title="🦙 CPU LLM Demo", page_icon="🦙", layout="wide") # 初始化状态 if "download_status" not in st.session_state: st.session_state.download_status = "idle" st.session_state.load_time = 0 # 启动后台加载线程 threading.Thread(target=background_model_load, daemon=True).start() # 顶部状态栏 col1, col2 = st.columns([3, 1]) with col1: status_map = { "idle": "⏳ 准备加载模型...", "downloading": "⬇️ 正在下载模型 (1.0GB)...", "downloaded": "✅ 模型下载完成,正在加载到内存...", "loading": "🧠 正在加载模型到内存(约60-90秒)...", "ready": f"✅ 模型就绪!加载耗时 {st.session_state.load_time:.1f} 秒", "error": f"❌ 加载失败: {model_error}" } st.info(status_map.get(st.session_state.download_status, "❓ 未知状态")) with col2: st.caption("💡 首次加载需1-2分钟 | 休眠后需重新下载") # 模型未就绪时禁止聊天 if st.session_state.download_status != "ready": st.stop() # 聊天界面 st.title("🦙 本地CPU大模型 (Qwen2.5-1.5B)") st.caption("完全离线运行 · 无外部API调用 · 适合演示用途") if "messages" not in st.session_state: st.session_state.messages = [] # 显示历史消息 for msg in st.session_state.messages: with st.chat_message(msg["role"]): st.markdown(msg["content"]) # 用户输入 if prompt := st.chat_input("问点什么吧..."): # 保存用户消息 st.session_state.messages.append({"role": "user", "content": prompt}) with st.chat_message("user"): st.markdown(prompt) # 生成回复 with st.chat_message("assistant"): message_placeholder = st.empty() full_response = "" # Qwen2.5对话模板 messages = [ {"role": "system", "content": "You are a helpful assistant."}, *[{"role": m["role"], "content": m["content"]} for m in st.session_state.messages] ] # 流式生成(CPU较慢,需耐心) try: for chunk in llm_instance.create_chat_completion( messages=messages, max_tokens=256, # 限制长度避免超时 temperature=0.7, stream=True ): delta = chunk["choices"][0]["delta"] if "content" in delta: full_response += delta["content"] message_placeholder.markdown(full_response + "▌") message_placeholder.markdown(full_response) st.session_state.messages.append({"role": "assistant", "content": full_response}) except Exception as e: st.error(f"生成失败: {str(e)}") message_placeholder.markdown("❌ 生成超时,请缩短问题长度重试")