| import streamlit as st |
| from llama_cpp import Llama |
| from huggingface_hub import hf_hub_download |
| import os |
| import time |
| import threading |
|
|
| |
| MODEL_REPO = "Qwen/Qwen2.5-1.5B-Instruct-GGUF" |
| MODEL_FILENAME = "qwen2.5-1_5b-instruct-q4_k_m.gguf" |
| MODEL_DIR = "/app/models" |
|
|
| |
| llm_instance = None |
| model_loading = False |
| model_error = None |
|
|
| def background_model_load(): |
| """后台线程加载模型(避免阻塞Streamlit主线程)""" |
| global llm_instance, model_loading, model_error |
| |
| if model_loading: |
| return |
| |
| model_loading = True |
| try: |
| |
| os.makedirs(MODEL_DIR, exist_ok=True) |
| model_path = os.path.join(MODEL_DIR, MODEL_FILENAME) |
| |
| |
| if not os.path.exists(model_path): |
| st.session_state.download_status = "downloading" |
| |
| model_path = hf_hub_download( |
| repo_id=MODEL_REPO, |
| filename=MODEL_FILENAME, |
| local_dir=MODEL_DIR, |
| resume_download=True, |
| token=None |
| ) |
| st.session_state.download_status = "downloaded" |
| |
| |
| st.session_state.download_status = "loading" |
| start = time.time() |
| llm_instance = Llama( |
| model_path=model_path, |
| n_ctx=2048, |
| n_threads=4, |
| n_gpu_layers=0, |
| verbose=False, |
| n_batch=512 |
| ) |
| st.session_state.download_status = "ready" |
| st.session_state.load_time = time.time() - start |
| |
| except Exception as e: |
| model_error = str(e) |
| st.session_state.download_status = "error" |
| finally: |
| model_loading = False |
|
|
| |
| st.set_page_config(page_title="🦙 CPU LLM Demo", page_icon="🦙", layout="wide") |
|
|
| |
| if "download_status" not in st.session_state: |
| st.session_state.download_status = "idle" |
| st.session_state.load_time = 0 |
| |
| threading.Thread(target=background_model_load, daemon=True).start() |
|
|
| |
| col1, col2 = st.columns([3, 1]) |
| with col1: |
| status_map = { |
| "idle": "⏳ 准备加载模型...", |
| "downloading": "⬇️ 正在下载模型 (1.0GB)...", |
| "downloaded": "✅ 模型下载完成,正在加载到内存...", |
| "loading": "🧠 正在加载模型到内存(约60-90秒)...", |
| "ready": f"✅ 模型就绪!加载耗时 {st.session_state.load_time:.1f} 秒", |
| "error": f"❌ 加载失败: {model_error}" |
| } |
| st.info(status_map.get(st.session_state.download_status, "❓ 未知状态")) |
| with col2: |
| st.caption("💡 首次加载需1-2分钟 | 休眠后需重新下载") |
|
|
| |
| if st.session_state.download_status != "ready": |
| st.stop() |
|
|
| |
| st.title("🦙 本地CPU大模型 (Qwen2.5-1.5B)") |
| st.caption("完全离线运行 · 无外部API调用 · 适合演示用途") |
|
|
| if "messages" not in st.session_state: |
| st.session_state.messages = [] |
|
|
| |
| for msg in st.session_state.messages: |
| with st.chat_message(msg["role"]): |
| st.markdown(msg["content"]) |
|
|
| |
| if prompt := st.chat_input("问点什么吧..."): |
| |
| st.session_state.messages.append({"role": "user", "content": prompt}) |
| with st.chat_message("user"): |
| st.markdown(prompt) |
| |
| |
| with st.chat_message("assistant"): |
| message_placeholder = st.empty() |
| full_response = "" |
| |
| |
| messages = [ |
| {"role": "system", "content": "You are a helpful assistant."}, |
| *[{"role": m["role"], "content": m["content"]} for m in st.session_state.messages] |
| ] |
| |
| |
| try: |
| for chunk in llm_instance.create_chat_completion( |
| messages=messages, |
| max_tokens=256, |
| temperature=0.7, |
| stream=True |
| ): |
| delta = chunk["choices"][0]["delta"] |
| if "content" in delta: |
| full_response += delta["content"] |
| message_placeholder.markdown(full_response + "▌") |
| message_placeholder.markdown(full_response) |
| st.session_state.messages.append({"role": "assistant", "content": full_response}) |
| except Exception as e: |
| st.error(f"生成失败: {str(e)}") |
| message_placeholder.markdown("❌ 生成超时,请缩短问题长度重试") |