Spaces:

Marcus719
/

ID2223_Lab2

Sleeping

App Files Files Community

Marcus719 commited on Nov 30, 2025

Commit

d9a1250

verified ·

1 Parent(s): 2a8403d

Update app.py

Browse files

Files changed (1) hide show

app.py +254 -24

app.py CHANGED Viewed

@@ -1,30 +1,260 @@
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM
-model_name = "Marcus719/Llama-3.2-3B-Instruct-Lab2"
-# load tokenizer and model
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    low_cpu_mem_usage=True,
-    device_map="auto"
-)
-# define generate function
-def generate_text(input_text):
-    inputs = tokenizer(input_text, return_tensors="pt")
-    outputs = model.generate(inputs["input_ids"], max_length=100, num_return_sequences=1)
-    return tokenizer.decode(outputs[0], skip_special_tokens=True)
-# create gradio interface
-interface = gr.Interface(
-    fn=generate_text,
-    inputs="text",
-    outputs="text",
-    title="Hugging Face model Demo",
-    description="say something"
 )
-# launch the app
-interface.launch()

 import gradio as gr
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
+import os
+# ============================================
+# 配置区域 - KTH ID2223 Lab 2
+# ============================================
+MODEL_REPO = "Marcus719/Llama-3.2-3B-Instruct-FineTome-Lab2-GGUF"
+# ⚠️ 请确认你仓库中的 GGUF 文件名，常见格式：
+# - unsloth.Q4_K_M.gguf (推荐，较小较快)
+# - unsloth.Q8_0.gguf (更精确但较慢)
+MODEL_FILENAME = "unsloth.Q4_K_M.gguf"
+# ============================================
+# 下载并加载模型
+# ============================================
+print(f"📥 Downloading model from {MODEL_REPO}...")
+try:
+    model_path = hf_hub_download(
+        repo_id=MODEL_REPO,
+        filename=MODEL_FILENAME,
+    )
+    print(f"✅ Model downloaded: {model_path}")
+except Exception as e:
+    print(f"❌ Error downloading model: {e}")
+    print("Please check MODEL_FILENAME matches your repository file.")
+    raise e
+print("🔄 Loading model (this may take a few minutes on CPU)...")
+# 加载 GGUF 模型 - 针对 HuggingFace Spaces 免费 CPU 优化
+llm = Llama(
+    model_path=model_path,
+    n_ctx=2048,           # 上下文长度 (降低以节省内存)
+    n_threads=2,          # HF Spaces 免费 CPU 线程数
+    n_gpu_layers=0,       # 纯 CPU 推理
+    verbose=False
 )
+print("✅ Model loaded successfully!")
+# ============================================
+# Llama 3.2 Instruct 对话模板
+# ============================================
+def format_prompt(message: str, history: list, system_prompt: str) -> str:
+    """
+    Format conversation using Llama 3.2 Instruct chat template.
+    Reference: https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_2
+    """
+    prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>"
+    # Add conversation history
+    for user_msg, assistant_msg in history:
+        if user_msg:
+            prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{user_msg}<|eot_id|>"
+        if assistant_msg:
+            prompt += f"<|start_header_id|>assistant<|end_header_id|>\n\n{assistant_msg}<|eot_id|>"
+    # Add current message
+    prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|>"
+    prompt += f"<|start_header_id|>assistant<|end_header_id|>\n\n"
+    return prompt
+# ============================================
+# 生成回复函数 (流式输出)
+# ============================================
+def chat(message: str, history: list, system_prompt: str, max_tokens: int, temperature: float, top_p: float):
+    """Generate streaming response from the fine-tuned LLM."""
+    prompt = format_prompt(message, history, system_prompt)
+    response = ""
+    stream = llm(
+        prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        stop=["<|eot_id|>", "<|end_of_text|>"],
+        stream=True
+    )
+    for chunk in stream:
+        token = chunk["choices"][0]["text"]
+        response += token
+        yield response
+# ============================================
+# Gradio 界面
+# ============================================
+DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
+with gr.Blocks(
+    theme=gr.themes.Soft(),
+    title="🦙 Llama 3.2 Fine-tuned ChatBot | KTH ID2223 Lab 2"
+) as demo:
+    gr.Markdown(
+        """
+        # 🦙 Llama 3.2 3B Instruct - Fine-tuned on FineTome Dataset
+        **KTH ID2223 Scalable Machine Learning - Lab 2**
+        This chatbot uses a Llama 3.2 3B model fine-tuned on the [FineTome-100k](https://huggingface.co/datasets/mlabonne/FineTome-100k) instruction dataset using QLoRA (4-bit quantization with LoRA adapters).
+        📦 **Model**: [Marcus719/Llama-3.2-3B-Instruct-FineTome-Lab2-GGUF](https://huggingface.co/Marcus719/Llama-3.2-3B-Instruct-FineTome-Lab2-GGUF)
+        """
+    )
+    chatbot = gr.Chatbot(
+        label="Conversation",
+        height=450,
+        show_copy_button=True,
+    )
+    with gr.Row():
+        msg = gr.Textbox(
+            label="Your Message",
+            placeholder="Type your message here...",
+            scale=4,
+            container=False,
+            autofocus=True
+        )
+        submit_btn = gr.Button("Send 🚀", scale=1, variant="primary")
+    with gr.Accordion("⚙️ Advanced Settings", open=False):
+        system_prompt = gr.Textbox(
+            label="System Prompt",
+            value=DEFAULT_SYSTEM_PROMPT,
+            lines=4
+        )
+        with gr.Row():
+            max_tokens = gr.Slider(
+                minimum=64,
+                maximum=1024,
+                value=256,
+                step=32,
+                label="Max Tokens"
+            )
+            temperature = gr.Slider(
+                minimum=0.1,
+                maximum=1.5,
+                value=0.7,
+                step=0.1,
+                label="Temperature"
+            )
+            top_p = gr.Slider(
+                minimum=0.1,
+                maximum=1.0,
+                value=0.9,
+                step=0.05,
+                label="Top-p"
+            )
+    with gr.Row():
+        clear_btn = gr.Button("🗑️ Clear Chat")
+        retry_btn = gr.Button("🔄 Regenerate")
+    # Example prompts
+    gr.Examples(
+        examples=[
+            "Hello! Can you introduce yourself?",
+            "Explain machine learning in simple terms.",
+            "What is the difference between supervised and unsupervised learning?",
+            "Write a short poem about artificial intelligence.",
+            "How does fine-tuning improve a language model?",
+        ],
+        inputs=msg,
+        label="💡 Example Prompts"
+    )
+    # Event handlers
+    def user_input(message, history):
+        return "", history + [[message, None]]
+    def bot_response(history, system_prompt, max_tokens, temperature, top_p):
+        if not history:
+            return history
+        message = history[-1][0]
+        history_for_model = history[:-1]
+        for response in chat(message, history_for_model, system_prompt, max_tokens, temperature, top_p):
+            history[-1][1] = response
+            yield history
+    def retry_last(history, system_prompt, max_tokens, temperature, top_p):
+        if history and len(history) > 0:
+            history[-1][1] = None
+            message = history[-1][0]
+            history_for_model = history[:-1]
+            for response in chat(message, history_for_model, system_prompt, max_tokens, temperature, top_p):
+                history[-1][1] = response
+                yield history
+    # Submit message
+    msg.submit(
+        user_input,
+        [msg, chatbot],
+        [msg, chatbot],
+        queue=False
+    ).then(
+        bot_response,
+        [chatbot, system_prompt, max_tokens, temperature, top_p],
+        chatbot
+    )
+    submit_btn.click(
+        user_input,
+        [msg, chatbot],
+        [msg, chatbot],
+        queue=False
+    ).then(
+        bot_response,
+        [chatbot, system_prompt, max_tokens, temperature, top_p],
+        chatbot
+    )
+    # Clear chat
+    clear_btn.click(lambda: [], None, chatbot, queue=False)
+    # Retry last response
+    retry_btn.click(
+        retry_last,
+        [chatbot, system_prompt, max_tokens, temperature, top_p],
+        chatbot
+    )
+    gr.Markdown(
+        """
+        ---
+        ### 📝 About This Project
+        **Fine-tuning Details:**
+        - Base Model: `meta-llama/Llama-3.2-3B-Instruct`
+        - Dataset: [FineTome-100k](https://huggingface.co/datasets/mlabonne/FineTome-100k)
+        - Method: QLoRA (4-bit quantization + LoRA)
+        - Framework: [Unsloth](https://github.com/unslothai/unsloth)
+        **Tips:**
+        - Lower temperature (0.1-0.5) for more focused responses
+        - Higher temperature (0.7-1.0) for creative responses
+        - Adjust max tokens based on expected response length
+        Built with ❤️ using Gradio & llama.cpp | KTH ID2223 Lab 2
+        """
+    )
+# ============================================
+# Launch
+# ============================================
+if __name__ == "__main__":
+    demo.queue().launch()