plain_untuned

Sleeping

App Files Files Community

simonper commited on Dec 3, 2025

Commit

83cbea2

verified ·

1 Parent(s): 14d11ec

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -86

app.py CHANGED Viewed

@@ -1,107 +1,92 @@
 import gradio as gr
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-from threading import Thread
-# --- 1. SETUP MODEL & TOKENIZER ---
-# User requested the BASE (Untrained) version, not Instruct.
-MODEL_ID = "meta-llama/Llama-3.2-1B"
-# Check for GPU, otherwise fallback to CPU
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Loading base model on: {device}")
-try:
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID,
-        torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
-        device_map="auto"
-    )
-    # CRITICAL FIX FOR BASE MODELS:
-    # Base models often do not have a 'chat_template' defined in their config
-    # because they aren't meant for chat. We must manually assign the Llama 3
-    # template so the code doesn't crash when using apply_chat_template.
-    if tokenizer.chat_template is None:
-        print("Base model detected: Assigning default Llama 3 chat template...")
-        tokenizer.chat_template = (
-            "{% set loop_messages = messages %}"
-            "{% for message in loop_messages %}"
-            "{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}"
-            "{% if loop.index0 == 0 %}"
-            "{% set content = '<|begin_of_text|>' + content %}"
-            "{% endif %}"
-            "{{ content }}"
-            "{% endfor %}"
-            "{% if add_generation_prompt %}"
-            "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
-            "{% endif %}"
         )
-        # Ensure special tokens used in template exist in tokenizer
-        tokenizer.pad_token_id = tokenizer.eos_token_id
-except Exception as e:
-    print(f"Error loading model. Ensure you have a valid HF_TOKEN and access to the gated repo. Error: {e}")
-    raise e
-# --- 2. GENERATION FUNCTION ---
 def respond(
     message,
     history: list[dict],
-    system_message_dummy,
     max_tokens,
     temperature,
     top_p,
     repetition_penalty,
     style_mode,
 ):
-    # Base models ignore system prompts mostly, but we include it for structure
-    system_prompt = "You are an AI assistant."
-    if style_mode == "Shakespeare":
-        system_prompt = "You are William Shakespeare. Speak in Early Modern English."
-    elif style_mode == "Funny/Ironic":
-        system_prompt = "You are a sarcastic comedian."
-    # Context Window Management
-    if len(history) > 10:
-        history = history[-10:]
-    # Build messages
-    messages = [{"role": "system", "content": system_prompt}]
-    for turn in history:
         messages.append({"role": turn['role'], "content": turn['content']})
     messages.append({"role": "user", "content": message})
-    # Apply Template
-    input_ids = tokenizer.apply_chat_template(
-        messages,
-        add_generation_prompt=True,
-        return_tensors="pt"
-    ).to(model.device)
-    terminators = [
-        tokenizer.eos_token_id,
-        tokenizer.convert_tokens_to_ids("<|eot_id|>")
-    ]
-    # Generate
-    outputs = model.generate(
-        input_ids,
-        max_new_tokens=int(max_tokens),
-        eos_token_id=terminators,
         temperature=float(temperature),
         top_p=float(top_p),
-        repetition_penalty=float(repetition_penalty),
-        do_sample=True,
     )
-    response = outputs[0][input_ids.shape[-1]:]
-    decoded_response = tokenizer.decode(response, skip_special_tokens=True)
-    return decoded_response
-# --- 3. GUI SETUP ---
-# (Kept identical to previous, just updated title)
 chatbot = gr.ChatInterface(
     respond,
     type="messages",
@@ -111,13 +96,18 @@ chatbot = gr.ChatInterface(
         gr.Slider(minimum=0.1, maximum=2.0, value=0.7, label="Temperature"),
         gr.Slider(minimum=0.1, maximum=1.0, value=0.9, label="Top-p"),
         gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.05, label="Repetition Penalty"),
-        gr.Dropdown(choices=["Normal", "Professional", "Shakespeare", "Funny/Ironic"], value="Normal", label="Style"),
     ],
 )
 with gr.Blocks() as demo:
-    gr.Markdown("# Chat with Llama 3.2 1B (Base/Untrained)")
-    gr.Markdown("> **Warning:** You are running the base model. It will likely hallucinate or autocomplete text rather than chatting normally.")
     chatbot.render()
 if __name__ == "__main__":

 import gradio as gr
+from llama_cpp import Llama
+from transformers import AutoTokenizer
+MODEL_REPO = "simonper/Llama-3.2-1B-bnb-4bit_untrained_gguf_4bit"
+MODEL_FILE = "Llama-3.2-1B.Q4_K_M.gguf"
+TOKENIZER_ID = "chthees/lora_model_full_finetome-tokenizer"
+print("Loading Tokenizer...")
+tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_ID)
+print("Loading Model...")
+llm = Llama.from_pretrained(
+    repo_id=MODEL_REPO,
+    filename=MODEL_FILE,
+    n_ctx=2048,
+    n_threads=2,
+    verbose=False
+)
+# --- SYSTEM PROMPT LOGIC ---
+def get_system_prompt(style_mode):
+    base_instruction = "You are a helpful and intelligent AI assistant."
+    prompts = {
+        "Normal": f"{base_instruction} Answer clearly and concisely.",
+        "Professional": (
+            f"{base_instruction} You are a senior corporate executive. "
+            "Your tone is strictly professional, polite, and business-oriented."
+        ),
+        "Shakespeare": (
+            f"{base_instruction} You are William Shakespeare. "
+            "Speak only in Early Modern English (thee, thou, hath). Be poetic and dramatic."
+        ),
+        "Funny/Ironic": (
+            f"{base_instruction} You are a sarcastic comedian. "
+            "Wrap your answers in dry humor, irony, and witty remarks."
         )
+    }
+    return prompts.get(style_mode, prompts["Normal"])
+# --- CORE RESPONSE FUNCTION ---
 def respond(
     message,
     history: list[dict],
+    system_message_dummy,
     max_tokens,
     temperature,
     top_p,
     repetition_penalty,
     style_mode,
 ):
+    messages = []
+    # Add System Persona
+    system_prompt = get_system_prompt(style_mode)
+    messages.append({"role": "system", "content": system_prompt})
+    # Add Conversation History
+    # We slice to the last 10 turns to keep the context window manageable
+    for turn in history[-10:]:
         messages.append({"role": turn['role'], "content": turn['content']})
+    # Add Current User Message
     messages.append({"role": "user", "content": message})
+    prompt_str = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    # 3. Generate Response
+    output = llm(
+        prompt_str,
+        max_tokens=int(max_tokens),
         temperature=float(temperature),
         top_p=float(top_p),
+        repeat_penalty=float(repetition_penalty),
+        stop=[tokenizer.eos_token, "<|eot_id|>"],
+        echo=False
     )
+    return output["choices"][0]["text"].strip()
+# --- GUI SETUP ---
 chatbot = gr.ChatInterface(
     respond,
     type="messages",
         gr.Slider(minimum=0.1, maximum=2.0, value=0.7, label="Temperature"),
         gr.Slider(minimum=0.1, maximum=1.0, value=0.9, label="Top-p"),
         gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.05, label="Repetition Penalty"),
+        gr.Dropdown(
+            choices=["Normal", "Professional", "Shakespeare", "Funny/Ironic"],
+            value="Normal",
+            label="Choose the Style / Tone"
+        )
     ],
 )
 with gr.Blocks() as demo:
+    gr.Markdown("# Styled Chat Bot")
+    with gr.Sidebar():
+        gr.LoginButton()
     chatbot.render()
 if __name__ == "__main__":