plain_untuned

Sleeping

App Files Files Community

simonper commited on Dec 3, 2025

Commit

14d11ec

verified ·

1 Parent(s): 91771ea

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -82

app.py CHANGED Viewed

@@ -1,125 +1,123 @@
 import gradio as gr
-from llama_cpp import Llama
-# Initialize the model
-llm = Llama.from_pretrained(
-    repo_id="simonper/fine-tuned-gguf-modal1",
-    filename="Llama-3.2-1B.Q8_0.gguf",
-    n_ctx=2048,
-    n_threads=2,
-    verbose=False
-)
-# --- 1. LLAMA 3 SPECIFIC FORMATTING ---
-def format_llama3_prompt(system_message: str, history: list[dict], user_message: str) -> str:
-    """
-    Formats the conversation using official Llama 3 special tokens.
-    """
-    formatted_prompt = "<|begin_of_text|>"
-    # Add System Message
-    formatted_prompt += f"<|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|>"
-    # Add History
-    for turn in history:
-        role = turn['role']
-        content = turn['content']
-        formatted_prompt += f"<|start_header_id|>{role}<|end_header_id|>\n\n{content}<|eot_id|>"
-    # Add Current User Message
-    formatted_prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{user_message}<|eot_id|>"
-    # Add Assistant Header (ready for generation)
-    formatted_prompt += f"<|start_header_id|>assistant<|end_header_id|>\n\n"
-    return formatted_prompt
-# --- 2. ENHANCED SYSTEM PROMPTS ---
-def get_system_prompt(style_mode):
-    """
-    Returns a rich persona definition based on the selected style.
-    """
-    base_instruction = "You are a helpful and intelligent AI assistant."
-    prompts = {
-        "Normal": (
-            f"{base_instruction} Answer the user's questions clearly and concisely."
-        ),
-        "Professional": (
-            f"{base_instruction} You are a senior corporate executive. "
-            "Your tone is strictly professional, polite, and business-oriented. "
-            "Use formal vocabulary, avoid slang, and structure your answers with bullet points where possible."
-        ),
-        "Shakespeare": (
-            f"{base_instruction} You are William Shakespeare. "
-            "You speak only in Early Modern English (using thee, thou, hath, etc.). "
-            "Your responses should be poetic, dramatic, and perhaps slightly archaic."
-        ),
-        "Funny/Ironic": (
-            f"{base_instruction} You are a sarcastic comedian who loves irony. "
-            "While you must still answer the user's question, wrap the answer in dry humor, "
-            "witty remarks, and self-deprecating jokes. Do not be overly polite."
         )
-    }
-    return prompts.get(style_mode, prompts["Normal"])
 def respond(
     message,
     history: list[dict],
-    system_message_dummy,
     max_tokens,
     temperature,
     top_p,
     repetition_penalty,
     style_mode,
 ):
-    system_prompt = get_system_prompt(style_mode)
     if len(history) > 10:
         history = history[-10:]
-    # 3. Build the prompt using Llama 3 template
-    prompt = format_llama3_prompt(system_prompt, history, message)
-    # 4. Generate
-    output = llm(
-        prompt,
-        max_tokens=int(max_tokens),
         temperature=float(temperature),
         top_p=float(top_p),
-        repeat_penalty=float(repetition_penalty),
-        stop=["<|eot_id|>", "<|end_of_text|>"],
-        echo=False
     )
-    reply = output["choices"][0]["text"].strip()
-    return reply
 # --- 3. GUI SETUP ---
 chatbot = gr.ChatInterface(
     respond,
     type="messages",
     additional_inputs=[
         gr.Textbox(value="", label="System Prompt (Hidden)", visible=False),
         gr.Slider(minimum=1, maximum=1024, value=512, label="Max New Tokens"),
         gr.Slider(minimum=0.1, maximum=2.0, value=0.7, label="Temperature"),
         gr.Slider(minimum=0.1, maximum=1.0, value=0.9, label="Top-p"),
         gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.05, label="Repetition Penalty"),
-        gr.Dropdown(
-            choices=["Normal", "Professional", "Shakespeare", "Funny/Ironic"],
-            value="Normal",
-            label="Choose the Style / Tone"
-        )
     ],
 )
 with gr.Blocks() as demo:
-    gr.Markdown("# Advanced Chat Bot (Llama 3.2 1B)")
-    with gr.Sidebar():
-        gr.LoginButton()
     chatbot.render()
 if __name__ == "__main__":

 import gradio as gr
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from threading import Thread
+# --- 1. SETUP MODEL & TOKENIZER ---
+# User requested the BASE (Untrained) version, not Instruct.
+MODEL_ID = "meta-llama/Llama-3.2-1B"
+# Check for GPU, otherwise fallback to CPU
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Loading base model on: {device}")
+try:
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
+        device_map="auto"
+    )
+    # CRITICAL FIX FOR BASE MODELS:
+    # Base models often do not have a 'chat_template' defined in their config
+    # because they aren't meant for chat. We must manually assign the Llama 3
+    # template so the code doesn't crash when using apply_chat_template.
+    if tokenizer.chat_template is None:
+        print("Base model detected: Assigning default Llama 3 chat template...")
+        tokenizer.chat_template = (
+            "{% set loop_messages = messages %}"
+            "{% for message in loop_messages %}"
+            "{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}"
+            "{% if loop.index0 == 0 %}"
+            "{% set content = '<|begin_of_text|>' + content %}"
+            "{% endif %}"
+            "{{ content }}"
+            "{% endfor %}"
+            "{% if add_generation_prompt %}"
+            "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
+            "{% endif %}"
         )
+        # Ensure special tokens used in template exist in tokenizer
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+except Exception as e:
+    print(f"Error loading model. Ensure you have a valid HF_TOKEN and access to the gated repo. Error: {e}")
+    raise e
+# --- 2. GENERATION FUNCTION ---
 def respond(
     message,
     history: list[dict],
+    system_message_dummy,
     max_tokens,
     temperature,
     top_p,
     repetition_penalty,
     style_mode,
 ):
+    # Base models ignore system prompts mostly, but we include it for structure
+    system_prompt = "You are an AI assistant."
+    if style_mode == "Shakespeare":
+        system_prompt = "You are William Shakespeare. Speak in Early Modern English."
+    elif style_mode == "Funny/Ironic":
+        system_prompt = "You are a sarcastic comedian."
+    # Context Window Management
     if len(history) > 10:
         history = history[-10:]
+    # Build messages
+    messages = [{"role": "system", "content": system_prompt}]
+    for turn in history:
+        messages.append({"role": turn['role'], "content": turn['content']})
+    messages.append({"role": "user", "content": message})
+    # Apply Template
+    input_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        return_tensors="pt"
+    ).to(model.device)
+    terminators = [
+        tokenizer.eos_token_id,
+        tokenizer.convert_tokens_to_ids("<|eot_id|>")
+    ]
+    # Generate
+    outputs = model.generate(
+        input_ids,
+        max_new_tokens=int(max_tokens),
+        eos_token_id=terminators,
         temperature=float(temperature),
         top_p=float(top_p),
+        repetition_penalty=float(repetition_penalty),
+        do_sample=True,
     )
+    response = outputs[0][input_ids.shape[-1]:]
+    decoded_response = tokenizer.decode(response, skip_special_tokens=True)
+    return decoded_response
 # --- 3. GUI SETUP ---
+# (Kept identical to previous, just updated title)
 chatbot = gr.ChatInterface(
     respond,
     type="messages",
     additional_inputs=[
         gr.Textbox(value="", label="System Prompt (Hidden)", visible=False),
         gr.Slider(minimum=1, maximum=1024, value=512, label="Max New Tokens"),
         gr.Slider(minimum=0.1, maximum=2.0, value=0.7, label="Temperature"),
         gr.Slider(minimum=0.1, maximum=1.0, value=0.9, label="Top-p"),
         gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.05, label="Repetition Penalty"),
+        gr.Dropdown(choices=["Normal", "Professional", "Shakespeare", "Funny/Ironic"], value="Normal", label="Style"),
     ],
 )
 with gr.Blocks() as demo:
+    gr.Markdown("# Chat with Llama 3.2 1B (Base/Untrained)")
+    gr.Markdown("> **Warning:** You are running the base model. It will likely hallucinate or autocomplete text rather than chatting normally.")
     chatbot.render()
 if __name__ == "__main__":