iris_before_code

Sleeping

App Files Files Community

astegaras commited on Nov 27, 2025

Commit

485a33e

verified ·

1 Parent(s): e4aa198

app uppdate for mlx

Browse files

Files changed (1) hide show

app.py +44 -35

app.py CHANGED Viewed

@@ -1,43 +1,52 @@
 import gradio as gr
-from llama_cpp import Llama
-from huggingface_hub import hf_hub_download
-# Download GGUF file from HuggingFace
-model_path = hf_hub_download(
-    repo_id="astegaras/Llama3.2_3B",
-    filename="model-Q2_K.gguf",
-)
-# Load model
-llm = Llama(
-    model_path=model_path,
-    n_ctx=4096,
-    n_gpu_layers=0,
-    chat_format=None,
-    add_bos_token=False,
-    add_eos_token=False,
-)
-# EXACT SAME BEHAVIOR AS mlx_lm.generate
-def respond(user_input):
-    output = llm(
-        user_input,             # <-- only this!
         max_tokens=256,
         temperature=0.7,
         top_p=0.9,
-        stop=None,
     )
-    return output["choices"][0]["text"].strip()
-gr.Interface(
-    fn=respond,
-    inputs="text",
-    outputs="text",
-    title="Llama3.2-3B Fine-tuned Model"
-).launch()

 import gradio as gr
+from mlx_lm import load, generate
+# ----------------------------------------------------
+# 1. Load your quantized MLX model from HuggingFace
+# ----------------------------------------------------
+MODEL_REPO = "astegaras/my-mlx-llama3"   # <-- change to your repo
+print("Loading model...")
+model, tokenizer = load(MODEL_REPO)
+print("Model loaded!")
+# ----------------------------------------------------
+# 2. Chat / inference function
+# ----------------------------------------------------
+def respond(user_input, history):
+    """
+    user_input: new user message
+    history: list of [user, assistant] messages from Gradio
+    """
+    # Build a conversation prompt (simple version)
+    messages = []
+    for user_msg, assistant_msg in history:
+        messages.append(f"User: {user_msg}\nAssistant: {assistant_msg}")
+    messages.append(f"User: {user_input}\nAssistant:")
+    prompt = "\n".join(messages)
+    # Generate with mlx_lm
+    output = generate(
+        model,
+        tokenizer,
+        prompt,
         max_tokens=256,
         temperature=0.7,
         top_p=0.9,
     )
+    # Extract only the assistant's new text
+    assistant_reply = output[len(prompt):].strip()
+    return assistant_reply
+# ----------------------------------------------------
+# 3. Launch Gradio chat interface
+# ----------------------------------------------------
+gr.ChatInterface(
+    fn=respond,
+    title="My MLX Llama Model",
+    description="Chat with your fine-tuned MLX model!",
+).launch()