astegaras commited on
Commit
485a33e
·
verified ·
1 Parent(s): e4aa198

app uppdate for mlx

Browse files
Files changed (1) hide show
  1. app.py +44 -35
app.py CHANGED
@@ -1,43 +1,52 @@
1
  import gradio as gr
2
- from llama_cpp import Llama
3
- from huggingface_hub import hf_hub_download
4
-
5
- # Download GGUF file from HuggingFace
6
- model_path = hf_hub_download(
7
- repo_id="astegaras/Llama3.2_3B",
8
- filename="model-Q2_K.gguf",
9
- )
10
-
11
- # Load model
12
- llm = Llama(
13
- model_path=model_path,
14
- n_ctx=4096,
15
- n_gpu_layers=0,
16
- chat_format=None,
17
- add_bos_token=False,
18
- add_eos_token=False,
19
- )
20
-
21
- # EXACT SAME BEHAVIOR AS mlx_lm.generate
22
- def respond(user_input):
23
- output = llm(
24
- user_input, # <-- only this!
 
 
 
 
 
 
 
 
 
 
25
  max_tokens=256,
26
  temperature=0.7,
27
  top_p=0.9,
28
- stop=None,
29
  )
30
-
31
- return output["choices"][0]["text"].strip()
32
-
33
- gr.Interface(
34
- fn=respond,
35
- inputs="text",
36
- outputs="text",
37
- title="Llama3.2-3B Fine-tuned Model"
38
- ).launch()
39
-
40
-
41
 
 
 
42
 
 
43
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from mlx_lm import load, generate
3
+
4
+ # ----------------------------------------------------
5
+ # 1. Load your quantized MLX model from HuggingFace
6
+ # ----------------------------------------------------
7
+ MODEL_REPO = "astegaras/my-mlx-llama3" # <-- change to your repo
8
+
9
+ print("Loading model...")
10
+ model, tokenizer = load(MODEL_REPO)
11
+ print("Model loaded!")
12
+
13
+ # ----------------------------------------------------
14
+ # 2. Chat / inference function
15
+ # ----------------------------------------------------
16
+ def respond(user_input, history):
17
+ """
18
+ user_input: new user message
19
+ history: list of [user, assistant] messages from Gradio
20
+ """
21
+
22
+ # Build a conversation prompt (simple version)
23
+ messages = []
24
+ for user_msg, assistant_msg in history:
25
+ messages.append(f"User: {user_msg}\nAssistant: {assistant_msg}")
26
+ messages.append(f"User: {user_input}\nAssistant:")
27
+
28
+ prompt = "\n".join(messages)
29
+
30
+ # Generate with mlx_lm
31
+ output = generate(
32
+ model,
33
+ tokenizer,
34
+ prompt,
35
  max_tokens=256,
36
  temperature=0.7,
37
  top_p=0.9,
 
38
  )
 
 
 
 
 
 
 
 
 
 
 
39
 
40
+ # Extract only the assistant's new text
41
+ assistant_reply = output[len(prompt):].strip()
42
 
43
+ return assistant_reply
44
 
45
+ # ----------------------------------------------------
46
+ # 3. Launch Gradio chat interface
47
+ # ----------------------------------------------------
48
+ gr.ChatInterface(
49
+ fn=respond,
50
+ title="My MLX Llama Model",
51
+ description="Chat with your fine-tuned MLX model!",
52
+ ).launch()