Selinaliu1030 commited on
Commit
b83ade3
·
1 Parent(s): c661be3

use automodel instead

Browse files
Files changed (2) hide show
  1. app.py +32 -32
  2. requirements.txt +0 -1
app.py CHANGED
@@ -1,20 +1,18 @@
1
  import gradio as gr
2
- from llama_cpp import Llama
3
- from huggingface_hub import hf_hub_download
4
 
5
- # Load your local .gguf model
6
- repo_id = "Selinaliu1030/lora_model"
7
- filename = "llama-3.2-3b-finetuned-q8_0.gguf"
8
 
9
- model_path = hf_hub_download(
10
- repo_id=repo_id,
11
- filename=filename,
12
- local_dir=".", # where to download in the Space
13
- )
14
- llm = Llama(
15
- model_path=model_path, # <-- modify path if needed
16
- n_ctx=4096,
17
- n_threads=4,
18
  )
19
 
20
  def respond(
@@ -24,35 +22,37 @@ def respond(
24
  max_tokens,
25
  temperature,
26
  top_p,
27
- hf_token, # unused now, but keep for interface compatibility
28
  ):
29
- # Combine system + history + new user message
30
  messages = [{"role": "system", "content": system_message}]
31
  messages.extend(history)
32
  messages.append({"role": "user", "content": message})
33
 
34
- # Convert to llama.cpp style input
35
  prompt = ""
36
  for msg in messages:
37
- role = msg["role"]
38
- content = msg["content"]
39
- prompt += f"<{role}>: {content}\n"
40
  prompt += "<assistant>: "
41
 
42
- # Stream tokens
43
- stream = llm(
44
- prompt,
45
- max_tokens=max_tokens,
 
 
 
46
  temperature=temperature,
47
  top_p=top_p,
48
- stream=True,
 
49
  )
50
 
51
- response = ""
52
- for chunk in stream:
53
- token = chunk["choices"][0]["text"]
54
- response += token
55
- yield response
 
56
 
57
 
58
  # Gradio UI
@@ -61,8 +61,8 @@ chatbot = gr.ChatInterface(
61
  type="messages",
62
  additional_inputs=[
63
  gr.Textbox(value="You are a helpful assistant.", label="System message"),
64
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
65
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
66
  gr.Slider(
67
  minimum=0.1,
68
  maximum=1.0,
 
1
  import gradio as gr
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
+ from huggingface_hub import login
4
 
5
+ # Hugging Face model repo ID (must contain HF model weights, NOT .gguf)
6
+ MODEL_ID = "Selinaliu1030/lora_model"
 
7
 
8
+ # Load tokenizer + model
9
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
10
+
11
+ model = AutoModelForCausalLM.from_pretrained(
12
+ MODEL_ID,
13
+ device_map="auto", # uses GPU if available
14
+ torch_dtype="auto", # automatically picks fp16/bf16
15
+ low_cpu_mem_usage=True,
 
16
  )
17
 
18
  def respond(
 
22
  max_tokens,
23
  temperature,
24
  top_p,
25
+ hf_token, # still required by UI signature; unused
26
  ):
27
+ # Build prompt
28
  messages = [{"role": "system", "content": system_message}]
29
  messages.extend(history)
30
  messages.append({"role": "user", "content": message})
31
 
 
32
  prompt = ""
33
  for msg in messages:
34
+ prompt += f"<{msg['role']}>: {msg['content']}\n"
 
 
35
  prompt += "<assistant>: "
36
 
37
+ # Tokenize
38
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
39
+
40
+ # Generate
41
+ output = model.generate(
42
+ **inputs,
43
+ max_new_tokens=max_tokens,
44
  temperature=temperature,
45
  top_p=top_p,
46
+ do_sample=True,
47
+ pad_token_id=tokenizer.eos_token_id,
48
  )
49
 
50
+ # Decode
51
+ result = tokenizer.decode(output[0], skip_special_tokens=True)
52
+ # Extract only the assistant's response
53
+ assistant_reply = result.split("<assistant>:")[-1].strip()
54
+
55
+ yield assistant_reply
56
 
57
 
58
  # Gradio UI
 
61
  type="messages",
62
  additional_inputs=[
63
  gr.Textbox(value="You are a helpful assistant.", label="System message"),
64
+ gr.Slider(minimum=1, maximum=2048, value=256, step=1, label="Max new tokens"),
65
+ gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
66
  gr.Slider(
67
  minimum=0.1,
68
  maximum=1.0,
requirements.txt CHANGED
@@ -1,3 +1,2 @@
1
- llama-cpp-python==0.2.79
2
  huggingface_hub
3
  gradio
 
 
1
  huggingface_hub
2
  gradio