ghosthets commited on
Commit
58b71d4
·
verified ·
1 Parent(s): aa4e8c5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -18
app.py CHANGED
@@ -6,23 +6,20 @@ import torch
6
  app = flask.Flask(__name__)
7
 
8
  # ---------------------------
9
- # SMALL LLM MODEL (1–2 GB)
10
  # ---------------------------
11
- # Best small model: SmolLM-1.7B-Chat
12
- model_id = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
13
 
14
  print(f"🔄 Loading {model_id} model...")
15
 
16
- # Load tokenizer
17
  tokenizer = AutoTokenizer.from_pretrained(model_id)
18
 
19
- # Load model (auto dtype to avoid errors)
20
  model = AutoModelForCausalLM.from_pretrained(
21
  model_id,
22
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
23
  )
24
 
25
- # Device setup
26
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
  model.to(device)
28
 
@@ -31,7 +28,7 @@ print(f"✅ {model_id} loaded successfully!")
31
  # ---------------------------
32
  # Chat Endpoint
33
  # ---------------------------
34
- @app.route('/chat', methods=['POST'])
35
  def chat():
36
  try:
37
  data = request.get_json()
@@ -40,25 +37,23 @@ def chat():
40
  if not msg:
41
  return jsonify({"error": "No message sent"}), 400
42
 
43
- # SmolLM uses normal text prompt (no ChatML)
44
- prompt = f"<|user|>\n{msg}\n<|assistant|>\n"
45
 
46
  inputs = tokenizer(prompt, return_tensors="pt").to(device)
47
 
48
  output = model.generate(
49
  **inputs,
50
- max_new_tokens=256,
51
- do_sample=True,
52
- temperature=0.6,
53
- top_p=0.8,
54
  pad_token_id=tokenizer.eos_token_id,
55
  )
56
 
57
  reply = tokenizer.decode(output[0], skip_special_tokens=True)
58
 
59
- # Extract only assistant part
60
- if "<|assistant|>" in reply:
61
- reply = reply.split("<|assistant|>")[-1].strip()
62
 
63
  return jsonify({"reply": reply})
64
 
@@ -67,4 +62,4 @@ def chat():
67
 
68
 
69
  if __name__ == "__main__":
70
- app.run(host='0.0.0.0', port=7860)
 
6
  app = flask.Flask(__name__)
7
 
8
  # ---------------------------
9
+ # SUPER FAST SMALL MODEL
10
  # ---------------------------
11
+ model_id = "HuggingFaceTB/SmolLM2-360M-Instruct"
 
12
 
13
  print(f"🔄 Loading {model_id} model...")
14
 
 
15
  tokenizer = AutoTokenizer.from_pretrained(model_id)
16
 
17
+ # Best dtype for CPU speed = bfloat16
18
  model = AutoModelForCausalLM.from_pretrained(
19
  model_id,
20
+ torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
21
  )
22
 
 
23
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
  model.to(device)
25
 
 
28
  # ---------------------------
29
  # Chat Endpoint
30
  # ---------------------------
31
+ @app.route("/chat", methods=["POST"])
32
  def chat():
33
  try:
34
  data = request.get_json()
 
37
  if not msg:
38
  return jsonify({"error": "No message sent"}), 400
39
 
40
+ # SmolLM2 format: no chat special tokens needed
41
+ prompt = f"User: {msg}\nAssistant:"
42
 
43
  inputs = tokenizer(prompt, return_tensors="pt").to(device)
44
 
45
  output = model.generate(
46
  **inputs,
47
+ max_new_tokens=128, # fast
48
+ do_sample=False, # FASTEST
 
 
49
  pad_token_id=tokenizer.eos_token_id,
50
  )
51
 
52
  reply = tokenizer.decode(output[0], skip_special_tokens=True)
53
 
54
+ # Remove prompt text from output
55
+ if "Assistant:" in reply:
56
+ reply = reply.split("Assistant:")[-1].strip()
57
 
58
  return jsonify({"reply": reply})
59
 
 
62
 
63
 
64
  if __name__ == "__main__":
65
+ app.run(host="0.0.0.0", port=7860)