Valtry commited on
Commit
bdc78db
·
verified ·
1 Parent(s): 9bd6495

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -18
app.py CHANGED
@@ -24,24 +24,28 @@ print("Model loaded successfully!")
24
 
25
 
26
  class ChatRequest(BaseModel):
27
- model: str
28
  messages: list
29
  stream: bool = False
 
 
30
 
31
 
32
  @app.post("/v1/chat/completions")
33
  async def chat(req: ChatRequest):
34
 
35
- user_message = req.messages[-1]["content"]
36
-
37
- prompt = f"""
38
- You are a helpful assistant.
39
-
40
- User: {user_message}
41
- Assistant:
42
- """
43
 
44
  inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
 
 
 
 
45
 
46
  # ---------------- STREAM MODE ----------------
47
 
@@ -56,8 +60,8 @@ Assistant:
56
  generation_kwargs = dict(
57
  **inputs,
58
  streamer=streamer,
59
- max_new_tokens=512,
60
- temperature=0.7,
61
  do_sample=True
62
  )
63
 
@@ -96,22 +100,20 @@ Assistant:
96
 
97
  output = model.generate(
98
  **inputs,
99
- max_new_tokens=80,
100
- temperature=0.7,
101
  do_sample=True
102
  )
103
 
104
- response = tokenizer.decode(output[0], skip_special_tokens=True)
105
-
106
- if "Assistant:" in response:
107
- response = response.split("Assistant:")[-1].strip()
108
 
109
  return {
110
  "choices": [
111
  {
112
  "message": {
113
  "role": "assistant",
114
- "content": response
115
  }
116
  }
117
  ]
 
24
 
25
 
26
  class ChatRequest(BaseModel):
27
+ model: str = "auric-ai"
28
  messages: list
29
  stream: bool = False
30
+ max_tokens: int = 512
31
+ temperature: float = 0.1
32
 
33
 
34
  @app.post("/v1/chat/completions")
35
  async def chat(req: ChatRequest):
36
 
37
+ # Use Qwen2's chat template for proper system/user/assistant formatting
38
+ prompt = tokenizer.apply_chat_template(
39
+ req.messages,
40
+ tokenize=False,
41
+ add_generation_prompt=True
42
+ )
 
 
43
 
44
  inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
45
+ prompt_length = inputs["input_ids"].shape[-1]
46
+
47
+ temperature = max(req.temperature, 0.01)
48
+ max_tokens = min(req.max_tokens, 2048)
49
 
50
  # ---------------- STREAM MODE ----------------
51
 
 
60
  generation_kwargs = dict(
61
  **inputs,
62
  streamer=streamer,
63
+ max_new_tokens=max_tokens,
64
+ temperature=temperature,
65
  do_sample=True
66
  )
67
 
 
100
 
101
  output = model.generate(
102
  **inputs,
103
+ max_new_tokens=max_tokens,
104
+ temperature=temperature,
105
  do_sample=True
106
  )
107
 
108
+ # Only decode the newly generated tokens, not the prompt
109
+ response = tokenizer.decode(output[0][prompt_length:], skip_special_tokens=True)
 
 
110
 
111
  return {
112
  "choices": [
113
  {
114
  "message": {
115
  "role": "assistant",
116
+ "content": response.strip()
117
  }
118
  }
119
  ]