Neon-AI commited on
Commit
2f41b1c
·
verified ·
1 Parent(s): e1352c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -6
app.py CHANGED
@@ -9,7 +9,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
9
  MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
10
 
11
  app = FastAPI(
12
- title="General AI REST API - Phi-3 Mini",
13
  version="1.0.0"
14
  )
15
 
@@ -27,7 +27,7 @@ model.eval()
27
  # ------------------------------
28
  class ChatRequest(BaseModel):
29
  prompt: str
30
- max_tokens: int = 150
31
  temperature: float = 0.25
32
  top_p: float = 0.95
33
 
@@ -49,25 +49,43 @@ def chat(req: ChatRequest):
49
  if not req.prompt or len(req.prompt.strip()) == 0:
50
  raise HTTPException(status_code=400, detail="Prompt is empty")
51
 
 
52
  # Safety caps
53
- prompt = req.prompt[:500] # limit prompt length
54
- max_tokens = min(req.max_tokens, 150) # limit max tokens
 
55
 
56
- # Build messages for Phi-3 Mini instruct
 
 
57
  messages = [
58
- {"role": "system", "content": "You are a concise, intelligent assistant. Answer clearly and briefly."},
 
 
 
 
 
 
 
 
59
  {"role": "user", "content": prompt}
60
  ]
61
 
 
62
  input_ids = tokenizer.apply_chat_template(
63
  messages,
64
  tokenize=True,
65
  return_tensors="pt"
66
  )
 
67
 
 
 
 
68
  with torch.no_grad():
69
  output = model.generate(
70
  input_ids,
 
71
  max_new_tokens=max_tokens,
72
  temperature=req.temperature,
73
  top_p=req.top_p,
@@ -75,6 +93,7 @@ def chat(req: ChatRequest):
75
  do_sample=True
76
  )
77
 
 
78
  reply = tokenizer.decode(
79
  output[0][input_ids.shape[-1]:],
80
  skip_special_tokens=True
 
9
  MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
10
 
11
  app = FastAPI(
12
+ title="Neon Tech Chatbot",
13
  version="1.0.0"
14
  )
15
 
 
27
  # ------------------------------
28
  class ChatRequest(BaseModel):
29
  prompt: str
30
+ max_tokens: int = 120
31
  temperature: float = 0.25
32
  top_p: float = 0.95
33
 
 
49
  if not req.prompt or len(req.prompt.strip()) == 0:
50
  raise HTTPException(status_code=400, detail="Prompt is empty")
51
 
52
+ # ------------------------------
53
  # Safety caps
54
+ # ------------------------------
55
+ prompt = req.prompt[:500] # limit prompt length
56
+ max_tokens = min(req.max_tokens, 150) # limit max tokens
57
 
58
+ # ------------------------------
59
+ # Build messages for instruct
60
+ # ------------------------------
61
  messages = [
62
+ {
63
+ "role": "system",
64
+ "content": (
65
+ "You are a concise, intelligent assistant. "
66
+ "Always respond in plain text. "
67
+ "Never output JSON, code blocks, or structured data. "
68
+ "Answer clearly and briefly."
69
+ )
70
+ },
71
  {"role": "user", "content": prompt}
72
  ]
73
 
74
+ # Tokenize + create attention mask explicitly
75
  input_ids = tokenizer.apply_chat_template(
76
  messages,
77
  tokenize=True,
78
  return_tensors="pt"
79
  )
80
+ attention_mask = torch.ones_like(input_ids)
81
 
82
+ # ------------------------------
83
+ # Generate response
84
+ # ------------------------------
85
  with torch.no_grad():
86
  output = model.generate(
87
  input_ids,
88
+ attention_mask=attention_mask,
89
  max_new_tokens=max_tokens,
90
  temperature=req.temperature,
91
  top_p=req.top_p,
 
93
  do_sample=True
94
  )
95
 
96
+ # Decode output, skip the prompt tokens
97
  reply = tokenizer.decode(
98
  output[0][input_ids.shape[-1]:],
99
  skip_special_tokens=True