Neon-AI commited on
Commit
ee65cee
·
verified ·
1 Parent(s): a72e972

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -64
app.py CHANGED
@@ -3,28 +3,14 @@ from fastapi import FastAPI, HTTPException
3
  from pydantic import BaseModel
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
5
 
6
- # ------------------------------
7
- # Model configuration
8
- # ------------------------------
9
  MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
10
 
11
- app = FastAPI(
12
- title="Neon Tech Chatbot",
13
- version="1.0.0"
14
- )
15
 
16
- # Load model & tokenizer once at startup
17
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
18
- model = AutoModelForCausalLM.from_pretrained(
19
- MODEL_ID,
20
- device_map="cpu",
21
- torch_dtype=torch.float32
22
- )
23
- model.eval()
24
 
25
- # ------------------------------
26
- # Schemas
27
- # ------------------------------
28
  class ChatRequest(BaseModel):
29
  prompt: str
30
  max_tokens: int = 120
@@ -34,70 +20,56 @@ class ChatRequest(BaseModel):
34
  class ChatResponse(BaseModel):
35
  reply: str
36
 
37
- # ------------------------------
38
- # Health check
39
- # ------------------------------
40
  @app.get("/health")
41
  def health():
42
  return {"status": "ok"}
43
 
44
- # ------------------------------
45
- # Chat endpoint
46
- # ------------------------------
 
 
 
 
 
 
 
 
47
  @app.post("/chat", response_model=ChatResponse)
48
  def chat(req: ChatRequest):
49
- if not req.prompt or len(req.prompt.strip()) == 0:
50
- raise HTTPException(status_code=400, detail="Prompt is empty")
51
-
52
- # ------------------------------
53
- # Safety caps
54
- # ------------------------------
55
- prompt = req.prompt[:500] # limit prompt length
56
- max_tokens = min(req.max_tokens, 150) # limit max tokens
57
 
58
- # ------------------------------
59
- # Build messages for instruct
60
- # ------------------------------
61
- messages = [
62
- {
63
- "role": "system",
64
- "content": (
65
- "You are a concise, intelligent assistant. "
66
- "Always respond in plain text. "
67
- "Never output JSON, code blocks, or structured data. "
68
- "Answer clearly and briefly."
69
- "The name if your owner is Neon, and you are always happy to meet him"
70
- )
71
- },
72
- {"role": "user", "content": prompt}
73
- ]
74
 
75
- # Tokenize + create attention mask explicitly
76
- input_ids = tokenizer.apply_chat_template(
77
- messages,
78
- tokenize=True,
79
- return_tensors="pt"
 
 
 
80
  )
81
- attention_mask = torch.ones_like(input_ids)
82
 
83
- # ------------------------------
84
- # Generate response
85
- # ------------------------------
86
  with torch.no_grad():
87
  output = model.generate(
88
- input_ids,
89
  attention_mask=attention_mask,
90
- max_new_tokens=max_tokens,
91
  temperature=req.temperature,
92
  top_p=req.top_p,
93
  repetition_penalty=1.1,
94
  do_sample=True
95
  )
96
 
97
- # Decode output, skip the prompt tokens
98
- reply = tokenizer.decode(
99
- output[0][input_ids.shape[-1]:],
100
- skip_special_tokens=True
101
- ).strip()
102
 
103
  return {"reply": reply}
 
3
  from pydantic import BaseModel
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
5
 
 
 
 
6
  MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
7
 
8
+ app = FastAPI(title="Neon Tech Chatbot", version="1.0.0")
 
 
 
9
 
10
+ # Initialize global variables as None
11
+ tokenizer = None
12
+ model = None
 
 
 
 
 
13
 
 
 
 
14
  class ChatRequest(BaseModel):
15
  prompt: str
16
  max_tokens: int = 120
 
20
  class ChatResponse(BaseModel):
21
  reply: str
22
 
 
 
 
23
  @app.get("/health")
24
  def health():
25
  return {"status": "ok"}
26
 
27
+ def load_model():
28
+ global tokenizer, model
29
+ if model is None or tokenizer is None:
30
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
31
+ model = AutoModelForCausalLM.from_pretrained(
32
+ MODEL_ID,
33
+ device_map="cpu",
34
+ torch_dtype=torch.float32
35
+ )
36
+ model.eval()
37
+
38
  @app.post("/chat", response_model=ChatResponse)
39
  def chat(req: ChatRequest):
40
+ load_model() # lazy load model only on first request
 
 
 
 
 
 
 
41
 
42
+ if not req.prompt.strip():
43
+ raise HTTPException(status_code=400, detail="Prompt is empty")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ # Build manual prompt string (no apply_chat_template)
46
+ full_prompt = (
47
+ "You are a concise, intelligent assistant. "
48
+ "Always respond in plain text. "
49
+ "Never output JSON, code blocks, or structured data. "
50
+ "Answer clearly and briefly. "
51
+ "The name of your owner is Neon, and you are always happy to meet him.\n\n"
52
+ f"User: {req.prompt}\nAssistant:"
53
  )
 
54
 
55
+ inputs = tokenizer(full_prompt, return_tensors="pt")
56
+ attention_mask = torch.ones_like(inputs.input_ids)
57
+
58
  with torch.no_grad():
59
  output = model.generate(
60
+ inputs.input_ids,
61
  attention_mask=attention_mask,
62
+ max_new_tokens=min(req.max_tokens, 150),
63
  temperature=req.temperature,
64
  top_p=req.top_p,
65
  repetition_penalty=1.1,
66
  do_sample=True
67
  )
68
 
69
+ reply = tokenizer.decode(output[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True).strip()
70
+
71
+ # Strip leftover system prefix if present
72
+ if reply.lower().startswith("system"):
73
+ reply = reply.split("\n", 1)[-1].strip()
74
 
75
  return {"reply": reply}