Neon-AI commited on
Commit
2b178c7
·
verified ·
1 Parent(s): ee65cee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -20
app.py CHANGED
@@ -2,15 +2,39 @@ import torch
2
  from fastapi import FastAPI, HTTPException
3
  from pydantic import BaseModel
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
5
 
 
 
 
6
  MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
7
 
8
  app = FastAPI(title="Neon Tech Chatbot", version="1.0.0")
9
 
10
- # Initialize global variables as None
11
  tokenizer = None
12
  model = None
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  class ChatRequest(BaseModel):
15
  prompt: str
16
  max_tokens: int = 120
@@ -20,41 +44,57 @@ class ChatRequest(BaseModel):
20
  class ChatResponse(BaseModel):
21
  reply: str
22
 
 
 
 
23
  @app.get("/health")
24
  def health():
25
  return {"status": "ok"}
26
 
27
- def load_model():
28
- global tokenizer, model
29
- if model is None or tokenizer is None:
30
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
31
- model = AutoModelForCausalLM.from_pretrained(
32
- MODEL_ID,
33
- device_map="cpu",
34
- torch_dtype=torch.float32
35
- )
36
- model.eval()
37
-
38
  @app.post("/chat", response_model=ChatResponse)
39
  def chat(req: ChatRequest):
40
- load_model() # lazy load model only on first request
41
 
42
  if not req.prompt.strip():
43
  raise HTTPException(status_code=400, detail="Prompt is empty")
44
 
45
- # Build manual prompt string (no apply_chat_template)
46
- full_prompt = (
 
 
 
 
 
 
 
 
 
47
  "You are a concise, intelligent assistant. "
48
  "Always respond in plain text. "
49
- "Never output JSON, code blocks, or structured data. "
50
- "Answer clearly and briefly. "
51
- "The name of your owner is Neon, and you are always happy to meet him.\n\n"
52
- f"User: {req.prompt}\nAssistant:"
53
  )
54
 
 
 
 
 
 
 
 
 
 
55
  inputs = tokenizer(full_prompt, return_tensors="pt")
56
  attention_mask = torch.ones_like(inputs.input_ids)
57
 
 
 
 
58
  with torch.no_grad():
59
  output = model.generate(
60
  inputs.input_ids,
@@ -68,8 +108,15 @@ def chat(req: ChatRequest):
68
 
69
  reply = tokenizer.decode(output[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True).strip()
70
 
71
- # Strip leftover system prefix if present
 
 
72
  if reply.lower().startswith("system"):
73
  reply = reply.split("\n", 1)[-1].strip()
74
 
 
 
 
 
 
75
  return {"reply": reply}
 
2
  from fastapi import FastAPI, HTTPException
3
  from pydantic import BaseModel
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
5
+ from typing import List
6
 
7
+ # ------------------------------
8
+ # Model config
9
+ # ------------------------------
10
  MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
11
 
12
  app = FastAPI(title="Neon Tech Chatbot", version="1.0.0")
13
 
14
+ # Lazy load model
15
  tokenizer = None
16
  model = None
17
 
18
+ def load_model():
19
+ global tokenizer, model
20
+ if model is None or tokenizer is None:
21
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
22
+ model = AutoModelForCausalLM.from_pretrained(
23
+ MODEL_ID,
24
+ device_map="cpu",
25
+ torch_dtype=torch.float32
26
+ )
27
+ model.eval()
28
+
29
+ # ------------------------------
30
+ # Memory storage (in-memory)
31
+ # ------------------------------
32
+ # Keep last 5 exchanges max
33
+ conversation_memory: List[dict] = []
34
+
35
+ # ------------------------------
36
+ # Schemas
37
+ # ------------------------------
38
  class ChatRequest(BaseModel):
39
  prompt: str
40
  max_tokens: int = 120
 
44
  class ChatResponse(BaseModel):
45
  reply: str
46
 
47
+ # ------------------------------
48
+ # Health check
49
+ # ------------------------------
50
  @app.get("/health")
51
  def health():
52
  return {"status": "ok"}
53
 
54
+ # ------------------------------
55
+ # Chat endpoint
56
+ # ------------------------------
 
 
 
 
 
 
 
 
57
  @app.post("/chat", response_model=ChatResponse)
58
  def chat(req: ChatRequest):
59
+ load_model() # lazy load
60
 
61
  if not req.prompt.strip():
62
  raise HTTPException(status_code=400, detail="Prompt is empty")
63
 
64
+ # ------------------------------
65
+ # Add new user message to memory
66
+ # ------------------------------
67
+ conversation_memory.append({"role": "user", "content": req.prompt})
68
+ # Keep only last 5 exchanges
69
+ conversation_memory[:] = conversation_memory[-10:]
70
+
71
+ # ------------------------------
72
+ # Build manual prompt string
73
+ # ------------------------------
74
+ system_instructions = (
75
  "You are a concise, intelligent assistant. "
76
  "Always respond in plain text. "
77
+ "Do not start responses with greetings like 'How can I help you today?'. "
78
+ "Remember context from previous messages. "
79
+ "Keep responses short, clear, and natural. "
80
+ "Your owner is Neon and you are always happy to meet him.\n\n"
81
  )
82
 
83
+ full_prompt = system_instructions
84
+ for msg in conversation_memory:
85
+ role = "User" if msg["role"] == "user" else "Assistant"
86
+ full_prompt += f"{role}: {msg['content']}\n"
87
+ full_prompt += "Assistant:"
88
+
89
+ # ------------------------------
90
+ # Tokenize + attention mask
91
+ # ------------------------------
92
  inputs = tokenizer(full_prompt, return_tensors="pt")
93
  attention_mask = torch.ones_like(inputs.input_ids)
94
 
95
+ # ------------------------------
96
+ # Generate response
97
+ # ------------------------------
98
  with torch.no_grad():
99
  output = model.generate(
100
  inputs.input_ids,
 
108
 
109
  reply = tokenizer.decode(output[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True).strip()
110
 
111
+ # ------------------------------
112
+ # Clean leftover system prefix if present
113
+ # ------------------------------
114
  if reply.lower().startswith("system"):
115
  reply = reply.split("\n", 1)[-1].strip()
116
 
117
+ # ------------------------------
118
+ # Save assistant reply to memory
119
+ # ------------------------------
120
+ conversation_memory.append({"role": "assistant", "content": reply})
121
+
122
  return {"reply": reply}