Neon-AI commited on
Commit
1c348b1
·
verified ·
1 Parent(s): eaf7d6c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -88
app.py CHANGED
@@ -1,104 +1,75 @@
1
- import torch
2
- from fastapi import FastAPI, HTTPException
3
  from pydantic import BaseModel
4
- from transformers import AutoTokenizer, AutoModelForCausalLM
5
-
6
- # ------------------------------
7
- # Model configuration
8
- # ------------------------------
9
- MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
10
 
11
- app = FastAPI(
12
- title="Niche Chatbot",
13
- version="1.0.0"
14
- )
 
 
15
 
16
- # Lazy-load model
17
- tokenizer = None
18
- model = None
 
 
19
 
20
- def load_model():
21
- global tokenizer, model
22
- if model is None or tokenizer is None:
23
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
24
- model = AutoModelForCausalLM.from_pretrained(
25
- MODEL_ID,
26
- device_map="cpu",
27
- torch_dtype=torch.float32
28
- )
29
- model.eval()
30
 
31
- # ------------------------------
32
- # Schemas
33
- # ------------------------------
34
- class ChatRequest(BaseModel):
35
  prompt: str
36
- max_tokens: int = 120
37
- temperature: float = 0.25
38
- top_p: float = 0.95
39
-
40
- class ChatResponse(BaseModel):
41
- reply: str
42
 
43
- # ------------------------------
44
- # Health check
45
- # ------------------------------
46
- @app.get("/health")
47
- def health():
48
- return {"status": "ok"}
49
 
50
- # ------------------------------
51
- # Chat endpoint
52
- # ------------------------------
53
- @app.post("/chat", response_model=ChatResponse)
54
- def chat(req: ChatRequest):
55
- load_model() # lazy-load on first request
56
-
57
- if not req.prompt.strip():
58
- raise HTTPException(status_code=400, detail="Prompt is empty")
59
-
60
- # ------------------------------
61
- # Build manual prompt
62
- # ------------------------------
63
  system_instructions = (
64
- "You are Niche, a concise and intelligent AI that answers questions directly. "
65
- "Never begin replies with greetings, offers to assist, or questions like 'How can I help you?'. "
66
- "Always respond naturally, clearly, and only provide the requested information or explanation. "
67
- "Always respond in plain text. "
68
- "Do not start responses with greetings like 'How can I help you today?'. "
69
- "Keep answers clear, short, and natural. "
70
- "Your owner is Neon. Mention your owner only if asked about them, otherwise focus on answering the user naturally.\n\n"
 
71
  )
72
 
73
- full_prompt = system_instructions + f"User: {req.prompt}\nAssistant:"
 
 
 
74
 
75
- # ------------------------------
76
- # Tokenize + attention mask
77
- # ------------------------------
78
- inputs = tokenizer(full_prompt, return_tensors="pt")
79
- attention_mask = torch.ones_like(inputs.input_ids)
 
80
 
81
- # ------------------------------
82
- # Generate response
83
- # ------------------------------
84
- with torch.no_grad():
85
- output = model.generate(
86
- inputs.input_ids,
87
- attention_mask=attention_mask,
88
- max_new_tokens=min(req.max_tokens, 150),
89
- temperature=req.temperature,
90
- top_p=req.top_p,
91
- repetition_penalty=1.1,
92
- do_sample=True
93
- )
94
 
95
- reply = tokenizer.decode(
96
- output[0][inputs.input_ids.shape[-1]:],
97
- skip_special_tokens=True
98
- ).strip()
99
 
100
- # Clean leftover system prefix if present
101
- if reply.lower().startswith("system"):
102
- reply = reply.split("\n", 1)[-1].strip()
 
 
103
 
104
- return {"reply": reply}
 
1
+ from fastapi import FastAPI
2
+ from fastapi.responses import StreamingResponse
3
  from pydantic import BaseModel
4
+ import torch
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
6
+ import threading
 
 
 
7
 
8
+ # ---------------- CONFIG ----------------
9
+ MODEL_ID = "Neon-AI/Niche"
10
+ MAX_NEW_TOKENS = 16384
11
+ TEMPERATURE = 0.7
12
+ TOP_P = 0.9
13
+ # ----------------------------------------
14
 
15
+ # Load model once
16
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
17
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.float32)
18
+ model.to("cpu")
19
+ model.eval()
20
 
21
+ app = FastAPI()
 
 
 
 
 
 
 
 
 
22
 
23
+ class PromptRequest(BaseModel):
 
 
 
24
  prompt: str
 
 
 
 
 
 
25
 
26
+ @app.post("/generate")
27
+ async def generate(request: PromptRequest):
28
+ prompt = request.prompt
 
 
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  system_instructions = (
31
+ "You are Niche, a concise and intelligent AI. "
32
+ "Answer directly and naturally. "
33
+ "Do not use greetings, pleasantries, or offers of help. "
34
+ "Respond only with the requested information or explanation. "
35
+ "Use plain and rich code markdowns. "
36
+ "Keep responses short, clear, and focused. "
37
+ "Your owner is Neon. Mention Neon only if explicitly asked. "
38
+ "Neon is a man; the pronoun should always be 'him'."
39
  )
40
 
41
+ chat = [
42
+ {"role": "system", "content": system_instructions},
43
+ {"role": "user", "content": prompt}
44
+ ]
45
 
46
+ inputs = tokenizer.apply_chat_template(
47
+ chat,
48
+ add_generation_prompt=True,
49
+ return_tensors="pt",
50
+ return_dict=True
51
+ )
52
 
53
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
54
+
55
+ gen_kwargs = dict(
56
+ **inputs,
57
+ max_new_tokens=MAX_NEW_TOKENS,
58
+ do_sample=True,
59
+ temperature=TEMPERATURE,
60
+ top_p=TOP_P,
61
+ eos_token_id=tokenizer.eos_token_id,
62
+ pad_token_id=tokenizer.eos_token_id,
63
+ streamer=streamer
64
+ )
 
65
 
66
+ thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
67
+ thread.start()
 
 
68
 
69
+ def event_generator():
70
+ yield "data: " # start empty
71
+ for token in streamer:
72
+ yield f"data: {token}\n\n"
73
+ yield "data: [DONE]\n\n"
74
 
75
+ return StreamingResponse(event_generator(), media_type="text/event-stream")