Spaces:

Neon-AI
/

chatbot

Paused

App Files Files Community

Neon-AI commited on Jan 15

Commit

1c348b1

verified ·

1 Parent(s): eaf7d6c

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -88

app.py CHANGED Viewed

@@ -1,104 +1,75 @@
-import torch
-from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
-from transformers import AutoTokenizer, AutoModelForCausalLM
-# ------------------------------
-# Model configuration
-# ------------------------------
-MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
-app = FastAPI(
-    title="Niche Chatbot",
-    version="1.0.0"
-)
-# Lazy-load model
-tokenizer = None
-model = None
-def load_model():
-    global tokenizer, model
-    if model is None or tokenizer is None:
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-        model = AutoModelForCausalLM.from_pretrained(
-            MODEL_ID,
-            device_map="cpu",
-            torch_dtype=torch.float32
-        )
-        model.eval()
-# ------------------------------
-# Schemas
-# ------------------------------
-class ChatRequest(BaseModel):
     prompt: str
-    max_tokens: int = 120
-    temperature: float = 0.25
-    top_p: float = 0.95
-class ChatResponse(BaseModel):
-    reply: str
-# ------------------------------
-# Health check
-# ------------------------------
-@app.get("/health")
-def health():
-    return {"status": "ok"}
-# ------------------------------
-# Chat endpoint
-# ------------------------------
-@app.post("/chat", response_model=ChatResponse)
-def chat(req: ChatRequest):
-    load_model()  # lazy-load on first request
-    if not req.prompt.strip():
-        raise HTTPException(status_code=400, detail="Prompt is empty")
-    # ------------------------------
-    # Build manual prompt
-    # ------------------------------
     system_instructions = (
-    "You are Niche, a concise and intelligent AI that answers questions directly. "
-    "Never begin replies with greetings, offers to assist, or questions like 'How can I help you?'. "
-    "Always respond naturally, clearly, and only provide the requested information or explanation. "
-    "Always respond in plain text. "
-    "Do not start responses with greetings like 'How can I help you today?'. "
-    "Keep answers clear, short, and natural. "
-    "Your owner is Neon. Mention your owner only if asked about them, otherwise focus on answering the user naturally.\n\n"
     )
-    full_prompt = system_instructions + f"User: {req.prompt}\nAssistant:"
-    # ------------------------------
-    # Tokenize + attention mask
-    # ------------------------------
-    inputs = tokenizer(full_prompt, return_tensors="pt")
-    attention_mask = torch.ones_like(inputs.input_ids)
-    # ------------------------------
-    # Generate response
-    # ------------------------------
-    with torch.no_grad():
-        output = model.generate(
-            inputs.input_ids,
-            attention_mask=attention_mask,
-            max_new_tokens=min(req.max_tokens, 150),
-            temperature=req.temperature,
-            top_p=req.top_p,
-            repetition_penalty=1.1,
-            do_sample=True
-        )
-    reply = tokenizer.decode(
-        output[0][inputs.input_ids.shape[-1]:],
-        skip_special_tokens=True
-    ).strip()
-    # Clean leftover system prefix if present
-    if reply.lower().startswith("system"):
-        reply = reply.split("\n", 1)[-1].strip()
-    return {"reply": reply}

+from fastapi import FastAPI
+from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+import threading
+# ---------------- CONFIG ----------------
+MODEL_ID = "Neon-AI/Niche"
+MAX_NEW_TOKENS = 16384
+TEMPERATURE = 0.7
+TOP_P = 0.9
+# ----------------------------------------
+# Load model once
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.float32)
+model.to("cpu")
+model.eval()
+app = FastAPI()
+class PromptRequest(BaseModel):
     prompt: str
+@app.post("/generate")
+async def generate(request: PromptRequest):
+    prompt = request.prompt
     system_instructions = (
+        "You are Niche, a concise and intelligent AI. "
+        "Answer directly and naturally. "
+        "Do not use greetings, pleasantries, or offers of help. "
+        "Respond only with the requested information or explanation. "
+        "Use plain and rich code markdowns. "
+        "Keep responses short, clear, and focused. "
+        "Your owner is Neon. Mention Neon only if explicitly asked. "
+        "Neon is a man; the pronoun should always be 'him'."
     )
+    chat = [
+        {"role": "system", "content": system_instructions},
+        {"role": "user", "content": prompt}
+    ]
+    inputs = tokenizer.apply_chat_template(
+        chat,
+        add_generation_prompt=True,
+        return_tensors="pt",
+        return_dict=True
+    )
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    gen_kwargs = dict(
+        **inputs,
+        max_new_tokens=MAX_NEW_TOKENS,
+        do_sample=True,
+        temperature=TEMPERATURE,
+        top_p=TOP_P,
+        eos_token_id=tokenizer.eos_token_id,
+        pad_token_id=tokenizer.eos_token_id,
+        streamer=streamer
+    )
+    thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
+    thread.start()
+    def event_generator():
+        yield "data: "  # start empty
+        for token in streamer:
+            yield f"data: {token}\n\n"
+        yield "data: [DONE]\n\n"
+    return StreamingResponse(event_generator(), media_type="text/event-stream")