Spaces:

Neon-AI
/

chatbot

Paused

App Files Files Community

Neon-AI commited on Jan 8

Commit

2b178c7

verified ·

1 Parent(s): ee65cee

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -20

app.py CHANGED Viewed

@@ -2,15 +2,39 @@ import torch
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForCausalLM
 MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
 app = FastAPI(title="Neon Tech Chatbot", version="1.0.0")
-# Initialize global variables as None
 tokenizer = None
 model = None
 class ChatRequest(BaseModel):
     prompt: str
     max_tokens: int = 120
@@ -20,41 +44,57 @@ class ChatRequest(BaseModel):
 class ChatResponse(BaseModel):
     reply: str
 @app.get("/health")
 def health():
     return {"status": "ok"}
-def load_model():
-    global tokenizer, model
-    if model is None or tokenizer is None:
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-        model = AutoModelForCausalLM.from_pretrained(
-            MODEL_ID,
-            device_map="cpu",
-            torch_dtype=torch.float32
-        )
-        model.eval()
 @app.post("/chat", response_model=ChatResponse)
 def chat(req: ChatRequest):
-    load_model()  # lazy load model only on first request
     if not req.prompt.strip():
         raise HTTPException(status_code=400, detail="Prompt is empty")
-    # Build manual prompt string (no apply_chat_template)
-    full_prompt = (
         "You are a concise, intelligent assistant. "
         "Always respond in plain text. "
-        "Never output JSON, code blocks, or structured data. "
-        "Answer clearly and briefly. "
-        "The name of your owner is Neon, and you are always happy to meet him.\n\n"
-        f"User: {req.prompt}\nAssistant:"
     )
     inputs = tokenizer(full_prompt, return_tensors="pt")
     attention_mask = torch.ones_like(inputs.input_ids)
     with torch.no_grad():
         output = model.generate(
             inputs.input_ids,
@@ -68,8 +108,15 @@ def chat(req: ChatRequest):
     reply = tokenizer.decode(output[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True).strip()
-    # Strip leftover system prefix if present
     if reply.lower().startswith("system"):
         reply = reply.split("\n", 1)[-1].strip()
     return {"reply": reply}

 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForCausalLM
+from typing import List
+# ------------------------------
+# Model config
+# ------------------------------
 MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
 app = FastAPI(title="Neon Tech Chatbot", version="1.0.0")
+# Lazy load model
 tokenizer = None
 model = None
+def load_model():
+    global tokenizer, model
+    if model is None or tokenizer is None:
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            device_map="cpu",
+            torch_dtype=torch.float32
+        )
+        model.eval()
+# ------------------------------
+# Memory storage (in-memory)
+# ------------------------------
+# Keep last 5 exchanges max
+conversation_memory: List[dict] = []
+# ------------------------------
+# Schemas
+# ------------------------------
 class ChatRequest(BaseModel):
     prompt: str
     max_tokens: int = 120
 class ChatResponse(BaseModel):
     reply: str
+# ------------------------------
+# Health check
+# ------------------------------
 @app.get("/health")
 def health():
     return {"status": "ok"}
+# ------------------------------
+# Chat endpoint
+# ------------------------------
 @app.post("/chat", response_model=ChatResponse)
 def chat(req: ChatRequest):
+    load_model()  # lazy load
     if not req.prompt.strip():
         raise HTTPException(status_code=400, detail="Prompt is empty")
+    # ------------------------------
+    # Add new user message to memory
+    # ------------------------------
+    conversation_memory.append({"role": "user", "content": req.prompt})
+    # Keep only last 5 exchanges
+    conversation_memory[:] = conversation_memory[-10:]
+    # ------------------------------
+    # Build manual prompt string
+    # ------------------------------
+    system_instructions = (
         "You are a concise, intelligent assistant. "
         "Always respond in plain text. "
+        "Do not start responses with greetings like 'How can I help you today?'. "
+        "Remember context from previous messages. "
+        "Keep responses short, clear, and natural. "
+        "Your owner is Neon and you are always happy to meet him.\n\n"
     )
+    full_prompt = system_instructions
+    for msg in conversation_memory:
+        role = "User" if msg["role"] == "user" else "Assistant"
+        full_prompt += f"{role}: {msg['content']}\n"
+    full_prompt += "Assistant:"
+    # ------------------------------
+    # Tokenize + attention mask
+    # ------------------------------
     inputs = tokenizer(full_prompt, return_tensors="pt")
     attention_mask = torch.ones_like(inputs.input_ids)
+    # ------------------------------
+    # Generate response
+    # ------------------------------
     with torch.no_grad():
         output = model.generate(
             inputs.input_ids,
     reply = tokenizer.decode(output[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True).strip()
+    # ------------------------------
+    # Clean leftover system prefix if present
+    # ------------------------------
     if reply.lower().startswith("system"):
         reply = reply.split("\n", 1)[-1].strip()
+    # ------------------------------
+    # Save assistant reply to memory
+    # ------------------------------
+    conversation_memory.append({"role": "assistant", "content": reply})
     return {"reply": reply}