Spaces:

Neon-AI
/

chatbot

Paused

App Files Files Community

Neon-AI commited on Jan 8

Commit

ee65cee

verified ·

1 Parent(s): a72e972

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -64

app.py CHANGED Viewed

@@ -3,28 +3,14 @@ from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForCausalLM
-# ------------------------------
-# Model configuration
-# ------------------------------
 MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
-app = FastAPI(
-    title="Neon Tech Chatbot",
-    version="1.0.0"
-)
-# Load model & tokenizer once at startup
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    device_map="cpu",
-    torch_dtype=torch.float32
-)
-model.eval()
-# ------------------------------
-# Schemas
-# ------------------------------
 class ChatRequest(BaseModel):
     prompt: str
     max_tokens: int = 120
@@ -34,70 +20,56 @@ class ChatRequest(BaseModel):
 class ChatResponse(BaseModel):
     reply: str
-# ------------------------------
-# Health check
-# ------------------------------
 @app.get("/health")
 def health():
     return {"status": "ok"}
-# ------------------------------
-# Chat endpoint
-# ------------------------------
 @app.post("/chat", response_model=ChatResponse)
 def chat(req: ChatRequest):
-    if not req.prompt or len(req.prompt.strip()) == 0:
-        raise HTTPException(status_code=400, detail="Prompt is empty")
-    # ------------------------------
-    # Safety caps
-    # ------------------------------
-    prompt = req.prompt[:500]               # limit prompt length
-    max_tokens = min(req.max_tokens, 150)  # limit max tokens
-    # ------------------------------
-    # Build messages for instruct
-    # ------------------------------
-    messages = [
-        {
-            "role": "system",
-            "content": (
-                "You are a concise, intelligent assistant. "
-                "Always respond in plain text. "
-                "Never output JSON, code blocks, or structured data. "
-                "Answer clearly and briefly."
-                "The name if your owner is Neon, and you are always happy to meet him"
-            )
-        },
-        {"role": "user", "content": prompt}
-    ]
-    # Tokenize + create attention mask explicitly
-    input_ids = tokenizer.apply_chat_template(
-        messages,
-        tokenize=True,
-        return_tensors="pt"
     )
-    attention_mask = torch.ones_like(input_ids)
-    # ------------------------------
-    # Generate response
-    # ------------------------------
     with torch.no_grad():
         output = model.generate(
-            input_ids,
             attention_mask=attention_mask,
-            max_new_tokens=max_tokens,
             temperature=req.temperature,
             top_p=req.top_p,
             repetition_penalty=1.1,
             do_sample=True
         )
-    # Decode output, skip the prompt tokens
-    reply = tokenizer.decode(
-        output[0][input_ids.shape[-1]:],
-        skip_special_tokens=True
-    ).strip()
     return {"reply": reply}

 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForCausalLM
 MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
+app = FastAPI(title="Neon Tech Chatbot", version="1.0.0")
+# Initialize global variables as None
+tokenizer = None
+model = None
 class ChatRequest(BaseModel):
     prompt: str
     max_tokens: int = 120
 class ChatResponse(BaseModel):
     reply: str
 @app.get("/health")
 def health():
     return {"status": "ok"}
+def load_model():
+    global tokenizer, model
+    if model is None or tokenizer is None:
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            device_map="cpu",
+            torch_dtype=torch.float32
+        )
+        model.eval()
 @app.post("/chat", response_model=ChatResponse)
 def chat(req: ChatRequest):
+    load_model()  # lazy load model only on first request
+    if not req.prompt.strip():
+        raise HTTPException(status_code=400, detail="Prompt is empty")
+    # Build manual prompt string (no apply_chat_template)
+    full_prompt = (
+        "You are a concise, intelligent assistant. "
+        "Always respond in plain text. "
+        "Never output JSON, code blocks, or structured data. "
+        "Answer clearly and briefly. "
+        "The name of your owner is Neon, and you are always happy to meet him.\n\n"
+        f"User: {req.prompt}\nAssistant:"
     )
+    inputs = tokenizer(full_prompt, return_tensors="pt")
+    attention_mask = torch.ones_like(inputs.input_ids)
     with torch.no_grad():
         output = model.generate(
+            inputs.input_ids,
             attention_mask=attention_mask,
+            max_new_tokens=min(req.max_tokens, 150),
             temperature=req.temperature,
             top_p=req.top_p,
             repetition_penalty=1.1,
             do_sample=True
         )
+    reply = tokenizer.decode(output[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True).strip()
+    # Strip leftover system prefix if present
+    if reply.lower().startswith("system"):
+        reply = reply.split("\n", 1)[-1].strip()
     return {"reply": reply}