Spaces:

CooLLaMACEO
/

ChatMPT

Running

App Files Files Community

CooLLaMACEO commited on Feb 4

Commit

3320b3e

verified ·

1 Parent(s): ad08817

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -42

app.py CHANGED Viewed

@@ -1,17 +1,13 @@
-import os
 from fastapi import FastAPI, Request, HTTPException, Depends
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 from fastapi.responses import JSONResponse
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-import torch
 import uvicorn
-# -------------------------------
-# FastAPI setup
-# -------------------------------
-app = FastAPI(title="ChatMPT API (Transformers)")
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -19,58 +15,47 @@ app.add_middleware(
     allow_headers=["*"],
 )
 security = HTTPBearer()
-MY_API_KEY = os.environ.get("API_KEY", "my-secret-key-456")
 def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
     if credentials.credentials != MY_API_KEY:
         raise HTTPException(status_code=403, detail="Unauthorized")
     return credentials.credentials
-# -------------------------------
-# Load model with Transformers
-# -------------------------------
-MODEL_PATH = "./mpt-7b-q2.gguf"  # path to downloaded model
-print("Loading tokenizer and model...")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_PATH,
-    device_map="auto",   # will use GPU if available, CPU otherwise
-    torch_dtype=torch.float16  # use float16 if possible for efficiency
-)
-generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)
-# -------------------------------
-# Chat Endpoint
-# -------------------------------
 @app.post("/v1/chat")
 async def chat(request: Request, _ = Depends(verify_token)):
     try:
         data = await request.json()
-        user_input = data.get("prompt", "").strip()
-        if not user_input:
             return JSONResponse(status_code=400, content={"error": "No prompt provided"})
-        # Generate response
-        output = generator(user_input, do_sample=True, temperature=0.7)
-        reply = output[0]["generated_text"]
         return JSONResponse(content={"reply": reply})
     except Exception as e:
         return JSONResponse(status_code=500, content={"error": str(e)})
-# -------------------------------
-# Health Check
-# -------------------------------
-@app.get("/health")
-async def health():
-    return {"status": "ok"}
-# -------------------------------
-# Run app
-# -------------------------------
 if __name__ == "__main__":
-    port = int(os.environ.get("PORT", 8080))
-    uvicorn.run(app, host="0.0.0.0", port=port)

 from fastapi import FastAPI, Request, HTTPException, Depends
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 from fastapi.responses import JSONResponse
+from llama_cpp import Llama
 import uvicorn
+app = FastAPI()
+# Allow all origins (for frontend access)
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# Simple API key auth
 security = HTTPBearer()
+MY_API_KEY = "my-secret-key-456"
+# Load GGUF model (CPU only, small threads for Spaces)
+llm = Llama(
+    model_path="./mpt-7b-chat.gguf",  # Make sure this is a tokenizer-included GGUF
+    n_ctx=2048,
+    n_threads=2,       # Reduce for free tier
+    n_gpu_layers=0     # Force CPU
+)
 def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
     if credentials.credentials != MY_API_KEY:
         raise HTTPException(status_code=403, detail="Unauthorized")
     return credentials.credentials
 @app.post("/v1/chat")
 async def chat(request: Request, _ = Depends(verify_token)):
     try:
         data = await request.json()
+        user_prompt = data.get("prompt", "").strip()
+        if not user_prompt:
             return JSONResponse(status_code=400, content={"error": "No prompt provided"})
+        # MPT chat format
+        prompt = f"<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
+        output = llm(
+            prompt,
+            max_tokens=512,
+            temperature=0.7,
+            stop=["<|im_end|>", "<|im_start|>"],
+            echo=False
+        )
+        reply = output["choices"][0]["text"].strip()
         return JSONResponse(content={"reply": reply})
     except Exception as e:
         return JSONResponse(status_code=500, content={"error": str(e)})
 if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)