Spaces:

CooLLaMACEO
/

ChatMPT

Running

App Files Files Community

CooLLaMACEO commited on Feb 4

Commit

c828ba8

verified ·

1 Parent(s): 2257bba

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -24

app.py CHANGED Viewed

@@ -3,15 +3,15 @@ from fastapi import FastAPI, Request, HTTPException, Depends
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 from fastapi.responses import JSONResponse
-from llama_cpp import Llama
 import uvicorn
 # -------------------------------
 # FastAPI setup
 # -------------------------------
-app = FastAPI(title="ChatMPT API")
-# Enable CORS for all origins (adjust for production)
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -19,9 +19,8 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# API key security
 security = HTTPBearer()
-MY_API_KEY = os.environ.get("API_KEY", "my-secret-key-456")  # can override with env variable
 def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
     if credentials.credentials != MY_API_KEY:
@@ -29,14 +28,18 @@ def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
     return credentials.credentials
 # -------------------------------
-# Load MPT-7B Model
 # -------------------------------
-llm = Llama(
-    model_path="./mpt-7b-q2.gguf",  # downloaded in Dockerfile
-    n_ctx=2048,
-    n_threads=4,       # adjust for CPU cores
-    n_gpu_layers=0     # force CPU (change if GPU available)
 )
 # -------------------------------
 # Chat Endpoint
@@ -46,23 +49,13 @@ async def chat(request: Request, _ = Depends(verify_token)):
     try:
         data = await request.json()
         user_input = data.get("prompt", "").strip()
         if not user_input:
             return JSONResponse(status_code=400, content={"error": "No prompt provided"})
-        # Format for MPT-Chat
-        prompt = f"<|im_start|>user\n{user_input}<|im_end|>\n<|im_start|>assistant\n"
         # Generate response
-        output = llm(
-            prompt,
-            max_tokens=512,
-            temperature=0.7,
-            stop=["<|im_end|>", "<|im_start|>"],
-            echo=False
-        )
-        reply = output["choices"][0]["text"].strip()
         return JSONResponse(content={"reply": reply})
     except Exception as e:
@@ -79,5 +72,5 @@ async def health():
 # Run app
 # -------------------------------
 if __name__ == "__main__":
-    port = int(os.environ.get("PORT", 8080))  # Hugging Face Spaces sets PORT
     uvicorn.run(app, host="0.0.0.0", port=port)

 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 from fastapi.responses import JSONResponse
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+import torch
 import uvicorn
 # -------------------------------
 # FastAPI setup
 # -------------------------------
+app = FastAPI(title="ChatMPT API (Transformers)")
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
 security = HTTPBearer()
+MY_API_KEY = os.environ.get("API_KEY", "my-secret-key-456")
 def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
     if credentials.credentials != MY_API_KEY:
     return credentials.credentials
 # -------------------------------
+# Load model with Transformers
 # -------------------------------
+MODEL_PATH = "./mpt-7b-q2.gguf"  # path to downloaded model
+print("Loading tokenizer and model...")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_PATH,
+    device_map="auto",   # will use GPU if available, CPU otherwise
+    torch_dtype=torch.float16  # use float16 if possible for efficiency
 )
+generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)
 # -------------------------------
 # Chat Endpoint
     try:
         data = await request.json()
         user_input = data.get("prompt", "").strip()
         if not user_input:
             return JSONResponse(status_code=400, content={"error": "No prompt provided"})
         # Generate response
+        output = generator(user_input, do_sample=True, temperature=0.7)
+        reply = output[0]["generated_text"]
         return JSONResponse(content={"reply": reply})
     except Exception as e:
 # Run app
 # -------------------------------
 if __name__ == "__main__":
+    port = int(os.environ.get("PORT", 8080))
     uvicorn.run(app, host="0.0.0.0", port=port)