Spaces:

Valtry
/

AI-Machine

Sleeping

App Files Files Community

Valtry commited on 28 days ago

Commit

f8fbbce

verified ·

1 Parent(s): da301ab

Update app.py

Browse files

Files changed (1) hide show

app.py +187 -73

app.py CHANGED Viewed

@@ -1,89 +1,203 @@
 from fastapi import FastAPI
 from pydantic import BaseModel
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import uvicorn
-# -----------------------
-# LOAD MODEL
-# -----------------------
-MODEL_ID = "microsoft/phi-2"
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    device_map="cpu",
-    torch_dtype=torch.float32,
-    low_cpu_mem_usage=True
-)
-torch.set_num_threads(2)
-# -----------------------
-# FASTAPI
-# -----------------------
-app = FastAPI()
 class ChatRequest(BaseModel):
     message: str
-@app.get("/")
-def home():
-    return {"status": "API running 🚀"}
-# -----------------------
-# CHAT (NO STREAMING)
-# -----------------------
-@app.post("/chat")
-def chat(req: ChatRequest):
-    prompt = f"""You are a concise assistant.
-Return plain text only.
-No markdown.
-No bullet points.
-No numbering.
-No symbols like # * -.
-Only simple readable sentence.
-User: {req.message}
-Assistant:"""
-    inputs = tokenizer(prompt, return_tensors="pt")
-    outputs = model.generate(
-        **inputs,
-        max_new_tokens=80,
-        temperature=0.5,
-        do_sample=True,
-        eos_token_id=tokenizer.eos_token_id,
-        pad_token_id=tokenizer.eos_token_id
     )
-    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # 🔥 CLEAN OUTPUT
-    if "Assistant:" in text:
-        text = text.split("Assistant:")[-1]
-    if "User:" in text:
-        text = text.split("User:")[0]
-    text = text.strip()
-    # remove unwanted formatting
-    text = text.replace("\n", " ")
-    text = text.replace("  ", " ")
     return {
-        "response": text
     }
-# -----------------------
-# START SERVER
-# -----------------------
 if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=7860)

 from fastapi import FastAPI
+from fastapi.responses import StreamingResponse
+from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
+from supabase import create_client
+import os, json, uvicorn, threading
+from contextlib import asynccontextmanager
+# =========================
+# CONFIG
+# =========================
+HF_TOKEN = os.getenv("HF_TOKEN")
+SUPABASE_URL = os.getenv("SUPABASE_URL")
+SUPABASE_KEY = os.getenv("SUPABASE_KEY")
+supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
+model = None
+stop_flag = False
+# =========================
+# REQUEST
+# =========================
 class ChatRequest(BaseModel):
     message: str
+    temperature: float = 0.7
+    stream: bool = False
+# =========================
+# CLEAN OUTPUT
+# =========================
+def clean_output(text):
+    stop_words = [
+        "<|eot_id|>",
+        "<|end_of_text|>",
+        "<|eof|>",
+        "Human:",
+        "Assistant:",
+        "User:"
+    ]
+    for w in stop_words:
+        if w in text:
+            text = text.split(w)[0]
+    return text.strip()
+# =========================
+# PROMPT (NO HISTORY)
+# =========================
+def build_prompt(user_msg):
+    return f"""<|begin_of_text|>
+<|start_header_id|>system<|end_header_id|>
+You are a helpful AI assistant.
+<|eot_id|>
+<|start_header_id|>user<|end_header_id|>
+{user_msg}
+<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>
+"""
+# =========================
+# MODEL LOAD
+# =========================
+def load_model():
+    return Llama(
+        model_path=hf_hub_download(
+            repo_id="Valtry/llama3.2-3b-q4-gguf",
+            filename="llama3.2-3b-q4.gguf",
+            token=HF_TOKEN,
+            cache_dir="/data"
+        ),
+        n_ctx=2048,
+        n_threads=4,
+        n_batch=512,
+        use_mmap=True,
+        use_mlock=True,
+        f16_kv=True,
+        verbose=False
     )
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    global model
+    model = load_model()
+    yield
+# =========================
+# APP
+# =========================
+app = FastAPI(lifespan=lifespan)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# =========================
+# SAVE
+# =========================
+def save_message(role, content):
+    supabase.table("messages").insert({
+        "role": role,
+        "content": content
+        # timestamp auto handled by DB
+    }).execute()
+# =========================
+# STOP
+# =========================
+@app.post("/v1/stop")
+def stop():
+    global stop_flag
+    stop_flag = True
+    return {"status": "stopped"}
+# =========================
+# CHAT
+# =========================
+@app.post("/v1/chat")
+async def chat(req: ChatRequest):
+    prompt = build_prompt(req.message)
+    temp, rp, tp = req.temperature, 1.15, 0.9
+    max_tokens = 2048
+    if req.stream:
+        def generate():
+            global stop_flag
+            output = ""
+            stream = model(
+                prompt,
+                max_tokens=max_tokens,
+                temperature=temp,
+                top_p=tp,
+                repeat_penalty=rp,
+                stop=["<|eot_id|>", "<|end_of_text|>", "<|eof|>"],
+                stream=True
+            )
+            for chunk in stream:
+                if stop_flag:
+                    stop_flag = False
+                    break
+                token = chunk["choices"][0]["text"]
+                output += token
+                yield f"data: {json.dumps({'choices':[{'delta':{'content':token}}]})}\n\n"
+            output_clean = clean_output(output)
+            yield "event: done\ndata: {}\n\n"
+            yield "data: [DONE]\n\n"
+            def save_async():
+                save_message("user", req.message)
+                save_message("assistant", output_clean)
+            threading.Thread(target=save_async).start()
+        return StreamingResponse(generate(), media_type="text/event-stream")
+    output = model(
+        prompt,
+        max_tokens=max_tokens,
+        temperature=temp,
+        top_p=tp,
+        repeat_penalty=rp,
+        stop=["<|eot_id|>", "<|end_of_text|>", "<|eof|>"]
+    )
+    text = clean_output(output["choices"][0]["text"])
+    def save_async():
+        save_message("user", req.message)
+        save_message("assistant", text)
+    threading.Thread(target=save_async).start()
     return {
+        "choices":[{"message":{"role":"assistant","content":text}}],
+        "done":True
     }
+# =========================
+# ROOT
+# =========================
+@app.get("/")
+def root():
+    return {"status": "Minimal LLaMA API running 🚀"}
+# =========================
+# RUN
+# =========================
 if __name__ == "__main__":
+    uvicorn.run("app:app", host="0.0.0.0", port=7860)