natalieparker commited on
Commit
667a0a8
·
verified ·
1 Parent(s): 6a3b1d6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -0
app.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from pydantic import BaseModel
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
+ import torch
5
+
6
+ MODEL_NAME = "natalieparker/LumaAI-160M-v3"
7
+
8
+ print("🔥 Loading tokenizer...")
9
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
10
+
11
+ print("🔥 Loading model...")
12
+ model = AutoModelForCausalLM.from_pretrained(
13
+ MODEL_NAME,
14
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
15
+ low_cpu_mem_usage=True
16
+ )
17
+
18
+ device = "cuda" if torch.cuda.is_available() else "cpu"
19
+ model.to(device)
20
+
21
+ app = FastAPI()
22
+
23
+ class GenerateRequest(BaseModel):
24
+ prompt: str
25
+ max_new_tokens: int = 150
26
+ temperature: float = 0.9
27
+ top_p: float = 0.9
28
+
29
+ @app.post("/api/generate")
30
+ def generate(req: GenerateRequest):
31
+ inputs = tokenizer(req.prompt, return_tensors="pt").to(device)
32
+
33
+ output = model.generate(
34
+ **inputs,
35
+ max_new_tokens=req.max_new_tokens,
36
+ temperature=req.temperature,
37
+ top_p=req.top_p,
38
+ do_sample=True,
39
+ repetition_penalty=1.05,
40
+ )
41
+
42
+ text = tokenizer.decode(output[0], skip_special_tokens=True)
43
+ return {"response": text}