File size: 1,689 Bytes
af5708f
 
4fcfac7
3ed3ed9
42e23ac
4fcfac7
650274d
8004c59
650274d
ae5b614
 
42e23ac
1b99533
ae5b614
 
d66d396
3ed3ed9
d66d396
3ed3ed9
ae5b614
 
d66d396
 
ae5b614
 
d66d396
3ed3ed9
ae5b614
 
d66d396
3ed3ed9
f3c61d6
 
a0aaa19
f3c61d6
d66d396
a0aaa19
 
f3c61d6
 
ae5b614
f3c61d6
 
3ed3ed9
 
 
a0aaa19
3ed3ed9
f3c61d6
 
4fcfac7
 
34202a9
f3c61d6
4fcfac7
 
ae5b614
42e23ac
d66d396
 
 
a0aaa19
d66d396
 
 
3ed3ed9
4fcfac7
 
3ed3ed9
a0aaa19
 
 
4fcfac7
42e23ac
a0aaa19
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from fastapi import FastAPI
from pydantic import BaseModel
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

app = FastAPI()

MODEL_REPO = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = None
model = None


def load_model():
    global tokenizer, model

    if tokenizer is None or model is None:
        print("🔥 Loading TinyLlama model on CPU...")

        tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)

        # ❗ NO device_map
        # ❗ NO torch_dtype=float16
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_REPO,
            torch_dtype=torch.float32,      # safe CPU
            low_cpu_mem_usage=True
        )

        print("✅ TinyLlama loaded!")


@app.get("/")
async def home():
    return {
        "message": "🚀 TinyLlama Chat API Running",
        "endpoints": ["/", "/status", "/generate"],
        "model": MODEL_REPO
    }


@app.get("/status")
async def status():
    return {
        "status": "ok",
        "model": MODEL_REPO,
        "loaded": model is not None
    }


class InputText(BaseModel):
    text: str


@app.post("/generate")
async def generate_text(data: InputText):
    load_model()

    prompt = f"<|system|>You are a helpful assistant.<|user|>{data.text}<|assistant|>"

    inputs = tokenizer(prompt, return_tensors="pt")

    # Move to CPU explicitly
    inputs = {k: v.to("cpu") for k, v in inputs.items()}
    model.to("cpu")

    output = model.generate(
        **inputs,
        max_new_tokens=150,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )

    result = tokenizer.decode(output[0], skip_special_tokens=True)
    return {"response": result}