hello-ram commited on
Commit
4fcfac7
·
verified ·
1 Parent(s): 7d72c0a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -98
app.py CHANGED
@@ -1,109 +1,35 @@
1
- import os
2
- import torch
3
  from fastapi import FastAPI
4
- from fastapi.middleware.cors import CORSMiddleware
5
  from pydantic import BaseModel
 
 
6
 
7
- from unsloth import FastLanguageModel
8
- from transformers import AutoTokenizer
9
-
10
- # -------------------------------
11
- # HF cache paths
12
- # -------------------------------
13
- os.environ["HF_HOME"] = "/tmp"
14
- os.environ["TRANSFORMERS_CACHE"] = "/tmp"
15
-
16
- # -------------------------------
17
- # FastAPI
18
- # -------------------------------
19
-
20
- os.environ["CUDA_VISIBLE_DEVICES"] = "" # Force CPU
21
 
22
- import unsloth # Now it won't try to use GPU
 
23
 
24
- app = FastAPI(title="Unsolth GPT OSS API")
 
25
 
26
- app.add_middleware(
27
- CORSMiddleware,
28
- allow_origins=["*"],
29
- allow_methods=["*"],
30
- allow_headers=["*"],
31
  )
32
 
33
- # -------------------------------
34
- # Model variables
35
- # -------------------------------
36
- model = None
37
- tokenizer = None
38
-
39
- # Paths (exact as in your repo)
40
- base_model_name = "unsolth_gpt.20" # your folder
41
- lora_model_path = "unsolth_gpt.20" # LoRA files are inside same folder
42
-
43
- # -------------------------------
44
- # Load model
45
- # -------------------------------
46
- def load_model():
47
- global model, tokenizer
48
- if model is None or tokenizer is None:
49
- tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
50
-
51
- # Load base model on CPU
52
- base_model = FastLanguageModel.from_pretrained(
53
- base_model_name,
54
- trust_remote_code=True,
55
- device="cpu"
56
- )
57
-
58
- # Inject LoRA weights on CPU
59
- model = FastLanguageModel.get_peft_model(
60
- base_model,
61
- r=8,
62
- target_modules=[
63
- "q_proj", "k_proj", "v_proj", "o_proj",
64
- "gate_proj", "up_proj", "down_proj"
65
- ],
66
- lora_alpha=16,
67
- lora_dropout=0,
68
- bias="none",
69
- state_dict=torch.load(os.path.join(lora_model_path, "model.safetensors"), map_location="cpu")
70
- )
71
- model.eval()
72
-
73
- # -------------------------------
74
- # Input schema
75
- # -------------------------------
76
- class QueryRequest(BaseModel):
77
- question: str
78
- max_new_tokens: int = 64
79
- temperature: float = 0.7
80
- top_p: float = 0.9
81
- reasoning_effort: str = "medium"
82
-
83
- # -------------------------------
84
- # Health
85
- # -------------------------------
86
- @app.get("/")
87
- def health():
88
- return {"status": "ok"}
89
-
90
- # -------------------------------
91
- # Predict
92
- # -------------------------------
93
- @app.post("/predict")
94
- def predict(req: QueryRequest):
95
- load_model()
96
 
97
- inputs = tokenizer.apply_chat_template(
98
- [{"role": "user", "content": req.question}],
99
- add_generation_prompt=True,
100
- return_tensors="pt",
101
- return_dict=True,
102
- reasoning_effort=req.reasoning_effort
103
- ).to("cpu") # force CPU
104
 
105
- with torch.no_grad():
106
- output = model.generate(**inputs, max_new_tokens=req.max_new_tokens)
 
 
 
107
 
108
- answer = tokenizer.decode(output[0], skip_special_tokens=True)
109
- return {"question": req.question, "answer": answer}
 
 
 
1
  from fastapi import FastAPI
 
2
  from pydantic import BaseModel
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
+ import torch
5
 
6
+ app = FastAPI()
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ # ---- Load your HF model repo ----
9
+ MODEL_REPO = "hello-ram/mpt-model"
10
 
11
+ print("Loading tokenizer...")
12
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
13
 
14
+ print("Loading model...")
15
+ model = AutoModelForCausalLM.from_pretrained(
16
+ MODEL_REPO,
17
+ torch_dtype=torch.float16,
18
+ device_map="auto"
19
  )
20
 
21
+ class InputText(BaseModel):
22
+ text: str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ @app.post("/generate")
25
+ async def generate_text(data: InputText):
26
+ inputs = tokenizer(data.text, return_tensors="pt").to(model.device)
 
 
 
 
27
 
28
+ output = model.generate(
29
+ **inputs,
30
+ max_new_tokens=200,
31
+ temperature=0.7
32
+ )
33
 
34
+ generated = tokenizer.decode(output[0], skip_special_tokens=True)
35
+ return {"response": generated}