hello-ram commited on
Commit
1b99533
Β·
verified Β·
1 Parent(s): 34202a9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -58
app.py CHANGED
@@ -1,106 +1,117 @@
1
- from fastapi import FastAPI,Query
2
- from transformers import AutoTokenizer, AutoModelForCausalLM
3
- import torch
4
- import os
5
  from pydantic import BaseModel
6
  from fastapi.middleware.cors import CORSMiddleware
 
 
 
7
 
8
-
9
- # βœ… Force Hugging Face cache to /tmp (writable in Spaces)
 
10
  os.environ["HF_HOME"] = "/tmp"
11
  os.environ["TRANSFORMERS_CACHE"] = "/tmp"
12
 
13
-
 
 
14
  model_id = "hello-ram/unsolth_gpt.20"
15
- #helloram
16
  tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="/tmp")
17
  model = AutoModelForCausalLM.from_pretrained(model_id, cache_dir="/tmp")
 
18
 
19
-
20
- app = FastAPI(title="QA GPT2 API", description="Serving HuggingFace model with FastAPI")
 
 
21
 
22
  app.add_middleware(
23
  CORSMiddleware,
24
  allow_origins=["*"],
25
- allow_credentials=True,
26
  allow_methods=["*"],
27
  allow_headers=["*"],
28
  )
29
- # Request schema
 
30
  class QueryRequest(BaseModel):
31
  question: str
32
- max_new_tokens: int = 50
33
  temperature: float = 0.7
34
  top_p: float = 0.9
 
35
 
36
 
 
 
 
37
  @app.get("/")
38
  def home():
39
- return {"message": "Welcome to QA GPT2 API πŸš€"}
40
-
41
- @app.get("/ask")
42
- def ask(question: str, max_new_tokens: int = 50):
43
- inputs = tokenizer(question, return_tensors="pt")
44
- outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
45
- answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
46
- return {"question": question, "answer": answer}
47
-
48
 
49
 
50
- # Health check endpoint
 
 
51
  @app.get("/health")
52
  def health():
53
  return {"status": "ok"}
54
 
55
- # Inference endpoint
 
 
 
56
  @app.post("/predict")
57
- def predict(request: QueryRequest):
 
 
58
  messages = [
59
- {"role": "system", "content": "reasoning language: english\n\nYou are a helpful assistant."},
60
- {"role": "user", "content": request.question}
61
  ]
62
 
63
- inputs = tokenizer.apply_chat_template(
 
64
  messages,
65
  add_generation_prompt=True,
 
66
  return_tensors="pt"
67
- ).to(model.device)
68
-
69
- outputs = model.generate(
70
- **inputs,
71
- max_new_tokens=request.max_new_tokens,
72
- do_sample=True,
73
- temperature=request.temperature,
74
- top_p=request.top_p,
75
- pad_token_id=tokenizer.eos_token_id
76
  )
77
 
78
- answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
79
- return {"question": request.question, "answer": answer}
 
 
 
 
 
 
 
 
80
 
 
 
81
 
 
 
 
 
82
 
83
 
84
- @app.get("/answers")
85
- def predict(question: str = Query(..., description="The question to ask"), max_new_tokens: int = Query(50, description="Max new tokens to generate")):
86
- # Tokenize the input question
 
 
 
87
  inputs = tokenizer(question, return_tensors="pt")
88
 
89
- # Generate output from model
90
- outputs = model.generate(
91
- **inputs,
92
- max_new_tokens=max_new_tokens,
93
- do_sample=True,
94
- temperature=0.7,
95
- top_p=0.9,
96
- pad_token_id=tokenizer.eos_token_id,
97
- return_dict_in_generate=True
98
- )
99
 
100
- # Decode output
101
- answer = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
102
 
103
- return {
104
- "question": question,
105
- "answer": answer
106
- }
 
1
+ from fastapi import FastAPI
 
 
 
2
  from pydantic import BaseModel
3
  from fastapi.middleware.cors import CORSMiddleware
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
5
+ import torch
6
+ import os
7
 
8
+ # ─────────────────────────────────────────────
9
+ # Make HF Spaces writable (required)
10
+ # ─────────────────────────────────────────────
11
  os.environ["HF_HOME"] = "/tmp"
12
  os.environ["TRANSFORMERS_CACHE"] = "/tmp"
13
 
14
+ # ─────────────────────────────────────────────
15
+ # Load model & tokenizer
16
+ # ─────────────────────────────────────────────
17
  model_id = "hello-ram/unsolth_gpt.20"
18
+
19
  tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="/tmp")
20
  model = AutoModelForCausalLM.from_pretrained(model_id, cache_dir="/tmp")
21
+ model.eval() # VERY IMPORTANT on CPU
22
 
23
+ # ─────────────────────────────────────────────
24
+ # FastAPI config
25
+ # ─────────────────────────────────────────────
26
+ app = FastAPI(title="Unsloth GPT API", description="LoRA + Chat Template + Reasoning API")
27
 
28
  app.add_middleware(
29
  CORSMiddleware,
30
  allow_origins=["*"],
 
31
  allow_methods=["*"],
32
  allow_headers=["*"],
33
  )
34
+
35
+ # POST input schema
36
  class QueryRequest(BaseModel):
37
  question: str
38
+ max_new_tokens: int = 64
39
  temperature: float = 0.7
40
  top_p: float = 0.9
41
+ reasoning_effort: str = "medium" # low / medium / high
42
 
43
 
44
+ # ─────────────────────────────────────────────
45
+ # Home
46
+ # ─────────────────────────────────────────────
47
  @app.get("/")
48
  def home():
49
+ return {"message": "Unsloth GPT API Running Successfully πŸš€"}
 
 
 
 
 
 
 
 
50
 
51
 
52
+ # ─────────────────────────────────────────────
53
+ # HEALTH CHECK
54
+ # ─────────────────────────────────────────────
55
  @app.get("/health")
56
  def health():
57
  return {"status": "ok"}
58
 
59
+
60
+ # ─────────────────────────────────────────────
61
+ # MAIN PREDICTION ENDPOINT (FULL CHAT TEMPLATE)
62
+ # ─────────────────────────────────────────────
63
  @app.post("/predict")
64
+ def predict(req: QueryRequest):
65
+
66
+ # Build correct Unsloth chat template input
67
  messages = [
68
+ {"role": "system", "content": "reasoning language: english\nYou are a helpful assistant."},
69
+ {"role": "user", "content": req.question},
70
  ]
71
 
72
+ # Apply Unsloth chat template
73
+ input_ids = tokenizer.apply_chat_template(
74
  messages,
75
  add_generation_prompt=True,
76
+ reasoning_effort=req.reasoning_effort,
77
  return_tensors="pt"
 
 
 
 
 
 
 
 
 
78
  )
79
 
80
+ # Generate
81
+ with torch.no_grad():
82
+ output = model.generate(
83
+ input_ids=input_ids,
84
+ max_new_tokens=req.max_new_tokens,
85
+ do_sample=True,
86
+ temperature=req.temperature,
87
+ top_p=req.top_p,
88
+ pad_token_id=tokenizer.eos_token_id,
89
+ )
90
 
91
+ # Decode
92
+ answer = tokenizer.decode(output[0], skip_special_tokens=True)
93
 
94
+ return {
95
+ "question": req.question,
96
+ "answer": answer
97
+ }
98
 
99
 
100
+ # ─────────────────────────────────────────────
101
+ # SIMPLE NON-CHAT ENDPOINT (/ask)
102
+ # ─────────────────────────────────────────────
103
+ @app.get("/ask")
104
+ def ask(question: str, max_new_tokens: int = 50):
105
+
106
  inputs = tokenizer(question, return_tensors="pt")
107
 
108
+ with torch.no_grad():
109
+ output = model.generate(
110
+ **inputs,
111
+ max_new_tokens=max_new_tokens,
112
+ pad_token_id=tokenizer.eos_token_id,
113
+ )
 
 
 
 
114
 
115
+ answer = tokenizer.decode(output[0], skip_special_tokens=True)
 
116
 
117
+ return {"question": question, "answer": answer}