rabiyulfahim commited on
Commit
3b5d156
·
verified ·
1 Parent(s): 8c386df

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -47
app.py CHANGED
@@ -1,32 +1,33 @@
1
  from fastapi import FastAPI
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import torch
4
- import os
5
  from pydantic import BaseModel
 
6
 
7
- # ✅ Force Hugging Face cache to /tmp (writable in Spaces)
8
- # os.environ["HF_HOME"] = "/tmp"
9
- # os.environ["TRANSFORMERS_CACHE"] = "/tmp"
10
- CACHE_DIR = os.path.expanduser("~/.cache/huggingface")
11
  os.makedirs(CACHE_DIR, exist_ok=True)
12
  os.environ["HF_HOME"] = CACHE_DIR
13
  os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
14
 
 
 
 
15
 
16
- # model_id = "rabiyulfahim/qa_python_gpt2"
17
-
18
- model_id = "deepseek-ai/DeepSeek-R1"
19
-
20
- # tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="/tmp")
21
- # model = AutoModelForCausalLM.from_pretrained(model_id, cache_dir="/tmp")
22
-
23
-
24
- tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=CACHE_DIR)
25
- model = AutoModelForCausalLM.from_pretrained(model_id, cache_dir=CACHE_DIR)
26
 
 
 
 
 
 
 
 
 
27
 
28
- app = FastAPI(title="QA GPT2 API", description="Serving HuggingFace model with FastAPI")
29
-
30
 
31
  # Request schema
32
  class QueryRequest(BaseModel):
@@ -35,51 +36,32 @@ class QueryRequest(BaseModel):
35
  temperature: float = 0.7
36
  top_p: float = 0.9
37
 
38
-
39
  @app.get("/")
40
  def home():
41
- return {"message": "Welcome to QA GPT2 API 🚀"}
42
 
43
  @app.get("/ask")
44
  def ask(question: str, max_new_tokens: int = 50):
45
- inputs = tokenizer(question, return_tensors="pt")
46
  outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
47
  answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
48
  return {"question": question, "answer": answer}
49
 
50
-
51
-
52
- # Health check endpoint
53
  @app.get("/health")
54
  def health():
55
  return {"status": "ok"}
56
 
57
- # Inference endpoint
58
  @app.post("/predict")
59
  def predict(request: QueryRequest):
60
- inputs = tokenizer(request.question, return_tensors="pt")
61
- # outputs = model.generate(
62
- # **inputs,
63
- # max_new_tokens=request.max_new_tokens,
64
- # do_sample=True,
65
- # temperature=0.7,
66
- # top_p=0.9,
67
- # pad_token_id=tokenizer.eos_token_id,
68
- # return_dict_in_generate=True
69
- # )
70
  outputs = model.generate(
71
- **inputs,
72
- max_new_tokens=request.max_new_tokens,
73
- do_sample=True,
74
- temperature=request.temperature,
75
- top_p=request.top_p,
76
- pad_token_id=tokenizer.eos_token_id,
77
- return_dict_in_generate=True
78
  )
79
-
80
-
81
  answer = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
82
- return {
83
- "question": request.question,
84
- "answer": answer
85
- }
 
1
  from fastapi import FastAPI
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import torch
 
4
  from pydantic import BaseModel
5
+ import os
6
 
7
+ # Hugging Face cache directory
8
+ CACHE_DIR = "/app/.cache/huggingface"
 
 
9
  os.makedirs(CACHE_DIR, exist_ok=True)
10
  os.environ["HF_HOME"] = CACHE_DIR
11
  os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
12
 
13
+ # Model ID (FP8 requires GPU)
14
+ MODEL_ID = "deepseek-ai/DeepSeek-R1"
15
+ FALLBACK_MODEL_ID = "gpt2" # CPU-friendly fallback
16
 
17
+ # Detect GPU
18
+ device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
 
 
19
 
20
+ try:
21
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR)
22
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR).to(device)
23
+ except Exception as e:
24
+ print(f"⚠️ Failed to load GPU FP8 model: {e}")
25
+ print(f"🔹 Falling back to CPU-friendly model: {FALLBACK_MODEL_ID}")
26
+ tokenizer = AutoTokenizer.from_pretrained(FALLBACK_MODEL_ID, cache_dir=CACHE_DIR)
27
+ model = AutoModelForCausalLM.from_pretrained(FALLBACK_MODEL_ID, cache_dir=CACHE_DIR).to(device)
28
 
29
+ # FastAPI app
30
+ app = FastAPI(title="QA GPT API", description="Hugging Face model served via FastAPI")
31
 
32
  # Request schema
33
  class QueryRequest(BaseModel):
 
36
  temperature: float = 0.7
37
  top_p: float = 0.9
38
 
 
39
  @app.get("/")
40
  def home():
41
+ return {"message": "Welcome to QA GPT API 🚀"}
42
 
43
  @app.get("/ask")
44
  def ask(question: str, max_new_tokens: int = 50):
45
+ inputs = tokenizer(question, return_tensors="pt").to(device)
46
  outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
47
  answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
48
  return {"question": question, "answer": answer}
49
 
 
 
 
50
  @app.get("/health")
51
  def health():
52
  return {"status": "ok"}
53
 
 
54
  @app.post("/predict")
55
  def predict(request: QueryRequest):
56
+ inputs = tokenizer(request.question, return_tensors="pt").to(device)
 
 
 
 
 
 
 
 
 
57
  outputs = model.generate(
58
+ **inputs,
59
+ max_new_tokens=request.max_new_tokens,
60
+ do_sample=True,
61
+ temperature=request.temperature,
62
+ top_p=request.top_p,
63
+ pad_token_id=tokenizer.eos_token_id,
64
+ return_dict_in_generate=True
65
  )
 
 
66
  answer = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
67
+ return {"question": request.question, "answer": answer}