hello-ram commited on
Commit
42e23ac
·
verified ·
1 Parent(s): 8d765ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -99
app.py CHANGED
@@ -1,100 +1,100 @@
1
- from fastapi import FastAPI,Query
2
- from transformers import AutoTokenizer, AutoModelForCausalLM
3
- import torch
4
- import os
5
- from pydantic import BaseModel
6
- from fastapi.middleware.cors import CORSMiddleware
7
-
8
-
9
- # ✅ Force Hugging Face cache to /tmp (writable in Spaces)
10
- os.environ["HF_HOME"] = "/tmp"
11
- os.environ["TRANSFORMERS_CACHE"] = "/tmp"
12
-
13
-
14
- model_id = "hello-ram/unsolth"
15
-
16
- tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="/tmp")
17
- model = AutoModelForCausalLM.from_pretrained(model_id, cache_dir="/tmp")
18
-
19
-
20
- app = FastAPI(title="QA GPT2 API", description="Serving HuggingFace model with FastAPI")
21
-
22
- app.add_middleware(
23
- CORSMiddleware,
24
- allow_origins=["*"],
25
- allow_credentials=True,
26
- allow_methods=["*"],
27
- allow_headers=["*"],
28
- )
29
- # Request schema
30
- class QueryRequest(BaseModel):
31
- question: str
32
- max_new_tokens: int = 50
33
- temperature: float = 0.7
34
- top_p: float = 0.9
35
-
36
-
37
- @app.get("/")
38
- def home():
39
- return {"message": "Welcome to QA GPT2 API 🚀"}
40
-
41
- @app.get("/ask")
42
- def ask(question: str, max_new_tokens: int = 50):
43
- inputs = tokenizer(question, return_tensors="pt")
44
- outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
45
- answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
46
- return {"question": question, "answer": answer}
47
-
48
-
49
-
50
- # Health check endpoint
51
- @app.get("/health")
52
- def health():
53
- return {"status": "ok"}
54
-
55
- # Inference endpoint
56
- @app.post("/predict")
57
- def predict(request: QueryRequest):
58
- inputs = tokenizer(request.question, return_tensors="pt")
59
- outputs = model.generate(
60
- **inputs,
61
- max_new_tokens=request.max_new_tokens,
62
- do_sample=True,
63
- temperature=0.7,
64
- top_p=0.9,
65
- pad_token_id=tokenizer.eos_token_id,
66
- return_dict_in_generate=True
67
- )
68
-
69
- answer = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
70
- return {
71
- "question": request.question,
72
- "answer": answer
73
- }
74
-
75
-
76
-
77
-
78
- @app.get("/answers")
79
- def predict(question: str = Query(..., description="The question to ask"), max_new_tokens: int = Query(50, description="Max new tokens to generate")):
80
- # Tokenize the input question
81
- inputs = tokenizer(question, return_tensors="pt")
82
-
83
- # Generate output from model
84
- outputs = model.generate(
85
- **inputs,
86
- max_new_tokens=max_new_tokens,
87
- do_sample=True,
88
- temperature=0.7,
89
- top_p=0.9,
90
- pad_token_id=tokenizer.eos_token_id,
91
- return_dict_in_generate=True
92
- )
93
-
94
- # Decode output
95
- answer = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
96
-
97
- return {
98
- "question": question,
99
- "answer": answer
100
  }
 
1
+ from fastapi import FastAPI,Query
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM
3
+ import torch
4
+ import os
5
+ from pydantic import BaseModel
6
+ from fastapi.middleware.cors import CORSMiddleware
7
+
8
+
9
+ # ✅ Force Hugging Face cache to /tmp (writable in Spaces)
10
+ os.environ["HF_HOME"] = "/tmp"
11
+ os.environ["TRANSFORMERS_CACHE"] = "/tmp"
12
+
13
+
14
+ model_id = "hello-ram/unsolth/finetuned_module"
15
+
16
+ tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="/tmp")
17
+ model = AutoModelForCausalLM.from_pretrained(model_id, cache_dir="/tmp")
18
+
19
+
20
+ app = FastAPI(title="QA GPT2 API", description="Serving HuggingFace model with FastAPI")
21
+
22
+ app.add_middleware(
23
+ CORSMiddleware,
24
+ allow_origins=["*"],
25
+ allow_credentials=True,
26
+ allow_methods=["*"],
27
+ allow_headers=["*"],
28
+ )
29
+ # Request schema
30
+ class QueryRequest(BaseModel):
31
+ question: str
32
+ max_new_tokens: int = 50
33
+ temperature: float = 0.7
34
+ top_p: float = 0.9
35
+
36
+
37
+ @app.get("/")
38
+ def home():
39
+ return {"message": "Welcome to QA GPT2 API 🚀"}
40
+
41
+ @app.get("/ask")
42
+ def ask(question: str, max_new_tokens: int = 50):
43
+ inputs = tokenizer(question, return_tensors="pt")
44
+ outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
45
+ answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
46
+ return {"question": question, "answer": answer}
47
+
48
+
49
+
50
+ # Health check endpoint
51
+ @app.get("/health")
52
+ def health():
53
+ return {"status": "ok"}
54
+
55
+ # Inference endpoint
56
+ @app.post("/predict")
57
+ def predict(request: QueryRequest):
58
+ inputs = tokenizer(request.question, return_tensors="pt")
59
+ outputs = model.generate(
60
+ **inputs,
61
+ max_new_tokens=request.max_new_tokens,
62
+ do_sample=True,
63
+ temperature=0.7,
64
+ top_p=0.9,
65
+ pad_token_id=tokenizer.eos_token_id,
66
+ return_dict_in_generate=True
67
+ )
68
+
69
+ answer = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
70
+ return {
71
+ "question": request.question,
72
+ "answer": answer
73
+ }
74
+
75
+
76
+
77
+
78
+ @app.get("/answers")
79
+ def predict(question: str = Query(..., description="The question to ask"), max_new_tokens: int = Query(50, description="Max new tokens to generate")):
80
+ # Tokenize the input question
81
+ inputs = tokenizer(question, return_tensors="pt")
82
+
83
+ # Generate output from model
84
+ outputs = model.generate(
85
+ **inputs,
86
+ max_new_tokens=max_new_tokens,
87
+ do_sample=True,
88
+ temperature=0.7,
89
+ top_p=0.9,
90
+ pad_token_id=tokenizer.eos_token_id,
91
+ return_dict_in_generate=True
92
+ )
93
+
94
+ # Decode output
95
+ answer = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
96
+
97
+ return {
98
+ "question": question,
99
+ "answer": answer
100
  }