thedeba commited on
Commit
11e76a2
·
verified ·
1 Parent(s): 2c19a01

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +17 -0
  2. app.py +75 -0
  3. requirements.txt +6 -0
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Copy requirements and install
6
+ COPY requirements.txt .
7
+ RUN pip install --upgrade pip
8
+ RUN pip install -r requirements.txt
9
+
10
+ # Copy app
11
+ COPY app.py .
12
+
13
+ # Expose the port FastAPI runs on
14
+ EXPOSE 7860
15
+
16
+ # Start FastAPI with uvicorn
17
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from pydantic import BaseModel
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
+ import torch
5
+ from fastapi.middleware.cors import CORSMiddleware
6
+ from peft import PeftModel
7
+
8
+
9
+ # -------------------------------
10
+ # Load model & tokenizer from HF Hub
11
+ # -------------------------------
12
+ BASE_MODEL = "thedeba/debai-8b"
13
+ LORA_ADAPTER = "thedeba/Friday-lora"
14
+
15
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
16
+ model = AutoModelForCausalLM.from_pretrained(
17
+ BASE_MODEL,
18
+ device_map="auto",
19
+ torch_dtype="auto",
20
+ )
21
+ model = PeftModel.from_pretrained(model, LORA_ADAPTER)
22
+ model.eval()
23
+ device = "cpu" # Spaces free tier uses CPU; you can switch to "cuda" if GPU granted
24
+ #model.to(device)
25
+
26
+ # -------------------------------
27
+ # FastAPI setup
28
+ # -------------------------------
29
+ app = FastAPI()
30
+
31
+
32
+ app.add_middleware(
33
+ CORSMiddleware,
34
+ allow_origins=["*"],
35
+ allow_credentials=True,
36
+ allow_methods=["*"],
37
+ allow_headers=["*"],
38
+ )
39
+
40
+ class Query(BaseModel):
41
+ text: str
42
+
43
+ @app.post("/generate")
44
+ def generate(query: Query):
45
+ messages = [{"role": "user", "content": query.text}]
46
+
47
+ # Convert to model input using chat template
48
+ inputs = tokenizer.apply_chat_template(
49
+ messages,
50
+ tokenize=True,
51
+ add_generation_prompt=True,
52
+ return_tensors="pt",
53
+ ).to(device)
54
+
55
+ # Generate
56
+ outputs = model.generate(
57
+ input_ids=inputs,
58
+ max_new_tokens=2048,
59
+ use_cache=True,
60
+ temperature=0.5,
61
+ min_p=0.1,
62
+ )
63
+
64
+ # Decode & extract assistant response
65
+ output_string = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
66
+ response = output_string.split("assistant")[-1].strip()
67
+ return {"response": response}
68
+
69
+ @app.get("/")
70
+ def root():
71
+ return {"debai": "API is running!"}
72
+
73
+ if __name__ == "__main__":
74
+ import uvicorn
75
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch>=2.1.0
2
+ transformers>=4.34.0
3
+ bitsandbytes
4
+ peft
5
+ fastapi
6
+ uvicorn[standard]