File size: 3,505 Bytes
27ba2b2 d994991 27ba2b2 b1f9442 27ba2b2 b1f9442 27ba2b2 b1f9442 27ba2b2 d994991 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from fastapi.middleware.cors import CORSMiddleware
# -------------------------------
# Load model & tokenizer
# -------------------------------
model_name = "thedeba/deb-8B" # HF Hub model path
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
device = "cpu" # HF Spaces free tier usually uses CPU
model.to(device)
# -------------------------------
# FastAPI setup
# -------------------------------
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# -------------------------------
# Pydantic model for input
# -------------------------------
class Query(BaseModel):
text: str
# -------------------------------
# Single generate endpoint
# -------------------------------
@app.post("/generate")
def generate(query: Query):
# Tokenize input
inputs = tokenizer(query.text, return_tensors="pt").to(device)
# Generate output
outputs = model.generate(
**inputs,
max_new_tokens=200,
temperature=0.8,
)
# Decode output
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return {"response": response_text}
# -------------------------------
# Root endpoint
# -------------------------------
@app.get("/")
def root():
return {"status": "API is running!"}
# -------------------------------
# Run locally (optional)
# -------------------------------
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)
"""from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from fastapi.middleware.cors import CORSMiddleware
# -------------------------------
# Load model & tokenizer from HF Hub
# -------------------------------
model_name = "thedeba/deb-8B" # HF Hub model path
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
device = "auto" # Spaces free tier uses CPU; you can switch to "cuda" if GPU granted
#model.to(device)
# -------------------------------
# FastAPI setup
# -------------------------------
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # or ["https://<username>.github.io"]
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
class Query(BaseModel):
text: str
@app.post("/generate")
def generate(query: Query):
messages = [{"role": "user", "content": query.text}]
# Convert to model input using chat template
inputs = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_tensors="pt",
).to(device)
# Generate
outputs = model.generate(
input_ids=inputs,
max_new_tokens=200,
use_cache=True,
temperature=0.8,
min_p=0.1,
)
# Decode & extract assistant response
output_string = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
response = output_string.split("assistant")[-1].strip()
return {"response": response}
@app.get("/")
def root():
return {"deb": "API is running!"}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)"""
|