|
|
import os |
|
|
os.environ["HF_HOME"] = "/home/appuser/.cache/huggingface" |
|
|
os.environ["TRANSFORMERS_CACHE"] = "/home/appuser/.cache/huggingface" |
|
|
os.environ["HUGGINGFACE_HUB_CACHE"] = "/home/appuser/.cache/huggingface" |
|
|
|
|
|
|
|
|
from fastapi import FastAPI |
|
|
from pydantic import BaseModel |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline |
|
|
|
|
|
import torch |
|
|
|
|
|
MODEL_NAME = "Qwen/Qwen3-4B-Instruct-2507" |
|
|
|
|
|
app = FastAPI() |
|
|
|
|
|
print("🔄 Loading tokenizer...") |
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) |
|
|
|
|
|
print("🔄 Loading model in 4-bit...") |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
MODEL_NAME, |
|
|
device_map="auto", |
|
|
torch_dtype=torch.float16, |
|
|
load_in_4bit=True, |
|
|
trust_remote_code=True |
|
|
) |
|
|
|
|
|
print("✅ Model loaded.") |
|
|
|
|
|
pipe = pipeline( |
|
|
"text-generation", |
|
|
model=model, |
|
|
tokenizer=tokenizer, |
|
|
max_new_tokens=200, |
|
|
do_sample=True, |
|
|
temperature=0.7 |
|
|
) |
|
|
|
|
|
class Request(BaseModel): |
|
|
prompt: str |
|
|
|
|
|
@app.get("/") |
|
|
def root(): |
|
|
return {"message": "Model Ready 🚀"} |
|
|
|
|
|
@app.post("/generate") |
|
|
def generate_text(data: Request): |
|
|
prompt = tokenizer.apply_chat_template( |
|
|
[ |
|
|
{"role": "system", "content": "Sen kullanıcıya yardımcı bir asistanısın."}, |
|
|
{"role": "user", "content": data.prompt} |
|
|
], |
|
|
tokenize=False, |
|
|
add_generation_prompt=True |
|
|
) |
|
|
|
|
|
inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
|
|
outputs = model.generate(**inputs, max_new_tokens=200) |
|
|
text = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
|
|
|
result = pipe(prompt)[0] |
|
|
|
|
|
return {"response": result,"text": text} |
|
|
|