File size: 1,828 Bytes
a73b693
834a493
 
 
 
a73b693
35f8a63
e95e89a
7f61b49
35f8a63
7f61b49
e787f52
b427f08
d1e0f82
7f61b49
 
 
 
 
 
 
 
 
 
 
 
d1e0f82
c7f5804
7f61b49
 
 
a26a440
7f61b49
 
 
5a5cee8
 
7f61b49
c7f5804
7f61b49
 
7146db8
e787f52
 
 
 
7146db8
7f61b49
727ba2c
 
 
 
 
 
 
 
3751dd8
 
 
 
 
8f8741a
3751dd8
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import os
os.environ["HF_HOME"] = "/home/appuser/.cache/huggingface"
os.environ["TRANSFORMERS_CACHE"] = "/home/appuser/.cache/huggingface"
os.environ["HUGGINGFACE_HUB_CACHE"] = "/home/appuser/.cache/huggingface"


from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

import torch

MODEL_NAME = "Qwen/Qwen3-4B-Instruct-2507" # "ynsbyrm/clone-c4ai-command-a-03-2025"  # kendi model repo adını burada tut

app = FastAPI()

print("🔄 Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

print("🔄 Loading model in 4-bit...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.float16,
    load_in_4bit=True,           # <--- burada sihir var
    trust_remote_code=True
)

print("✅ Model loaded.")

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=200,
    do_sample=True,
    temperature=0.7
)

class Request(BaseModel):
    prompt: str

@app.get("/")
def root():
    return {"message": "Model Ready 🚀"}
    
@app.post("/generate")
def generate_text(data: Request):
    prompt = tokenizer.apply_chat_template(
        [
            {"role": "system", "content": "Sen kullanıcıya yardımcı bir asistanısın."},
            {"role": "user", "content": data.prompt}
        ],
        tokenize=False,
        add_generation_prompt=True
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=200)
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    result = pipe(prompt)[0]
    #decoded_result = tokenizer.decode(result, skip_special_tokens=True)
    return {"response": result,"text": text}