import os os.environ["HF_HOME"] = "/home/appuser/.cache/huggingface" os.environ["TRANSFORMERS_CACHE"] = "/home/appuser/.cache/huggingface" os.environ["HUGGINGFACE_HUB_CACHE"] = "/home/appuser/.cache/huggingface" from fastapi import FastAPI from pydantic import BaseModel from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline import torch MODEL_NAME = "Qwen/Qwen3-4B-Instruct-2507" # "ynsbyrm/clone-c4ai-command-a-03-2025" # kendi model repo adını burada tut app = FastAPI() print("🔄 Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) print("🔄 Loading model in 4-bit...") model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, device_map="auto", torch_dtype=torch.float16, load_in_4bit=True, # <--- burada sihir var trust_remote_code=True ) print("✅ Model loaded.") pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200, do_sample=True, temperature=0.7 ) class Request(BaseModel): prompt: str @app.get("/") def root(): return {"message": "Model Ready 🚀"} @app.post("/generate") def generate_text(data: Request): prompt = tokenizer.apply_chat_template( [ {"role": "system", "content": "Sen kullanıcıya yardımcı bir asistanısın."}, {"role": "user", "content": data.prompt} ], tokenize=False, add_generation_prompt=True ) inputs = tokenizer(prompt, return_tensors="pt").to(model.device) outputs = model.generate(**inputs, max_new_tokens=200) text = tokenizer.decode(outputs[0], skip_special_tokens=True) result = pipe(prompt)[0] #decoded_result = tokenizer.decode(result, skip_special_tokens=True) return {"response": result,"text": text}