LLM_Model / app3.py
Shreekant Kalwar (Nokia)
new server
cd55ee8
raw
history blame
2.28 kB
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from fastapi.middleware.cors import CORSMiddleware
import torch
import os
# Ensure Hugging Face cache uses a writable path
os.environ["TRANSFORMERS_CACHE"] = "./.cache"
os.environ["HF_HOME"] = "./.cache"
app = FastAPI()
# βœ… Allow all origins
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
class ChatRequest(BaseModel):
message: str
max_tokens: int = 200 # default shorter responses for speed
# πŸ”Ή Choose a model (smaller = faster on CPU)
#model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
#model_name = "Qwen/Qwen2.5-1.5B-Instruct"
model_name = "deepseek-ai/deepseek-coder-1.3b-base"
print("πŸš€ Loading model... this may take a minute ⏳")
try:
if torch.cuda.is_available():
# βœ… GPU with quantization
from transformers import BitsAndBytesConfig
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
quantization_config=quant_config,
)
else:
# βœ… CPU fallback (no quantization)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float32,
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("βœ… Model loaded successfully!")
except Exception as e:
print("❌ Model loading failed:", str(e))
raise
@app.get("/")
def root():
return {"status": "ok"}
@app.post("/chat")
def chat(request: ChatRequest):
"""Chat endpoint"""
inputs = tokenizer(request.message, return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=request.max_tokens,
do_sample=True,
top_p=0.9,
temperature=0.7
)
# πŸ”Ή Only decode new tokens
reply_tokens = outputs[0][inputs["input_ids"].shape[1]:]
reply = tokenizer.decode(reply_tokens, skip_special_tokens=True)
return {"reply": reply}