LLM_Model / app2.py
Shreekant Kalwar (Nokia)
new server
cd55ee8
raw
history blame
1.6 kB
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from fastapi.middleware.cors import CORSMiddleware
import torch
import os
# Ensure Hugging Face cache uses a writable path
os.environ["TRANSFORMERS_CACHE"] = "/app/.cache"
os.environ["HF_HOME"] = "/app/.cache"
app = FastAPI()
# βœ… Allow all origins
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # allow all origins
allow_credentials=True,
allow_methods=["*"], # allow all HTTP methods
allow_headers=["*"], # allow all headers
)
class ChatRequest(BaseModel):
message: str
# Load DeepSeek model (small one for local use)
model_name = "deepseek-ai/deepseek-coder-1.3b-base"
# model_name = "deepseek-ai/deepseek-llm-7b-base"
#model_name="Qwen/Qwen2.5-1.5B-Instruct"
#model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
print("Loading model... this may take a minute ⏳")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device_map="auto",
offload_folder="offload"
)
print("Model loaded βœ…")
@app.get("/")
def root():
return {"status": "ok"}
@app.post("/chat")
def chat(request: ChatRequest):
"""Chat endpoint using DeepSeek model"""
inputs = tokenizer(request.message, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=200)
reply = tokenizer.decode(outputs[0], skip_special_tokens=True)
return {"reply": reply}