Spaces:

skkalwar
/

LLM_Model

Sleeping

LLM_Model / app3.py

Shreekant Kalwar (Nokia)

new server

cd55ee8 3 months ago

2.28 kB

	from fastapi import FastAPI
	from pydantic import BaseModel
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from fastapi.middleware.cors import CORSMiddleware
	import torch
	import os

	# Ensure Hugging Face cache uses a writable path
	os.environ["TRANSFORMERS_CACHE"] = "./.cache"
	os.environ["HF_HOME"] = "./.cache"

	app = FastAPI()

	# ✅ Allow all origins
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)


	class ChatRequest(BaseModel):
	message: str
	max_tokens: int = 200 # default shorter responses for speed


	# 🔹 Choose a model (smaller = faster on CPU)
	#model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
	#model_name = "Qwen/Qwen2.5-1.5B-Instruct"
	model_name = "deepseek-ai/deepseek-coder-1.3b-base"

	print("🚀 Loading model... this may take a minute ⏳")

	try:
	if torch.cuda.is_available():
	# ✅ GPU with quantization
	from transformers import BitsAndBytesConfig
	quant_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.float16,
	)
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	device_map="auto",
	quantization_config=quant_config,
	)
	else:
	# ✅ CPU fallback (no quantization)
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.float32,
	device_map="auto",
	)
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	print("✅ Model loaded successfully!")

	except Exception as e:
	print("❌ Model loading failed:", str(e))
	raise


	@app.get("/")
	def root():
	return {"status": "ok"}


	@app.post("/chat")
	def chat(request: ChatRequest):
	"""Chat endpoint"""
	inputs = tokenizer(request.message, return_tensors="pt").to(model.device)

	outputs = model.generate(
	**inputs,
	max_new_tokens=request.max_tokens,
	do_sample=True,
	top_p=0.9,
	temperature=0.7
	)

	# 🔹 Only decode new tokens
	reply_tokens = outputs[0][inputs["input_ids"].shape[1]:]
	reply = tokenizer.decode(reply_tokens, skip_special_tokens=True)

	return {"reply": reply}