hostserver3

Runtime error

App Files Files Community

hostserver3 / main.py

abdullahalioo

Update main.py

2ba12d8 verified 7 months ago

raw

history blame

2.65 kB

	from fastapi import FastAPI
	from pydantic import BaseModel
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.responses import StreamingResponse
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import torch
	import os
	import asyncio

	# Set cache directories
	cache_dir = "/tmp/hf_home"
	os.environ["HF_HOME"] = cache_dir
	os.environ["TRANSFORMERS_CACHE"] = cache_dir
	os.environ["HUGGINGFACE_HUB_CACHE"] = cache_dir

	# Create cache directory with proper permissions
	os.makedirs(cache_dir, exist_ok=True)
	os.chmod(cache_dir, 0o777)

	# Load model and tokenizer
	model_name = "Qwen/Qwen2.5-0.5B-Instruct"
	tokenizer = AutoTokenizer.from_pretrained(
	model_name,
	trust_remote_code=True,
	cache_dir=cache_dir
	)
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	trust_remote_code=True,
	cache_dir=cache_dir,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
	)

	# Set device
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)

	# Initialize FastAPI
	app = FastAPI()

	# Enable CORS
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# Input model
	class Question(BaseModel):
	question: str

	# System prompt
	SYSTEM_PROMPT = "You are Orion, an intelligent AI assistant created by Abdullah Ali, a 13-year-old from Lahore. Respond kindly and wisely."

	async def generate_response_chunks(prompt: str):
	# Create the chat template
	messages = [
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": prompt}
	]

	# Apply chat template
	qwen_prompt = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	# Tokenize and generate
	inputs = tokenizer(qwen_prompt, return_tensors="pt").to(device)
	outputs = model.generate(
	**inputs,
	max_new_tokens=512,
	do_sample=True,
	temperature=0.7,
	top_p=0.9,
	pad_token_id=tokenizer.eos_token_id
	)

	# Decode and clean the output
	full_output = tokenizer.decode(outputs[0], skip_special_tokens=False)

	# Extract only the assistant's response
	response = full_output[len(qwen_prompt):].split(tokenizer.eos_token)[0].strip()

	# Stream the response
	for word in response.split():
	yield word + " "
	await asyncio.sleep(0.05)

	@app.post("/ask")
	async def ask(question: Question):
	return StreamingResponse(
	generate_response_chunks(question.question),
	media_type="text/plain"
	)