Spaces:

build-small-hackathon
/

tinybard

Sleeping

tinybard / modal_inference.py

Hermes Bot

feat: add projects/tinybard/modal_inference.py and update configs

523e468 unverified 22 days ago

3.82 kB

	import os
	import modal

	# Define the Modal app
	app = modal.App("tinybard-inference")

	# Use a standard CUDA image with vLLM installed
	image = (
	modal.Image.debian_slim(python_version="3.10")
	.pip_install(
	"vllm==0.5.4",
	"huggingface_hub",
	"fastapi",
	"uvicorn",
	)
	)

	# Default model (can be overridden via environment variable)
	MODEL_ID = os.environ.get("MODEL_ID", "Qwen/Qwen2.5-1.5B-Instruct")


	@app.cls(
	gpu="L4", # L4 is cost-efficient and widely available
	image=image,
	secrets=[modal.Secret.from_name("huggingface-secret")], # Optional, for gated models
	timeout=600,
	)
	class InferenceModel:
	@modal.enter()
	def load_model(self):
	from vllm import AsyncLLMEngine
	from vllm.engine.arg_utils import AsyncEngineArgs

	# Configure AsyncLLMEngine
	engine_args = AsyncEngineArgs(
	model=MODEL_ID,
	max_model_len=2048,
	gpu_memory_utilization=0.9,
	trust_remote_code=True,
	)
	self.engine = AsyncLLMEngine.from_engine_args(engine_args)
	log_info = f"Model {MODEL_ID} loaded successfully."
	print(log_info)

	@modal.method()
	async def generate(self, prompt: str, max_tokens: int = 226, temperature: float = 0.7) -> str:
	from vllm import SamplingParams
	import uuid

	sampling_params = SamplingParams(
	temperature=temperature,
	max_tokens=max_tokens,
	)
	request_id = str(uuid.uuid4())
	results_generator = self.engine.generate(prompt, sampling_params, request_id)

	final_output = None
	async for request_output in results_generator:
	final_output = request_output

	if final_output and final_output.outputs:
	return final_output.outputs[0].text
	return ""


	# Create a FastAPI wrapper to expose an OpenAI-compatible /v1/chat/completions endpoint
	from fastapi import FastAPI, Request, HTTPException
	from fastapi.responses import JSONResponse

	web_app = FastAPI(title="TinyBard Inference API")


	@web_app.post("/v1/chat/completions")
	async def chat_completions(request: Request):
	body = await request.json()
	messages = body.get("messages", [])
	model = body.get("model", MODEL_ID)
	max_tokens = body.get("max_tokens", 220)
	temperature = body.get("temperature", 0.7)

	# Translate chat messages to plain text prompt (standard ChatML or basic dialogue)
	prompt = ""
	for msg in messages:
	role = msg.get("role", "user")
	content = msg.get("content", "").strip()
	if role == "system":
	prompt += f"System Instructions:\n{content}\n\n"
	elif role == "user":
	prompt += f"User:\n{content}\n\n"
	elif role == "assistant":
	prompt += f"Assistant:\n{content}\n\n"
	prompt += "Assistant:\n"

	# Call the Modal class method
	try:
	model_instance = InferenceModel()
	text = await model_instance.generate.remote.aio(
	prompt, max_tokens=max_tokens, temperature=temperature
	)
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Modal inference failed: {str(e)}")

	# Return OpenAI-compatible JSON structure
	return JSONResponse(
	{
	"id": "chatcmpl-modal",
	"object": "chat.completion",
	"model": model,
	"choices": [
	{
	"index": 0,
	"message": {
	"role": "assistant",
	"content": text.strip(),
	},
	"finish_reason": "stop",
	}
	],
	}
	)


	# Expose the web app via Modal ASGI
	@app.function(image=image)
	@modal.asgi_app()
	def api():
	return web_app