import subprocess from fastapi import FastAPI from pydantic import BaseModel from typing import Optional app = FastAPI() class GenerationRequest(BaseModel): prompt: str seq_len: Optional[int] = 128000 temperature: Optional[float] = 0.8 cpu_threads: Optional[int] = -1 model_path: Optional[str] = "/content/llama-model/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte" tokenizer_path: Optional[str] = "/content/llama-model/tokenizer.model" @app.post("/generate/") async def generate_text(request: GenerationRequest): command = [ "/content/executorch/cmake-out/examples/models/llama/llama_main", f"--model_path={request.model_path}", f"--tokenizer_path={request.tokenizer_path}", f"--prompt={request.prompt}", f"--temperature={request.temperature}", f"--seq_len={request.seq_len}", f"--cpu_threads={request.cpu_threads}" ] result = subprocess.run(command, capture_output=True, text=True) if result.returncode != 0: return {"error": result.stderr} return {"generated_text": result.stdout}