| | import subprocess |
| | from fastapi import FastAPI |
| | from pydantic import BaseModel |
| | from typing import Optional |
| |
|
| | app = FastAPI() |
| |
|
| | class GenerationRequest(BaseModel): |
| | prompt: str |
| | seq_len: Optional[int] = 128000 |
| | temperature: Optional[float] = 0.8 |
| | cpu_threads: Optional[int] = -1 |
| | model_path: Optional[str] = "/content/llama-model/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte" |
| | tokenizer_path: Optional[str] = "/content/llama-model/tokenizer.model" |
| |
|
| | @app.post("/generate/") |
| | async def generate_text(request: GenerationRequest): |
| | command = [ |
| | "/content/executorch/cmake-out/examples/models/llama/llama_main", |
| | f"--model_path={request.model_path}", |
| | f"--tokenizer_path={request.tokenizer_path}", |
| | f"--prompt={request.prompt}", |
| | f"--temperature={request.temperature}", |
| | f"--seq_len={request.seq_len}", |
| | f"--cpu_threads={request.cpu_threads}" |
| | ] |
| |
|
| | result = subprocess.run(command, capture_output=True, text=True) |
| |
|
| | if result.returncode != 0: |
| | return {"error": result.stderr} |
| |
|
| | return {"generated_text": result.stdout} |
| |
|