File size: 4,393 Bytes
77442ac | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | """
HuggingFace Space: Small LLM
Runs Phi-2 or similar small model on ZeroGPU
"""
from fastapi import FastAPI
from pydantic import BaseModel
from typing import Optional
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
app = FastAPI(
title="Small LLM Space",
description="Small LLM inference (Phi-2)"
)
# Model configuration
MODEL_NAME = "microsoft/phi-2" # 2.7B parameters - fits in ZeroGPU
model = None
tokenizer = None
def load_model():
"""Lazy load the model"""
global model, tokenizer
if model is None:
print(f"Loading {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
# Set pad token if not present
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device_map="auto" if torch.cuda.is_available() else None,
trust_remote_code=True
)
print(f"Model loaded on {next(model.parameters()).device}")
return model, tokenizer
class GenerateRequest(BaseModel):
prompt: str
max_tokens: int = 200
temperature: float = 0.7
top_p: float = 0.9
class GenerateResponse(BaseModel):
text: str
tokens_generated: int
model: str
error: Optional[str] = None
@app.get("/")
async def root():
return {
"status": "running",
"service": "llm",
"model": MODEL_NAME,
"gpu": torch.cuda.is_available()
}
@app.post("/api/generate", response_model=GenerateResponse)
async def generate(request: GenerateRequest):
"""Generate text completion"""
try:
model, tokenizer = load_model()
# Tokenize
inputs = tokenizer(
request.prompt,
return_tensors="pt",
padding=True,
truncation=True,
max_length=2048
)
# Move to device
if torch.cuda.is_available():
inputs = {k: v.cuda() for k, v in inputs.items()}
# Generate
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=request.max_tokens,
temperature=request.temperature,
top_p=request.top_p,
do_sample=request.temperature > 0,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id
)
# Decode
generated_text = tokenizer.decode(
outputs[0],
skip_special_tokens=True
)
# Calculate tokens generated
input_length = inputs["input_ids"].shape[1]
output_length = outputs.shape[1]
tokens_generated = output_length - input_length
return GenerateResponse(
text=generated_text,
tokens_generated=tokens_generated,
model=MODEL_NAME
)
except Exception as e:
return GenerateResponse(
text="",
tokens_generated=0,
model=MODEL_NAME,
error=str(e)
)
# ZeroGPU decorator for HuggingFace
try:
import spaces
generate = spaces.GPU(generate)
except ImportError:
pass # Not on HF Spaces
# Gradio interface
def gradio_interface():
import gradio as gr
def generate_wrapper(prompt, max_tokens, temperature):
from asyncio import run
response = run(generate(GenerateRequest(
prompt=prompt,
max_tokens=max_tokens,
temperature=temperature
)))
return response.text or f"Error: {response.error}"
iface = gr.Interface(
fn=generate_wrapper,
inputs=[
gr.Textbox(lines=5, label="Prompt"),
gr.Slider(50, 500, value=200, label="Max Tokens"),
gr.Slider(0.0, 1.5, value=0.7, label="Temperature")
],
outputs=gr.Textbox(lines=10, label="Generated Text"),
title="Small LLM (Phi-2)",
description="Generate text using Phi-2 model"
)
return iface
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)
|