llm / app.py
ndwdgda's picture
Upload folder using huggingface_hub
77442ac verified
"""
HuggingFace Space: Small LLM
Runs Phi-2 or similar small model on ZeroGPU
"""
from fastapi import FastAPI
from pydantic import BaseModel
from typing import Optional
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
app = FastAPI(
title="Small LLM Space",
description="Small LLM inference (Phi-2)"
)
# Model configuration
MODEL_NAME = "microsoft/phi-2" # 2.7B parameters - fits in ZeroGPU
model = None
tokenizer = None
def load_model():
"""Lazy load the model"""
global model, tokenizer
if model is None:
print(f"Loading {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
# Set pad token if not present
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device_map="auto" if torch.cuda.is_available() else None,
trust_remote_code=True
)
print(f"Model loaded on {next(model.parameters()).device}")
return model, tokenizer
class GenerateRequest(BaseModel):
prompt: str
max_tokens: int = 200
temperature: float = 0.7
top_p: float = 0.9
class GenerateResponse(BaseModel):
text: str
tokens_generated: int
model: str
error: Optional[str] = None
@app.get("/")
async def root():
return {
"status": "running",
"service": "llm",
"model": MODEL_NAME,
"gpu": torch.cuda.is_available()
}
@app.post("/api/generate", response_model=GenerateResponse)
async def generate(request: GenerateRequest):
"""Generate text completion"""
try:
model, tokenizer = load_model()
# Tokenize
inputs = tokenizer(
request.prompt,
return_tensors="pt",
padding=True,
truncation=True,
max_length=2048
)
# Move to device
if torch.cuda.is_available():
inputs = {k: v.cuda() for k, v in inputs.items()}
# Generate
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=request.max_tokens,
temperature=request.temperature,
top_p=request.top_p,
do_sample=request.temperature > 0,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id
)
# Decode
generated_text = tokenizer.decode(
outputs[0],
skip_special_tokens=True
)
# Calculate tokens generated
input_length = inputs["input_ids"].shape[1]
output_length = outputs.shape[1]
tokens_generated = output_length - input_length
return GenerateResponse(
text=generated_text,
tokens_generated=tokens_generated,
model=MODEL_NAME
)
except Exception as e:
return GenerateResponse(
text="",
tokens_generated=0,
model=MODEL_NAME,
error=str(e)
)
# ZeroGPU decorator for HuggingFace
try:
import spaces
generate = spaces.GPU(generate)
except ImportError:
pass # Not on HF Spaces
# Gradio interface
def gradio_interface():
import gradio as gr
def generate_wrapper(prompt, max_tokens, temperature):
from asyncio import run
response = run(generate(GenerateRequest(
prompt=prompt,
max_tokens=max_tokens,
temperature=temperature
)))
return response.text or f"Error: {response.error}"
iface = gr.Interface(
fn=generate_wrapper,
inputs=[
gr.Textbox(lines=5, label="Prompt"),
gr.Slider(50, 500, value=200, label="Max Tokens"),
gr.Slider(0.0, 1.5, value=0.7, label="Temperature")
],
outputs=gr.Textbox(lines=10, label="Generated Text"),
title="Small LLM (Phi-2)",
description="Generate text using Phi-2 model"
)
return iface
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)