File size: 4,393 Bytes
77442ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
"""
HuggingFace Space: Small LLM
Runs Phi-2 or similar small model on ZeroGPU
"""
from fastapi import FastAPI
from pydantic import BaseModel
from typing import Optional
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

app = FastAPI(
    title="Small LLM Space",
    description="Small LLM inference (Phi-2)"
)

# Model configuration
MODEL_NAME = "microsoft/phi-2"  # 2.7B parameters - fits in ZeroGPU
model = None
tokenizer = None


def load_model():
    """Lazy load the model"""
    global model, tokenizer
    
    if model is None:
        print(f"Loading {MODEL_NAME}...")
        
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
        
        # Set pad token if not present
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="auto" if torch.cuda.is_available() else None,
            trust_remote_code=True
        )
        
        print(f"Model loaded on {next(model.parameters()).device}")
    
    return model, tokenizer


class GenerateRequest(BaseModel):
    prompt: str
    max_tokens: int = 200
    temperature: float = 0.7
    top_p: float = 0.9


class GenerateResponse(BaseModel):
    text: str
    tokens_generated: int
    model: str
    error: Optional[str] = None


@app.get("/")
async def root():
    return {
        "status": "running",
        "service": "llm",
        "model": MODEL_NAME,
        "gpu": torch.cuda.is_available()
    }


@app.post("/api/generate", response_model=GenerateResponse)
async def generate(request: GenerateRequest):
    """Generate text completion"""
    
    try:
        model, tokenizer = load_model()
        
        # Tokenize
        inputs = tokenizer(
            request.prompt,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=2048
        )
        
        # Move to device
        if torch.cuda.is_available():
            inputs = {k: v.cuda() for k, v in inputs.items()}
        
        # Generate
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=request.max_tokens,
                temperature=request.temperature,
                top_p=request.top_p,
                do_sample=request.temperature > 0,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
        
        # Decode
        generated_text = tokenizer.decode(
            outputs[0],
            skip_special_tokens=True
        )
        
        # Calculate tokens generated
        input_length = inputs["input_ids"].shape[1]
        output_length = outputs.shape[1]
        tokens_generated = output_length - input_length
        
        return GenerateResponse(
            text=generated_text,
            tokens_generated=tokens_generated,
            model=MODEL_NAME
        )
        
    except Exception as e:
        return GenerateResponse(
            text="",
            tokens_generated=0,
            model=MODEL_NAME,
            error=str(e)
        )


# ZeroGPU decorator for HuggingFace
try:
    import spaces
    generate = spaces.GPU(generate)
except ImportError:
    pass  # Not on HF Spaces


# Gradio interface
def gradio_interface():
    import gradio as gr
    
    def generate_wrapper(prompt, max_tokens, temperature):
        from asyncio import run
        response = run(generate(GenerateRequest(
            prompt=prompt,
            max_tokens=max_tokens,
            temperature=temperature
        )))
        return response.text or f"Error: {response.error}"
    
    iface = gr.Interface(
        fn=generate_wrapper,
        inputs=[
            gr.Textbox(lines=5, label="Prompt"),
            gr.Slider(50, 500, value=200, label="Max Tokens"),
            gr.Slider(0.0, 1.5, value=0.7, label="Temperature")
        ],
        outputs=gr.Textbox(lines=10, label="Generated Text"),
        title="Small LLM (Phi-2)",
        description="Generate text using Phi-2 model"
    )
    
    return iface


if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)