File size: 4,167 Bytes
f17dc57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94dec36
 
 
f17dc57
277f293
94dec36
f17dc57
94dec36
277f293
 
 
94dec36
277f293
 
 
 
 
 
 
94dec36
 
277f293
94dec36
 
 
 
277f293
94dec36
 
 
 
 
f17dc57
 
277f293
94dec36
 
277f293
 
 
 
 
 
 
 
 
 
 
94dec36
 
 
 
 
 
cd60f0d
f17dc57
277f293
 
 
 
 
 
 
 
 
 
94dec36
 
277f293
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# FILE 1: minimal_service.py (same as Step 1)
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Global variables
_model = None
_tokenizer = None
_model_name = "microsoft/DialoGPT-small"

def initialize_tokenizer():
    """Initialize tokenizer"""
    global _tokenizer
    if _tokenizer is None:
        print("[MinimalService] Loading tokenizer...")
        _tokenizer = AutoTokenizer.from_pretrained(_model_name)
        if _tokenizer.pad_token is None:
            _tokenizer.pad_token = _tokenizer.eos_token
        print("[MinimalService] Tokenizer loaded successfully.")
    return _tokenizer

@spaces.GPU
def generate_text_gpu(prompt: str, max_tokens: int = 50):
    """GPU function for text generation"""
    global _model, _tokenizer
    
    print("[MinimalService] GPU function called")
    
    # Initialize tokenizer
    if _tokenizer is None:
        initialize_tokenizer()
    
    # Load model in GPU context
    if _model is None:
        print("[MinimalService] Loading model...")
        _model = AutoModelForCausalLM.from_pretrained(
            _model_name,
            torch_dtype=torch.float16,
            device_map="auto"
        )
        print("[MinimalService] Model loaded.")
    
    # Simple generation
    inputs = _tokenizer.encode(prompt, return_tensors="pt")
    device = next(_model.parameters()).device
    inputs = inputs.to(device)
    
    with torch.no_grad():
        outputs = _model.generate(
            inputs,
            max_new_tokens=max_tokens,
            temperature=0.7,
            do_sample=True,
            pad_token_id=_tokenizer.eos_token_id
        )
    
    response = _tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

class MinimalService:
    def __init__(self):
        print("[MinimalService] Service initialized")
        initialize_tokenizer()
    
    def generate(self, prompt: str):
        """Public method to generate text"""
        return generate_text_gpu(prompt)

# Create instance
service = MinimalService()

# Print confirmation
print(f"[MinimalService] GPU function available: {generate_text_gpu.__name__}")

# ====================================

# FILE 2: app.py (Step 2 - with FastAPI)
import gradio as gr
import spaces

# Import the service
from minimal_service import service, generate_text_gpu

# Additional GPU function at app level
@spaces.GPU
def app_gpu_test():
    """Test GPU function at app level"""
    return "App GPU function works"

print("[App] GPU functions imported successfully")
print(f"[App] Service GPU function: {generate_text_gpu.__name__}")
print(f"[App] App GPU function: {app_gpu_test.__name__}")

# ADD FASTAPI - Step 2 change
from fastapi import FastAPI
from fastapi.responses import RedirectResponse

def generate_response(user_input):
    """Generate response using the service"""
    if not user_input.strip():
        return "Please enter some text!"
    
    try:
        response = service.generate(user_input)
        return f"Generated: {response}"
    except Exception as e:
        return f"Error: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="Step 2: FastAPI Test") as demo:
    gr.Markdown("# Step 2: Testing FastAPI + GPU")
    gr.Markdown("Testing if adding FastAPI breaks GPU detection.")
    
    with gr.Row():
        input_text = gr.Textbox(
            label="Enter text",
            placeholder="Type something...",
            value="Hello, how are you?"
        )
        output_text = gr.Textbox(
            label="Generated response",
            interactive=False
        )
    
    generate_btn = gr.Button("Generate", variant="primary")
    
    generate_btn.click(
        fn=generate_response,
        inputs=[input_text],
        outputs=[output_text]
    )

# ADD FASTAPI MOUNTING
app = FastAPI()

@app.get("/")
async def root():
    return RedirectResponse(url="/gradio")

# Mount Gradio on FastAPI
app = gr.mount_gradio_app(app, demo, path="/gradio")

print("[App] FastAPI + Gradio setup completed")

if __name__ == "__main__":
    print("[App] Starting application...")
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)