|
|
|
|
|
import spaces |
|
|
import torch |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
|
|
|
|
|
|
_model = None |
|
|
_tokenizer = None |
|
|
_model_name = "microsoft/DialoGPT-small" |
|
|
|
|
|
def initialize_tokenizer(): |
|
|
"""Initialize tokenizer""" |
|
|
global _tokenizer |
|
|
if _tokenizer is None: |
|
|
print("[MinimalService] Loading tokenizer...") |
|
|
_tokenizer = AutoTokenizer.from_pretrained(_model_name) |
|
|
if _tokenizer.pad_token is None: |
|
|
_tokenizer.pad_token = _tokenizer.eos_token |
|
|
print("[MinimalService] Tokenizer loaded successfully.") |
|
|
return _tokenizer |
|
|
|
|
|
@spaces.GPU |
|
|
def generate_text_gpu(prompt: str, max_tokens: int = 50): |
|
|
"""GPU function for text generation""" |
|
|
global _model, _tokenizer |
|
|
|
|
|
print("[MinimalService] GPU function called") |
|
|
|
|
|
|
|
|
if _tokenizer is None: |
|
|
initialize_tokenizer() |
|
|
|
|
|
|
|
|
if _model is None: |
|
|
print("[MinimalService] Loading model...") |
|
|
_model = AutoModelForCausalLM.from_pretrained( |
|
|
_model_name, |
|
|
torch_dtype=torch.float16, |
|
|
device_map="auto" |
|
|
) |
|
|
print("[MinimalService] Model loaded.") |
|
|
|
|
|
|
|
|
inputs = _tokenizer.encode(prompt, return_tensors="pt") |
|
|
device = next(_model.parameters()).device |
|
|
inputs = inputs.to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = _model.generate( |
|
|
inputs, |
|
|
max_new_tokens=max_tokens, |
|
|
temperature=0.7, |
|
|
do_sample=True, |
|
|
pad_token_id=_tokenizer.eos_token_id |
|
|
) |
|
|
|
|
|
response = _tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
return response |
|
|
|
|
|
class MinimalService: |
|
|
def __init__(self): |
|
|
print("[MinimalService] Service initialized") |
|
|
initialize_tokenizer() |
|
|
|
|
|
def generate(self, prompt: str): |
|
|
"""Public method to generate text""" |
|
|
return generate_text_gpu(prompt) |
|
|
|
|
|
|
|
|
service = MinimalService() |
|
|
|
|
|
|
|
|
print(f"[MinimalService] GPU function available: {generate_text_gpu.__name__}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import gradio as gr |
|
|
import spaces |
|
|
|
|
|
|
|
|
from minimal_service import service, generate_text_gpu |
|
|
|
|
|
|
|
|
@spaces.GPU |
|
|
def app_gpu_test(): |
|
|
"""Test GPU function at app level""" |
|
|
return "App GPU function works" |
|
|
|
|
|
print("[App] GPU functions imported successfully") |
|
|
print(f"[App] Service GPU function: {generate_text_gpu.__name__}") |
|
|
print(f"[App] App GPU function: {app_gpu_test.__name__}") |
|
|
|
|
|
|
|
|
from fastapi import FastAPI |
|
|
from fastapi.responses import RedirectResponse |
|
|
|
|
|
def generate_response(user_input): |
|
|
"""Generate response using the service""" |
|
|
if not user_input.strip(): |
|
|
return "Please enter some text!" |
|
|
|
|
|
try: |
|
|
response = service.generate(user_input) |
|
|
return f"Generated: {response}" |
|
|
except Exception as e: |
|
|
return f"Error: {str(e)}" |
|
|
|
|
|
|
|
|
with gr.Blocks(title="Step 2: FastAPI Test") as demo: |
|
|
gr.Markdown("# Step 2: Testing FastAPI + GPU") |
|
|
gr.Markdown("Testing if adding FastAPI breaks GPU detection.") |
|
|
|
|
|
with gr.Row(): |
|
|
input_text = gr.Textbox( |
|
|
label="Enter text", |
|
|
placeholder="Type something...", |
|
|
value="Hello, how are you?" |
|
|
) |
|
|
output_text = gr.Textbox( |
|
|
label="Generated response", |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
generate_btn = gr.Button("Generate", variant="primary") |
|
|
|
|
|
generate_btn.click( |
|
|
fn=generate_response, |
|
|
inputs=[input_text], |
|
|
outputs=[output_text] |
|
|
) |
|
|
|
|
|
|
|
|
app = FastAPI() |
|
|
|
|
|
@app.get("/") |
|
|
async def root(): |
|
|
return RedirectResponse(url="/gradio") |
|
|
|
|
|
|
|
|
app = gr.mount_gradio_app(app, demo, path="/gradio") |
|
|
|
|
|
print("[App] FastAPI + Gradio setup completed") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("[App] Starting application...") |
|
|
import uvicorn |
|
|
uvicorn.run(app, host="0.0.0.0", port=7860) |