# FILE 1: minimal_service.py (same as Step 1) import spaces import torch from transformers import AutoModelForCausalLM, AutoTokenizer # Global variables _model = None _tokenizer = None _model_name = "microsoft/DialoGPT-small" def initialize_tokenizer(): """Initialize tokenizer""" global _tokenizer if _tokenizer is None: print("[MinimalService] Loading tokenizer...") _tokenizer = AutoTokenizer.from_pretrained(_model_name) if _tokenizer.pad_token is None: _tokenizer.pad_token = _tokenizer.eos_token print("[MinimalService] Tokenizer loaded successfully.") return _tokenizer @spaces.GPU def generate_text_gpu(prompt: str, max_tokens: int = 50): """GPU function for text generation""" global _model, _tokenizer print("[MinimalService] GPU function called") # Initialize tokenizer if _tokenizer is None: initialize_tokenizer() # Load model in GPU context if _model is None: print("[MinimalService] Loading model...") _model = AutoModelForCausalLM.from_pretrained( _model_name, torch_dtype=torch.float16, device_map="auto" ) print("[MinimalService] Model loaded.") # Simple generation inputs = _tokenizer.encode(prompt, return_tensors="pt") device = next(_model.parameters()).device inputs = inputs.to(device) with torch.no_grad(): outputs = _model.generate( inputs, max_new_tokens=max_tokens, temperature=0.7, do_sample=True, pad_token_id=_tokenizer.eos_token_id ) response = _tokenizer.decode(outputs[0], skip_special_tokens=True) return response class MinimalService: def __init__(self): print("[MinimalService] Service initialized") initialize_tokenizer() def generate(self, prompt: str): """Public method to generate text""" return generate_text_gpu(prompt) # Create instance service = MinimalService() # Print confirmation print(f"[MinimalService] GPU function available: {generate_text_gpu.__name__}") # ==================================== # FILE 2: app.py (Step 2 - with FastAPI) import gradio as gr import spaces # Import the service from minimal_service import service, generate_text_gpu # Additional GPU function at app level @spaces.GPU def app_gpu_test(): """Test GPU function at app level""" return "App GPU function works" print("[App] GPU functions imported successfully") print(f"[App] Service GPU function: {generate_text_gpu.__name__}") print(f"[App] App GPU function: {app_gpu_test.__name__}") # ADD FASTAPI - Step 2 change from fastapi import FastAPI from fastapi.responses import RedirectResponse def generate_response(user_input): """Generate response using the service""" if not user_input.strip(): return "Please enter some text!" try: response = service.generate(user_input) return f"Generated: {response}" except Exception as e: return f"Error: {str(e)}" # Create Gradio interface with gr.Blocks(title="Step 2: FastAPI Test") as demo: gr.Markdown("# Step 2: Testing FastAPI + GPU") gr.Markdown("Testing if adding FastAPI breaks GPU detection.") with gr.Row(): input_text = gr.Textbox( label="Enter text", placeholder="Type something...", value="Hello, how are you?" ) output_text = gr.Textbox( label="Generated response", interactive=False ) generate_btn = gr.Button("Generate", variant="primary") generate_btn.click( fn=generate_response, inputs=[input_text], outputs=[output_text] ) # ADD FASTAPI MOUNTING app = FastAPI() @app.get("/") async def root(): return RedirectResponse(url="/gradio") # Mount Gradio on FastAPI app = gr.mount_gradio_app(app, demo, path="/gradio") print("[App] FastAPI + Gradio setup completed") if __name__ == "__main__": print("[App] Starting application...") import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)