Spaces:
Configuration error
Configuration error
| """ | |
| FastAPI Deployment Server | |
| ========================== | |
| One-click deployment bridge for fine-tuned models. | |
| """ | |
| import os | |
| from pathlib import Path | |
| from typing import Optional, List, Dict, Any | |
| from dataclasses import dataclass | |
| from datetime import datetime | |
| from rich.console import Console | |
| console = Console() | |
| class GenerationRequest: | |
| """Request model for text generation.""" | |
| prompt: str | |
| system_prompt: Optional[str] = None | |
| max_tokens: int = 512 | |
| temperature: float = 0.7 | |
| top_p: float = 0.9 | |
| stream: bool = False | |
| class GenerationResponse: | |
| """Response model for text generation.""" | |
| generated_text: str | |
| prompt: str | |
| model: str | |
| tokens_generated: int | |
| generation_time: float | |
| class DeploymentServer: | |
| """ | |
| FastAPI-based deployment server for fine-tuned models. | |
| Features: | |
| - RESTful API for inference | |
| - Health check endpoint | |
| - Batch generation support | |
| - Automatic model loading | |
| """ | |
| def __init__( | |
| self, | |
| model_path: str, | |
| host: str = "0.0.0.0", | |
| port: int = 8000, | |
| max_seq_length: int = 2048 | |
| ): | |
| """ | |
| Initialize the deployment server. | |
| Args: | |
| model_path: Path to the fine-tuned model | |
| host: Server host | |
| port: Server port | |
| max_seq_length: Maximum sequence length | |
| """ | |
| self.model_path = model_path | |
| self.host = host | |
| self.port = port | |
| self.max_seq_length = max_seq_length | |
| self.model = None | |
| self.tokenizer = None | |
| self.app = None | |
| def load_model(self): | |
| """Load the fine-tuned model.""" | |
| console.print(f"\n[bold blue]📂 Loading model from:[/] {self.model_path}") | |
| try: | |
| from unsloth import FastLanguageModel | |
| self.model, self.tokenizer = FastLanguageModel.from_pretrained( | |
| model_name=self.model_path, | |
| max_seq_length=self.max_seq_length, | |
| dtype=None, | |
| load_in_4bit=True, | |
| ) | |
| FastLanguageModel.for_inference(self.model) | |
| console.print("[green]✓ Model loaded successfully[/]") | |
| except ImportError: | |
| console.print("[yellow]⚠️ Unsloth not available, trying transformers...[/]") | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) | |
| self.model = AutoModelForCausalLM.from_pretrained( | |
| self.model_path, | |
| device_map="auto", | |
| torch_dtype="auto" | |
| ) | |
| console.print("[green]✓ Model loaded with transformers[/]") | |
| def generate( | |
| self, | |
| prompt: str, | |
| system_prompt: Optional[str] = None, | |
| max_tokens: int = 512, | |
| temperature: float = 0.7, | |
| top_p: float = 0.9 | |
| ) -> GenerationResponse: | |
| """ | |
| Generate text from the model. | |
| Args: | |
| prompt: User prompt | |
| system_prompt: Optional system prompt | |
| max_tokens: Maximum tokens to generate | |
| temperature: Sampling temperature | |
| top_p: Top-p sampling parameter | |
| Returns: | |
| GenerationResponse with generated text | |
| """ | |
| if self.model is None: | |
| raise RuntimeError("Model not loaded. Call load_model() first.") | |
| start_time = datetime.now() | |
| # Format prompt with Alpaca template | |
| if system_prompt: | |
| formatted_prompt = f"""{system_prompt} | |
| ### Instruction: | |
| {prompt} | |
| ### Response: | |
| """ | |
| else: | |
| formatted_prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. | |
| ### Instruction: | |
| {prompt} | |
| ### Response: | |
| """ | |
| # Tokenize | |
| inputs = self.tokenizer( | |
| formatted_prompt, | |
| return_tensors="pt" | |
| ).to(self.model.device) | |
| # Generate | |
| outputs = self.model.generate( | |
| **inputs, | |
| max_new_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| do_sample=True, | |
| pad_token_id=self.tokenizer.eos_token_id | |
| ) | |
| # Decode | |
| full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Extract just the generated part | |
| if "### Response:" in full_response: | |
| generated_text = full_response.split("### Response:")[-1].strip() | |
| else: | |
| generated_text = full_response[len(formatted_prompt):].strip() | |
| generation_time = (datetime.now() - start_time).total_seconds() | |
| tokens_generated = len(self.tokenizer.encode(generated_text)) | |
| return GenerationResponse( | |
| generated_text=generated_text, | |
| prompt=prompt, | |
| model=self.model_path, | |
| tokens_generated=tokens_generated, | |
| generation_time=generation_time | |
| ) | |
| def create_app(self): | |
| """Create the FastAPI application.""" | |
| from fastapi import FastAPI, HTTPException | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel | |
| from typing import List, Optional | |
| app = FastAPI( | |
| title="Auto-FineTune-Ops Inference API", | |
| description="API for serving fine-tuned LLM models", | |
| version="1.0.0" | |
| ) | |
| # CORS middleware | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # Pydantic models for API | |
| class GenerateRequest(BaseModel): | |
| prompt: str | |
| system_prompt: Optional[str] = None | |
| max_tokens: int = 512 | |
| temperature: float = 0.7 | |
| top_p: float = 0.9 | |
| class GenerateResponse(BaseModel): | |
| generated_text: str | |
| prompt: str | |
| model: str | |
| tokens_generated: int | |
| generation_time: float | |
| class BatchGenerateRequest(BaseModel): | |
| prompts: List[str] | |
| system_prompt: Optional[str] = None | |
| max_tokens: int = 512 | |
| temperature: float = 0.7 | |
| top_p: float = 0.9 | |
| class HealthResponse(BaseModel): | |
| status: str | |
| model: str | |
| model_loaded: bool | |
| async def health_check(): | |
| """Health check endpoint.""" | |
| return HealthResponse( | |
| status="healthy", | |
| model=self.model_path, | |
| model_loaded=self.model is not None | |
| ) | |
| async def generate_text(request: GenerateRequest): | |
| """Generate text from a single prompt.""" | |
| if self.model is None: | |
| raise HTTPException(status_code=503, detail="Model not loaded") | |
| try: | |
| result = self.generate( | |
| prompt=request.prompt, | |
| system_prompt=request.system_prompt, | |
| max_tokens=request.max_tokens, | |
| temperature=request.temperature, | |
| top_p=request.top_p | |
| ) | |
| return GenerateResponse( | |
| generated_text=result.generated_text, | |
| prompt=result.prompt, | |
| model=result.model, | |
| tokens_generated=result.tokens_generated, | |
| generation_time=result.generation_time | |
| ) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def batch_generate(request: BatchGenerateRequest): | |
| """Generate text from multiple prompts.""" | |
| if self.model is None: | |
| raise HTTPException(status_code=503, detail="Model not loaded") | |
| results = [] | |
| for prompt in request.prompts: | |
| try: | |
| result = self.generate( | |
| prompt=prompt, | |
| system_prompt=request.system_prompt, | |
| max_tokens=request.max_tokens, | |
| temperature=request.temperature, | |
| top_p=request.top_p | |
| ) | |
| results.append(GenerateResponse( | |
| generated_text=result.generated_text, | |
| prompt=result.prompt, | |
| model=result.model, | |
| tokens_generated=result.tokens_generated, | |
| generation_time=result.generation_time | |
| )) | |
| except Exception as e: | |
| results.append(GenerateResponse( | |
| generated_text=f"Error: {str(e)}", | |
| prompt=prompt, | |
| model=self.model_path, | |
| tokens_generated=0, | |
| generation_time=0.0 | |
| )) | |
| return results | |
| async def root(): | |
| """Root endpoint with API info.""" | |
| return { | |
| "name": "Auto-FineTune-Ops Inference API", | |
| "version": "1.0.0", | |
| "model": self.model_path, | |
| "endpoints": { | |
| "/health": "Health check", | |
| "/generate": "Generate text (POST)", | |
| "/generate/batch": "Batch generation (POST)" | |
| } | |
| } | |
| self.app = app | |
| return app | |
| def run(self, reload: bool = False): | |
| """ | |
| Start the FastAPI server. | |
| Args: | |
| reload: Enable auto-reload for development | |
| """ | |
| import uvicorn | |
| console.print("\n" + "="*60) | |
| console.print("[bold magenta]🚀 DEPLOYMENT SERVER[/]") | |
| console.print("="*60) | |
| # Load model if not already loaded | |
| if self.model is None: | |
| self.load_model() | |
| # Create app if not already created | |
| if self.app is None: | |
| self.create_app() | |
| console.print(f"\n[bold green]Starting server at http://{self.host}:{self.port}[/]") | |
| console.print("[dim]Press Ctrl+C to stop[/]\n") | |
| uvicorn.run( | |
| self.app, | |
| host=self.host, | |
| port=self.port, | |
| reload=reload | |
| ) | |
| def main(): | |
| """CLI entry point for deployment.""" | |
| import argparse | |
| parser = argparse.ArgumentParser(description="Deploy fine-tuned model as API") | |
| parser.add_argument("--model", required=True, help="Path to fine-tuned model") | |
| parser.add_argument("--host", default="0.0.0.0", help="Server host") | |
| parser.add_argument("--port", type=int, default=8000, help="Server port") | |
| parser.add_argument("--reload", action="store_true", help="Enable auto-reload") | |
| args = parser.parse_args() | |
| server = DeploymentServer( | |
| model_path=args.model, | |
| host=args.host, | |
| port=args.port | |
| ) | |
| server.run(reload=args.reload) | |
| if __name__ == "__main__": | |
| main() | |