Spaces:

TeamGenKI
/

LLMServer

Paused

App Files Files Community

AurelioAguirre commited on Nov 17, 2024

Commit

28fa644

1 Parent(s): eda2ff2

Small refactor, moving endpoints to the routes.py file. Also added streaming endpoint, and from_pretrained

Browse files

Files changed (2) hide show

main/main.py +4 -166
main/routes.py +365 -0

main/main.py CHANGED Viewed

@@ -1,13 +1,9 @@
-from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
-from typing import Optional, Union
-import torch
 import logging
-from pathlib import Path
-from litgpt.api import LLM
 import os
 import uvicorn
 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -30,166 +26,8 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# Global variable to store the LLM instance
-llm_instance = None
-class InitializeRequest(BaseModel):
-    """
-    Configuration for model initialization including model path
-    """
-    mode: str = "cpu"
-    precision: Optional[str] = None
-    quantize: Optional[str] = None
-    gpu_count: Union[str, int] = "auto"
-    model_path: str
-class GenerateRequest(BaseModel):
-    prompt: str
-    max_new_tokens: int = 50
-    temperature: float = 1.0
-    top_k: Optional[int] = None
-    top_p: float = 1.0
-    return_as_token_ids: bool = False
-    stream: bool = False
-@app.get("/")
-async def root():
-    """Root endpoint to verify service is running"""
-    return {
-        "status": "running",
-        "service": "LLM Engine",
-        "endpoints": {
-            "initialize": "/initialize",
-            "generate": "/generate",
-            "health": "/health"
-        }
-    }
-@app.post("/initialize")
-async def initialize_model(request: InitializeRequest):
-    """
-    Initialize the LLM model with specified configuration.
-    """
-    global llm_instance
-    try:
-        # Get the project root directory (where main.py is located)
-        project_root = Path(__file__).parent
-        checkpoints_dir = project_root / "checkpoints"
-        logger.info(f"Checkpoint dir is: {checkpoints_dir}")
-        # For LitGPT downloaded models, path includes organization
-        if "/" in request.model_path:
-            # e.g., "mistralai/Mistral-7B-Instruct-v0.3"
-            org, model_name = request.model_path.split("/")
-            model_path = str(checkpoints_dir / org / model_name)
-        else:
-            # Fallback for direct model paths
-            model_path = str(checkpoints_dir / request.model_path)
-        logger.info(f"Using model path: {model_path}")
-        # Load the model
-        llm_instance = LLM.load(
-            model=model_path,
-            distribute=None if request.precision or request.quantize else "auto"
-        )
-        # If manual distribution is needed
-        if request.precision or request.quantize:
-            llm_instance.distribute(
-                accelerator="cuda" if request.mode == "gpu" else "cpu",
-                devices=request.gpu_count,
-                precision=request.precision,
-                quantize=request.quantize
-            )
-        logger.info(
-            f"Model initialized successfully with config:\n"
-            f"Mode: {request.mode}\n"
-            f"Precision: {request.precision}\n"
-            f"Quantize: {request.quantize}\n"
-            f"GPU Count: {request.gpu_count}\n"
-            f"Model Path: {model_path}\n"
-            f"Current GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f}GB allocated, "
-            f"{torch.cuda.memory_reserved()/1024**3:.2f}GB reserved"
-        )
-        return {"success": True, "message": "Model initialized successfully"}
-    except Exception as e:
-        logger.error(f"Error initializing model: {str(e)}")
-        # Print detailed memory statistics on failure
-        logger.error(f"GPU Memory Stats:\n"
-                     f"Allocated: {torch.cuda.memory_allocated()/1024**3:.2f}GB\n"
-                     f"Reserved: {torch.cuda.memory_reserved()/1024**3:.2f}GB\n"
-                     f"Max Allocated: {torch.cuda.max_memory_allocated()/1024**3:.2f}GB")
-        raise HTTPException(status_code=500, detail=f"Error initializing model: {str(e)}")
-@app.post("/generate")
-async def generate(request: GenerateRequest):
-    """
-    Generate text using the initialized model.
-    """
-    global llm_instance
-    if llm_instance is None:
-        raise HTTPException(status_code=400, detail="Model not initialized. Call /initialize first.")
-    try:
-        if request.stream:
-            raise HTTPException(
-                status_code=400,
-                detail="Streaming is not currently supported through the API"
-            )
-        generated_text = llm_instance.generate(
-            prompt=request.prompt,
-            max_new_tokens=request.max_new_tokens,
-            temperature=request.temperature,
-            top_k=request.top_k,
-            top_p=request.top_p,
-            return_as_token_ids=request.return_as_token_ids,
-            stream=False  # Force stream to False for now
-        )
-        response = {
-            "generated_text": generated_text if not request.return_as_token_ids else generated_text.tolist(),
-            "metadata": {
-                "prompt": request.prompt,
-                "max_new_tokens": request.max_new_tokens,
-                "temperature": request.temperature,
-                "top_k": request.top_k,
-                "top_p": request.top_p
-            }
-        }
-        return response
-    except Exception as e:
-        logger.error(f"Error generating text: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"Error generating text: {str(e)}")
-@app.get("/health")
-async def health_check():
-    """
-    Check if the service is running and model is loaded.
-    """
-    global llm_instance
-    status = {
-        "status": "healthy",
-        "model_loaded": llm_instance is not None,
-    }
-    if llm_instance is not None:
-        logger.info(f"llm_instance is: {llm_instance}")
-        status["model_info"] = {
-            "model_path": llm_instance.config.name,
-            "device": str(next(llm_instance.model.parameters()).device)
-        }
-    return status
 def main():
     # Load environment variables or configuration here

+from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 import logging
 import os
 import uvicorn
+from routes import router
 # Set up logging
 logging.basicConfig(level=logging.INFO)
     allow_headers=["*"],
 )
+# Include the router from routes.py
+app.include_router(router)
 def main():
     # Load environment variables or configuration here

main/routes.py ADDED Viewed

	@@ -0,0 +1,365 @@

+from fastapi import APIRouter, HTTPException
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
+from typing import Optional, Union, AsyncGenerator
+import torch
+import logging
+from pathlib import Path
+from litgpt.api import LLM
+import json
+import asyncio
+# Set up logging
+logger = logging.getLogger(__name__)
+# Create router instance
+router = APIRouter()
+# Global variable to store the LLM instance
+llm_instance = None
+class InitializeRequest(BaseModel):
+    """
+    Configuration for model initialization including model path
+    """
+    mode: str = "cpu"
+    precision: Optional[str] = None
+    quantize: Optional[str] = None
+    gpu_count: Union[str, int] = "auto"
+    model_path: str
+class GenerateRequest(BaseModel):
+    prompt: str
+    max_new_tokens: int = 50
+    temperature: float = 1.0
+    top_k: Optional[int] = None
+    top_p: float = 1.0
+    return_as_token_ids: bool = False
+    stream: bool = False
+# A Pydantic model for the streaming generation request
+class StreamGenerateRequest(BaseModel):
+    prompt: str
+    max_new_tokens: int = 50
+    temperature: float = 1.0
+    top_k: Optional[int] = None
+    top_p: float = 1.0
+class InitializeCustomRequest(BaseModel):
+    """
+    Configuration for custom model initialization using from_pretrained
+    """
+    mode: str = "cpu"
+    precision: Optional[str] = None
+    quantize: Optional[str] = None
+    gpu_count: Union[str, int] = "auto"
+    folder_path: str  # Path to the model folder relative to checkpoints
+    model_filename: str  # Name of the model file (e.g., "lit_model.pth")
+    config_filename: str = "config.json"  # Default config filename
+    tokenizer_filename: Optional[str] = "tokenizer.json"  # Optional tokenizer filename
+@router.post("/initialize/custom")
+async def initialize_custom_model(request: InitializeCustomRequest):
+    """
+    Initialize a custom model using from_pretrained method.
+    This is for models that are already downloaded and stored in the checkpoints directory.
+    """
+    global llm_instance
+    try:
+        # Get the project root directory and construct paths
+        project_root = Path(__file__).parent
+        checkpoints_dir = project_root / "checkpoints"
+        model_dir = checkpoints_dir / request.folder_path
+        logger.info(f"Loading custom model from directory: {model_dir}")
+        # Verify that all required files exist
+        model_path = model_dir / request.model_filename
+        config_path = model_dir / request.config_filename
+        if not model_path.exists():
+            raise HTTPException(
+                status_code=400,
+                detail=f"Model file not found: {request.model_filename}"
+            )
+        if not config_path.exists():
+            raise HTTPException(
+                status_code=400,
+                detail=f"Config file not found: {request.config_filename}"
+            )
+        # Check for tokenizer if specified
+        tokenizer_path = None
+        if request.tokenizer_filename:
+            tokenizer_path = model_dir / request.tokenizer_filename
+            if not tokenizer_path.exists():
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"Tokenizer file not found: {request.tokenizer_filename}"
+                )
+        # Load the model using from_pretrained
+        llm_instance = LLM.from_pretrained(
+            path=str(model_dir),
+            model_file=request.model_filename,
+            config_file=request.config_filename,
+            tokenizer_file=request.tokenizer_filename if request.tokenizer_filename else None,
+            distribute=None if request.precision or request.quantize else "auto"
+        )
+        # If manual distribution is needed
+        if request.precision or request.quantize:
+            llm_instance.distribute(
+                accelerator="cuda" if request.mode == "gpu" else "cpu",
+                devices=request.gpu_count,
+                precision=request.precision,
+                quantize=request.quantize
+            )
+        # Log success and memory stats
+        logger.info(
+            f"Custom model initialized successfully with config:\n"
+            f"Mode: {request.mode}\n"
+            f"Precision: {request.precision}\n"
+            f"Quantize: {request.quantize}\n"
+            f"GPU Count: {request.gpu_count}\n"
+            f"Model Directory: {model_dir}\n"
+            f"Model File: {request.model_filename}\n"
+            f"Config File: {request.config_filename}\n"
+            f"Tokenizer File: {request.tokenizer_filename}\n"
+            f"Current GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f}GB allocated, "
+            f"{torch.cuda.memory_reserved()/1024**3:.2f}GB reserved"
+        )
+        return {
+            "success": True,
+            "message": "Custom model initialized successfully",
+            "model_info": {
+                "folder": str(model_dir),
+                "model_file": request.model_filename,
+                "config_file": request.config_filename,
+                "tokenizer_file": request.tokenizer_filename
+            }
+        }
+    except Exception as e:
+        logger.error(f"Error initializing custom model: {str(e)}")
+        # Print detailed memory statistics on failure
+        logger.error(f"GPU Memory Stats:\n"
+                     f"Allocated: {torch.cuda.memory_allocated()/1024**3:.2f}GB\n"
+                     f"Reserved: {torch.cuda.memory_reserved()/1024**3:.2f}GB\n"
+                     f"Max Allocated: {torch.cuda.max_memory_allocated()/1024**3:.2f}GB")
+        raise HTTPException(status_code=500, detail=f"Error initializing custom model: {str(e)}")
+# Endpoint for streaming generation
+@router.post("/generate/stream")
+async def generate_stream(request: StreamGenerateRequest):
+    """
+    Generate text using the initialized model with streaming response.
+    Returns a StreamingResponse that yields JSON-formatted chunks of text.
+    """
+    global llm_instance
+    if llm_instance is None:
+        raise HTTPException(
+            status_code=400,
+            detail="Model not initialized. Call /initialize first."
+        )
+    async def event_generator() -> AsyncGenerator[str, None]:
+        try:
+            # Start the generation with streaming enabled
+            async for token in llm_instance.generate(
+                    prompt=request.prompt,
+                    max_new_tokens=request.max_new_tokens,
+                    temperature=request.temperature,
+                    top_k=request.top_k,
+                    top_p=request.top_p,
+                    stream=True  # Enable streaming
+            ):
+                # Create a JSON response for each token
+                chunk = {
+                    "token": token,
+                    "metadata": {
+                        "prompt": request.prompt,
+                        "is_finished": False
+                    }
+                }
+                # Format as SSE data
+                yield f"data: {json.dumps(chunk)}\n\n"
+                # Small delay to prevent overwhelming the client
+                await asyncio.sleep(0.01)
+            # Send final message indicating completion
+            final_chunk = {
+                "token": "",
+                "metadata": {
+                    "prompt": request.prompt,
+                    "is_finished": True
+                }
+            }
+            yield f"data: {json.dumps(final_chunk)}\n\n"
+        except Exception as e:
+            logger.error(f"Error in stream generation: {str(e)}")
+            error_chunk = {
+                "error": str(e),
+                "metadata": {
+                    "prompt": request.prompt,
+                    "is_finished": True
+                }
+            }
+            yield f"data: {json.dumps(error_chunk)}\n\n"
+    return StreamingResponse(
+        event_generator(),
+        media_type="text/event-stream",
+        headers={
+            'Cache-Control': 'no-cache',
+            'Connection': 'keep-alive',
+        }
+    )
+@router.get("/")
+async def root():
+    """Root endpoint to verify service is running"""
+    return {
+        "status": "running",
+        "service": "LLM Engine",
+        "endpoints": {
+            "initialize": "/initialize",
+            "generate": "/generate",
+            "health": "/health"
+        }
+    }
+@router.post("/initialize")
+async def initialize_model(request: InitializeRequest):
+    """
+    Initialize the LLM model with specified configuration.
+    """
+    global llm_instance
+    try:
+        # Get the project root directory (where main.py is located)
+        project_root = Path(__file__).parent
+        checkpoints_dir = project_root / "checkpoints"
+        logger.info(f"Checkpoint dir is: {checkpoints_dir}")
+        # For LitGPT downloaded models, path includes organization
+        if "/" in request.model_path:
+            # e.g., "mistralai/Mistral-7B-Instruct-v0.3"
+            org, model_name = request.model_path.split("/")
+            model_path = str(checkpoints_dir / org / model_name)
+        else:
+            # Fallback for direct model paths
+            model_path = str(checkpoints_dir / request.model_path)
+        logger.info(f"Using model path: {model_path}")
+        # Load the model
+        llm_instance = LLM.load(
+            model=model_path,
+            distribute=None if request.precision or request.quantize else "auto"
+        )
+        # If manual distribution is needed
+        if request.precision or request.quantize:
+            llm_instance.distribute(
+                accelerator="cuda" if request.mode == "gpu" else "cpu",
+                devices=request.gpu_count,
+                precision=request.precision,
+                quantize=request.quantize
+            )
+        logger.info(
+            f"Model initialized successfully with config:\n"
+            f"Mode: {request.mode}\n"
+            f"Precision: {request.precision}\n"
+            f"Quantize: {request.quantize}\n"
+            f"GPU Count: {request.gpu_count}\n"
+            f"Model Path: {model_path}\n"
+            f"Current GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f}GB allocated, "
+            f"{torch.cuda.memory_reserved()/1024**3:.2f}GB reserved"
+        )
+        return {"success": True, "message": "Model initialized successfully"}
+    except Exception as e:
+        logger.error(f"Error initializing model: {str(e)}")
+        # Print detailed memory statistics on failure
+        logger.error(f"GPU Memory Stats:\n"
+                     f"Allocated: {torch.cuda.memory_allocated()/1024**3:.2f}GB\n"
+                     f"Reserved: {torch.cuda.memory_reserved()/1024**3:.2f}GB\n"
+                     f"Max Allocated: {torch.cuda.max_memory_allocated()/1024**3:.2f}GB")
+        raise HTTPException(status_code=500, detail=f"Error initializing model: {str(e)}")
+@router.post("/generate")
+async def generate(request: GenerateRequest):
+    """
+    Generate text using the initialized model.
+    """
+    global llm_instance
+    if llm_instance is None:
+        raise HTTPException(status_code=400, detail="Model not initialized. Call /initialize first.")
+    try:
+        if request.stream:
+            raise HTTPException(
+                status_code=400,
+                detail="Streaming is not currently supported through the API"
+            )
+        generated_text = llm_instance.generate(
+            prompt=request.prompt,
+            max_new_tokens=request.max_new_tokens,
+            temperature=request.temperature,
+            top_k=request.top_k,
+            top_p=request.top_p,
+            return_as_token_ids=request.return_as_token_ids,
+            stream=False  # Force stream to False for now
+        )
+        response = {
+            "generated_text": generated_text if not request.return_as_token_ids else generated_text.tolist(),
+            "metadata": {
+                "prompt": request.prompt,
+                "max_new_tokens": request.max_new_tokens,
+                "temperature": request.temperature,
+                "top_k": request.top_k,
+                "top_p": request.top_p
+            }
+        }
+        return response
+    except Exception as e:
+        logger.error(f"Error generating text: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Error generating text: {str(e)}")
+@router.get("/health")
+async def health_check():
+    """
+    Check if the service is running and model is loaded.
+    """
+    global llm_instance
+    status = {
+        "status": "healthy",
+        "model_loaded": llm_instance is not None,
+    }
+    if llm_instance is not None:
+        logger.info(f"llm_instance is: {llm_instance}")
+        status["model_info"] = {
+            "model_path": llm_instance.config.name,
+            "device": str(next(llm_instance.model.parameters()).device)
+        }
+    return status