Spaces:

TeamGenKI
/

LLMServer

Paused

App Files Files Community

AurelioAguirre commited on Dec 2, 2024

Commit

cfaa883

1 Parent(s): 19b1be5

Refactor v2

Browse files

Files changed (7) hide show

.gitignore +1 -0
Dockerfile +18 -41
main/__init__.py +0 -0
main/env_template +0 -55
main/main.py +0 -61
main/routes.py +0 -419
requirements.txt +35 -45

.gitignore CHANGED Viewed

@@ -42,3 +42,4 @@ wheels/
 # Logs
 *.log
 logs/

 # Logs
 *.log
 logs/
+.cache/

Dockerfile CHANGED Viewed

@@ -1,56 +1,33 @@
-# Use Python 3.10 as base image for better compatibility with ML libraries
-FROM python:3.10-slim
 # Set working directory
-WORKDIR /app
-# Install required system dependencies
-RUN apt-get update && \
-    apt-get install -y \
     git \
-    wget \
-    && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
-# Create and set permissions for directories
-RUN mkdir -p /app/.cache/huggingface && \
-    chmod 777 /app/.cache/huggingface && \
-    mkdir -p /app/.git && \
-    chmod 777 /app/.git
-# Set environment variables
-ENV TRANSFORMERS_CACHE=/app/.cache/huggingface/hub
-ENV HF_HOME=/app/.cache/huggingface
-ENV GIT_CONFIG_GLOBAL=/app/.git/config
 # Copy requirements first to leverage Docker cache
 COPY requirements.txt .
 # Install Python dependencies
-RUN pip install --no-cache-dir -r requirements.txt
-# Create checkpoints directory with proper permissions
-RUN mkdir -p /app/main/checkpoints && \
-    chmod 777 /app/main/checkpoints
-# Download model using litgpt command line with proper secret mounting
-RUN --mount=type=secret,id=HF_TOKEN,mode=0444,required=true \
-    export HF_TOKEN=$(cat /run/secrets/HF_TOKEN) && \
-    echo "Starting model download..." && \
-    litgpt download mistralai/Mistral-7B-Instruct-v0.3 \
-        --access_token ${HF_TOKEN} \
-        --checkpoint_dir /app/main/checkpoints || { echo "Download failed with status $?"; exit 1; }
-# Copy the rest of the application
-COPY . .
-# Set environment variables for the application
-ENV LLM_ENGINE_HOST=0.0.0.0
-ENV LLM_ENGINE_PORT=7860
-ENV MODEL_PATH=/app/main/checkpoints/mistralai/Mistral-7B-Instruct-v0.3
-# Expose port 7860 for Hugging Face Spaces
-EXPOSE 7860
 # Command to run the application
-CMD ["python", "-m", "main.main"]

+# Start from NVIDIA CUDA base image
+FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04
 # Set working directory
+WORKDIR /code
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    python3.12 \
+    python3-pip \
     git \
     && rm -rf /var/lib/apt/lists/*
 # Copy requirements first to leverage Docker cache
 COPY requirements.txt .
 # Install Python dependencies
+RUN pip3 install --no-cache-dir -r requirements.txt
+# Copy the application code
+COPY ./app /code/app
+COPY ./utils /code/utils
+# Set environment variables
+ENV PYTHONPATH=/code
+ENV TRANSFORMERS_CACHE=/code/app/.cache
+ENV CUDA_VISIBLE_DEVICES=0
+# Expose the port the app runs on
+EXPOSE 8000
 # Command to run the application
+CMD ["python3", "-m", "app.main"]

main/__init__.py DELETED Viewed

File without changes

main/env_template DELETED Viewed

@@ -1,55 +0,0 @@
-# Service URLs Configuration
-LLM_ENGINE_URL=http://localhost:8001
-RAG_ENGINE_URL=http://localhost:8002
-# LLM Engine Server Configuration
-LLM_ENGINE_HOST=0.0.0.0
-LLM_ENGINE_PORT=8001
-# RAG Engine Server Configuration (if running locally)
-RAG_ENGINE_HOST=0.0.0.0
-RAG_ENGINE_PORT=8002
-# Base Paths Configuration
-BAS_MODEL_PATH=/path/to/your/model
-BAS_RESOURCES=/path/to/resources
-# CUDA Memory Management
-PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128,garbage_collection_threshold:0.8,expandable_segments:True
-# Other memory-related settings
-CUDA_LAUNCH_BLOCKING=0
-CUDA_VISIBLE_DEVICES=0
-# Logging Configuration
-LOG_LEVEL=INFO  # DEBUG, INFO, WARNING, ERROR, CRITICAL
-# GPU Configuration (optional)
-# CUDA_VISIBLE_DEVICES=0,1  # Specify which GPUs to use
-# Memory Configuration (optional)
-# MAX_GPU_MEMORY=16Gi  # Maximum GPU memory to use
-# MAX_CPU_MEMORY=32Gi  # Maximum CPU memory to use
-# Security (if needed)
-# API_KEY=your-api-key-here
-# SSL_CERT_PATH=/path/to/cert
-# SSL_KEY_PATH=/path/to/key
-# Development Settings
-# DEBUG=True  # Enable debug mode
-# RELOAD=False  # Enable auto-reload for development
-# Model Default Parameters (optional)
-# DEFAULT_MAX_NEW_TOKENS=50
-# DEFAULT_TEMPERATURE=1.0
-# DEFAULT_TOP_K=50
-# DEFAULT_TOP_P=1.0
-# Cache Settings (optional)
-# CACHE_DIR=/path/to/cache
-# MAX_CACHE_SIZE=10Gi
-# Monitoring (optional)
-# ENABLE_METRICS=True
-# PROMETHEUS_PORT=9090

main/main.py DELETED Viewed

@@ -1,61 +0,0 @@
-from fastapi import FastAPI
-from fastapi.middleware.cors import CORSMiddleware
-import logging
-import os
-import uvicorn
-from .routes import router
-# Set up logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Initialize FastAPI with simplified configuration
-app = FastAPI(
-    title="LLM Engine Service",
-    docs_url="/docs",
-    redoc_url="/redoc",
-    openapi_url="/openapi.json"
-)
-# Add CORS middleware
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-# Include the router from routes.py
-app.include_router(router)
-def main():
-    # Load environment variables or configuration here
-    host = os.getenv("LLM_ENGINE_HOST", "0.0.0.0")
-    port = int(os.getenv("LLM_ENGINE_PORT", "7860"))  # Default to 7860 for Spaces
-    # Log startup information
-    logger.info(f"Starting LLM Engine service on {host}:{port}, or: ")
-    logger.info("Available endpoints:")
-    logger.info("  - /")
-    logger.info("  - /health")
-    logger.info("  - /models")
-    logger.info("  - /initialize")
-    logger.info("  - /generate")
-    logger.info("  - /generate/stream")
-    logger.info("  - /download")
-    logger.info("  - /convert")
-    logger.info("  - /docs")
-    logger.info("  - /redoc")
-    logger.info("  - /openapi.json")
-    # Start the server
-    uvicorn.run(
-        app,
-        host=host,
-        port=port,
-        log_level="info"
-    )
-if __name__ == "__main__":
-    main()

main/routes.py DELETED Viewed

@@ -1,419 +0,0 @@
-from fastapi import APIRouter, HTTPException
-from fastapi.responses import StreamingResponse
-from pydantic import BaseModel, Field
-from typing import Optional, Union, AsyncGenerator, List
-import torch
-import logging
-from pathlib import Path
-from litgpt.api import LLM
-from litgpt.scripts.download import download_from_hub
-from litgpt.scripts.convert_hf_checkpoint import convert_hf_checkpoint
-import json
-import asyncio
-# Set up logging
-logger = logging.getLogger(__name__)
-# Create router instance
-router = APIRouter()
-# Global variable to store the LLM instance
-llm_instance = None
-class InitializeRequest(BaseModel):
-    """Configuration for model initialization including model path"""
-    mode: str = Field(default="cpu", description="Execution mode ('cpu' or 'gpu')")
-    precision: Optional[str] = Field(None, description="Precision format (e.g., 'bf16-true', 'bf16-mixed')")
-    quantize: Optional[str] = Field(None, description="Quantization format (e.g., 'bnb.nf4')")
-    gpu_count: Union[str, int] = Field(default="auto", description="Number of GPUs to use or 'auto'")
-    model_path: str = Field(..., description="Path to the model relative to checkpoints directory")
-class GenerateRequest(BaseModel):
-    """Request parameters for text generation"""
-    prompt: str = Field(..., description="Input text prompt for generation")
-    max_new_tokens: int = Field(default=50, description="Maximum number of tokens to generate")
-    temperature: float = Field(default=1.0, description="Sampling temperature")
-    top_k: Optional[int] = Field(None, description="Top-k sampling parameter")
-    top_p: float = Field(default=1.0, description="Top-p sampling parameter")
-    return_as_token_ids: bool = Field(default=False, description="Whether to return token IDs instead of text")
-    stream: bool = Field(default=False, description="Whether to stream the response")
-class StreamGenerateRequest(BaseModel):
-    """Request parameters for streaming text generation"""
-    prompt: str = Field(..., description="Input text prompt for generation")
-    max_new_tokens: int = Field(default=50, description="Maximum number of tokens to generate")
-    temperature: float = Field(default=1.0, description="Sampling temperature")
-    top_k: Optional[int] = Field(None, description="Top-k sampling parameter")
-    top_p: float = Field(default=1.0, description="Top-p sampling parameter")
-class DownloadModelRequest(BaseModel):
-    """Request to download a model from HuggingFace"""
-    repo_id: str = Field(
-        ...,
-        description="HuggingFace repository ID (e.g., 'huihui-ai/Llama-3.2-3B-Instruct-abliterated')"
-    )
-    model_name: str = Field(
-        ...,
-        description="Model architecture name (e.g., 'Llama-3.2-3B-Instruct')"
-    )
-    access_token: Optional[str] = Field(
-        None,
-        description="HuggingFace access token for private models"
-    )
-class ConvertModelRequest(BaseModel):
-    """Request to convert a downloaded model"""
-    folder_path: str = Field(
-        ...,
-        description="Path relative to checkpoints where model was downloaded"
-    )
-    model_name: str = Field(
-        ...,
-        description="Model architecture name for conversion"
-    )
-class ModelResponse(BaseModel):
-    """Model information response"""
-    name: str = Field(..., description="Full model name including organization")
-    path: str = Field(..., description="Relative path in checkpoints directory")
-    downloaded: bool = Field(..., description="Whether the model files are downloaded")
-    converted: bool = Field(..., description="Whether the model is converted to LitGPT format")
-    has_safetensors: bool = Field(..., description="Whether safetensors files are present")
-    files: List[str] = Field(..., description="List of files in model directory")
-class ModelsListResponse(BaseModel):
-    """Response for listing models"""
-    models: List[ModelResponse] = Field(..., description="List of available models")
-@router.post(
-    "/download",
-    response_model=dict,
-    summary="Download a model from HuggingFace Hub",
-    description="Downloads a model from HuggingFace to the LLM Engine's checkpoints directory",
-    response_description="Download status and location information"
-)
-async def download_model(request: DownloadModelRequest):
-    """
-    Download a model from HuggingFace Hub.
-    - Downloads model files to the checkpoints directory
-    - Creates necessary subdirectories
-    - Handles authentication for private models
-    Returns:
-        A JSON object containing download status and path information
-    """
-    try:
-        # Get the project root directory and construct paths
-        project_root = Path(__file__).parent.parent
-        checkpoints_dir = project_root / "checkpoints"
-        logger.info(f"Downloading model {request.repo_id} to {checkpoints_dir}")
-        download_from_hub(
-            repo_id=request.repo_id,
-            model_name=request.model_name,
-            access_token=request.access_token,
-            checkpoint_dir=checkpoints_dir,
-            tokenizer_only=False
-        )
-        return {
-            "status": "success",
-            "message": f"Model downloaded to {checkpoints_dir / request.repo_id}",
-            "path": str(request.repo_id)
-        }
-    except Exception as e:
-        logger.error(f"Error downloading model: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"Error downloading model: {str(e)}")
-@router.post(
-    "/convert",
-    response_model=dict,
-    summary="Convert a model to LitGPT format",
-    description="Converts a downloaded model to the LitGPT format required for inference",
-    response_description="Conversion status and location information"
-)
-async def convert_model(request: ConvertModelRequest):
-    """
-    Convert a downloaded model to LitGPT format.
-    - Converts model files to LitGPT's format
-    - Creates lit_model.pth file
-    - Maintains original files
-    Returns:
-        A JSON object containing conversion status and path information
-    """
-    try:
-        project_root = Path(__file__).parent.parent
-        checkpoints_dir = project_root / "checkpoints"
-        model_dir = checkpoints_dir / request.folder_path
-        if not model_dir.exists():
-            raise HTTPException(
-                status_code=404,
-                detail=f"Model directory not found: {request.folder_path}"
-            )
-        logger.info(f"Converting model in {model_dir}")
-        convert_hf_checkpoint(
-            checkpoint_dir=model_dir,
-            model_name=request.model_name
-        )
-        return {
-            "status": "success",
-            "message": f"Model converted successfully",
-            "path": str(request.folder_path)
-        }
-    except Exception as e:
-        logger.error(f"Error converting model: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"Error converting model: {str(e)}")
-@router.get(
-    "/models",
-    response_model=ModelsListResponse,
-    summary="List available models",
-    description="Lists all models in the checkpoints directory with their status",
-    response_description="List of models with their details and status"
-)
-async def list_models():
-    """
-    List all models in the checkpoints directory.
-    Returns:
-        A JSON object containing:
-        - List of models
-        - Each model's download status
-        - Each model's conversion status
-        - Available files for each model
-    """
-    try:
-        project_root = Path(__file__).parent.parent
-        checkpoints_dir = project_root / "checkpoints"
-        models = []
-        if checkpoints_dir.exists():
-            for org_dir in checkpoints_dir.iterdir():
-                if org_dir.is_dir():
-                    for model_dir in org_dir.iterdir():
-                        if model_dir.is_dir():
-                            files = [f.name for f in model_dir.iterdir()]
-                            has_safetensors = any(f.endswith('.safetensors') for f in files)
-                            has_lit_model = 'lit_model.pth' in files
-                            model_info = ModelResponse(
-                                name=f"{org_dir.name}/{model_dir.name}",
-                                path=str(model_dir.relative_to(checkpoints_dir)),
-                                downloaded=True,
-                                converted=has_lit_model,
-                                has_safetensors=has_safetensors,
-                                files=files
-                            )
-                            models.append(model_info)
-        return ModelsListResponse(models=models)
-    except Exception as e:
-        logger.error(f"Error listing models: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"Error listing models: {str(e)}")
-@router.post("/initialize")
-async def initialize_model(request: InitializeRequest):
-    """
-    Initialize the LLM model with specified configuration.
-    """
-    global llm_instance
-    try:
-        # Get the project root directory (where main.py is located)
-        project_root = Path(__file__).parent.parent
-        checkpoints_dir = project_root / "checkpoints"
-        logger.info(f"Checkpoint dir is: {checkpoints_dir}")
-        # For LitGPT downloaded models, path includes organization
-        if "/" in request.model_path:
-            # e.g., "mistralai/Mistral-7B-Instruct-v0.3"
-            org, model_name = request.model_path.split("/")
-            model_path = str(checkpoints_dir / org / model_name)
-        else:
-            # Fallback for direct model paths
-            model_path = str(checkpoints_dir / request.model_path)
-        logger.info(f"Using model path: {model_path}")
-        # Load the model
-        llm_instance = LLM.load(
-            model=model_path,
-            distribute=None if request.precision or request.quantize else "auto"
-        )
-        # If manual distribution is needed
-        logger.info("Distributing model")
-        if request.precision or request.quantize:
-            llm_instance.distribute(
-                accelerator="cuda" if request.mode == "gpu" else "cpu",
-                devices=request.gpu_count,
-                precision=request.precision,
-                quantize=request.quantize
-            )
-        logger.info(
-            f"Model initialized successfully with config:\n"
-            f"Mode: {request.mode}\n"
-            f"Precision: {request.precision}\n"
-            f"Quantize: {request.quantize}\n"
-            f"GPU Count: {request.gpu_count}\n"
-            f"Model Path: {model_path}\n"
-            f"Current GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f}GB allocated, "
-            f"{torch.cuda.memory_reserved()/1024**3:.2f}GB reserved"
-        )
-        return {"success": True, "message": "Model initialized successfully"}
-    except Exception as e:
-        logger.error(f"Error initializing model: {str(e)}")
-        # Print detailed memory statistics on failure
-        logger.error(f"GPU Memory Stats:\n"
-                     f"Allocated: {torch.cuda.memory_allocated()/1024**3:.2f}GB\n"
-                     f"Reserved: {torch.cuda.memory_reserved()/1024**3:.2f}GB\n"
-                     f"Max Allocated: {torch.cuda.max_memory_allocated()/1024**3:.2f}GB")
-        raise HTTPException(status_code=500, detail=f"Error initializing model: {str(e)}")
-@router.post("/generate")
-async def generate(request: GenerateRequest):
-    """
-    Generate text using the initialized model.
-    """
-    global llm_instance
-    if llm_instance is None:
-        raise HTTPException(status_code=400, detail="Model not initialized. Call /initialize first.")
-    try:
-        if request.stream:
-            raise HTTPException(
-                status_code=400,
-                detail="Streaming is not currently supported through the API"
-            )
-        generated_text = llm_instance.generate(
-            prompt=request.prompt,
-            max_new_tokens=request.max_new_tokens,
-            temperature=request.temperature,
-            top_k=request.top_k,
-            top_p=request.top_p,
-            return_as_token_ids=request.return_as_token_ids,
-            stream=False  # Force stream to False for now
-        )
-        response = {
-            "generated_text": generated_text if not request.return_as_token_ids else generated_text.tolist(),
-            "metadata": {
-                "prompt": request.prompt,
-                "max_new_tokens": request.max_new_tokens,
-                "temperature": request.temperature,
-                "top_k": request.top_k,
-                "top_p": request.top_p
-            }
-        }
-        return response
-    except Exception as e:
-        logger.error(f"Error generating text: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"Error generating text: {str(e)}")
-@router.post("/generate/stream")
-async def generate_stream(request: StreamGenerateRequest):
-    """
-    Generate text using the initialized model with streaming response.
-    Returns a StreamingResponse that yields JSON-formatted chunks of text.
-    """
-    global llm_instance
-    if llm_instance is None:
-        raise HTTPException(
-            status_code=400,
-            detail="Model not initialized. Call /initialize first."
-        )
-    async def event_generator() -> AsyncGenerator[str, None]:
-        try:
-            # Start the generation with streaming enabled
-            for token in llm_instance.generate(
-                    prompt=request.prompt,
-                    max_new_tokens=request.max_new_tokens,
-                    temperature=request.temperature,
-                    top_k=request.top_k,
-                    top_p=request.top_p,
-                    stream=True  # Enable streaming
-            ):
-                # Create a JSON response for each token
-                chunk = {
-                    "token": token,
-                    "metadata": {
-                        "prompt": request.prompt,
-                        "is_finished": False
-                    }
-                }
-                # Format as SSE data
-                yield f"data: {json.dumps(chunk)}\n\n"
-                # Small delay to prevent overwhelming the client
-                await asyncio.sleep(0.01)
-            # Send final message indicating completion
-            final_chunk = {
-                "token": "",
-                "metadata": {
-                    "prompt": request.prompt,
-                    "is_finished": True
-                }
-            }
-            yield f"data: {json.dumps(final_chunk)}\n\n"
-        except Exception as e:
-            logger.error(f"Error in stream generation: {str(e)}")
-            error_chunk = {
-                "error": str(e),
-                "metadata": {
-                    "prompt": request.prompt,
-                    "is_finished": True
-                }
-            }
-            yield f"data: {json.dumps(error_chunk)}\n\n"
-    return StreamingResponse(
-        event_generator(),
-        media_type="text/event-stream",
-        headers={
-            'Cache-Control': 'no-cache',
-            'Connection': 'keep-alive',
-        }
-    )
-@router.get("/health")
-async def health_check():
-    """
-    Check if the service is running and model is loaded.
-    Returns status information including model details if loaded.
-    """
-    global llm_instance
-    status = {
-        "status": "healthy",
-        "model_loaded": llm_instance is not None,
-    }
-    if llm_instance is not None:
-        logger.info(f"llm_instance is: {llm_instance}")
-        status["model_info"] = {
-            "model_path": llm_instance.config.name,
-            "device": str(next(llm_instance.model.parameters()).device)
-        }
-    return status

requirements.txt CHANGED Viewed

@@ -1,67 +1,57 @@
-aiohappyeyeballs==2.4.3
-aiohttp==3.10.10
-aiosignal==1.3.1
 annotated-types==0.7.0
 anyio==4.6.2.post1
-attrs==24.2.0
 bitsandbytes==0.44.1
 certifi==2024.8.30
 charset-normalizer==3.4.0
 click==8.1.7
-docstring_parser==0.16
-fastapi==0.109.0
 filelock==3.16.1
-frozenlist==1.5.0
 fsspec==2024.10.0
 h11==0.14.0
-huggingface-hub==0.23.5
 idna==3.10
-importlib_resources==6.4.5
 Jinja2==3.1.4
-jsonargparse==4.32.1
-lightning==2.4.0
-lightning-utilities==0.11.8
-litgpt==0.5.3
 MarkupSafe==3.0.2
 mpmath==1.3.0
-multidict==6.1.0
 networkx==3.4.2
-numpy==1.26.4
-nvidia-cublas-cu12==12.1.3.1
-nvidia-cuda-cupti-cu12==12.1.105
-nvidia-cuda-nvrtc-cu12==12.1.105
-nvidia-cuda-runtime-cu12==12.1.105
 nvidia-cudnn-cu12==9.1.0.70
-nvidia-cufft-cu12==11.0.2.54
-nvidia-curand-cu12==10.3.2.106
-nvidia-cusolver-cu12==11.4.5.107
-nvidia-cusparse-cu12==12.1.0.106
-nvidia-nccl-cu12==2.20.5
-nvidia-nvjitlink-cu12==12.6.77
-nvidia-nvtx-cu12==12.1.105
-packaging==24.1
-propcache==0.2.0
-pydantic==2.5.3
-pydantic_core==2.14.6
-python-dotenv==1.0.0
-pytorch-lightning==2.4.0
 PyYAML==6.0.2
-regex==2024.9.11
 requests==2.32.3
 safetensors==0.4.5
-setuptools==75.3.0
 sniffio==1.3.1
-starlette==0.35.1
-sympy==1.13.3
 tokenizers==0.20.3
-torch==2.4.1
-torchmetrics==1.5.1
-tqdm==4.66.6
-transformers==4.46.2
-triton==3.0.0
-typeshed_client==2.7.0
 typing_extensions==4.12.2
 urllib3==2.2.3
-uvicorn==0.27.0
-WhatIsMyIP==2024.2.20
-yarl==1.17.1

+accelerate==1.1.1
 annotated-types==0.7.0
 anyio==4.6.2.post1
 bitsandbytes==0.44.1
 certifi==2024.8.30
 charset-normalizer==3.4.0
 click==8.1.7
+fastapi==0.115.5
 filelock==3.16.1
 fsspec==2024.10.0
 h11==0.14.0
+huggingface-hub==0.26.3
 idna==3.10
+inquirerpy==0.3.4
 Jinja2==3.1.4
 MarkupSafe==3.0.2
 mpmath==1.3.0
 networkx==3.4.2
+numpy==2.1.3
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
 nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+packaging==24.2
+pfzy==0.3.4
+prompt_toolkit==3.0.48
+psutil==6.1.0
+pydantic==2.10.2
+pydantic_core==2.27.1
+python-dotenv==1.0.1
 PyYAML==6.0.2
+regex==2024.11.6
 requests==2.32.3
+router==0.1
 safetensors==0.4.5
+setuptools==75.6.0
 sniffio==1.3.1
+starlette==0.41.3
+sympy==1.13.1
 tokenizers==0.20.3
+torch==2.5.1
+tqdm==4.67.1
+transformers==4.46.3
+triton==3.1.0
 typing_extensions==4.12.2
 urllib3==2.2.3
+utils==1.0.2
+uvicorn==0.32.1
+wcwidth==0.2.13