Commit ·
212188a
1
Parent(s): db2ebae
Refactor: Clean modular architecture with app/ structure
Browse files- Replace old unified files with new modular app/ structure
- Update Dockerfile for optimized HF Spaces deployment
- Add requirements/ directory with base and optimization packages
- Update README with current API documentation
- Remove deprecated unified files
Changes:
- New app/ module: config.py, main.py, model.py, optimization.py
- New requirements structure: base.txt, optimization.txt
- Optimized Dockerfile with flash-linear-attention support
- Clean FastAPI endpoints: /, /health, /info, /generate
- Dockerfile +27 -28
- Dockerfile.unified +0 -44
- README.md +56 -70
- README_deploy.md +105 -0
- app.py +0 -243
- app/__init__.py +7 -0
- app/config.py +92 -0
- app/main.py +183 -0
- app/model.py +183 -0
- app/optimization.py +129 -0
- app_config_unified.py +0 -375
- app_unified.py +0 -243
- requirements.txt +0 -22
- requirements/base.txt +27 -0
- requirements/optimization.txt +34 -0
- requirements_unified.txt +0 -22
Dockerfile
CHANGED
|
@@ -1,44 +1,43 @@
|
|
| 1 |
-
#
|
| 2 |
-
|
| 3 |
|
| 4 |
-
|
| 5 |
-
WORKDIR /app
|
| 6 |
|
| 7 |
-
# Set
|
| 8 |
-
ENV
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# Install system dependencies
|
| 11 |
RUN apt-get update && apt-get install -y \
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
wget \
|
| 15 |
git \
|
| 16 |
&& rm -rf /var/lib/apt/lists/*
|
| 17 |
|
| 18 |
-
#
|
| 19 |
-
|
| 20 |
|
| 21 |
-
# Copy
|
| 22 |
-
COPY
|
|
|
|
| 23 |
|
| 24 |
# Install Python dependencies
|
| 25 |
-
|
|
|
|
| 26 |
|
| 27 |
-
#
|
| 28 |
-
|
|
|
|
| 29 |
|
| 30 |
-
#
|
| 31 |
-
ENV PYTHONPATH=/app
|
| 32 |
-
ENV HF_HOME=/app/cache
|
| 33 |
-
|
| 34 |
-
# Create cache directory with proper permissions and fix user issues
|
| 35 |
-
RUN mkdir -p /app/cache && chmod 777 /app/cache
|
| 36 |
-
RUN groupadd -g 1000 appuser && useradd -r -u 1000 -g appuser appuser
|
| 37 |
-
RUN chown -R appuser:appuser /app
|
| 38 |
-
|
| 39 |
-
# Expose port (HuggingFace Spaces uses 7860)
|
| 40 |
EXPOSE 7860
|
| 41 |
|
| 42 |
-
#
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
|
|
|
| 1 |
+
# Dragon-3B on HuggingFace Spaces
|
| 2 |
+
# Optimized for T4/L4 GPU with flash-linear-attention
|
| 3 |
|
| 4 |
+
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04
|
|
|
|
| 5 |
|
| 6 |
+
# Set environment variables
|
| 7 |
+
ENV DEBIAN_FRONTEND=noninteractive \
|
| 8 |
+
PYTHONUNBUFFERED=1 \
|
| 9 |
+
HF_HOME=/data/cache \
|
| 10 |
+
PORT=7860
|
| 11 |
|
| 12 |
# Install system dependencies
|
| 13 |
RUN apt-get update && apt-get install -y \
|
| 14 |
+
python3.10 \
|
| 15 |
+
python3-pip \
|
|
|
|
| 16 |
git \
|
| 17 |
&& rm -rf /var/lib/apt/lists/*
|
| 18 |
|
| 19 |
+
# Set working directory
|
| 20 |
+
WORKDIR /app
|
| 21 |
|
| 22 |
+
# Copy application code
|
| 23 |
+
COPY ./app ./app
|
| 24 |
+
COPY ./requirements ./requirements
|
| 25 |
|
| 26 |
# Install Python dependencies
|
| 27 |
+
# Install base first
|
| 28 |
+
RUN pip3 install --no-cache-dir -r requirements/base.txt
|
| 29 |
|
| 30 |
+
# Install optimizations (this will take longer but gives 3-4x speedup)
|
| 31 |
+
# Comment out this line for faster builds (at cost of performance)
|
| 32 |
+
RUN pip3 install --no-cache-dir -r requirements/optimization.txt
|
| 33 |
|
| 34 |
+
# Expose port
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
EXPOSE 7860
|
| 36 |
|
| 37 |
+
# Health check
|
| 38 |
+
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
|
| 39 |
+
CMD python3 -c "import requests; requests.get('http://localhost:7860/health')"
|
| 40 |
+
|
| 41 |
+
# Run application
|
| 42 |
+
CMD ["python3", "-m", "app.main"]
|
| 43 |
|
Dockerfile.unified
DELETED
|
@@ -1,44 +0,0 @@
|
|
| 1 |
-
# HuggingFace Spaces optimized Dockerfile using unified deployment system
|
| 2 |
-
FROM python:3.10-slim
|
| 3 |
-
|
| 4 |
-
# Set working directory
|
| 5 |
-
WORKDIR /app
|
| 6 |
-
|
| 7 |
-
# Set HuggingFace Spaces as deployment platform
|
| 8 |
-
ENV DEPLOYMENT_PLATFORM=hf
|
| 9 |
-
|
| 10 |
-
# Install system dependencies
|
| 11 |
-
RUN apt-get update && apt-get install -y \
|
| 12 |
-
build-essential \
|
| 13 |
-
curl \
|
| 14 |
-
wget \
|
| 15 |
-
git \
|
| 16 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 17 |
-
|
| 18 |
-
# Upgrade pip first
|
| 19 |
-
RUN pip install --upgrade pip
|
| 20 |
-
|
| 21 |
-
# Copy requirements first for better caching
|
| 22 |
-
COPY requirements.txt .
|
| 23 |
-
|
| 24 |
-
# Install Python dependencies
|
| 25 |
-
RUN pip install --no-cache-dir -r requirements.txt
|
| 26 |
-
|
| 27 |
-
# Copy application files
|
| 28 |
-
COPY . .
|
| 29 |
-
|
| 30 |
-
# Set environment variables
|
| 31 |
-
ENV PYTHONPATH=/app
|
| 32 |
-
ENV HF_HOME=/app/cache
|
| 33 |
-
|
| 34 |
-
# Create cache directory with proper permissions and fix user issues
|
| 35 |
-
RUN mkdir -p /app/cache && chmod 777 /app/cache
|
| 36 |
-
RUN groupadd -g 1000 appuser && useradd -r -u 1000 -g appuser appuser
|
| 37 |
-
RUN chown -R appuser:appuser /app
|
| 38 |
-
|
| 39 |
-
# Expose port (HuggingFace Spaces uses 7860)
|
| 40 |
-
EXPOSE 7860
|
| 41 |
-
|
| 42 |
-
# Run the unified application with HuggingFace Spaces optimizations
|
| 43 |
-
CMD ["uvicorn", "app_unified:app", "--host", "0.0.0.0", "--port", "7860", "--log-level", "info"]
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -1,102 +1,88 @@
|
|
| 1 |
---
|
| 2 |
-
title: Dragon-3B
|
| 3 |
emoji: 🐉
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: purple
|
| 6 |
sdk: docker
|
| 7 |
-
sdk_version: "4.44.0"
|
| 8 |
-
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: mit
|
| 11 |
---
|
| 12 |
|
| 13 |
-
# 🐉 Dragon-3B
|
| 14 |
|
| 15 |
-
|
| 16 |
|
| 17 |
-
## 🚀
|
| 18 |
|
| 19 |
-
- **
|
| 20 |
-
- **T4 GPU Optimized**:
|
| 21 |
-
- **FastAPI REST API**:
|
| 22 |
-
- **
|
| 23 |
-
- **
|
| 24 |
|
| 25 |
-
## 🔧
|
| 26 |
|
| 27 |
-
**
|
| 28 |
|
| 29 |
-
|
| 30 |
-
1. Go to
|
| 31 |
-
2. Select
|
| 32 |
-
3. Save
|
| 33 |
-
4. The Space will restart with GPU support
|
| 34 |
|
| 35 |
-
##
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
- `GET /model/info` - Model and optimization details
|
| 40 |
-
- `GET /platform/info` - Platform capabilities
|
| 41 |
-
- `POST /inference` - Run inference (optimized for T4 GPU)
|
| 42 |
-
- `GET /performance/benchmark` - Performance benchmark
|
| 43 |
|
| 44 |
-
##
|
|
|
|
| 45 |
|
| 46 |
-
``
|
| 47 |
-
|
| 48 |
-
curl https://jeanbaptdzd-dragon-3b-inference.hf.space/health
|
| 49 |
|
| 50 |
-
#
|
| 51 |
-
|
| 52 |
-
-H "Content-Type: application/json" \
|
| 53 |
-
-d '{"prompt": "The future of AI is", "max_new_tokens": 100}'
|
| 54 |
-
```
|
| 55 |
-
|
| 56 |
-
## 📊 **Performance Expectations**
|
| 57 |
-
|
| 58 |
-
| Hardware | Inference Speed | Memory Usage | Cold Start |
|
| 59 |
-
|----------|----------------|--------------|------------|
|
| 60 |
-
| **T4 GPU** | 20-40 tok/s | ~7GB | ~30s |
|
| 61 |
-
| **A10G GPU** | 30-60 tok/s | ~7GB | ~25s |
|
| 62 |
-
|
| 63 |
-
## 🔧 **Environment Variables**
|
| 64 |
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
| 69 |
```
|
| 70 |
|
| 71 |
-
##
|
| 72 |
|
| 73 |
-
|
| 74 |
-
-
|
| 75 |
-
- [Scaleway Deployment](../scw_deployment/README.md) - Enterprise deployment
|
| 76 |
|
| 77 |
-
##
|
| 78 |
|
| 79 |
-
|
| 80 |
-
-
|
| 81 |
-
|
| 82 |
-
-
|
|
|
|
|
|
|
| 83 |
|
| 84 |
-
##
|
| 85 |
-
- Space might be sleeping (wait 30-60 seconds for cold start)
|
| 86 |
-
- Check Space logs for build errors
|
| 87 |
-
- Ensure hardware is properly configured
|
| 88 |
|
| 89 |
-
|
| 90 |
-
-
|
| 91 |
-
-
|
| 92 |
-
-
|
|
|
|
| 93 |
|
| 94 |
-
##
|
| 95 |
|
| 96 |
-
-
|
| 97 |
-
-
|
| 98 |
-
-
|
|
|
|
| 99 |
|
| 100 |
-
|
| 101 |
|
| 102 |
-
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Dragon-3B Inference API
|
| 3 |
emoji: 🐉
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: purple
|
| 6 |
sdk: docker
|
|
|
|
|
|
|
| 7 |
pinned: false
|
| 8 |
license: mit
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# 🐉 Dragon-3B Inference API
|
| 12 |
|
| 13 |
+
FastAPI REST server for Dragon-3B-Base-alpha (Gated DeltaNet architecture) optimized for HuggingFace Spaces with T4 GPU.
|
| 14 |
|
| 15 |
+
## 🚀 Features
|
| 16 |
|
| 17 |
+
- **Clean Architecture**: Modular codebase with `app/` structure
|
| 18 |
+
- **T4 GPU Optimized**: Configured for HF Spaces T4 GPU (25-35 tok/s base, 80-100 tok/s with flash-attn)
|
| 19 |
+
- **FastAPI REST API**: Interactive docs at `/docs`
|
| 20 |
+
- **Health Monitoring**: `/health` and `/info` endpoints
|
| 21 |
+
- **Automatic Optimizations**: Detects and uses flash-linear-attention if available
|
| 22 |
|
| 23 |
+
## 🔧 Hardware Requirements
|
| 24 |
|
| 25 |
+
**Required**: T4 GPU (or better)
|
| 26 |
|
| 27 |
+
To configure GPU in your Space:
|
| 28 |
+
1. Go to Space Settings
|
| 29 |
+
2. Select "T4 small" or better
|
| 30 |
+
3. Save and rebuild
|
|
|
|
| 31 |
|
| 32 |
+
## 📡 API Endpoints
|
| 33 |
|
| 34 |
+
### `GET /`
|
| 35 |
+
Basic API information
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
+
### `GET /health`
|
| 38 |
+
Health check with model status
|
| 39 |
|
| 40 |
+
### `GET /info`
|
| 41 |
+
Detailed model and system information
|
|
|
|
| 42 |
|
| 43 |
+
### `POST /generate`
|
| 44 |
+
Generate text from a prompt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
+
```json
|
| 47 |
+
{
|
| 48 |
+
"prompt": "The future of AI is",
|
| 49 |
+
"max_new_tokens": 150,
|
| 50 |
+
"temperature": 0.7,
|
| 51 |
+
"top_p": 0.9
|
| 52 |
+
}
|
| 53 |
```
|
| 54 |
|
| 55 |
+
## 🔑 Configuration
|
| 56 |
|
| 57 |
+
Set in Space Settings → Repository secrets:
|
| 58 |
+
- `HF_TOKEN` - Your HuggingFace token (required to download the model)
|
|
|
|
| 59 |
|
| 60 |
+
## 📊 Performance
|
| 61 |
|
| 62 |
+
| Hardware | Speed (tokens/sec) | Notes |
|
| 63 |
+
|----------|-------------------|-------|
|
| 64 |
+
| T4 GPU | 25-35 | Base (without flash-attn) |
|
| 65 |
+
| T4 GPU | 80-100 | With flash-linear-attention |
|
| 66 |
+
| L4 GPU | 35-45 | Base |
|
| 67 |
+
| L4 GPU | 100-120 | With flash-linear-attention |
|
| 68 |
|
| 69 |
+
## 🛠️ Development
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
+
This Space uses:
|
| 72 |
+
- PyTorch >= 2.0
|
| 73 |
+
- transformers >= 4.57
|
| 74 |
+
- FastAPI + Uvicorn
|
| 75 |
+
- Optional: flash-linear-attention (3-4x speedup)
|
| 76 |
|
| 77 |
+
## 📝 Notes
|
| 78 |
|
| 79 |
+
- First request is slow (model loading ~30-60s)
|
| 80 |
+
- Subsequent requests are fast
|
| 81 |
+
- Space sleeps after 48h inactivity (free tier)
|
| 82 |
+
- Includes flash-linear-attention compilation (slower build, faster inference)
|
| 83 |
|
| 84 |
+
## 🔗 Links
|
| 85 |
|
| 86 |
+
- [Model Card](https://huggingface.co/DragonLLM/Dragon-3B-Base-alpha)
|
| 87 |
+
- [Project Repository](https://github.com/jeanbapt/Lingua-fin)
|
| 88 |
+
- [HF Spaces Docs](https://huggingface.co/docs/hub/spaces)
|
README_deploy.md
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Dragon-3B on HuggingFace Spaces
|
| 2 |
+
|
| 3 |
+
Deploy Dragon-3B as a FastAPI application on HuggingFace Spaces.
|
| 4 |
+
|
| 5 |
+
## 🚀 Quick Deploy
|
| 6 |
+
|
| 7 |
+
### 1. Create HuggingFace Space
|
| 8 |
+
|
| 9 |
+
```bash
|
| 10 |
+
# Create a new Space at https://huggingface.co/new-space
|
| 11 |
+
# Select: Docker, SDK: docker
|
| 12 |
+
```
|
| 13 |
+
|
| 14 |
+
### 2. Set Up Git
|
| 15 |
+
|
| 16 |
+
```bash
|
| 17 |
+
# Clone your space
|
| 18 |
+
git clone https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE
|
| 19 |
+
cd YOUR_SPACE
|
| 20 |
+
|
| 21 |
+
# Copy files
|
| 22 |
+
cp -r /path/to/Lingua-fin/app ./
|
| 23 |
+
cp /path/to/Lingua-fin/deployments/hf_space/* ./
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
### 3. Configure Secrets
|
| 27 |
+
|
| 28 |
+
In your Space settings, add:
|
| 29 |
+
- `HF_TOKEN` - Your HuggingFace token
|
| 30 |
+
|
| 31 |
+
### 4. Push and Deploy
|
| 32 |
+
|
| 33 |
+
```bash
|
| 34 |
+
git add .
|
| 35 |
+
git commit -m "Initial Dragon-3B deployment"
|
| 36 |
+
git push
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
Space will build and deploy automatically!
|
| 40 |
+
|
| 41 |
+
## ⚙️ Configuration
|
| 42 |
+
|
| 43 |
+
### Hardware Requirements
|
| 44 |
+
|
| 45 |
+
**Minimum**: CPU Basic (2 cores, 16GB RAM)
|
| 46 |
+
**Recommended**: T4 GPU (16GB VRAM) or better
|
| 47 |
+
|
| 48 |
+
Select hardware in Space settings.
|
| 49 |
+
|
| 50 |
+
### Environment Variables
|
| 51 |
+
|
| 52 |
+
Set in Space settings → Variables:
|
| 53 |
+
|
| 54 |
+
- `HF_TOKEN` (required) - HuggingFace token
|
| 55 |
+
- `PORT` - Port number (default: 7860 for Spaces)
|
| 56 |
+
- `LOG_LEVEL` - Logging level (INFO/DEBUG)
|
| 57 |
+
|
| 58 |
+
## 📊 Expected Performance
|
| 59 |
+
|
| 60 |
+
| Hardware | Setup | Tokens/sec | Use Case |
|
| 61 |
+
|----------|-------|------------|----------|
|
| 62 |
+
| CPU Basic | Minimal | 2-5 | Testing only |
|
| 63 |
+
| T4 GPU | Minimal | 25-35 | Demos |
|
| 64 |
+
| T4 GPU | Optimized | 35-45 | Production |
|
| 65 |
+
| L4 GPU | Optimized | 80-100 | High performance |
|
| 66 |
+
|
| 67 |
+
**Minimal** = Base requirements only
|
| 68 |
+
**Optimized** = With flash-linear-attention
|
| 69 |
+
|
| 70 |
+
## 🔧 Optimization
|
| 71 |
+
|
| 72 |
+
The Dockerfile includes `flash-linear-attention` by default for 3-4x speedup.
|
| 73 |
+
|
| 74 |
+
To disable (faster builds, slower inference):
|
| 75 |
+
- Remove from `requirements.txt`
|
| 76 |
+
- Comment out in `Dockerfile`
|
| 77 |
+
|
| 78 |
+
## 🧪 Testing
|
| 79 |
+
|
| 80 |
+
Once deployed, test at:
|
| 81 |
+
```
|
| 82 |
+
https://YOUR_USERNAME-YOUR_SPACE.hf.space/docs
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
Try the `/generate` endpoint with:
|
| 86 |
+
```json
|
| 87 |
+
{
|
| 88 |
+
"prompt": "The future of AI is",
|
| 89 |
+
"max_new_tokens": 150,
|
| 90 |
+
"temperature": 0.7
|
| 91 |
+
}
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
## 📝 Notes
|
| 95 |
+
|
| 96 |
+
- First request will be slow (model loading)
|
| 97 |
+
- Subsequent requests are fast
|
| 98 |
+
- Space sleeps after 48h inactivity (free tier)
|
| 99 |
+
- GPU Spaces have usage limits on free tier
|
| 100 |
+
|
| 101 |
+
## 🔗 Links
|
| 102 |
+
|
| 103 |
+
- [HuggingFace Spaces Docs](https://huggingface.co/docs/hub/spaces)
|
| 104 |
+
- [Docker SDK Reference](https://huggingface.co/docs/hub/spaces-sdks-docker)
|
| 105 |
+
|
app.py
DELETED
|
@@ -1,243 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
HuggingFace Spaces optimized FastAPI application using unified deployment system
|
| 3 |
-
"""
|
| 4 |
-
|
| 5 |
-
import os
|
| 6 |
-
import time
|
| 7 |
-
import logging
|
| 8 |
-
from fastapi import FastAPI, Request, HTTPException, status
|
| 9 |
-
from fastapi.responses import JSONResponse
|
| 10 |
-
from contextlib import asynccontextmanager
|
| 11 |
-
from typing import Dict, Any
|
| 12 |
-
|
| 13 |
-
# Set HuggingFace Spaces as the deployment platform
|
| 14 |
-
os.environ["DEPLOYMENT_PLATFORM"] = "hf"
|
| 15 |
-
|
| 16 |
-
# Import the unified configuration
|
| 17 |
-
from app_config_unified import DRAGON_CONFIG, load_dragon_model, run_inference, get_model_info, cleanup_model_memory
|
| 18 |
-
|
| 19 |
-
# Configure logging
|
| 20 |
-
logging.basicConfig(level=logging.INFO)
|
| 21 |
-
logger = logging.getLogger(__name__)
|
| 22 |
-
|
| 23 |
-
# Global variable to track startup time
|
| 24 |
-
startup_time = time.time()
|
| 25 |
-
|
| 26 |
-
@asynccontextmanager
|
| 27 |
-
async def lifespan(app: FastAPI):
|
| 28 |
-
"""
|
| 29 |
-
Context manager for managing the lifespan of the FastAPI application.
|
| 30 |
-
Handles model loading and cleanup.
|
| 31 |
-
"""
|
| 32 |
-
logger.info("🚀 Starting up FastAPI application on HuggingFace Spaces...")
|
| 33 |
-
logger.info(f"🏗️ Deployment platform: {DRAGON_CONFIG['platform']}")
|
| 34 |
-
logger.info(f"🏗️ Platform name: {DRAGON_CONFIG['platform_config']['name']}")
|
| 35 |
-
|
| 36 |
-
if not load_dragon_model(DRAGON_CONFIG):
|
| 37 |
-
logger.error("❌ Failed to load Dragon model during startup.")
|
| 38 |
-
# Depending on desired behavior, you might want to raise an exception here
|
| 39 |
-
# to prevent the app from starting if the model is critical.
|
| 40 |
-
# For now, we'll allow the app to start but inference will fail.
|
| 41 |
-
else:
|
| 42 |
-
logger.info("✅ Dragon model loaded successfully during startup.")
|
| 43 |
-
yield
|
| 44 |
-
logger.info("👋 Shutting down FastAPI application...")
|
| 45 |
-
cleanup_model_memory()
|
| 46 |
-
logger.info("✅ Application shutdown complete.")
|
| 47 |
-
|
| 48 |
-
# Create FastAPI app with HuggingFace Spaces specific title and description
|
| 49 |
-
app = FastAPI(
|
| 50 |
-
title="Dragon-3B-Base-alpha Inference API (HuggingFace Spaces)",
|
| 51 |
-
version="1.0.0",
|
| 52 |
-
description="FastAPI endpoint for Dragon-3B-Base-alpha model optimized for HuggingFace Spaces deployment with T4 GPU support.",
|
| 53 |
-
lifespan=lifespan
|
| 54 |
-
)
|
| 55 |
-
|
| 56 |
-
@app.get("/", summary="Root endpoint", tags=["General"])
|
| 57 |
-
async def read_root():
|
| 58 |
-
"""
|
| 59 |
-
Returns basic API information including HuggingFace Spaces specific details.
|
| 60 |
-
"""
|
| 61 |
-
return {
|
| 62 |
-
"message": "Welcome to the Dragon-3B-Base-alpha Inference API (HuggingFace Spaces)!",
|
| 63 |
-
"version": app.version,
|
| 64 |
-
"model_name": DRAGON_CONFIG["display_name"],
|
| 65 |
-
"platform": DRAGON_CONFIG["platform"],
|
| 66 |
-
"platform_name": DRAGON_CONFIG["platform_config"]["name"],
|
| 67 |
-
"hardware": "T4 GPU (HuggingFace Spaces)",
|
| 68 |
-
"docs_url": "/docs",
|
| 69 |
-
"redoc_url": "/redoc",
|
| 70 |
-
"optimizations": {
|
| 71 |
-
"flash_attention": DRAGON_CONFIG["platform_config"]["supports_flash_attention"],
|
| 72 |
-
"quantization": DRAGON_CONFIG["platform_config"]["supports_quantization"],
|
| 73 |
-
"gpu_optimized": True,
|
| 74 |
-
"note": "Optimized for HuggingFace Spaces T4 GPU with standard attention"
|
| 75 |
-
}
|
| 76 |
-
}
|
| 77 |
-
|
| 78 |
-
@app.get("/health", summary="Health check", tags=["Monitoring"])
|
| 79 |
-
async def health_check():
|
| 80 |
-
"""
|
| 81 |
-
Performs a health check on the API and model with HuggingFace Spaces specific information.
|
| 82 |
-
"""
|
| 83 |
-
model_info = get_model_info()
|
| 84 |
-
uptime = time.time() - startup_time
|
| 85 |
-
return {
|
| 86 |
-
"status": "healthy",
|
| 87 |
-
"model_loaded": model_info["model_loaded"],
|
| 88 |
-
"model_name": model_info["model_name"],
|
| 89 |
-
"platform": model_info["platform"],
|
| 90 |
-
"platform_name": model_info["platform_name"],
|
| 91 |
-
"hardware": "T4 GPU (HuggingFace Spaces)",
|
| 92 |
-
"gpu_available": model_info["gpu_info"]["gpu_available"],
|
| 93 |
-
"gpu_info": model_info["gpu_info"],
|
| 94 |
-
"optimizations": model_info["optimizations"],
|
| 95 |
-
"uptime": uptime,
|
| 96 |
-
"space_info": {
|
| 97 |
-
"deployment_type": "HuggingFace Spaces",
|
| 98 |
-
"gpu_type": "T4",
|
| 99 |
-
"memory_limit": "16GB",
|
| 100 |
-
"auto_sleep": True
|
| 101 |
-
}
|
| 102 |
-
}
|
| 103 |
-
|
| 104 |
-
@app.get("/model/info", summary="Get model information", tags=["Model"])
|
| 105 |
-
async def model_info():
|
| 106 |
-
"""
|
| 107 |
-
Returns detailed information about the loaded model and HuggingFace Spaces optimizations.
|
| 108 |
-
"""
|
| 109 |
-
info = get_model_info()
|
| 110 |
-
info["space_specific"] = {
|
| 111 |
-
"hardware": "T4 GPU",
|
| 112 |
-
"memory_limit": "16GB",
|
| 113 |
-
"auto_sleep_enabled": True,
|
| 114 |
-
"cold_start_time": "~30 seconds",
|
| 115 |
-
"optimization_level": "Basic (no flash-attention)"
|
| 116 |
-
}
|
| 117 |
-
return info
|
| 118 |
-
|
| 119 |
-
@app.get("/platform/info", summary="Get platform information", tags=["Platform"])
|
| 120 |
-
async def platform_info():
|
| 121 |
-
"""
|
| 122 |
-
Returns detailed information about HuggingFace Spaces deployment and its capabilities.
|
| 123 |
-
"""
|
| 124 |
-
return {
|
| 125 |
-
"platform": DRAGON_CONFIG["platform"],
|
| 126 |
-
"platform_name": DRAGON_CONFIG["platform_config"]["name"],
|
| 127 |
-
"deployment_type": "HuggingFace Spaces",
|
| 128 |
-
"capabilities": {
|
| 129 |
-
"flash_attention": DRAGON_CONFIG["platform_config"]["supports_flash_attention"],
|
| 130 |
-
"quantization": DRAGON_CONFIG["platform_config"]["supports_quantization"],
|
| 131 |
-
"gpu_acceleration": True,
|
| 132 |
-
"cuda_support": True,
|
| 133 |
-
"auto_scaling": False,
|
| 134 |
-
"persistent_storage": False
|
| 135 |
-
},
|
| 136 |
-
"hardware": {
|
| 137 |
-
"gpu_type": "T4",
|
| 138 |
-
"gpu_memory": "16GB",
|
| 139 |
-
"cpu_cores": "2-4",
|
| 140 |
-
"ram": "16GB"
|
| 141 |
-
},
|
| 142 |
-
"limitations": {
|
| 143 |
-
"auto_sleep": "Spaces sleep after 48 hours of inactivity",
|
| 144 |
-
"cold_start": "~30 second startup time after sleep",
|
| 145 |
-
"flash_attention": "Not supported due to build environment limitations",
|
| 146 |
-
"custom_dependencies": "Limited to basic PyTorch and transformers"
|
| 147 |
-
},
|
| 148 |
-
"performance_notes": {
|
| 149 |
-
"inference_speed": "20-50 tokens/second",
|
| 150 |
-
"memory_usage": "~7GB GPU memory",
|
| 151 |
-
"optimization_level": "Basic (standard attention implementation)"
|
| 152 |
-
}
|
| 153 |
-
}
|
| 154 |
-
|
| 155 |
-
@app.post("/inference", summary="Run inference", tags=["Inference"])
|
| 156 |
-
async def inference_endpoint(request: Request, prompt: str, max_new_tokens: int = 150, temperature: float = 0.6):
|
| 157 |
-
"""
|
| 158 |
-
Runs inference on the Dragon model with HuggingFace Spaces optimizations.
|
| 159 |
-
"""
|
| 160 |
-
if not get_model_info()["model_loaded"]:
|
| 161 |
-
raise HTTPException(
|
| 162 |
-
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
| 163 |
-
detail="Model not loaded. Please check /health endpoint."
|
| 164 |
-
)
|
| 165 |
-
|
| 166 |
-
result = run_inference(prompt, max_new_tokens, temperature)
|
| 167 |
-
|
| 168 |
-
if result["success"]:
|
| 169 |
-
return JSONResponse(content={
|
| 170 |
-
"success": True,
|
| 171 |
-
"response": result["response"],
|
| 172 |
-
"model_name": result["model_name"],
|
| 173 |
-
"platform": result["platform"],
|
| 174 |
-
"inference_time": result["inference_time"],
|
| 175 |
-
"hardware": "T4 GPU (HuggingFace Spaces)",
|
| 176 |
-
"optimizations_used": {
|
| 177 |
-
"flash_attention": DRAGON_CONFIG["platform_config"]["supports_flash_attention"],
|
| 178 |
-
"quantization": DRAGON_CONFIG["platform_config"]["supports_quantization"],
|
| 179 |
-
"note": "Using standard attention implementation optimized for T4 GPU"
|
| 180 |
-
}
|
| 181 |
-
})
|
| 182 |
-
else:
|
| 183 |
-
raise HTTPException(
|
| 184 |
-
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 185 |
-
detail=result["error"]
|
| 186 |
-
)
|
| 187 |
-
|
| 188 |
-
@app.get("/performance/benchmark", summary="Performance benchmark", tags=["Performance"])
|
| 189 |
-
async def performance_benchmark():
|
| 190 |
-
"""
|
| 191 |
-
Runs a simple performance benchmark optimized for HuggingFace Spaces T4 GPU.
|
| 192 |
-
"""
|
| 193 |
-
if not get_model_info()["model_loaded"]:
|
| 194 |
-
raise HTTPException(
|
| 195 |
-
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
| 196 |
-
detail="Model not loaded. Please check /health endpoint."
|
| 197 |
-
)
|
| 198 |
-
|
| 199 |
-
# Simple benchmark with a standard prompt
|
| 200 |
-
test_prompt = "The future of artificial intelligence is"
|
| 201 |
-
benchmark_results = []
|
| 202 |
-
|
| 203 |
-
for i in range(3): # Run 3 iterations
|
| 204 |
-
result = run_inference(test_prompt, max_new_tokens=50, temperature=0.7)
|
| 205 |
-
if result["success"]:
|
| 206 |
-
benchmark_results.append({
|
| 207 |
-
"iteration": i + 1,
|
| 208 |
-
"inference_time": result["inference_time"],
|
| 209 |
-
"tokens_generated": 50,
|
| 210 |
-
"tokens_per_second": 50 / result["inference_time"] if result["inference_time"] > 0 else 0
|
| 211 |
-
})
|
| 212 |
-
|
| 213 |
-
if benchmark_results:
|
| 214 |
-
avg_time = sum(r["inference_time"] for r in benchmark_results) / len(benchmark_results)
|
| 215 |
-
avg_tps = sum(r["tokens_per_second"] for r in benchmark_results) / len(benchmark_results)
|
| 216 |
-
|
| 217 |
-
return {
|
| 218 |
-
"platform": DRAGON_CONFIG["platform"],
|
| 219 |
-
"platform_name": DRAGON_CONFIG["platform_config"]["name"],
|
| 220 |
-
"hardware": "T4 GPU (HuggingFace Spaces)",
|
| 221 |
-
"benchmark_results": benchmark_results,
|
| 222 |
-
"average_inference_time": avg_time,
|
| 223 |
-
"average_tokens_per_second": avg_tps,
|
| 224 |
-
"optimizations": {
|
| 225 |
-
"flash_attention": DRAGON_CONFIG["platform_config"]["supports_flash_attention"],
|
| 226 |
-
"quantization": DRAGON_CONFIG["platform_config"]["supports_quantization"],
|
| 227 |
-
"note": "Standard attention implementation on T4 GPU"
|
| 228 |
-
},
|
| 229 |
-
"performance_rating": "Good for demos and testing, suitable for moderate workloads"
|
| 230 |
-
}
|
| 231 |
-
else:
|
| 232 |
-
raise HTTPException(
|
| 233 |
-
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 234 |
-
detail="Benchmark failed - no successful inference runs"
|
| 235 |
-
)
|
| 236 |
-
|
| 237 |
-
# Example of how to run the app locally (for development)
|
| 238 |
-
if __name__ == "__main__":
|
| 239 |
-
import uvicorn
|
| 240 |
-
|
| 241 |
-
# HuggingFace Spaces uses port 7860
|
| 242 |
-
uvicorn.run(app, host="0.0.0.0", port=7860)
|
| 243 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Dragon-3B FastAPI Application
|
| 3 |
+
Clean, simple, production-ready inference API.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
__version__ = "2.0.0"
|
| 7 |
+
|
app/config.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration management using environment variables.
|
| 3 |
+
Simple, explicit, no magic platform detection.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
from typing import Optional
|
| 9 |
+
import torch
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@dataclass
|
| 13 |
+
class ModelConfig:
|
| 14 |
+
"""Model configuration."""
|
| 15 |
+
model_id: str = "DragonLLM/Dragon-3B-Base-alpha"
|
| 16 |
+
hf_token: Optional[str] = None
|
| 17 |
+
torch_dtype: str = "bfloat16" # or "float16", "float32"
|
| 18 |
+
device_map: str = "auto" # or "cuda", "cpu", "mps"
|
| 19 |
+
trust_remote_code: bool = True
|
| 20 |
+
low_cpu_mem_usage: bool = True
|
| 21 |
+
|
| 22 |
+
def __post_init__(self):
|
| 23 |
+
"""Get token from environment if not provided."""
|
| 24 |
+
if self.hf_token is None:
|
| 25 |
+
# Just need HF_TOKEN to download the model from HuggingFace
|
| 26 |
+
self.hf_token = os.getenv("HF_TOKEN")
|
| 27 |
+
|
| 28 |
+
if not self.hf_token:
|
| 29 |
+
raise ValueError(
|
| 30 |
+
"HuggingFace token required to download Dragon-3B model. "
|
| 31 |
+
"Set HF_TOKEN environment variable"
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@dataclass
|
| 36 |
+
class GenerationConfig:
|
| 37 |
+
"""Text generation parameters."""
|
| 38 |
+
max_new_tokens: int = 150
|
| 39 |
+
temperature: float = 0.7
|
| 40 |
+
top_p: float = 0.9
|
| 41 |
+
do_sample: bool = True
|
| 42 |
+
repetition_penalty: float = 1.05
|
| 43 |
+
|
| 44 |
+
def to_dict(self):
|
| 45 |
+
"""Convert to dictionary for model.generate()."""
|
| 46 |
+
return {
|
| 47 |
+
"max_new_tokens": self.max_new_tokens,
|
| 48 |
+
"temperature": self.temperature,
|
| 49 |
+
"top_p": self.top_p,
|
| 50 |
+
"do_sample": self.do_sample,
|
| 51 |
+
"repetition_penalty": self.repetition_penalty,
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
@dataclass
|
| 56 |
+
class AppConfig:
|
| 57 |
+
"""Application configuration."""
|
| 58 |
+
host: str = "0.0.0.0"
|
| 59 |
+
port: int = int(os.getenv("PORT", "8000"))
|
| 60 |
+
log_level: str = os.getenv("LOG_LEVEL", "INFO")
|
| 61 |
+
|
| 62 |
+
# Model paths
|
| 63 |
+
cache_dir: Optional[str] = os.getenv("HF_HOME")
|
| 64 |
+
|
| 65 |
+
# API settings
|
| 66 |
+
max_concurrent_requests: int = int(os.getenv("MAX_CONCURRENT", "1"))
|
| 67 |
+
|
| 68 |
+
# Optimizations
|
| 69 |
+
use_flash_linear_attention: bool = True # Auto-detect if available
|
| 70 |
+
use_quantization: bool = os.getenv("USE_QUANTIZATION", "false").lower() == "true"
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def get_torch_dtype(dtype_str: str):
|
| 74 |
+
"""Convert string to torch dtype."""
|
| 75 |
+
dtype_map = {
|
| 76 |
+
"bfloat16": torch.bfloat16,
|
| 77 |
+
"float16": torch.float16,
|
| 78 |
+
"float32": torch.float32,
|
| 79 |
+
"auto": torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32
|
| 80 |
+
}
|
| 81 |
+
return dtype_map.get(dtype_str, torch.bfloat16)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def detect_device():
|
| 85 |
+
"""Auto-detect best available device."""
|
| 86 |
+
if torch.cuda.is_available():
|
| 87 |
+
return "cuda"
|
| 88 |
+
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
|
| 89 |
+
return "mps"
|
| 90 |
+
else:
|
| 91 |
+
return "cpu"
|
| 92 |
+
|
app/main.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Dragon-3B FastAPI Application
|
| 3 |
+
Clean, simple, production-ready.
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
python -m app.main
|
| 7 |
+
# or
|
| 8 |
+
uvicorn app.main:app --host 0.0.0.0 --port 8000
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import logging
|
| 12 |
+
from contextlib import asynccontextmanager
|
| 13 |
+
from typing import Optional
|
| 14 |
+
|
| 15 |
+
from fastapi import FastAPI, HTTPException
|
| 16 |
+
from pydantic import BaseModel, Field
|
| 17 |
+
|
| 18 |
+
from .config import ModelConfig, GenerationConfig, AppConfig
|
| 19 |
+
from .model import DragonModel
|
| 20 |
+
|
| 21 |
+
# Configure logging
|
| 22 |
+
logging.basicConfig(
|
| 23 |
+
level=logging.INFO,
|
| 24 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 25 |
+
)
|
| 26 |
+
logger = logging.getLogger(__name__)
|
| 27 |
+
|
| 28 |
+
# Global model instance
|
| 29 |
+
dragon_model: Optional[DragonModel] = None
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
@asynccontextmanager
|
| 33 |
+
async def lifespan(app: FastAPI):
|
| 34 |
+
"""Manage model lifecycle - load on startup, unload on shutdown."""
|
| 35 |
+
global dragon_model
|
| 36 |
+
|
| 37 |
+
logger.info("🚀 Starting Dragon-3B API...")
|
| 38 |
+
|
| 39 |
+
try:
|
| 40 |
+
# Load configuration
|
| 41 |
+
model_config = ModelConfig()
|
| 42 |
+
app_config = AppConfig()
|
| 43 |
+
|
| 44 |
+
# Initialize and load model
|
| 45 |
+
dragon_model = DragonModel(model_config)
|
| 46 |
+
dragon_model.load(
|
| 47 |
+
cache_dir=app_config.cache_dir,
|
| 48 |
+
use_quantization=app_config.use_quantization
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
logger.info("✅ Dragon-3B API ready!")
|
| 52 |
+
|
| 53 |
+
except Exception as e:
|
| 54 |
+
logger.error(f"❌ Failed to load model: {e}")
|
| 55 |
+
raise
|
| 56 |
+
|
| 57 |
+
yield
|
| 58 |
+
|
| 59 |
+
# Cleanup
|
| 60 |
+
if dragon_model:
|
| 61 |
+
dragon_model.unload()
|
| 62 |
+
logger.info("👋 Dragon-3B API shutdown complete")
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
# Create FastAPI app
|
| 66 |
+
app = FastAPI(
|
| 67 |
+
title="Dragon-3B Inference API",
|
| 68 |
+
description="Clean FastAPI inference server for Dragon-3B-Base-alpha (Gated DeltaNet)",
|
| 69 |
+
version="2.0.0",
|
| 70 |
+
lifespan=lifespan
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
# Request/Response models
|
| 75 |
+
class GenerateRequest(BaseModel):
|
| 76 |
+
"""Request model for text generation."""
|
| 77 |
+
prompt: str = Field(..., description="Input text prompt", min_length=1)
|
| 78 |
+
max_new_tokens: Optional[int] = Field(150, description="Maximum tokens to generate", ge=1, le=2048)
|
| 79 |
+
temperature: Optional[float] = Field(0.7, description="Sampling temperature", ge=0.0, le=2.0)
|
| 80 |
+
top_p: Optional[float] = Field(0.9, description="Nucleus sampling threshold", ge=0.0, le=1.0)
|
| 81 |
+
|
| 82 |
+
class Config:
|
| 83 |
+
json_schema_extra = {
|
| 84 |
+
"example": {
|
| 85 |
+
"prompt": "The future of artificial intelligence is",
|
| 86 |
+
"max_new_tokens": 150,
|
| 87 |
+
"temperature": 0.7,
|
| 88 |
+
"top_p": 0.9
|
| 89 |
+
}
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
class GenerateResponse(BaseModel):
|
| 94 |
+
"""Response model for text generation."""
|
| 95 |
+
generated_text: str
|
| 96 |
+
prompt: str
|
| 97 |
+
generation_time: float
|
| 98 |
+
num_tokens: int
|
| 99 |
+
tokens_per_sec: float
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
# API Endpoints
|
| 103 |
+
|
| 104 |
+
@app.get("/")
|
| 105 |
+
async def root():
|
| 106 |
+
"""Root endpoint with API information."""
|
| 107 |
+
return {
|
| 108 |
+
"name": "Dragon-3B Inference API",
|
| 109 |
+
"version": "2.0.0",
|
| 110 |
+
"model": "DragonLLM/Dragon-3B-Base-alpha",
|
| 111 |
+
"architecture": "Gated DeltaNet (Linear Attention)",
|
| 112 |
+
"docs": "/docs",
|
| 113 |
+
"health": "/health"
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
@app.get("/health")
|
| 118 |
+
async def health():
|
| 119 |
+
"""Health check endpoint."""
|
| 120 |
+
if dragon_model is None or not dragon_model._loaded:
|
| 121 |
+
raise HTTPException(status_code=503, detail="Model not loaded")
|
| 122 |
+
|
| 123 |
+
info = dragon_model.get_info()
|
| 124 |
+
|
| 125 |
+
return {
|
| 126 |
+
"status": "healthy",
|
| 127 |
+
"model": info["model_id"],
|
| 128 |
+
"device": info["device"]["device"],
|
| 129 |
+
"optimizations": info["optimizations"]
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
@app.get("/info")
|
| 134 |
+
async def info():
|
| 135 |
+
"""Detailed model and system information."""
|
| 136 |
+
if dragon_model is None:
|
| 137 |
+
raise HTTPException(status_code=503, detail="Model not loaded")
|
| 138 |
+
|
| 139 |
+
return dragon_model.get_info()
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
@app.post("/generate", response_model=GenerateResponse)
|
| 143 |
+
async def generate(request: GenerateRequest):
|
| 144 |
+
"""
|
| 145 |
+
Generate text from a prompt.
|
| 146 |
+
|
| 147 |
+
This endpoint uses the Dragon-3B model optimized with Gated DeltaNet
|
| 148 |
+
(linear attention). For best performance, ensure flash-linear-attention
|
| 149 |
+
is installed.
|
| 150 |
+
"""
|
| 151 |
+
if dragon_model is None or not dragon_model._loaded:
|
| 152 |
+
raise HTTPException(status_code=503, detail="Model not loaded")
|
| 153 |
+
|
| 154 |
+
try:
|
| 155 |
+
# Create generation config from request
|
| 156 |
+
gen_config = GenerationConfig(
|
| 157 |
+
max_new_tokens=request.max_new_tokens,
|
| 158 |
+
temperature=request.temperature,
|
| 159 |
+
top_p=request.top_p
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
# Generate
|
| 163 |
+
result = dragon_model.generate(request.prompt, gen_config)
|
| 164 |
+
|
| 165 |
+
return GenerateResponse(**result)
|
| 166 |
+
|
| 167 |
+
except Exception as e:
|
| 168 |
+
logger.error(f"Generation failed: {e}")
|
| 169 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
# Run with uvicorn
|
| 173 |
+
if __name__ == "__main__":
|
| 174 |
+
import uvicorn
|
| 175 |
+
config = AppConfig()
|
| 176 |
+
|
| 177 |
+
uvicorn.run(
|
| 178 |
+
"app.main:app",
|
| 179 |
+
host=config.host,
|
| 180 |
+
port=config.port,
|
| 181 |
+
log_level=config.log_level.lower()
|
| 182 |
+
)
|
| 183 |
+
|
app/model.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Model loading and inference.
|
| 3 |
+
Clean, simple, no global state.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
import time
|
| 8 |
+
from typing import Optional, Dict, Any
|
| 9 |
+
import torch
|
| 10 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 11 |
+
|
| 12 |
+
from .config import ModelConfig, GenerationConfig, get_torch_dtype
|
| 13 |
+
from .optimization import detect_optimizations, get_model_kwargs, get_device_info
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class DragonModel:
|
| 19 |
+
"""
|
| 20 |
+
Dragon-3B model wrapper for inference.
|
| 21 |
+
Handles loading, optimization detection, and text generation.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self, config: ModelConfig):
|
| 25 |
+
"""
|
| 26 |
+
Initialize model (doesn't load yet - call load() explicitly).
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
config: Model configuration
|
| 30 |
+
"""
|
| 31 |
+
self.config = config
|
| 32 |
+
self.model = None
|
| 33 |
+
self.tokenizer = None
|
| 34 |
+
self.optimizations = {}
|
| 35 |
+
self.device_info = {}
|
| 36 |
+
self._loaded = False
|
| 37 |
+
|
| 38 |
+
def load(self, cache_dir: Optional[str] = None, use_quantization: bool = False):
|
| 39 |
+
"""
|
| 40 |
+
Load model and tokenizer.
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
cache_dir: Optional cache directory
|
| 44 |
+
use_quantization: Whether to use 8-bit quantization
|
| 45 |
+
"""
|
| 46 |
+
if self._loaded:
|
| 47 |
+
logger.info("Model already loaded")
|
| 48 |
+
return
|
| 49 |
+
|
| 50 |
+
logger.info(f"🐉 Loading {self.config.model_id}...")
|
| 51 |
+
|
| 52 |
+
# Detect available optimizations
|
| 53 |
+
self.optimizations = detect_optimizations()
|
| 54 |
+
self.device_info = get_device_info()
|
| 55 |
+
|
| 56 |
+
# Log device info
|
| 57 |
+
if self.device_info["device"] == "cuda":
|
| 58 |
+
logger.info(
|
| 59 |
+
f"⚡ GPU: {self.device_info['gpu_name']} "
|
| 60 |
+
f"({self.device_info['gpu_memory_gb']:.1f} GB)"
|
| 61 |
+
)
|
| 62 |
+
elif self.device_info["device"] == "mps":
|
| 63 |
+
logger.info("⚡ Device: Apple Silicon (MPS)")
|
| 64 |
+
else:
|
| 65 |
+
logger.info("⚡ Device: CPU")
|
| 66 |
+
|
| 67 |
+
# Get torch dtype
|
| 68 |
+
torch_dtype = get_torch_dtype(self.config.torch_dtype)
|
| 69 |
+
logger.info(f"🔧 Using dtype: {torch_dtype}")
|
| 70 |
+
|
| 71 |
+
# Get model kwargs with optimizations
|
| 72 |
+
model_kwargs = get_model_kwargs(
|
| 73 |
+
torch_dtype=torch_dtype,
|
| 74 |
+
device_map=self.config.device_map,
|
| 75 |
+
use_quantization=use_quantization,
|
| 76 |
+
cache_dir=cache_dir
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
# Load tokenizer
|
| 80 |
+
logger.info("📚 Loading tokenizer...")
|
| 81 |
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
| 82 |
+
self.config.model_id,
|
| 83 |
+
token=self.config.hf_token,
|
| 84 |
+
trust_remote_code=True,
|
| 85 |
+
cache_dir=cache_dir
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
# Load model
|
| 89 |
+
logger.info("🚀 Loading model...")
|
| 90 |
+
start_time = time.time()
|
| 91 |
+
|
| 92 |
+
self.model = AutoModelForCausalLM.from_pretrained(
|
| 93 |
+
self.config.model_id,
|
| 94 |
+
token=self.config.hf_token,
|
| 95 |
+
**model_kwargs
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
load_time = time.time() - start_time
|
| 99 |
+
logger.info(f"✅ Model loaded in {load_time:.2f}s")
|
| 100 |
+
|
| 101 |
+
self._loaded = True
|
| 102 |
+
self.model.eval() # Set to eval mode
|
| 103 |
+
|
| 104 |
+
def generate(
|
| 105 |
+
self,
|
| 106 |
+
prompt: str,
|
| 107 |
+
generation_config: Optional[GenerationConfig] = None
|
| 108 |
+
) -> Dict[str, Any]:
|
| 109 |
+
"""
|
| 110 |
+
Generate text from prompt.
|
| 111 |
+
|
| 112 |
+
Args:
|
| 113 |
+
prompt: Input text prompt
|
| 114 |
+
generation_config: Generation parameters (uses defaults if None)
|
| 115 |
+
|
| 116 |
+
Returns:
|
| 117 |
+
Dictionary with generated text, timing, and metadata
|
| 118 |
+
"""
|
| 119 |
+
if not self._loaded:
|
| 120 |
+
raise RuntimeError("Model not loaded. Call load() first.")
|
| 121 |
+
|
| 122 |
+
if generation_config is None:
|
| 123 |
+
generation_config = GenerationConfig()
|
| 124 |
+
|
| 125 |
+
# Tokenize
|
| 126 |
+
inputs = self.tokenizer(prompt, return_tensors="pt")
|
| 127 |
+
|
| 128 |
+
# Move to device
|
| 129 |
+
if self.model.device.type != "cpu":
|
| 130 |
+
inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
|
| 131 |
+
|
| 132 |
+
# Generate
|
| 133 |
+
start_time = time.time()
|
| 134 |
+
|
| 135 |
+
with torch.no_grad():
|
| 136 |
+
outputs = self.model.generate(
|
| 137 |
+
**inputs,
|
| 138 |
+
**generation_config.to_dict(),
|
| 139 |
+
pad_token_id=self.tokenizer.eos_token_id
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
generation_time = time.time() - start_time
|
| 143 |
+
|
| 144 |
+
# Decode
|
| 145 |
+
generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 146 |
+
|
| 147 |
+
# Remove prompt from output
|
| 148 |
+
if generated_text.startswith(prompt):
|
| 149 |
+
generated_text = generated_text[len(prompt):].strip()
|
| 150 |
+
|
| 151 |
+
# Calculate tokens/sec
|
| 152 |
+
num_tokens = len(outputs[0]) - len(inputs["input_ids"][0])
|
| 153 |
+
tokens_per_sec = num_tokens / generation_time if generation_time > 0 else 0
|
| 154 |
+
|
| 155 |
+
return {
|
| 156 |
+
"generated_text": generated_text,
|
| 157 |
+
"prompt": prompt,
|
| 158 |
+
"generation_time": generation_time,
|
| 159 |
+
"num_tokens": num_tokens,
|
| 160 |
+
"tokens_per_sec": tokens_per_sec,
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
def get_info(self) -> Dict[str, Any]:
|
| 164 |
+
"""Get model and system information."""
|
| 165 |
+
return {
|
| 166 |
+
"model_id": self.config.model_id,
|
| 167 |
+
"loaded": self._loaded,
|
| 168 |
+
"device": self.device_info,
|
| 169 |
+
"optimizations": self.optimizations,
|
| 170 |
+
"torch_dtype": str(self.config.torch_dtype),
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
def unload(self):
|
| 174 |
+
"""Unload model from memory."""
|
| 175 |
+
if self.model is not None:
|
| 176 |
+
del self.model
|
| 177 |
+
if self.tokenizer is not None:
|
| 178 |
+
del self.tokenizer
|
| 179 |
+
|
| 180 |
+
torch.cuda.empty_cache()
|
| 181 |
+
self._loaded = False
|
| 182 |
+
logger.info("🧹 Model unloaded")
|
| 183 |
+
|
app/optimization.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Auto-detect and configure performance optimizations.
|
| 3 |
+
Uses flash-linear-attention when available (3-4x speedup).
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
from typing import Dict, Any, Optional
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def detect_optimizations() -> Dict[str, bool]:
|
| 13 |
+
"""
|
| 14 |
+
Detect available optimization packages.
|
| 15 |
+
|
| 16 |
+
Returns:
|
| 17 |
+
Dictionary of optimization name -> available (bool)
|
| 18 |
+
"""
|
| 19 |
+
optimizations = {
|
| 20 |
+
"flash_linear_attention": False,
|
| 21 |
+
"flash_attention": False,
|
| 22 |
+
"causal_conv1d": False,
|
| 23 |
+
"bitsandbytes": False
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
# Check flash-linear-attention (CRITICAL for Dragon-3B)
|
| 27 |
+
try:
|
| 28 |
+
import flash_linear_attention
|
| 29 |
+
optimizations["flash_linear_attention"] = True
|
| 30 |
+
logger.info("✅ flash-linear-attention detected (3-4x speedup enabled)")
|
| 31 |
+
except ImportError:
|
| 32 |
+
logger.warning(
|
| 33 |
+
"⚠️ flash-linear-attention not found - install for 3-4x speedup:\n"
|
| 34 |
+
" pip install flash-linear-attention"
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
# Check flash-attention (minimal benefit for Dragon-3B, but check anyway)
|
| 38 |
+
try:
|
| 39 |
+
import flash_attn
|
| 40 |
+
optimizations["flash_attention"] = True
|
| 41 |
+
logger.info("✅ flash-attention detected")
|
| 42 |
+
except ImportError:
|
| 43 |
+
pass
|
| 44 |
+
|
| 45 |
+
# Check causal-conv1d (helpful, ~20% gain)
|
| 46 |
+
try:
|
| 47 |
+
import causal_conv1d
|
| 48 |
+
optimizations["causal_conv1d"] = True
|
| 49 |
+
logger.info("✅ causal-conv1d detected")
|
| 50 |
+
except ImportError:
|
| 51 |
+
pass
|
| 52 |
+
|
| 53 |
+
# Check bitsandbytes for quantization
|
| 54 |
+
try:
|
| 55 |
+
import bitsandbytes
|
| 56 |
+
optimizations["bitsandbytes"] = True
|
| 57 |
+
logger.info("✅ bitsandbytes detected (quantization available)")
|
| 58 |
+
except ImportError:
|
| 59 |
+
pass
|
| 60 |
+
|
| 61 |
+
return optimizations
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def get_model_kwargs(
|
| 65 |
+
torch_dtype,
|
| 66 |
+
device_map: str,
|
| 67 |
+
use_quantization: bool = False,
|
| 68 |
+
cache_dir: Optional[str] = None
|
| 69 |
+
) -> Dict[str, Any]:
|
| 70 |
+
"""
|
| 71 |
+
Get model loading kwargs with optimizations.
|
| 72 |
+
|
| 73 |
+
Args:
|
| 74 |
+
torch_dtype: Torch dtype for model weights
|
| 75 |
+
device_map: Device placement strategy
|
| 76 |
+
use_quantization: Whether to use 8-bit quantization
|
| 77 |
+
cache_dir: Cache directory for model files
|
| 78 |
+
|
| 79 |
+
Returns:
|
| 80 |
+
Dictionary of kwargs for AutoModelForCausalLM.from_pretrained()
|
| 81 |
+
"""
|
| 82 |
+
import torch
|
| 83 |
+
|
| 84 |
+
kwargs = {
|
| 85 |
+
"torch_dtype": torch_dtype,
|
| 86 |
+
"device_map": device_map,
|
| 87 |
+
"trust_remote_code": True,
|
| 88 |
+
"low_cpu_mem_usage": True,
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
if cache_dir:
|
| 92 |
+
kwargs["cache_dir"] = cache_dir
|
| 93 |
+
|
| 94 |
+
# Add quantization config if requested and available
|
| 95 |
+
if use_quantization and torch.cuda.is_available():
|
| 96 |
+
try:
|
| 97 |
+
from transformers import BitsAndBytesConfig
|
| 98 |
+
kwargs["quantization_config"] = BitsAndBytesConfig(
|
| 99 |
+
load_in_8bit=True,
|
| 100 |
+
llm_int8_threshold=6.0,
|
| 101 |
+
)
|
| 102 |
+
logger.info("✅ Using 8-bit quantization")
|
| 103 |
+
except Exception as e:
|
| 104 |
+
logger.warning(f"⚠️ Quantization requested but failed: {e}")
|
| 105 |
+
|
| 106 |
+
return kwargs
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def get_device_info() -> Dict[str, Any]:
|
| 110 |
+
"""Get information about available compute devices."""
|
| 111 |
+
import torch
|
| 112 |
+
|
| 113 |
+
info = {
|
| 114 |
+
"cuda_available": torch.cuda.is_available(),
|
| 115 |
+
"mps_available": hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(),
|
| 116 |
+
"device": "cpu"
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
if info["cuda_available"]:
|
| 120 |
+
info["device"] = "cuda"
|
| 121 |
+
info["gpu_name"] = torch.cuda.get_device_name(0)
|
| 122 |
+
info["gpu_memory_gb"] = torch.cuda.get_device_properties(0).total_memory / (1024**3)
|
| 123 |
+
info["cuda_version"] = torch.version.cuda
|
| 124 |
+
elif info["mps_available"]:
|
| 125 |
+
info["device"] = "mps"
|
| 126 |
+
info["gpu_name"] = "Apple Silicon"
|
| 127 |
+
|
| 128 |
+
return info
|
| 129 |
+
|
app_config_unified.py
DELETED
|
@@ -1,375 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Unified Dragon-3B configuration that adapts to different deployment platforms
|
| 3 |
-
Supports HuggingFace Spaces, Koyeb, and Scaleway with platform-specific optimizations
|
| 4 |
-
"""
|
| 5 |
-
|
| 6 |
-
import os
|
| 7 |
-
import torch
|
| 8 |
-
import gc
|
| 9 |
-
import time
|
| 10 |
-
import logging
|
| 11 |
-
from typing import Dict, Any, Optional
|
| 12 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
|
| 13 |
-
from huggingface_hub import login
|
| 14 |
-
|
| 15 |
-
logger = logging.getLogger(__name__)
|
| 16 |
-
|
| 17 |
-
# Global variables for model and tokenizer
|
| 18 |
-
model = None
|
| 19 |
-
tokenizer = None
|
| 20 |
-
pipe = None
|
| 21 |
-
model_loaded = False
|
| 22 |
-
current_model_name = None
|
| 23 |
-
|
| 24 |
-
# Detect deployment platform
|
| 25 |
-
DEPLOYMENT_PLATFORM = os.getenv("DEPLOYMENT_PLATFORM", "hf").lower()
|
| 26 |
-
|
| 27 |
-
# Platform-specific configurations
|
| 28 |
-
PLATFORM_CONFIGS = {
|
| 29 |
-
"hf": {
|
| 30 |
-
"name": "HuggingFace Spaces",
|
| 31 |
-
"supports_flash_attention": False,
|
| 32 |
-
"supports_quantization": False,
|
| 33 |
-
"default_dtype": "bfloat16",
|
| 34 |
-
"default_device_map": "auto",
|
| 35 |
-
"max_memory_usage": 0.8
|
| 36 |
-
},
|
| 37 |
-
"koyeb": {
|
| 38 |
-
"name": "Koyeb",
|
| 39 |
-
"supports_flash_attention": True,
|
| 40 |
-
"supports_quantization": True,
|
| 41 |
-
"default_dtype": "bfloat16",
|
| 42 |
-
"default_device_map": "auto",
|
| 43 |
-
"max_memory_usage": 0.9
|
| 44 |
-
},
|
| 45 |
-
"scw": {
|
| 46 |
-
"name": "Scaleway",
|
| 47 |
-
"supports_flash_attention": True,
|
| 48 |
-
"supports_quantization": True,
|
| 49 |
-
"default_dtype": "bfloat16",
|
| 50 |
-
"default_device_map": "auto",
|
| 51 |
-
"max_memory_usage": 0.9
|
| 52 |
-
}
|
| 53 |
-
}
|
| 54 |
-
|
| 55 |
-
# Get current platform configuration
|
| 56 |
-
current_platform = PLATFORM_CONFIGS.get(DEPLOYMENT_PLATFORM, PLATFORM_CONFIGS["hf"])
|
| 57 |
-
|
| 58 |
-
# Dragon configuration with platform-specific optimizations
|
| 59 |
-
DRAGON_CONFIG = {
|
| 60 |
-
"model_id": "DragonLLM/Dragon-3B-Base-alpha",
|
| 61 |
-
"display_name": f"Dragon-3B-Base-alpha ({current_platform['name']})",
|
| 62 |
-
"architecture": "DragonForCausalLM",
|
| 63 |
-
"platform": DEPLOYMENT_PLATFORM,
|
| 64 |
-
"platform_config": current_platform,
|
| 65 |
-
"tokenizer": {
|
| 66 |
-
"eos_token": "<|endoftext|>",
|
| 67 |
-
"bos_token": "<|beginoftext|>",
|
| 68 |
-
"pad_token": "<|pad|>",
|
| 69 |
-
"unk_token": "<|unk|>",
|
| 70 |
-
"eos_token_id": 0,
|
| 71 |
-
"bos_token_id": 0,
|
| 72 |
-
"pad_token_id": 0,
|
| 73 |
-
"eot_token_id": 0,
|
| 74 |
-
"vocab_size": 196736,
|
| 75 |
-
"model_max_length": 8192
|
| 76 |
-
},
|
| 77 |
-
"generation": {
|
| 78 |
-
"eos_tokens": [0],
|
| 79 |
-
"bos_token_id": 0,
|
| 80 |
-
"temperature": 0.6,
|
| 81 |
-
"top_p": 0.9,
|
| 82 |
-
"max_new_tokens": 150,
|
| 83 |
-
"repetition_penalty": 1.05,
|
| 84 |
-
"no_repeat_ngram_size": 2,
|
| 85 |
-
"early_stopping": False,
|
| 86 |
-
"min_length": 50,
|
| 87 |
-
"do_sample": True,
|
| 88 |
-
"use_cache": True,
|
| 89 |
-
"pad_token_id": 0
|
| 90 |
-
}
|
| 91 |
-
}
|
| 92 |
-
|
| 93 |
-
def cleanup_model_memory():
|
| 94 |
-
"""Cleans up model from GPU memory."""
|
| 95 |
-
global model, tokenizer, pipe, model_loaded, current_model_name
|
| 96 |
-
if model is not None:
|
| 97 |
-
logger.info("🧹 Cleaning up model memory...")
|
| 98 |
-
del model
|
| 99 |
-
if tokenizer is not None:
|
| 100 |
-
del tokenizer
|
| 101 |
-
if pipe is not None:
|
| 102 |
-
del pipe
|
| 103 |
-
torch.cuda.empty_cache()
|
| 104 |
-
gc.collect()
|
| 105 |
-
model = None
|
| 106 |
-
tokenizer = None
|
| 107 |
-
pipe = None
|
| 108 |
-
model_loaded = False
|
| 109 |
-
current_model_name = None
|
| 110 |
-
logger.info("✅ Model memory cleaned")
|
| 111 |
-
|
| 112 |
-
def get_optimal_dtype():
|
| 113 |
-
"""Get optimal dtype based on platform and hardware."""
|
| 114 |
-
if torch.cuda.is_available():
|
| 115 |
-
if current_platform["default_dtype"] == "bfloat16" and torch.cuda.is_bf16_supported():
|
| 116 |
-
return torch.bfloat16
|
| 117 |
-
elif torch.cuda.is_fp16_supported():
|
| 118 |
-
return torch.float16
|
| 119 |
-
else:
|
| 120 |
-
return torch.float32
|
| 121 |
-
else:
|
| 122 |
-
return torch.float32
|
| 123 |
-
|
| 124 |
-
def get_attention_implementation():
|
| 125 |
-
"""Get optimal attention implementation based on platform support."""
|
| 126 |
-
if not torch.cuda.is_available():
|
| 127 |
-
return "eager"
|
| 128 |
-
|
| 129 |
-
if current_platform["supports_flash_attention"]:
|
| 130 |
-
try:
|
| 131 |
-
# Try to import flash attention
|
| 132 |
-
import flash_attn
|
| 133 |
-
return "flash_attention_2"
|
| 134 |
-
except ImportError:
|
| 135 |
-
logger.warning("⚠️ Flash attention not available, using eager attention")
|
| 136 |
-
return "eager"
|
| 137 |
-
else:
|
| 138 |
-
return "eager"
|
| 139 |
-
|
| 140 |
-
def get_quantization_config():
|
| 141 |
-
"""Get quantization configuration based on platform support."""
|
| 142 |
-
if not current_platform["supports_quantization"] or not torch.cuda.is_available():
|
| 143 |
-
return None
|
| 144 |
-
|
| 145 |
-
try:
|
| 146 |
-
return BitsAndBytesConfig(
|
| 147 |
-
load_in_8bit=True,
|
| 148 |
-
llm_int8_threshold=6.0,
|
| 149 |
-
llm_int8_skip_modules=["lm_head", "embed_tokens"],
|
| 150 |
-
llm_int8_has_fp16_weight=False,
|
| 151 |
-
)
|
| 152 |
-
except Exception as e:
|
| 153 |
-
logger.warning(f"⚠️ Quantization not available: {e}")
|
| 154 |
-
return None
|
| 155 |
-
|
| 156 |
-
def load_dragon_model(model_config: Dict[str, Any]) -> bool:
|
| 157 |
-
"""Loads the Dragon model with platform-specific optimizations."""
|
| 158 |
-
global model, tokenizer, pipe, model_loaded, current_model_name
|
| 159 |
-
|
| 160 |
-
if model_loaded and current_model_name == model_config["display_name"]:
|
| 161 |
-
logger.info(f"✅ Model '{current_model_name}' already loaded.")
|
| 162 |
-
return True
|
| 163 |
-
|
| 164 |
-
cleanup_model_memory()
|
| 165 |
-
|
| 166 |
-
hf_token_dragon = os.getenv("HF_TOKEN_DRAGON")
|
| 167 |
-
model_id = model_config["model_id"]
|
| 168 |
-
|
| 169 |
-
if not hf_token_dragon:
|
| 170 |
-
logger.error("❌ HF_TOKEN_DRAGON not found in environment")
|
| 171 |
-
return False
|
| 172 |
-
|
| 173 |
-
try:
|
| 174 |
-
logger.info(f"🐉 Initializing {model_config['display_name']} model...")
|
| 175 |
-
logger.info(f"🏗️ Platform: {current_platform['name']}")
|
| 176 |
-
login(token=hf_token_dragon, add_to_git_credential=False)
|
| 177 |
-
logger.info("✅ Authenticated with HuggingFace")
|
| 178 |
-
|
| 179 |
-
# Check CUDA availability and GPU info
|
| 180 |
-
if torch.cuda.is_available():
|
| 181 |
-
gpu_name = torch.cuda.get_device_name(0)
|
| 182 |
-
gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
|
| 183 |
-
logger.info(f"🚀 Loading {model_id} with CUDA support...")
|
| 184 |
-
logger.info(f"⚡ GPU: {gpu_name} ({gpu_memory:.1f} GB VRAM)")
|
| 185 |
-
|
| 186 |
-
torch_dtype = get_optimal_dtype()
|
| 187 |
-
device_map = current_platform["default_device_map"]
|
| 188 |
-
quantization_config = get_quantization_config()
|
| 189 |
-
attn_implementation = get_attention_implementation()
|
| 190 |
-
|
| 191 |
-
logger.info(f"🔧 Using dtype: {torch_dtype}")
|
| 192 |
-
logger.info(f"🔧 Using attention: {attn_implementation}")
|
| 193 |
-
logger.info(f"🔧 Using quantization: {'Yes' if quantization_config else 'No'}")
|
| 194 |
-
else:
|
| 195 |
-
torch_dtype = torch.float32
|
| 196 |
-
device_map = None
|
| 197 |
-
quantization_config = None
|
| 198 |
-
attn_implementation = "eager"
|
| 199 |
-
logger.warning("⚠️ CUDA not available, falling back to CPU with float32")
|
| 200 |
-
|
| 201 |
-
hf_home = os.getenv("HF_HOME")
|
| 202 |
-
if hf_home:
|
| 203 |
-
logger.info(f"📁 Using HF_HOME cache: {hf_home}")
|
| 204 |
-
else:
|
| 205 |
-
logger.info("📁 Using default HF cache location")
|
| 206 |
-
|
| 207 |
-
# Load tokenizer
|
| 208 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
| 209 |
-
model_id,
|
| 210 |
-
token=hf_token_dragon,
|
| 211 |
-
trust_remote_code=True,
|
| 212 |
-
cache_dir=hf_home if hf_home else None
|
| 213 |
-
)
|
| 214 |
-
|
| 215 |
-
# Load model with platform-specific optimizations
|
| 216 |
-
logger.info("🚀 Loading model with platform-specific optimizations...")
|
| 217 |
-
model = AutoModelForCausalLM.from_pretrained(
|
| 218 |
-
model_id,
|
| 219 |
-
token=hf_token_dragon,
|
| 220 |
-
dtype=torch_dtype,
|
| 221 |
-
device_map=device_map,
|
| 222 |
-
trust_remote_code=True,
|
| 223 |
-
low_cpu_mem_usage=True,
|
| 224 |
-
cache_dir=hf_home if hf_home else None,
|
| 225 |
-
attn_implementation=attn_implementation,
|
| 226 |
-
quantization_config=quantization_config
|
| 227 |
-
)
|
| 228 |
-
logger.info("✅ Model loaded successfully!")
|
| 229 |
-
|
| 230 |
-
# Create pipeline
|
| 231 |
-
if device_map == "auto":
|
| 232 |
-
pipe = pipeline(
|
| 233 |
-
"text-generation",
|
| 234 |
-
model=model,
|
| 235 |
-
tokenizer=tokenizer,
|
| 236 |
-
dtype=torch_dtype
|
| 237 |
-
)
|
| 238 |
-
else:
|
| 239 |
-
pipe = pipeline(
|
| 240 |
-
"text-generation",
|
| 241 |
-
model=model,
|
| 242 |
-
tokenizer=tokenizer,
|
| 243 |
-
dtype=torch_dtype,
|
| 244 |
-
device=-1
|
| 245 |
-
)
|
| 246 |
-
|
| 247 |
-
model_loaded = True
|
| 248 |
-
current_model_name = model_config["display_name"]
|
| 249 |
-
device_name = "CUDA" if torch.cuda.is_available() else "CPU"
|
| 250 |
-
logger.info(f"✅ Dragon model loaded successfully with {device_name} on {current_platform['name']}!")
|
| 251 |
-
return True
|
| 252 |
-
|
| 253 |
-
except Exception as e:
|
| 254 |
-
logger.error(f"❌ Failed to load model: {e}")
|
| 255 |
-
cleanup_model_memory()
|
| 256 |
-
return False
|
| 257 |
-
|
| 258 |
-
def run_inference(prompt: str, max_new_tokens: int = 150, temperature: float = 0.6) -> Dict[str, Any]:
|
| 259 |
-
"""Run inference with the loaded model."""
|
| 260 |
-
global pipe, model, tokenizer, model_loaded, current_model_name
|
| 261 |
-
|
| 262 |
-
if not model_loaded or pipe is None:
|
| 263 |
-
return {
|
| 264 |
-
"success": False,
|
| 265 |
-
"response": None,
|
| 266 |
-
"error": "Model not loaded",
|
| 267 |
-
"model_name": current_model_name,
|
| 268 |
-
"inference_time": 0.0
|
| 269 |
-
}
|
| 270 |
-
|
| 271 |
-
try:
|
| 272 |
-
model.eval()
|
| 273 |
-
start_time = time.time()
|
| 274 |
-
|
| 275 |
-
# Generate response with optimized parameters
|
| 276 |
-
generation_params = {
|
| 277 |
-
"max_new_tokens": max_new_tokens,
|
| 278 |
-
"temperature": temperature,
|
| 279 |
-
"do_sample": DRAGON_CONFIG["generation"]["do_sample"],
|
| 280 |
-
"top_p": DRAGON_CONFIG["generation"]["top_p"],
|
| 281 |
-
"repetition_penalty": DRAGON_CONFIG["generation"]["repetition_penalty"],
|
| 282 |
-
"no_repeat_ngram_size": DRAGON_CONFIG["generation"]["no_repeat_ngram_size"],
|
| 283 |
-
"early_stopping": DRAGON_CONFIG["generation"]["early_stopping"],
|
| 284 |
-
"min_length": DRAGON_CONFIG["generation"]["min_length"],
|
| 285 |
-
"use_cache": DRAGON_CONFIG["generation"]["use_cache"],
|
| 286 |
-
"pad_token_id": DRAGON_CONFIG["generation"]["pad_token_id"]
|
| 287 |
-
}
|
| 288 |
-
|
| 289 |
-
# Ensure prompt is a string
|
| 290 |
-
if not isinstance(prompt, str):
|
| 291 |
-
raise ValueError("Prompt must be a string.")
|
| 292 |
-
|
| 293 |
-
# Run inference
|
| 294 |
-
output = pipe(prompt, **generation_params)
|
| 295 |
-
generated_text = output[0]['generated_text']
|
| 296 |
-
|
| 297 |
-
# Post-process to remove the input prompt from the generated text
|
| 298 |
-
if generated_text.startswith(prompt):
|
| 299 |
-
generated_text = generated_text[len(prompt):].strip()
|
| 300 |
-
|
| 301 |
-
end_time = time.time()
|
| 302 |
-
inference_time = end_time - start_time
|
| 303 |
-
|
| 304 |
-
logger.info(f"✅ Inference successful for prompt: '{prompt[:50]}...'")
|
| 305 |
-
logger.info(f"⏱️ Inference time: {inference_time:.2f} seconds")
|
| 306 |
-
|
| 307 |
-
return {
|
| 308 |
-
"success": True,
|
| 309 |
-
"response": generated_text,
|
| 310 |
-
"error": None,
|
| 311 |
-
"model_name": current_model_name,
|
| 312 |
-
"inference_time": inference_time,
|
| 313 |
-
"platform": DEPLOYMENT_PLATFORM
|
| 314 |
-
}
|
| 315 |
-
|
| 316 |
-
except Exception as e:
|
| 317 |
-
logger.error(f"❌ Inference failed: {e}")
|
| 318 |
-
return {
|
| 319 |
-
"success": False,
|
| 320 |
-
"response": None,
|
| 321 |
-
"error": str(e),
|
| 322 |
-
"model_name": current_model_name,
|
| 323 |
-
"inference_time": time.time() - start_time if 'start_time' in locals() else 0.0,
|
| 324 |
-
"platform": DEPLOYMENT_PLATFORM
|
| 325 |
-
}
|
| 326 |
-
|
| 327 |
-
def get_model_info() -> Dict[str, Any]:
|
| 328 |
-
"""Returns information about the loaded model and platform."""
|
| 329 |
-
global model, tokenizer, model_loaded, current_model_name
|
| 330 |
-
gpu_info = {
|
| 331 |
-
"gpu_available": torch.cuda.is_available(),
|
| 332 |
-
"gpu_name": None,
|
| 333 |
-
"gpu_memory_total": None,
|
| 334 |
-
"gpu_memory_allocated": None,
|
| 335 |
-
"gpu_memory_reserved": None,
|
| 336 |
-
"gpu_memory_free": None,
|
| 337 |
-
"cuda_version": None,
|
| 338 |
-
"flash_attention_enabled": False
|
| 339 |
-
}
|
| 340 |
-
|
| 341 |
-
if torch.cuda.is_available():
|
| 342 |
-
gpu_info["gpu_name"] = torch.cuda.get_device_name(0)
|
| 343 |
-
gpu_info["cuda_version"] = torch.version.cuda
|
| 344 |
-
total_memory = torch.cuda.get_device_properties(0).total_memory
|
| 345 |
-
allocated_memory = torch.cuda.memory_allocated(0)
|
| 346 |
-
reserved_memory = torch.cuda.memory_reserved(0)
|
| 347 |
-
free_memory = total_memory - allocated_memory
|
| 348 |
-
|
| 349 |
-
gpu_info["gpu_memory_total"] = f"{total_memory / (1024**3):.2f} GB"
|
| 350 |
-
gpu_info["gpu_memory_allocated"] = f"{allocated_memory / (1024**3):.2f} GB"
|
| 351 |
-
gpu_info["gpu_memory_reserved"] = f"{reserved_memory / (1024**3):.2f} GB"
|
| 352 |
-
gpu_info["gpu_memory_free"] = f"{free_memory / (1024**3):.2f} GB"
|
| 353 |
-
|
| 354 |
-
# Check if flash attention is enabled
|
| 355 |
-
if model is not None and hasattr(model.config, 'attn_implementation'):
|
| 356 |
-
gpu_info["flash_attention_enabled"] = model.config.attn_implementation == "flash_attention_2"
|
| 357 |
-
|
| 358 |
-
return {
|
| 359 |
-
"model_loaded": model_loaded,
|
| 360 |
-
"model_name": current_model_name,
|
| 361 |
-
"model_id": DRAGON_CONFIG["model_id"],
|
| 362 |
-
"architecture": DRAGON_CONFIG["architecture"],
|
| 363 |
-
"platform": DEPLOYMENT_PLATFORM,
|
| 364 |
-
"platform_name": current_platform["name"],
|
| 365 |
-
"platform_config": current_platform,
|
| 366 |
-
"tokenizer_config": DRAGON_CONFIG["tokenizer"],
|
| 367 |
-
"generation_config": DRAGON_CONFIG["generation"],
|
| 368 |
-
"gpu_info": gpu_info,
|
| 369 |
-
"optimizations": {
|
| 370 |
-
"flash_attention": gpu_info.get("flash_attention_enabled", False),
|
| 371 |
-
"quantization": "8-bit" if torch.cuda.is_available() and current_platform["supports_quantization"] else "none",
|
| 372 |
-
"dtype": str(get_optimal_dtype()),
|
| 373 |
-
"attention_implementation": get_attention_implementation()
|
| 374 |
-
}
|
| 375 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app_unified.py
DELETED
|
@@ -1,243 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
HuggingFace Spaces optimized FastAPI application using unified deployment system
|
| 3 |
-
"""
|
| 4 |
-
|
| 5 |
-
import os
|
| 6 |
-
import time
|
| 7 |
-
import logging
|
| 8 |
-
from fastapi import FastAPI, Request, HTTPException, status
|
| 9 |
-
from fastapi.responses import JSONResponse
|
| 10 |
-
from contextlib import asynccontextmanager
|
| 11 |
-
from typing import Dict, Any
|
| 12 |
-
|
| 13 |
-
# Set HuggingFace Spaces as the deployment platform
|
| 14 |
-
os.environ["DEPLOYMENT_PLATFORM"] = "hf"
|
| 15 |
-
|
| 16 |
-
# Import the unified configuration
|
| 17 |
-
from app_config_unified import DRAGON_CONFIG, load_dragon_model, run_inference, get_model_info, cleanup_model_memory
|
| 18 |
-
|
| 19 |
-
# Configure logging
|
| 20 |
-
logging.basicConfig(level=logging.INFO)
|
| 21 |
-
logger = logging.getLogger(__name__)
|
| 22 |
-
|
| 23 |
-
# Global variable to track startup time
|
| 24 |
-
startup_time = time.time()
|
| 25 |
-
|
| 26 |
-
@asynccontextmanager
|
| 27 |
-
async def lifespan(app: FastAPI):
|
| 28 |
-
"""
|
| 29 |
-
Context manager for managing the lifespan of the FastAPI application.
|
| 30 |
-
Handles model loading and cleanup.
|
| 31 |
-
"""
|
| 32 |
-
logger.info("🚀 Starting up FastAPI application on HuggingFace Spaces...")
|
| 33 |
-
logger.info(f"🏗️ Deployment platform: {DRAGON_CONFIG['platform']}")
|
| 34 |
-
logger.info(f"🏗️ Platform name: {DRAGON_CONFIG['platform_config']['name']}")
|
| 35 |
-
|
| 36 |
-
if not load_dragon_model(DRAGON_CONFIG):
|
| 37 |
-
logger.error("❌ Failed to load Dragon model during startup.")
|
| 38 |
-
# Depending on desired behavior, you might want to raise an exception here
|
| 39 |
-
# to prevent the app from starting if the model is critical.
|
| 40 |
-
# For now, we'll allow the app to start but inference will fail.
|
| 41 |
-
else:
|
| 42 |
-
logger.info("✅ Dragon model loaded successfully during startup.")
|
| 43 |
-
yield
|
| 44 |
-
logger.info("👋 Shutting down FastAPI application...")
|
| 45 |
-
cleanup_model_memory()
|
| 46 |
-
logger.info("✅ Application shutdown complete.")
|
| 47 |
-
|
| 48 |
-
# Create FastAPI app with HuggingFace Spaces specific title and description
|
| 49 |
-
app = FastAPI(
|
| 50 |
-
title="Dragon-3B-Base-alpha Inference API (HuggingFace Spaces)",
|
| 51 |
-
version="1.0.0",
|
| 52 |
-
description="FastAPI endpoint for Dragon-3B-Base-alpha model optimized for HuggingFace Spaces deployment with T4 GPU support.",
|
| 53 |
-
lifespan=lifespan
|
| 54 |
-
)
|
| 55 |
-
|
| 56 |
-
@app.get("/", summary="Root endpoint", tags=["General"])
|
| 57 |
-
async def read_root():
|
| 58 |
-
"""
|
| 59 |
-
Returns basic API information including HuggingFace Spaces specific details.
|
| 60 |
-
"""
|
| 61 |
-
return {
|
| 62 |
-
"message": "Welcome to the Dragon-3B-Base-alpha Inference API (HuggingFace Spaces)!",
|
| 63 |
-
"version": app.version,
|
| 64 |
-
"model_name": DRAGON_CONFIG["display_name"],
|
| 65 |
-
"platform": DRAGON_CONFIG["platform"],
|
| 66 |
-
"platform_name": DRAGON_CONFIG["platform_config"]["name"],
|
| 67 |
-
"hardware": "T4 GPU (HuggingFace Spaces)",
|
| 68 |
-
"docs_url": "/docs",
|
| 69 |
-
"redoc_url": "/redoc",
|
| 70 |
-
"optimizations": {
|
| 71 |
-
"flash_attention": DRAGON_CONFIG["platform_config"]["supports_flash_attention"],
|
| 72 |
-
"quantization": DRAGON_CONFIG["platform_config"]["supports_quantization"],
|
| 73 |
-
"gpu_optimized": True,
|
| 74 |
-
"note": "Optimized for HuggingFace Spaces T4 GPU with standard attention"
|
| 75 |
-
}
|
| 76 |
-
}
|
| 77 |
-
|
| 78 |
-
@app.get("/health", summary="Health check", tags=["Monitoring"])
|
| 79 |
-
async def health_check():
|
| 80 |
-
"""
|
| 81 |
-
Performs a health check on the API and model with HuggingFace Spaces specific information.
|
| 82 |
-
"""
|
| 83 |
-
model_info = get_model_info()
|
| 84 |
-
uptime = time.time() - startup_time
|
| 85 |
-
return {
|
| 86 |
-
"status": "healthy",
|
| 87 |
-
"model_loaded": model_info["model_loaded"],
|
| 88 |
-
"model_name": model_info["model_name"],
|
| 89 |
-
"platform": model_info["platform"],
|
| 90 |
-
"platform_name": model_info["platform_name"],
|
| 91 |
-
"hardware": "T4 GPU (HuggingFace Spaces)",
|
| 92 |
-
"gpu_available": model_info["gpu_info"]["gpu_available"],
|
| 93 |
-
"gpu_info": model_info["gpu_info"],
|
| 94 |
-
"optimizations": model_info["optimizations"],
|
| 95 |
-
"uptime": uptime,
|
| 96 |
-
"space_info": {
|
| 97 |
-
"deployment_type": "HuggingFace Spaces",
|
| 98 |
-
"gpu_type": "T4",
|
| 99 |
-
"memory_limit": "16GB",
|
| 100 |
-
"auto_sleep": True
|
| 101 |
-
}
|
| 102 |
-
}
|
| 103 |
-
|
| 104 |
-
@app.get("/model/info", summary="Get model information", tags=["Model"])
|
| 105 |
-
async def model_info():
|
| 106 |
-
"""
|
| 107 |
-
Returns detailed information about the loaded model and HuggingFace Spaces optimizations.
|
| 108 |
-
"""
|
| 109 |
-
info = get_model_info()
|
| 110 |
-
info["space_specific"] = {
|
| 111 |
-
"hardware": "T4 GPU",
|
| 112 |
-
"memory_limit": "16GB",
|
| 113 |
-
"auto_sleep_enabled": True,
|
| 114 |
-
"cold_start_time": "~30 seconds",
|
| 115 |
-
"optimization_level": "Basic (no flash-attention)"
|
| 116 |
-
}
|
| 117 |
-
return info
|
| 118 |
-
|
| 119 |
-
@app.get("/platform/info", summary="Get platform information", tags=["Platform"])
|
| 120 |
-
async def platform_info():
|
| 121 |
-
"""
|
| 122 |
-
Returns detailed information about HuggingFace Spaces deployment and its capabilities.
|
| 123 |
-
"""
|
| 124 |
-
return {
|
| 125 |
-
"platform": DRAGON_CONFIG["platform"],
|
| 126 |
-
"platform_name": DRAGON_CONFIG["platform_config"]["name"],
|
| 127 |
-
"deployment_type": "HuggingFace Spaces",
|
| 128 |
-
"capabilities": {
|
| 129 |
-
"flash_attention": DRAGON_CONFIG["platform_config"]["supports_flash_attention"],
|
| 130 |
-
"quantization": DRAGON_CONFIG["platform_config"]["supports_quantization"],
|
| 131 |
-
"gpu_acceleration": True,
|
| 132 |
-
"cuda_support": True,
|
| 133 |
-
"auto_scaling": False,
|
| 134 |
-
"persistent_storage": False
|
| 135 |
-
},
|
| 136 |
-
"hardware": {
|
| 137 |
-
"gpu_type": "T4",
|
| 138 |
-
"gpu_memory": "16GB",
|
| 139 |
-
"cpu_cores": "2-4",
|
| 140 |
-
"ram": "16GB"
|
| 141 |
-
},
|
| 142 |
-
"limitations": {
|
| 143 |
-
"auto_sleep": "Spaces sleep after 48 hours of inactivity",
|
| 144 |
-
"cold_start": "~30 second startup time after sleep",
|
| 145 |
-
"flash_attention": "Not supported due to build environment limitations",
|
| 146 |
-
"custom_dependencies": "Limited to basic PyTorch and transformers"
|
| 147 |
-
},
|
| 148 |
-
"performance_notes": {
|
| 149 |
-
"inference_speed": "20-50 tokens/second",
|
| 150 |
-
"memory_usage": "~7GB GPU memory",
|
| 151 |
-
"optimization_level": "Basic (standard attention implementation)"
|
| 152 |
-
}
|
| 153 |
-
}
|
| 154 |
-
|
| 155 |
-
@app.post("/inference", summary="Run inference", tags=["Inference"])
|
| 156 |
-
async def inference_endpoint(request: Request, prompt: str, max_new_tokens: int = 150, temperature: float = 0.6):
|
| 157 |
-
"""
|
| 158 |
-
Runs inference on the Dragon model with HuggingFace Spaces optimizations.
|
| 159 |
-
"""
|
| 160 |
-
if not get_model_info()["model_loaded"]:
|
| 161 |
-
raise HTTPException(
|
| 162 |
-
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
| 163 |
-
detail="Model not loaded. Please check /health endpoint."
|
| 164 |
-
)
|
| 165 |
-
|
| 166 |
-
result = run_inference(prompt, max_new_tokens, temperature)
|
| 167 |
-
|
| 168 |
-
if result["success"]:
|
| 169 |
-
return JSONResponse(content={
|
| 170 |
-
"success": True,
|
| 171 |
-
"response": result["response"],
|
| 172 |
-
"model_name": result["model_name"],
|
| 173 |
-
"platform": result["platform"],
|
| 174 |
-
"inference_time": result["inference_time"],
|
| 175 |
-
"hardware": "T4 GPU (HuggingFace Spaces)",
|
| 176 |
-
"optimizations_used": {
|
| 177 |
-
"flash_attention": DRAGON_CONFIG["platform_config"]["supports_flash_attention"],
|
| 178 |
-
"quantization": DRAGON_CONFIG["platform_config"]["supports_quantization"],
|
| 179 |
-
"note": "Using standard attention implementation optimized for T4 GPU"
|
| 180 |
-
}
|
| 181 |
-
})
|
| 182 |
-
else:
|
| 183 |
-
raise HTTPException(
|
| 184 |
-
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 185 |
-
detail=result["error"]
|
| 186 |
-
)
|
| 187 |
-
|
| 188 |
-
@app.get("/performance/benchmark", summary="Performance benchmark", tags=["Performance"])
|
| 189 |
-
async def performance_benchmark():
|
| 190 |
-
"""
|
| 191 |
-
Runs a simple performance benchmark optimized for HuggingFace Spaces T4 GPU.
|
| 192 |
-
"""
|
| 193 |
-
if not get_model_info()["model_loaded"]:
|
| 194 |
-
raise HTTPException(
|
| 195 |
-
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
| 196 |
-
detail="Model not loaded. Please check /health endpoint."
|
| 197 |
-
)
|
| 198 |
-
|
| 199 |
-
# Simple benchmark with a standard prompt
|
| 200 |
-
test_prompt = "The future of artificial intelligence is"
|
| 201 |
-
benchmark_results = []
|
| 202 |
-
|
| 203 |
-
for i in range(3): # Run 3 iterations
|
| 204 |
-
result = run_inference(test_prompt, max_new_tokens=50, temperature=0.7)
|
| 205 |
-
if result["success"]:
|
| 206 |
-
benchmark_results.append({
|
| 207 |
-
"iteration": i + 1,
|
| 208 |
-
"inference_time": result["inference_time"],
|
| 209 |
-
"tokens_generated": 50,
|
| 210 |
-
"tokens_per_second": 50 / result["inference_time"] if result["inference_time"] > 0 else 0
|
| 211 |
-
})
|
| 212 |
-
|
| 213 |
-
if benchmark_results:
|
| 214 |
-
avg_time = sum(r["inference_time"] for r in benchmark_results) / len(benchmark_results)
|
| 215 |
-
avg_tps = sum(r["tokens_per_second"] for r in benchmark_results) / len(benchmark_results)
|
| 216 |
-
|
| 217 |
-
return {
|
| 218 |
-
"platform": DRAGON_CONFIG["platform"],
|
| 219 |
-
"platform_name": DRAGON_CONFIG["platform_config"]["name"],
|
| 220 |
-
"hardware": "T4 GPU (HuggingFace Spaces)",
|
| 221 |
-
"benchmark_results": benchmark_results,
|
| 222 |
-
"average_inference_time": avg_time,
|
| 223 |
-
"average_tokens_per_second": avg_tps,
|
| 224 |
-
"optimizations": {
|
| 225 |
-
"flash_attention": DRAGON_CONFIG["platform_config"]["supports_flash_attention"],
|
| 226 |
-
"quantization": DRAGON_CONFIG["platform_config"]["supports_quantization"],
|
| 227 |
-
"note": "Standard attention implementation on T4 GPU"
|
| 228 |
-
},
|
| 229 |
-
"performance_rating": "Good for demos and testing, suitable for moderate workloads"
|
| 230 |
-
}
|
| 231 |
-
else:
|
| 232 |
-
raise HTTPException(
|
| 233 |
-
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 234 |
-
detail="Benchmark failed - no successful inference runs"
|
| 235 |
-
)
|
| 236 |
-
|
| 237 |
-
# Example of how to run the app locally (for development)
|
| 238 |
-
if __name__ == "__main__":
|
| 239 |
-
import uvicorn
|
| 240 |
-
|
| 241 |
-
# HuggingFace Spaces uses port 7860
|
| 242 |
-
uvicorn.run(app, host="0.0.0.0", port=7860)
|
| 243 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
DELETED
|
@@ -1,22 +0,0 @@
|
|
| 1 |
-
# Dragon-3B Unified Requirements
|
| 2 |
-
# Core dependencies
|
| 3 |
-
torch>=2.0.0
|
| 4 |
-
transformers>=4.57.0
|
| 5 |
-
fastapi>=0.104.0
|
| 6 |
-
uvicorn[standard]>=0.24.0
|
| 7 |
-
huggingface-hub>=0.18.0
|
| 8 |
-
accelerate>=0.20.0
|
| 9 |
-
tokenizers>=0.14.0
|
| 10 |
-
einops>=0.6.0
|
| 11 |
-
protobuf
|
| 12 |
-
numpy
|
| 13 |
-
requests>=2.31.0
|
| 14 |
-
urllib3>=2.5.0
|
| 15 |
-
pyyaml
|
| 16 |
-
|
| 17 |
-
# Performance optimizations (installed conditionally based on platform)
|
| 18 |
-
# flash-attn>=2.5.0
|
| 19 |
-
# flash-linear-attention>=0.1.0
|
| 20 |
-
# causal-conv1d>=1.0.0
|
| 21 |
-
# flex-head-fa>=0.1.0
|
| 22 |
-
# bitsandbytes>=0.41.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements/base.txt
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Dragon-3B Core Requirements
|
| 2 |
+
# Minimal dependencies for inference
|
| 3 |
+
|
| 4 |
+
# Deep learning framework
|
| 5 |
+
torch>=2.0.0 # For flash-linear-attention: need >= 2.5
|
| 6 |
+
|
| 7 |
+
# Transformers and model loading
|
| 8 |
+
transformers>=4.57.0 # For flash-linear-attention: need >= 4.45
|
| 9 |
+
huggingface-hub>=0.18.0
|
| 10 |
+
accelerate>=0.20.0
|
| 11 |
+
tokenizers>=0.14.0
|
| 12 |
+
|
| 13 |
+
# Model architecture dependencies
|
| 14 |
+
einops>=0.6.0 # Required by Dragon architecture
|
| 15 |
+
protobuf # Required by transformers
|
| 16 |
+
|
| 17 |
+
# Basic utilities
|
| 18 |
+
numpy
|
| 19 |
+
|
| 20 |
+
# FastAPI and server
|
| 21 |
+
fastapi>=0.104.0
|
| 22 |
+
uvicorn[standard]>=0.24.0
|
| 23 |
+
pydantic>=2.0.0
|
| 24 |
+
|
| 25 |
+
# Installation:
|
| 26 |
+
# pip install -r requirements/base.txt
|
| 27 |
+
|
requirements/optimization.txt
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Dragon-3B Performance Optimizations
|
| 2 |
+
# Install these for 3-6x speedup
|
| 3 |
+
|
| 4 |
+
# CRITICAL: flash-linear-attention (3-4x speedup)
|
| 5 |
+
# Dragon-3B uses Gated DeltaNet (linear attention), this is MANDATORY for performance
|
| 6 |
+
#
|
| 7 |
+
# ⚠️ REQUIREMENTS (check before installing):
|
| 8 |
+
# - PyTorch >= 2.5
|
| 9 |
+
# - Triton >= 3.0
|
| 10 |
+
# - transformers >= 4.45
|
| 11 |
+
# - Compiles CUDA/Triton kernels (takes 30-40 min first time)
|
| 12 |
+
flash-linear-attention>=0.1.0
|
| 13 |
+
|
| 14 |
+
# Helpful: causal convolution (~20% additional gain)
|
| 15 |
+
causal-conv1d>=1.0.0
|
| 16 |
+
|
| 17 |
+
# Optional: 8-bit quantization (for memory-constrained environments)
|
| 18 |
+
bitsandbytes>=0.41.0
|
| 19 |
+
|
| 20 |
+
# Installation:
|
| 21 |
+
# 1. Check versions first:
|
| 22 |
+
# python -c "import torch; print(f'PyTorch: {torch.__version__}')"
|
| 23 |
+
# python -c "import transformers; print(f'Transformers: {transformers.__version__}')"
|
| 24 |
+
#
|
| 25 |
+
# 2. Upgrade if needed:
|
| 26 |
+
# pip install 'torch>=2.5' 'transformers>=4.45' 'triton>=3.0'
|
| 27 |
+
#
|
| 28 |
+
# 3. Install optimizations:
|
| 29 |
+
# pip install -r requirements/base.txt
|
| 30 |
+
# pip install -r requirements/optimization.txt --no-build-isolation
|
| 31 |
+
#
|
| 32 |
+
# Note: flash-linear-attention requires compilation (30-40 min)
|
| 33 |
+
# For Colab: Skip this, model works fine without it (25-35 tok/s vs 80-100 tok/s)
|
| 34 |
+
|
requirements_unified.txt
DELETED
|
@@ -1,22 +0,0 @@
|
|
| 1 |
-
# Dragon-3B Unified Requirements
|
| 2 |
-
# Core dependencies
|
| 3 |
-
torch>=2.0.0
|
| 4 |
-
transformers>=4.57.0
|
| 5 |
-
fastapi>=0.104.0
|
| 6 |
-
uvicorn[standard]>=0.24.0
|
| 7 |
-
huggingface-hub>=0.18.0
|
| 8 |
-
accelerate>=0.20.0
|
| 9 |
-
tokenizers>=0.14.0
|
| 10 |
-
einops>=0.6.0
|
| 11 |
-
protobuf
|
| 12 |
-
numpy
|
| 13 |
-
requests>=2.31.0
|
| 14 |
-
urllib3>=2.5.0
|
| 15 |
-
pyyaml
|
| 16 |
-
|
| 17 |
-
# Performance optimizations (installed conditionally based on platform)
|
| 18 |
-
# flash-attn>=2.5.0
|
| 19 |
-
# flash-linear-attention>=0.1.0
|
| 20 |
-
# causal-conv1d>=1.0.0
|
| 21 |
-
# flex-head-fa>=0.1.0
|
| 22 |
-
# bitsandbytes>=0.41.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|