Upload 4 files
Browse files- Dockerfile +48 -0
- README.md +279 -0
- app.py +629 -0
- requirements.txt +6 -0
Dockerfile
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use Python 3.11 slim image for better compatibility with Hugging Face
|
| 2 |
+
FROM python:3.11-slim
|
| 3 |
+
|
| 4 |
+
# Set working directory
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Set environment variables for Hugging Face Spaces
|
| 8 |
+
ENV PYTHONUNBUFFERED=1
|
| 9 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 10 |
+
ENV PORT=7860
|
| 11 |
+
ENV HOST=0.0.0.0
|
| 12 |
+
|
| 13 |
+
# Install system dependencies required for the application
|
| 14 |
+
RUN apt-get update && apt-get install -y \
|
| 15 |
+
gcc \
|
| 16 |
+
g++ \
|
| 17 |
+
libffi-dev \
|
| 18 |
+
libssl-dev \
|
| 19 |
+
curl \
|
| 20 |
+
git \
|
| 21 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 22 |
+
|
| 23 |
+
# Copy requirements first for better Docker layer caching
|
| 24 |
+
COPY requirements.txt .
|
| 25 |
+
|
| 26 |
+
# Install Python dependencies
|
| 27 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 28 |
+
pip install --no-cache-dir -r requirements.txt
|
| 29 |
+
|
| 30 |
+
# Copy application files
|
| 31 |
+
COPY replicate_server.py app.py
|
| 32 |
+
COPY test_all_models.py .
|
| 33 |
+
COPY quick_test.py .
|
| 34 |
+
|
| 35 |
+
# Create a simple health check script
|
| 36 |
+
RUN echo '#!/bin/bash\ncurl -f http://localhost:7860/health || exit 1' > /healthcheck.sh && \
|
| 37 |
+
chmod +x /healthcheck.sh
|
| 38 |
+
|
| 39 |
+
# Expose the port that Hugging Face expects
|
| 40 |
+
EXPOSE 7860
|
| 41 |
+
|
| 42 |
+
# Health check
|
| 43 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
| 44 |
+
CMD /healthcheck.sh
|
| 45 |
+
|
| 46 |
+
# Command to run the multi-model application
|
| 47 |
+
# Hugging Face Spaces expects the app to run on port 7860
|
| 48 |
+
CMD ["python", "app.py"]
|
README.md
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Multi-Model Replicate OpenAI API
|
| 3 |
+
emoji: 🤖
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
suggested_hardware: cpu-basic
|
| 9 |
+
tags:
|
| 10 |
+
- openai
|
| 11 |
+
- claude
|
| 12 |
+
- gpt
|
| 13 |
+
- replicate
|
| 14 |
+
- api
|
| 15 |
+
- multi-model
|
| 16 |
+
- streaming
|
| 17 |
+
- function-calling
|
| 18 |
+
---
|
| 19 |
+
|
| 20 |
+
# 🚀 Multi-Model Replicate OpenAI API - Hugging Face Spaces
|
| 21 |
+
|
| 22 |
+
Deploy a complete OpenAI-compatible API with 7 AI models (Claude & GPT) to Hugging Face Spaces.
|
| 23 |
+
|
| 24 |
+
## 🤖 Supported Models
|
| 25 |
+
|
| 26 |
+
### Anthropic Claude Models
|
| 27 |
+
- `claude-4-sonnet` - Latest Claude 4 Sonnet (Most Capable)
|
| 28 |
+
- `claude-3.7-sonnet` - Claude 3.7 Sonnet
|
| 29 |
+
- `claude-3.5-sonnet` - Claude 3.5 Sonnet (Balanced)
|
| 30 |
+
- `claude-3.5-haiku` - Claude 3.5 Haiku (Fastest)
|
| 31 |
+
|
| 32 |
+
### OpenAI GPT Models
|
| 33 |
+
- `gpt-4.1` - Latest GPT-4.1
|
| 34 |
+
- `gpt-4.1-mini` - GPT-4.1 Mini (Cost-Effective)
|
| 35 |
+
- `gpt-4.1-nano` - GPT-4.1 Nano (Ultra-Fast)
|
| 36 |
+
|
| 37 |
+
## ✨ Features
|
| 38 |
+
|
| 39 |
+
- 🎯 **100% OpenAI Compatible** - Drop-in replacement
|
| 40 |
+
- 🌊 **Streaming Support** - Real-time responses
|
| 41 |
+
- 🔧 **Function Calling** - Tool/function calling
|
| 42 |
+
- 🔐 **Secure** - Obfuscated API keys
|
| 43 |
+
- 📊 **Monitoring** - Health checks & stats
|
| 44 |
+
- 🚀 **Multi-Model** - 7 models in one API
|
| 45 |
+
|
| 46 |
+
## 🚀 Deploy to Hugging Face Spaces
|
| 47 |
+
|
| 48 |
+
### Step 1: Create New Space
|
| 49 |
+
1. Go to [huggingface.co/spaces](https://huggingface.co/spaces)
|
| 50 |
+
2. Click **"Create new Space"**
|
| 51 |
+
3. Choose:
|
| 52 |
+
- **Name**: `replicate-multi-model-api`
|
| 53 |
+
- **SDK**: **Docker** ⚠️ (Important!)
|
| 54 |
+
- **Hardware**: CPU Basic (free tier)
|
| 55 |
+
- **Visibility**: Public
|
| 56 |
+
|
| 57 |
+
### Step 2: Upload Files
|
| 58 |
+
Upload these files to your Space:
|
| 59 |
+
|
| 60 |
+
```
|
| 61 |
+
📁 Your Hugging Face Space:
|
| 62 |
+
├── app.py ← Upload replicate_server.py as app.py
|
| 63 |
+
├── requirements.txt ← Upload requirements.txt
|
| 64 |
+
├── Dockerfile ← Upload Dockerfile
|
| 65 |
+
├── README.md ← Upload this file as README.md
|
| 66 |
+
├── test_all_models.py ← Upload test_all_models.py (optional)
|
| 67 |
+
└── quick_test.py ← Upload quick_test.py (optional)
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
### Step 3: Set Environment Variables (Optional)
|
| 71 |
+
In your Space settings, you can set:
|
| 72 |
+
- `REPLICATE_API_TOKEN` - Your Replicate API token (if you want to use your own)
|
| 73 |
+
|
| 74 |
+
**Note**: The app includes an obfuscated token, so this is optional.
|
| 75 |
+
|
| 76 |
+
### Step 4: Deploy
|
| 77 |
+
- Hugging Face will automatically build and deploy
|
| 78 |
+
- Wait 5-10 minutes for build completion
|
| 79 |
+
- Your API will be live!
|
| 80 |
+
|
| 81 |
+
## 🎯 Your API Endpoints
|
| 82 |
+
|
| 83 |
+
Once deployed at `https://your-username-replicate-multi-model-api.hf.space`:
|
| 84 |
+
|
| 85 |
+
### Main Endpoints
|
| 86 |
+
- `POST /v1/chat/completions` - Chat completions (all models)
|
| 87 |
+
- `GET /v1/models` - List all 7 models
|
| 88 |
+
- `GET /health` - Health check
|
| 89 |
+
|
| 90 |
+
### Alternative Endpoints
|
| 91 |
+
- `POST /chat/completions` - Alternative chat endpoint
|
| 92 |
+
- `GET /models` - Alternative models endpoint
|
| 93 |
+
|
| 94 |
+
## 🧪 Test Your Deployment
|
| 95 |
+
|
| 96 |
+
### 1. Health Check
|
| 97 |
+
```bash
|
| 98 |
+
curl https://your-username-replicate-multi-model-api.hf.space/health
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
### 2. List Models
|
| 102 |
+
```bash
|
| 103 |
+
curl https://your-username-replicate-multi-model-api.hf.space/v1/models
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
### 3. Test Claude 4 Sonnet
|
| 107 |
+
```bash
|
| 108 |
+
curl -X POST https://your-username-replicate-multi-model-api.hf.space/v1/chat/completions \
|
| 109 |
+
-H "Content-Type: application/json" \
|
| 110 |
+
-d '{
|
| 111 |
+
"model": "claude-4-sonnet",
|
| 112 |
+
"messages": [
|
| 113 |
+
{"role": "user", "content": "Write a haiku about AI"}
|
| 114 |
+
],
|
| 115 |
+
"max_tokens": 100
|
| 116 |
+
}'
|
| 117 |
+
```
|
| 118 |
+
|
| 119 |
+
### 4. Test GPT-4.1 Mini
|
| 120 |
+
```bash
|
| 121 |
+
curl -X POST https://your-username-replicate-multi-model-api.hf.space/v1/chat/completions \
|
| 122 |
+
-H "Content-Type: application/json" \
|
| 123 |
+
-d '{
|
| 124 |
+
"model": "gpt-4.1-mini",
|
| 125 |
+
"messages": [
|
| 126 |
+
{"role": "user", "content": "Quick math: What is 15 * 23?"}
|
| 127 |
+
],
|
| 128 |
+
"stream": false
|
| 129 |
+
}'
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
### 5. Test Streaming
|
| 133 |
+
```bash
|
| 134 |
+
curl -X POST https://your-username-replicate-multi-model-api.hf.space/v1/chat/completions \
|
| 135 |
+
-H "Content-Type: application/json" \
|
| 136 |
+
-d '{
|
| 137 |
+
"model": "claude-3.5-haiku",
|
| 138 |
+
"messages": [
|
| 139 |
+
{"role": "user", "content": "Count from 1 to 10"}
|
| 140 |
+
],
|
| 141 |
+
"stream": true
|
| 142 |
+
}'
|
| 143 |
+
```
|
| 144 |
+
|
| 145 |
+
## 🔌 OpenAI SDK Compatibility
|
| 146 |
+
|
| 147 |
+
Your deployed API works with the OpenAI SDK:
|
| 148 |
+
|
| 149 |
+
```python
|
| 150 |
+
import openai
|
| 151 |
+
|
| 152 |
+
client = openai.OpenAI(
|
| 153 |
+
base_url="https://your-username-replicate-multi-model-api.hf.space/v1",
|
| 154 |
+
api_key="dummy" # Not required
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
# Use any of the 7 models
|
| 158 |
+
completion = client.chat.completions.create(
|
| 159 |
+
model="claude-3.5-sonnet",
|
| 160 |
+
messages=[
|
| 161 |
+
{"role": "user", "content": "Hello, world!"}
|
| 162 |
+
]
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
print(completion.choices[0].message.content)
|
| 166 |
+
```
|
| 167 |
+
|
| 168 |
+
## 📊 Model Selection Guide
|
| 169 |
+
|
| 170 |
+
### For Different Use Cases:
|
| 171 |
+
|
| 172 |
+
**🧠 Complex Reasoning & Analysis**
|
| 173 |
+
- `claude-4-sonnet` - Best for complex tasks, analysis, coding
|
| 174 |
+
|
| 175 |
+
**⚡ Speed & Quick Responses**
|
| 176 |
+
- `claude-3.5-haiku` - Fastest Claude model
|
| 177 |
+
- `gpt-4.1-nano` - Ultra-fast GPT model
|
| 178 |
+
|
| 179 |
+
**💰 Cost-Effective**
|
| 180 |
+
- `gpt-4.1-mini` - Good balance of cost and capability
|
| 181 |
+
|
| 182 |
+
**🎯 General Purpose**
|
| 183 |
+
- `claude-3.5-sonnet` - Excellent all-around model
|
| 184 |
+
- `gpt-4.1` - Latest GPT capabilities
|
| 185 |
+
|
| 186 |
+
**📝 Writing & Creative Tasks**
|
| 187 |
+
- `claude-3.7-sonnet` - Great for creative writing
|
| 188 |
+
- `claude-3.5-sonnet` - Balanced creativity and logic
|
| 189 |
+
|
| 190 |
+
## 🔧 Configuration
|
| 191 |
+
|
| 192 |
+
### Environment Variables
|
| 193 |
+
- `PORT` - Server port (default: 7860 for HF)
|
| 194 |
+
- `HOST` - Server host (default: 0.0.0.0)
|
| 195 |
+
- `REPLICATE_API_TOKEN` - Your Replicate token (optional)
|
| 196 |
+
|
| 197 |
+
### Request Parameters
|
| 198 |
+
All models support:
|
| 199 |
+
- `max_tokens` - Maximum response tokens
|
| 200 |
+
- `temperature` - Creativity (0.0-2.0)
|
| 201 |
+
- `top_p` - Nucleus sampling
|
| 202 |
+
- `stream` - Enable streaming
|
| 203 |
+
- `tools` - Function calling tools
|
| 204 |
+
|
| 205 |
+
## 📈 Expected Performance
|
| 206 |
+
|
| 207 |
+
### Response Times (approximate):
|
| 208 |
+
- **Claude 3.5 Haiku**: ~2-5 seconds
|
| 209 |
+
- **GPT-4.1 Nano**: ~2-4 seconds
|
| 210 |
+
- **GPT-4.1 Mini**: ~3-6 seconds
|
| 211 |
+
- **Claude 3.5 Sonnet**: ~4-8 seconds
|
| 212 |
+
- **Claude 3.7 Sonnet**: ~5-10 seconds
|
| 213 |
+
- **GPT-4.1**: ~6-12 seconds
|
| 214 |
+
- **Claude 4 Sonnet**: ~8-15 seconds
|
| 215 |
+
|
| 216 |
+
### Context Lengths:
|
| 217 |
+
- **Claude Models**: 200,000 tokens
|
| 218 |
+
- **GPT Models**: 128,000 tokens
|
| 219 |
+
|
| 220 |
+
## 🆘 Troubleshooting
|
| 221 |
+
|
| 222 |
+
### Build Issues
|
| 223 |
+
1. **Docker build fails**: Check Dockerfile syntax
|
| 224 |
+
2. **Dependencies fail**: Verify requirements.txt
|
| 225 |
+
3. **Port issues**: Ensure using port 7860
|
| 226 |
+
|
| 227 |
+
### Runtime Issues
|
| 228 |
+
1. **Health check fails**: Check server logs in HF
|
| 229 |
+
2. **Models not working**: Verify Replicate API access
|
| 230 |
+
3. **Slow responses**: Try faster models (haiku, nano)
|
| 231 |
+
|
| 232 |
+
### API Issues
|
| 233 |
+
1. **Model not found**: Check model name spelling
|
| 234 |
+
2. **Streaming broken**: Verify SSE support
|
| 235 |
+
3. **Function calling fails**: Check tool definition format
|
| 236 |
+
|
| 237 |
+
## ✅ Success Checklist
|
| 238 |
+
|
| 239 |
+
- [ ] Space created with Docker SDK
|
| 240 |
+
- [ ] All files uploaded correctly
|
| 241 |
+
- [ ] Build completes without errors
|
| 242 |
+
- [ ] Health endpoint returns 200
|
| 243 |
+
- [ ] Models endpoint lists 7 models
|
| 244 |
+
- [ ] At least one model responds correctly
|
| 245 |
+
- [ ] Streaming works
|
| 246 |
+
- [ ] OpenAI SDK compatibility verified
|
| 247 |
+
|
| 248 |
+
## 🎉 You're Live!
|
| 249 |
+
|
| 250 |
+
Once deployed, your API provides:
|
| 251 |
+
|
| 252 |
+
✅ **7 AI Models** in one endpoint
|
| 253 |
+
✅ **OpenAI Compatibility** for easy integration
|
| 254 |
+
✅ **Streaming Support** for real-time responses
|
| 255 |
+
✅ **Function Calling** for tool integration
|
| 256 |
+
✅ **Global Access** via Hugging Face
|
| 257 |
+
✅ **Free Hosting** on HF Spaces
|
| 258 |
+
|
| 259 |
+
## 📞 Support
|
| 260 |
+
|
| 261 |
+
For issues:
|
| 262 |
+
1. Check Hugging Face Space logs
|
| 263 |
+
2. Test locally first: `python replicate_server.py`
|
| 264 |
+
3. Verify model names match supported list
|
| 265 |
+
4. Check Replicate API status
|
| 266 |
+
|
| 267 |
+
## 🚀 Example Applications
|
| 268 |
+
|
| 269 |
+
Your deployed API can power:
|
| 270 |
+
- **Chatbots** with multiple personality models
|
| 271 |
+
- **Code Assistants** using Claude for analysis
|
| 272 |
+
- **Writing Tools** with model selection
|
| 273 |
+
- **Research Tools** with different reasoning models
|
| 274 |
+
- **Customer Support** with fast response models
|
| 275 |
+
|
| 276 |
+
**Your Multi-Model API URL**:
|
| 277 |
+
`https://your-username-replicate-multi-model-api.hf.space`
|
| 278 |
+
|
| 279 |
+
🎊 **Congratulations! You now have 7 AI models in one OpenAI-compatible API!** 🎊
|
app.py
ADDED
|
@@ -0,0 +1,629 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64 as _b64, json as _j, time as _t, uuid as _u, logging as _l, traceback as _tb, os as _o
|
| 2 |
+
from fastapi import FastAPI as _FA, HTTPException as _HE
|
| 3 |
+
from fastapi.responses import StreamingResponse as _SR, JSONResponse as _JR
|
| 4 |
+
from pydantic import BaseModel as _BM, Field as _F
|
| 5 |
+
from typing import List as _L, Optional as _O, Dict as _D, Any as _A, Union as _U
|
| 6 |
+
import replicate as _r
|
| 7 |
+
from contextlib import asynccontextmanager as _acm
|
| 8 |
+
|
| 9 |
+
# Obfuscated configuration
|
| 10 |
+
_l.basicConfig(level=_l.INFO)
|
| 11 |
+
_lg = _l.getLogger(__name__)
|
| 12 |
+
_TOKEN = _b64.b64decode(b'cjhfWDdxeVpLTkZLZlZpUWdRaDJJcUhIa1BmdkFqRGhqSzFBWVl0Yw==').decode('utf-8')
|
| 13 |
+
|
| 14 |
+
# Supported models configuration
|
| 15 |
+
_MODELS = {
|
| 16 |
+
# Anthropic Claude Models
|
| 17 |
+
"claude-4-sonnet": "anthropic/claude-4-sonnet",
|
| 18 |
+
"claude-3.7-sonnet": "anthropic/claude-3.7-sonnet",
|
| 19 |
+
"claude-3.5-sonnet": "anthropic/claude-3.5-sonnet",
|
| 20 |
+
"claude-3.5-haiku": "anthropic/claude-3.5-haiku",
|
| 21 |
+
|
| 22 |
+
# OpenAI GPT Models
|
| 23 |
+
"gpt-4.1": "openai/gpt-4.1",
|
| 24 |
+
"gpt-4.1-mini": "openai/gpt-4.1-mini",
|
| 25 |
+
"gpt-4.1-nano": "openai/gpt-4.1-nano",
|
| 26 |
+
|
| 27 |
+
# Alternative naming (with provider prefix)
|
| 28 |
+
"anthropic/claude-4-sonnet": "anthropic/claude-4-sonnet",
|
| 29 |
+
"anthropic/claude-3.7-sonnet": "anthropic/claude-3.7-sonnet",
|
| 30 |
+
"anthropic/claude-3.5-sonnet": "anthropic/claude-3.5-sonnet",
|
| 31 |
+
"anthropic/claude-3.5-haiku": "anthropic/claude-3.5-haiku",
|
| 32 |
+
"openai/gpt-4.1": "openai/gpt-4.1",
|
| 33 |
+
"openai/gpt-4.1-mini": "openai/gpt-4.1-mini",
|
| 34 |
+
"openai/gpt-4.1-nano": "openai/gpt-4.1-nano"
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
# Model metadata for OpenAI compatibility
|
| 38 |
+
_MODEL_INFO = {
|
| 39 |
+
"claude-4-sonnet": {"owned_by": "anthropic", "context_length": 200000},
|
| 40 |
+
"claude-3.7-sonnet": {"owned_by": "anthropic", "context_length": 200000},
|
| 41 |
+
"claude-3.5-sonnet": {"owned_by": "anthropic", "context_length": 200000},
|
| 42 |
+
"claude-3.5-haiku": {"owned_by": "anthropic", "context_length": 200000},
|
| 43 |
+
"gpt-4.1": {"owned_by": "openai", "context_length": 128000},
|
| 44 |
+
"gpt-4.1-mini": {"owned_by": "openai", "context_length": 128000},
|
| 45 |
+
"gpt-4.1-nano": {"owned_by": "openai", "context_length": 128000}
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
# OpenAI Compatible Models
|
| 49 |
+
class _CM(_BM):
|
| 50 |
+
role: str = _F(..., description="Message role")
|
| 51 |
+
content: _O[_U[str, _L[_D[str, _A]]]] = _F(None, description="Message content")
|
| 52 |
+
name: _O[str] = _F(None, description="Message name")
|
| 53 |
+
function_call: _O[_D[str, _A]] = _F(None, description="Function call")
|
| 54 |
+
tool_calls: _O[_L[_D[str, _A]]] = _F(None, description="Tool calls")
|
| 55 |
+
tool_call_id: _O[str] = _F(None, description="Tool call ID")
|
| 56 |
+
|
| 57 |
+
class _FC(_BM):
|
| 58 |
+
name: str = _F(..., description="Function name")
|
| 59 |
+
arguments: str = _F(..., description="Function arguments")
|
| 60 |
+
|
| 61 |
+
class _TC(_BM):
|
| 62 |
+
id: str = _F(..., description="Tool call ID")
|
| 63 |
+
type: str = _F(default="function", description="Tool call type")
|
| 64 |
+
function: _FC = _F(..., description="Function call")
|
| 65 |
+
|
| 66 |
+
class _FD(_BM):
|
| 67 |
+
name: str = _F(..., description="Function name")
|
| 68 |
+
description: _O[str] = _F(None, description="Function description")
|
| 69 |
+
parameters: _D[str, _A] = _F(..., description="Function parameters")
|
| 70 |
+
|
| 71 |
+
class _TD(_BM):
|
| 72 |
+
type: str = _F(default="function", description="Tool type")
|
| 73 |
+
function: _FD = _F(..., description="Function definition")
|
| 74 |
+
|
| 75 |
+
class _CCR(_BM):
|
| 76 |
+
model: str = _F(..., description="Model name")
|
| 77 |
+
messages: _L[_CM] = _F(..., description="Messages")
|
| 78 |
+
max_tokens: _O[int] = _F(default=4096, description="Max tokens")
|
| 79 |
+
temperature: _O[float] = _F(default=0.7, description="Temperature")
|
| 80 |
+
top_p: _O[float] = _F(default=1.0, description="Top p")
|
| 81 |
+
n: _O[int] = _F(default=1, description="Number of completions")
|
| 82 |
+
stream: _O[bool] = _F(default=True, description="Stream response")
|
| 83 |
+
stop: _O[_U[str, _L[str]]] = _F(None, description="Stop sequences")
|
| 84 |
+
presence_penalty: _O[float] = _F(default=0.0, description="Presence penalty")
|
| 85 |
+
frequency_penalty: _O[float] = _F(default=0.0, description="Frequency penalty")
|
| 86 |
+
logit_bias: _O[_D[str, float]] = _F(None, description="Logit bias")
|
| 87 |
+
user: _O[str] = _F(None, description="User ID")
|
| 88 |
+
tools: _O[_L[_TD]] = _F(None, description="Available tools")
|
| 89 |
+
tool_choice: _O[_U[str, _D[str, _A]]] = _F(None, description="Tool choice")
|
| 90 |
+
functions: _O[_L[_FD]] = _F(None, description="Available functions")
|
| 91 |
+
function_call: _O[_U[str, _D[str, _A]]] = _F(None, description="Function call")
|
| 92 |
+
|
| 93 |
+
class _CCC(_BM):
|
| 94 |
+
index: int = _F(default=0, description="Choice index")
|
| 95 |
+
message: _CM = _F(..., description="Message")
|
| 96 |
+
finish_reason: _O[str] = _F(None, description="Finish reason")
|
| 97 |
+
|
| 98 |
+
class _CCSC(_BM):
|
| 99 |
+
index: int = _F(default=0, description="Choice index")
|
| 100 |
+
delta: _D[str, _A] = _F(..., description="Delta")
|
| 101 |
+
finish_reason: _O[str] = _F(None, description="Finish reason")
|
| 102 |
+
|
| 103 |
+
class _CCRes(_BM):
|
| 104 |
+
id: str = _F(..., description="Completion ID")
|
| 105 |
+
object: str = _F(default="chat.completion", description="Object type")
|
| 106 |
+
created: int = _F(..., description="Created timestamp")
|
| 107 |
+
model: str = _F(..., description="Model name")
|
| 108 |
+
choices: _L[_CCC] = _F(..., description="Choices")
|
| 109 |
+
usage: _D[str, int] = _F(..., description="Usage stats")
|
| 110 |
+
system_fingerprint: _O[str] = _F(None, description="System fingerprint")
|
| 111 |
+
|
| 112 |
+
class _CCSR(_BM):
|
| 113 |
+
id: str = _F(..., description="Completion ID")
|
| 114 |
+
object: str = _F(default="chat.completion.chunk", description="Object type")
|
| 115 |
+
created: int = _F(..., description="Created timestamp")
|
| 116 |
+
model: str = _F(..., description="Model name")
|
| 117 |
+
choices: _L[_CCSC] = _F(..., description="Choices")
|
| 118 |
+
system_fingerprint: _O[str] = _F(None, description="System fingerprint")
|
| 119 |
+
|
| 120 |
+
class _OM(_BM):
|
| 121 |
+
id: str = _F(..., description="Model ID")
|
| 122 |
+
object: str = _F(default="model", description="Object type")
|
| 123 |
+
created: int = _F(..., description="Created timestamp")
|
| 124 |
+
owned_by: str = _F(..., description="Owner")
|
| 125 |
+
|
| 126 |
+
# Replicate Client
|
| 127 |
+
class _RC:
|
| 128 |
+
def __init__(self, _tk=_TOKEN):
|
| 129 |
+
_o.environ['REPLICATE_API_TOKEN'] = _tk
|
| 130 |
+
self._client = _r
|
| 131 |
+
self._models = _MODELS
|
| 132 |
+
self._model_info = _MODEL_INFO
|
| 133 |
+
|
| 134 |
+
def _get_replicate_model(self, _model_name):
|
| 135 |
+
"""Get the Replicate model ID from OpenAI model name"""
|
| 136 |
+
return self._models.get(_model_name, _model_name)
|
| 137 |
+
|
| 138 |
+
def _validate_model(self, _model_name):
|
| 139 |
+
"""Validate if model is supported"""
|
| 140 |
+
return _model_name in self._models or _model_name in self._models.values()
|
| 141 |
+
|
| 142 |
+
def _format_messages(self, _msgs):
|
| 143 |
+
_prompt = ""
|
| 144 |
+
_system = ""
|
| 145 |
+
|
| 146 |
+
for _msg in _msgs:
|
| 147 |
+
_role = _msg.get('role', '')
|
| 148 |
+
_content = _msg.get('content', '')
|
| 149 |
+
|
| 150 |
+
if _role == 'system':
|
| 151 |
+
_system = _content
|
| 152 |
+
elif _role == 'user':
|
| 153 |
+
_prompt += f"Human: {_content}\n\n"
|
| 154 |
+
elif _role == 'assistant':
|
| 155 |
+
_prompt += f"Assistant: {_content}\n\n"
|
| 156 |
+
|
| 157 |
+
_prompt += "Assistant: "
|
| 158 |
+
return _prompt, _system
|
| 159 |
+
|
| 160 |
+
def _create_prediction(self, _model_name, _prompt, _system="", **_kwargs):
|
| 161 |
+
"""Create a prediction using Replicate API"""
|
| 162 |
+
_replicate_model = self._get_replicate_model(_model_name)
|
| 163 |
+
|
| 164 |
+
_input = {
|
| 165 |
+
"prompt": _prompt,
|
| 166 |
+
"system_prompt": _system,
|
| 167 |
+
"max_tokens": _kwargs.get('max_tokens', 4096),
|
| 168 |
+
"temperature": _kwargs.get('temperature', 0.7),
|
| 169 |
+
"top_p": _kwargs.get('top_p', 1.0)
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
try:
|
| 173 |
+
_prediction = self._client.predictions.create(
|
| 174 |
+
model=_replicate_model,
|
| 175 |
+
input=_input
|
| 176 |
+
)
|
| 177 |
+
return _prediction
|
| 178 |
+
except Exception as _e:
|
| 179 |
+
_lg.error(f"Prediction creation error for {_replicate_model}: {_e}")
|
| 180 |
+
return None
|
| 181 |
+
|
| 182 |
+
def _handle_tools(self, _tools, _tool_choice):
|
| 183 |
+
if not _tools:
|
| 184 |
+
return ""
|
| 185 |
+
|
| 186 |
+
_tool_prompt = "\n\nYou have access to the following tools:\n"
|
| 187 |
+
for _tool in _tools:
|
| 188 |
+
_func = _tool.get('function', {})
|
| 189 |
+
_name = _func.get('name', '')
|
| 190 |
+
_desc = _func.get('description', '')
|
| 191 |
+
_params = _func.get('parameters', {})
|
| 192 |
+
_tool_prompt += f"- {_name}: {_desc}\n"
|
| 193 |
+
_tool_prompt += f" Parameters: {_j.dumps(_params)}\n"
|
| 194 |
+
|
| 195 |
+
_tool_prompt += "\nTo use a tool, respond with JSON in this format:\n"
|
| 196 |
+
_tool_prompt += '{"tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "tool_name", "arguments": "{\\"param\\": \\"value\\"}"}}]}\n'
|
| 197 |
+
|
| 198 |
+
return _tool_prompt
|
| 199 |
+
|
| 200 |
+
def _stream_chat(self, _model_name, _prompt, _system="", **_kwargs):
|
| 201 |
+
"""Stream chat using Replicate's streaming API"""
|
| 202 |
+
_replicate_model = self._get_replicate_model(_model_name)
|
| 203 |
+
|
| 204 |
+
_input = {
|
| 205 |
+
"prompt": _prompt,
|
| 206 |
+
"system_prompt": _system,
|
| 207 |
+
"max_tokens": _kwargs.get('max_tokens', 4096),
|
| 208 |
+
"temperature": _kwargs.get('temperature', 0.7),
|
| 209 |
+
"top_p": _kwargs.get('top_p', 1.0)
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
try:
|
| 213 |
+
# Use Replicate's streaming method
|
| 214 |
+
for _event in self._client.stream(_replicate_model, input=_input):
|
| 215 |
+
if _event:
|
| 216 |
+
yield str(_event)
|
| 217 |
+
except Exception as _e:
|
| 218 |
+
_lg.error(f"Streaming error for {_replicate_model}: {_e}")
|
| 219 |
+
yield f"Error: {_e}"
|
| 220 |
+
|
| 221 |
+
def _stream_from_prediction(self, _prediction):
|
| 222 |
+
"""Stream from a prediction using the stream URL"""
|
| 223 |
+
try:
|
| 224 |
+
import requests
|
| 225 |
+
_stream_url = _prediction.urls.get('stream')
|
| 226 |
+
if not _stream_url:
|
| 227 |
+
_lg.error("No stream URL available")
|
| 228 |
+
return
|
| 229 |
+
|
| 230 |
+
_response = requests.get(
|
| 231 |
+
_stream_url,
|
| 232 |
+
headers={
|
| 233 |
+
"Accept": "text/event-stream",
|
| 234 |
+
"Cache-Control": "no-store"
|
| 235 |
+
},
|
| 236 |
+
stream=True
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
for _line in _response.iter_lines():
|
| 240 |
+
if _line:
|
| 241 |
+
_line = _line.decode('utf-8')
|
| 242 |
+
if _line.startswith('data: '):
|
| 243 |
+
_data = _line[6:]
|
| 244 |
+
if _data != '[DONE]':
|
| 245 |
+
yield _data
|
| 246 |
+
else:
|
| 247 |
+
break
|
| 248 |
+
|
| 249 |
+
except Exception as _e:
|
| 250 |
+
_lg.error(f"Stream from prediction error: {_e}")
|
| 251 |
+
yield f"Error: {_e}"
|
| 252 |
+
|
| 253 |
+
def _complete_chat(self, _model_name, _prompt, _system="", **_kwargs):
|
| 254 |
+
"""Complete chat using Replicate's run method"""
|
| 255 |
+
_replicate_model = self._get_replicate_model(_model_name)
|
| 256 |
+
|
| 257 |
+
_input = {
|
| 258 |
+
"prompt": _prompt,
|
| 259 |
+
"system_prompt": _system,
|
| 260 |
+
"max_tokens": _kwargs.get('max_tokens', 4096),
|
| 261 |
+
"temperature": _kwargs.get('temperature', 0.7),
|
| 262 |
+
"top_p": _kwargs.get('top_p', 1.0)
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
try:
|
| 266 |
+
_result = self._client.run(_replicate_model, input=_input)
|
| 267 |
+
return "".join(_result) if isinstance(_result, list) else str(_result)
|
| 268 |
+
except Exception as _e:
|
| 269 |
+
_lg.error(f"Completion error for {_replicate_model}: {_e}")
|
| 270 |
+
return f"Error: {_e}"
|
| 271 |
+
|
| 272 |
+
# Global variables
|
| 273 |
+
_client = None
|
| 274 |
+
_startup_time = _t.time()
|
| 275 |
+
_request_count = 0
|
| 276 |
+
_error_count = 0
|
| 277 |
+
|
| 278 |
+
@_acm
|
| 279 |
+
async def _lifespan(_app: _FA):
|
| 280 |
+
global _client
|
| 281 |
+
try:
|
| 282 |
+
_lg.info("Initializing Replicate client...")
|
| 283 |
+
_client = _RC()
|
| 284 |
+
_lg.info("Replicate client initialized successfully")
|
| 285 |
+
except Exception as _e:
|
| 286 |
+
_lg.error(f"Failed to initialize client: {_e}")
|
| 287 |
+
_client = None
|
| 288 |
+
|
| 289 |
+
yield
|
| 290 |
+
_lg.info("Shutting down Replicate client...")
|
| 291 |
+
|
| 292 |
+
# FastAPI App
|
| 293 |
+
_app = _FA(
|
| 294 |
+
title="Replicate Claude-4-Sonnet OpenAI API",
|
| 295 |
+
version="1.0.0",
|
| 296 |
+
description="OpenAI-compatible API for Claude-4-Sonnet via Replicate",
|
| 297 |
+
lifespan=_lifespan
|
| 298 |
+
)
|
| 299 |
+
|
| 300 |
+
# CORS
|
| 301 |
+
try:
|
| 302 |
+
from fastapi.middleware.cors import CORSMiddleware as _CM
|
| 303 |
+
_app.add_middleware(
|
| 304 |
+
_CM,
|
| 305 |
+
allow_origins=["*"],
|
| 306 |
+
allow_credentials=True,
|
| 307 |
+
allow_methods=["*"],
|
| 308 |
+
allow_headers=["*"],
|
| 309 |
+
)
|
| 310 |
+
except ImportError:
|
| 311 |
+
pass
|
| 312 |
+
|
| 313 |
+
# Error handlers
|
| 314 |
+
@_app.exception_handler(_HE)
|
| 315 |
+
async def _http_exception_handler(_request, _exc: _HE):
|
| 316 |
+
_lg.error(f"HTTP error: {_exc.status_code} - {_exc.detail}")
|
| 317 |
+
return _JR(
|
| 318 |
+
status_code=_exc.status_code,
|
| 319 |
+
content={
|
| 320 |
+
"error": {
|
| 321 |
+
"message": _exc.detail,
|
| 322 |
+
"type": "api_error",
|
| 323 |
+
"code": _exc.status_code
|
| 324 |
+
}
|
| 325 |
+
}
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
@_app.exception_handler(Exception)
|
| 329 |
+
async def _global_exception_handler(_request, _exc):
|
| 330 |
+
_lg.error(f"Unexpected error: {_exc}\n{_tb.format_exc()}")
|
| 331 |
+
return _JR(
|
| 332 |
+
status_code=500,
|
| 333 |
+
content={
|
| 334 |
+
"error": {
|
| 335 |
+
"message": "Internal server error",
|
| 336 |
+
"type": "server_error",
|
| 337 |
+
"code": 500
|
| 338 |
+
}
|
| 339 |
+
}
|
| 340 |
+
)
|
| 341 |
+
|
| 342 |
+
@_app.get("/")
|
| 343 |
+
async def _root():
|
| 344 |
+
_model_count = len([m for m in _MODELS.keys() if not m.startswith(('anthropic/', 'openai/'))])
|
| 345 |
+
return {
|
| 346 |
+
"message": "Replicate Multi-Model OpenAI API",
|
| 347 |
+
"version": "1.0.0",
|
| 348 |
+
"status": "running",
|
| 349 |
+
"supported_models": _model_count,
|
| 350 |
+
"providers": ["anthropic", "openai"]
|
| 351 |
+
}
|
| 352 |
+
|
| 353 |
+
@_app.get("/health")
|
| 354 |
+
async def _health_check():
|
| 355 |
+
global _client, _startup_time, _request_count, _error_count
|
| 356 |
+
|
| 357 |
+
_uptime = _t.time() - _startup_time
|
| 358 |
+
_status = "healthy"
|
| 359 |
+
|
| 360 |
+
_client_status = "unknown"
|
| 361 |
+
if _client is None:
|
| 362 |
+
_client_status = "not_initialized"
|
| 363 |
+
_status = "degraded"
|
| 364 |
+
else:
|
| 365 |
+
_client_status = "ready"
|
| 366 |
+
|
| 367 |
+
return {
|
| 368 |
+
"status": _status,
|
| 369 |
+
"timestamp": int(_t.time()),
|
| 370 |
+
"uptime_seconds": int(_uptime),
|
| 371 |
+
"client_status": _client_status,
|
| 372 |
+
"stats": {
|
| 373 |
+
"total_requests": _request_count,
|
| 374 |
+
"total_errors": _error_count,
|
| 375 |
+
"error_rate": _error_count / max(_request_count, 1)
|
| 376 |
+
}
|
| 377 |
+
}
|
| 378 |
+
|
| 379 |
+
@_app.get("/v1/models")
|
| 380 |
+
async def _list_models():
|
| 381 |
+
"""List all supported models"""
|
| 382 |
+
_models_list = []
|
| 383 |
+
_created_time = int(_t.time())
|
| 384 |
+
|
| 385 |
+
# Get unique model names (remove duplicates from alternative naming)
|
| 386 |
+
_unique_models = set()
|
| 387 |
+
for _model_name in _MODELS.keys():
|
| 388 |
+
if not _model_name.startswith(('anthropic/', 'openai/')):
|
| 389 |
+
_unique_models.add(_model_name)
|
| 390 |
+
|
| 391 |
+
# Create model objects
|
| 392 |
+
for _model_name in sorted(_unique_models):
|
| 393 |
+
_info = _MODEL_INFO.get(_model_name, {"owned_by": "unknown", "context_length": 4096})
|
| 394 |
+
_models_list.append(_OM(
|
| 395 |
+
id=_model_name,
|
| 396 |
+
created=_created_time,
|
| 397 |
+
owned_by=_info["owned_by"]
|
| 398 |
+
))
|
| 399 |
+
|
| 400 |
+
return {
|
| 401 |
+
"object": "list",
|
| 402 |
+
"data": _models_list
|
| 403 |
+
}
|
| 404 |
+
|
| 405 |
+
@_app.get("/models")
|
| 406 |
+
async def _list_models_alt():
|
| 407 |
+
return await _list_models()
|
| 408 |
+
|
| 409 |
+
async def _generate_stream_response(_request: _CCR, _prompt: str, _system: str, _request_id: str = None):
|
| 410 |
+
_completion_id = f"chatcmpl-{_u.uuid4().hex}"
|
| 411 |
+
_created_time = int(_t.time())
|
| 412 |
+
_request_id = _request_id or f"req-{_u.uuid4().hex[:8]}"
|
| 413 |
+
|
| 414 |
+
_lg.info(f"[{_request_id}] Starting stream generation")
|
| 415 |
+
|
| 416 |
+
try:
|
| 417 |
+
# Send initial chunk with role
|
| 418 |
+
_initial_chunk = {
|
| 419 |
+
"id": _completion_id,
|
| 420 |
+
"object": "chat.completion.chunk",
|
| 421 |
+
"created": _created_time,
|
| 422 |
+
"model": _request.model,
|
| 423 |
+
"choices": [{
|
| 424 |
+
"index": 0,
|
| 425 |
+
"delta": {"role": "assistant"},
|
| 426 |
+
"finish_reason": None
|
| 427 |
+
}]
|
| 428 |
+
}
|
| 429 |
+
yield f"data: {_j.dumps(_initial_chunk)}\n\n"
|
| 430 |
+
|
| 431 |
+
# Stream content chunks using Replicate's streaming
|
| 432 |
+
_chunk_count = 0
|
| 433 |
+
_total_content = ""
|
| 434 |
+
|
| 435 |
+
try:
|
| 436 |
+
# Use Replicate's direct streaming method with model parameter
|
| 437 |
+
for _chunk in _client._stream_chat(_request.model, _prompt, _system, **_request.model_dump()):
|
| 438 |
+
if _chunk and isinstance(_chunk, str):
|
| 439 |
+
_chunk_count += 1
|
| 440 |
+
_total_content += _chunk
|
| 441 |
+
|
| 442 |
+
_stream_response = _CCSR(
|
| 443 |
+
id=_completion_id,
|
| 444 |
+
created=_created_time,
|
| 445 |
+
model=_request.model,
|
| 446 |
+
choices=[_CCSC(
|
| 447 |
+
delta={"content": _chunk},
|
| 448 |
+
finish_reason=None
|
| 449 |
+
)]
|
| 450 |
+
)
|
| 451 |
+
|
| 452 |
+
try:
|
| 453 |
+
_chunk_json = _j.dumps(_stream_response.model_dump())
|
| 454 |
+
yield f"data: {_chunk_json}\n\n"
|
| 455 |
+
except Exception as _json_error:
|
| 456 |
+
_lg.error(f"[{_request_id}] JSON serialization error: {_json_error}")
|
| 457 |
+
continue
|
| 458 |
+
|
| 459 |
+
except Exception as _stream_error:
|
| 460 |
+
_lg.error(f"[{_request_id}] Streaming error after {_chunk_count} chunks: {_stream_error}")
|
| 461 |
+
|
| 462 |
+
if _chunk_count == 0:
|
| 463 |
+
_error_content = "I apologize, but I encountered an error while generating the response. Please try again."
|
| 464 |
+
_error_response = _CCSR(
|
| 465 |
+
id=_completion_id,
|
| 466 |
+
created=_created_time,
|
| 467 |
+
model=_request.model,
|
| 468 |
+
choices=[_CCSC(
|
| 469 |
+
delta={"content": _error_content},
|
| 470 |
+
finish_reason=None
|
| 471 |
+
)]
|
| 472 |
+
)
|
| 473 |
+
yield f"data: {_j.dumps(_error_response.model_dump())}\n\n"
|
| 474 |
+
|
| 475 |
+
_lg.info(f"[{_request_id}] Stream completed: {_chunk_count} chunks, {len(_total_content)} characters")
|
| 476 |
+
|
| 477 |
+
except Exception as _e:
|
| 478 |
+
_lg.error(f"[{_request_id}] Critical streaming error: {_e}")
|
| 479 |
+
_error_chunk = {
|
| 480 |
+
"id": _completion_id,
|
| 481 |
+
"object": "chat.completion.chunk",
|
| 482 |
+
"created": _created_time,
|
| 483 |
+
"model": _request.model,
|
| 484 |
+
"choices": [{
|
| 485 |
+
"index": 0,
|
| 486 |
+
"delta": {"content": "Error occurred while streaming response."},
|
| 487 |
+
"finish_reason": "stop"
|
| 488 |
+
}]
|
| 489 |
+
}
|
| 490 |
+
yield f"data: {_j.dumps(_error_chunk)}\n\n"
|
| 491 |
+
|
| 492 |
+
finally:
|
| 493 |
+
try:
|
| 494 |
+
_final_chunk = {
|
| 495 |
+
"id": _completion_id,
|
| 496 |
+
"object": "chat.completion.chunk",
|
| 497 |
+
"created": _created_time,
|
| 498 |
+
"model": _request.model,
|
| 499 |
+
"choices": [{
|
| 500 |
+
"index": 0,
|
| 501 |
+
"delta": {},
|
| 502 |
+
"finish_reason": "stop"
|
| 503 |
+
}]
|
| 504 |
+
}
|
| 505 |
+
yield f"data: {_j.dumps(_final_chunk)}\n\n"
|
| 506 |
+
yield "data: [DONE]\n\n"
|
| 507 |
+
_lg.info(f"[{_request_id}] Stream finalized")
|
| 508 |
+
except Exception as _final_error:
|
| 509 |
+
_lg.error(f"[{_request_id}] Error sending final chunk: {_final_error}")
|
| 510 |
+
yield "data: [DONE]\n\n"
|
| 511 |
+
|
| 512 |
+
@_app.post("/v1/chat/completions")
|
| 513 |
+
async def _create_chat_completion(_request: _CCR):
|
| 514 |
+
global _request_count, _error_count, _client
|
| 515 |
+
|
| 516 |
+
_request_count += 1
|
| 517 |
+
_request_id = f"req-{_u.uuid4().hex[:8]}"
|
| 518 |
+
_lg.info(f"[{_request_id}] Chat completion request: model={_request.model}, stream={_request.stream}")
|
| 519 |
+
|
| 520 |
+
if _client is None:
|
| 521 |
+
_error_count += 1
|
| 522 |
+
_lg.error(f"[{_request_id}] Client not initialized")
|
| 523 |
+
raise _HE(status_code=503, detail="Service temporarily unavailable")
|
| 524 |
+
|
| 525 |
+
try:
|
| 526 |
+
# Validate model
|
| 527 |
+
if not _client._validate_model(_request.model):
|
| 528 |
+
_supported_models = list(_MODELS.keys())
|
| 529 |
+
raise _HE(status_code=400, detail=f"Model '{_request.model}' not supported. Supported models: {_supported_models}")
|
| 530 |
+
|
| 531 |
+
# Format messages
|
| 532 |
+
_prompt, _system = _client._format_messages([_msg.model_dump() for _msg in _request.messages])
|
| 533 |
+
|
| 534 |
+
# Handle tools/functions
|
| 535 |
+
if _request.tools or _request.functions:
|
| 536 |
+
_tools = _request.tools or [_TD(function=_func) for _func in (_request.functions or [])]
|
| 537 |
+
_tool_prompt = _client._handle_tools([_tool.model_dump() for _tool in _tools], _request.tool_choice)
|
| 538 |
+
_prompt += _tool_prompt
|
| 539 |
+
|
| 540 |
+
_lg.info(f"[{_request_id}] Formatted prompt length: {len(_prompt)}")
|
| 541 |
+
|
| 542 |
+
# Stream or complete
|
| 543 |
+
if _request.stream:
|
| 544 |
+
_lg.info(f"[{_request_id}] Starting streaming response")
|
| 545 |
+
return _SR(
|
| 546 |
+
_generate_stream_response(_request, _prompt, _system, _request_id),
|
| 547 |
+
media_type="text/plain",
|
| 548 |
+
headers={
|
| 549 |
+
"Cache-Control": "no-cache",
|
| 550 |
+
"Connection": "keep-alive",
|
| 551 |
+
"Content-Type": "text/event-stream"
|
| 552 |
+
}
|
| 553 |
+
)
|
| 554 |
+
else:
|
| 555 |
+
# Non-streaming completion
|
| 556 |
+
_lg.info(f"[{_request_id}] Starting non-streaming completion")
|
| 557 |
+
_content = _client._complete_chat(_request.model, _prompt, _system, **_request.model_dump())
|
| 558 |
+
|
| 559 |
+
_completion_id = f"chatcmpl-{_u.uuid4().hex}"
|
| 560 |
+
_created_time = int(_t.time())
|
| 561 |
+
|
| 562 |
+
# Check for tool calls in response
|
| 563 |
+
_tool_calls = None
|
| 564 |
+
_finish_reason = "stop"
|
| 565 |
+
|
| 566 |
+
try:
|
| 567 |
+
if _content.strip().startswith('{"tool_calls"'):
|
| 568 |
+
_tool_data = _j.loads(_content.strip())
|
| 569 |
+
if "tool_calls" in _tool_data:
|
| 570 |
+
_tool_calls = [_TC(**_tc) for _tc in _tool_data["tool_calls"]]
|
| 571 |
+
_finish_reason = "tool_calls"
|
| 572 |
+
_content = None
|
| 573 |
+
except:
|
| 574 |
+
pass
|
| 575 |
+
|
| 576 |
+
_response = _CCRes(
|
| 577 |
+
id=_completion_id,
|
| 578 |
+
created=_created_time,
|
| 579 |
+
model=_request.model,
|
| 580 |
+
choices=[_CCC(
|
| 581 |
+
message=_CM(
|
| 582 |
+
role="assistant",
|
| 583 |
+
content=_content,
|
| 584 |
+
tool_calls=[_tc.model_dump() for _tc in _tool_calls] if _tool_calls else None
|
| 585 |
+
),
|
| 586 |
+
finish_reason=_finish_reason
|
| 587 |
+
)],
|
| 588 |
+
usage={
|
| 589 |
+
"prompt_tokens": len(_prompt.split()),
|
| 590 |
+
"completion_tokens": len(_content.split()) if _content else 0,
|
| 591 |
+
"total_tokens": len(_prompt.split()) + (len(_content.split()) if _content else 0)
|
| 592 |
+
}
|
| 593 |
+
)
|
| 594 |
+
|
| 595 |
+
_lg.info(f"[{_request_id}] Non-streaming completion finished")
|
| 596 |
+
return _response
|
| 597 |
+
|
| 598 |
+
except _HE:
|
| 599 |
+
_error_count += 1
|
| 600 |
+
raise
|
| 601 |
+
except Exception as _e:
|
| 602 |
+
_error_count += 1
|
| 603 |
+
_lg.error(f"[{_request_id}] Unexpected error: {_e}\n{_tb.format_exc()}")
|
| 604 |
+
raise _HE(status_code=500, detail="Internal server error occurred")
|
| 605 |
+
|
| 606 |
+
@_app.post("/chat/completions")
|
| 607 |
+
async def _create_chat_completion_alt(_request: _CCR):
|
| 608 |
+
return await _create_chat_completion(_request)
|
| 609 |
+
|
| 610 |
+
if __name__ == "__main__":
|
| 611 |
+
try:
|
| 612 |
+
import uvicorn as _uv
|
| 613 |
+
_port = int(_o.getenv("PORT", 7860)) # Hugging Face default port
|
| 614 |
+
_host = _o.getenv("HOST", "0.0.0.0")
|
| 615 |
+
|
| 616 |
+
_lg.info(f"Starting Replicate Multi-Model server on {_host}:{_port}")
|
| 617 |
+
_lg.info(f"Supported models: {list(_MODELS.keys())[:7]}") # Show first 7 models
|
| 618 |
+
_uv.run(
|
| 619 |
+
_app,
|
| 620 |
+
host=_host,
|
| 621 |
+
port=_port,
|
| 622 |
+
reload=False,
|
| 623 |
+
log_level="info",
|
| 624 |
+
access_log=True
|
| 625 |
+
)
|
| 626 |
+
except ImportError:
|
| 627 |
+
_lg.error("uvicorn not installed. Install with: pip install uvicorn")
|
| 628 |
+
except Exception as _e:
|
| 629 |
+
_lg.error(f"Failed to start server: {_e}")
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.104.1
|
| 2 |
+
uvicorn[standard]==0.24.0
|
| 3 |
+
pydantic==2.5.0
|
| 4 |
+
replicate==0.22.0
|
| 5 |
+
requests==2.31.0
|
| 6 |
+
sseclient-py==1.8.0
|