Spaces:
Configuration error
Configuration error
Update README.md
Browse files
README.md
CHANGED
|
@@ -1,244 +1,213 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
1. **Setup**
|
| 144 |
-
You need `CMake 3.14+` installed, or install with `brew install cmake` (on macOS) or standard package managers on Linux.
|
| 145 |
-
|
| 146 |
-
2. **Build from Source**
|
| 147 |
-
```bash
|
| 148 |
-
git clone https://github.com/your-org/cactus.git
|
| 149 |
-
cd cactus
|
| 150 |
-
mkdir build && cd build
|
| 151 |
-
cmake .. -DCMAKE_BUILD_TYPE=Release
|
| 152 |
-
make -j$(nproc)
|
| 153 |
-
```
|
| 154 |
-
|
| 155 |
-
3. **CMake Integration**
|
| 156 |
-
Add to your `CMakeLists.txt`:
|
| 157 |
-
|
| 158 |
-
```cmake
|
| 159 |
-
# Add Cactus as subdirectory
|
| 160 |
-
add_subdirectory(cactus)
|
| 161 |
-
|
| 162 |
-
# Link to your target
|
| 163 |
-
target_link_libraries(your_target cactus)
|
| 164 |
-
target_include_directories(your_target PRIVATE cactus)
|
| 165 |
-
|
| 166 |
-
# Requires C++17 or higher
|
| 167 |
-
```
|
| 168 |
-
|
| 169 |
-
4. **Basic Text Completion**
|
| 170 |
-
```cpp
|
| 171 |
-
#include "cactus/cactus.h"
|
| 172 |
-
#include <iostream>
|
| 173 |
-
|
| 174 |
-
int main() {
|
| 175 |
-
cactus::cactus_context context;
|
| 176 |
-
|
| 177 |
-
// Configure parameters
|
| 178 |
-
common_params params;
|
| 179 |
-
params.model.path = "model.gguf";
|
| 180 |
-
params.n_ctx = 2048;
|
| 181 |
-
params.n_threads = 4;
|
| 182 |
-
params.n_gpu_layers = 99; // Use GPU acceleration
|
| 183 |
-
|
| 184 |
-
// Load model
|
| 185 |
-
if (!context.loadModel(params)) {
|
| 186 |
-
std::cerr << "Failed to load model" << std::endl;
|
| 187 |
-
return 1;
|
| 188 |
}
|
| 189 |
-
|
| 190 |
-
// Set prompt
|
| 191 |
-
context.params.prompt = "Hello, how are you?";
|
| 192 |
-
context.params.n_predict = 100;
|
| 193 |
-
|
| 194 |
-
// Initialize sampling
|
| 195 |
-
if (!context.initSampling()) {
|
| 196 |
-
std::cerr << "Failed to initialize sampling" << std::endl;
|
| 197 |
-
return 1;
|
| 198 |
-
}
|
| 199 |
-
|
| 200 |
-
// Generate response
|
| 201 |
-
context.beginCompletion();
|
| 202 |
-
context.loadPrompt();
|
| 203 |
-
|
| 204 |
-
while (context.has_next_token && !context.is_interrupted) {
|
| 205 |
-
auto token_output = context.doCompletion();
|
| 206 |
-
if (token_output.tok == -1) break;
|
| 207 |
-
}
|
| 208 |
-
|
| 209 |
-
std::cout << "Response: " << context.generated_text << std::endl;
|
| 210 |
-
return 0;
|
| 211 |
}
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<img src="assets/banner.jpg" alt="Logo" style="border-radius: 30px; width: 100%;">
|
| 2 |
+
|
| 3 |
+
Energy-efficient kernels & inference engine for phones.
|
| 4 |
+
|
| 5 |
+
## Why Cactus?
|
| 6 |
+
- Phones run on battery, GPUs drain energy and heat the devices.
|
| 7 |
+
- 70% of phones today don't ship NPUs which most frameworks optimse for.
|
| 8 |
+
- Cactus is optimsed for old and new ARM-CPU first, with NPU/DSP/ISP coming.
|
| 9 |
+
- Fast on all phones with less battery drain and heating.
|
| 10 |
+
|
| 11 |
+
## Performance (CPU only)
|
| 12 |
+
|
| 13 |
+
- Speed for various sizes can be estimated proportionally
|
| 14 |
+
- INT4 wiil give 30% gains when merged
|
| 15 |
+
- GPUs yield gains but drain battery, will be passed on for NPUs
|
| 16 |
+
|
| 17 |
+
| Device | Qwen3-INT8-600m (toks/sec) |
|
| 18 |
+
|:------------------------------|:------------------------:|
|
| 19 |
+
| iPhone 17 Pro | 74 |
|
| 20 |
+
| Galaxy S25 Ultra / 16 Pro | 58 |
|
| 21 |
+
| iPhone 16 / Galaxy S25 / Nothing 3 | 52 |
|
| 22 |
+
| iPhone 15 Pro | 48 |
|
| 23 |
+
| iPhone 14 Pro / OnePlus 13 5G | 47 |
|
| 24 |
+
| Galaxy S24 Ultra / iPhone 15 | 42 |
|
| 25 |
+
| OnePlus Open / Galaxy S23 | 41 |
|
| 26 |
+
| iPhone 13 Pro / OnePlus 12 | 38 |
|
| 27 |
+
| iPhone 13 mini / Redmi K70 Ultra / Xiaomi 13 / OnePlus 11 | 27 |
|
| 28 |
+
| Pixel 6a / Nothing 3a / iPhone X / Galaxy S21 | 16 |
|
| 29 |
+
|
| 30 |
+
## File Size Comparison
|
| 31 |
+
|
| 32 |
+
| Format | Size (Qwen3-0.6B-INT8) |
|
| 33 |
+
|--------|------------------------|
|
| 34 |
+
| Cactus | 370-420 MB |
|
| 35 |
+
| ONNX/TFLite/MLX | 600 MB |
|
| 36 |
+
| GGUF | 800 MB |
|
| 37 |
+
| Executorch | 944 MB |
|
| 38 |
+
|
| 39 |
+
## Battery drain
|
| 40 |
+
|
| 41 |
+
- Newer devices have bigger battery
|
| 42 |
+
- NPUs are designed for less drain (2-10x)
|
| 43 |
+
- Apple Intelligence drain 0.6 percent/min on iPhone 16 Pro Max
|
| 44 |
+
|
| 45 |
+
| Device | Qwen3-INT8-600m (percent/min) |
|
| 46 |
+
|:------------------------------|:------------------------:|
|
| 47 |
+
| OnePlus 13 5G | 0.33 |
|
| 48 |
+
| Redmi K70 Ultra / OnePlus 12 | 0.41 |
|
| 49 |
+
| Galaxy S25 Ultra / Iphone 17 Pro / Nothing 3 | 0.44 |
|
| 50 |
+
| Galaxy S24 Ultra / Nothing 3a / Pixel 6a | 0.48 |
|
| 51 |
+
| iPhone 16 Pro Max / Xiaomi 13 | 0.50 |
|
| 52 |
+
|
| 53 |
+
## Design
|
| 54 |
+
```
|
| 55 |
+
┌─────────────────┐
|
| 56 |
+
│ Cactus FFI │ ←── OpenAI compatible C API for integration
|
| 57 |
+
└─────────────────┘
|
| 58 |
+
│
|
| 59 |
+
┌─────────────────┐
|
| 60 |
+
│ Cactus Engine │ ←── High-level transformer engine
|
| 61 |
+
└─────────────────┘
|
| 62 |
+
│
|
| 63 |
+
┌─────────────────┐
|
| 64 |
+
│ Cactus Graph │ ←── Unified zero-copy computation graph
|
| 65 |
+
└─────────────────┘
|
| 66 |
+
│
|
| 67 |
+
┌─────────────────┐
|
| 68 |
+
│ Cactus Kernels │ ←── Low-level ARM-specific SIMD operations
|
| 69 |
+
└─────────────────┘
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
## Cactus Graph & Kernels
|
| 73 |
+
Cactus Graph is a general numerical computing framework that runs on Cactus Kernels.
|
| 74 |
+
Great for implementing custom models and scientific computing, like JAX for phones.
|
| 75 |
+
|
| 76 |
+
```cpp
|
| 77 |
+
#include cactus.h
|
| 78 |
+
|
| 79 |
+
CactusGraph graph;
|
| 80 |
+
|
| 81 |
+
auto a = graph.input({2, 3}, Precision::FP16);
|
| 82 |
+
auto b = graph.input({3, 4}, Precision::INT8);
|
| 83 |
+
|
| 84 |
+
auto x1 = graph.matmul(a, b, false);
|
| 85 |
+
auto x2 = graph.transpose(x1);
|
| 86 |
+
auto result = graph.matmul(b, x2, true);
|
| 87 |
+
|
| 88 |
+
float a_data[6] = {1.1f, 2.3f, 3.4f, 4.2f, 5.7f, 6.8f};
|
| 89 |
+
float b_data[12] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
|
| 90 |
+
|
| 91 |
+
graph.set_input(a, a_data, Precision::FP16);
|
| 92 |
+
graph.set_input(b, b_data, Precision::INT8);
|
| 93 |
+
graph.execute();
|
| 94 |
+
|
| 95 |
+
void* output_data = graph.get_output(result);
|
| 96 |
+
graph.hard_reset();
|
| 97 |
+
|
| 98 |
+
```
|
| 99 |
+
|
| 100 |
+
## Cactus Engine & APIs
|
| 101 |
+
Cactus Engine is a transformer inference engine built on top of Cactus Graphs.
|
| 102 |
+
It is abstracted via Cactus Foreign Function Interface APIs.
|
| 103 |
+
Header files are self-documenting but documentation contributions are welcome.
|
| 104 |
+
|
| 105 |
+
```cpp
|
| 106 |
+
#include cactus.h
|
| 107 |
+
|
| 108 |
+
const char* model_path = "path/to/weight/folder";
|
| 109 |
+
cactus_model_t model = cactus_init(model_path, 2048);
|
| 110 |
+
|
| 111 |
+
const char* messages = R"([
|
| 112 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
| 113 |
+
{"role": "user", "content": "/nothink My name is Henry Ndubuaku"}
|
| 114 |
+
])";
|
| 115 |
+
|
| 116 |
+
const char* options = R"({
|
| 117 |
+
"max_tokens": 50,
|
| 118 |
+
"stop_sequences": ["<|im_end|>"]
|
| 119 |
+
})";
|
| 120 |
+
|
| 121 |
+
char response[1024];
|
| 122 |
+
int result = cactus_complete(model, messages, response, sizeof(response), options, nullptr, nullptr, nullptr);
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
With tool support:
|
| 126 |
+
```cpp
|
| 127 |
+
const char* tools = R"([
|
| 128 |
+
{
|
| 129 |
+
"function": {
|
| 130 |
+
"name": "get_weather",
|
| 131 |
+
"description": "Get weather for a location",
|
| 132 |
+
"parameters": {
|
| 133 |
+
"properties": {
|
| 134 |
+
"location": {
|
| 135 |
+
"type": "string",
|
| 136 |
+
"description": "City name",
|
| 137 |
+
"required": true
|
| 138 |
+
}
|
| 139 |
+
},
|
| 140 |
+
"required": ["location"]
|
| 141 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
}
|
| 144 |
+
])";
|
| 145 |
+
|
| 146 |
+
int result = cactus_complete(model, messages, response, sizeof(response), options, tools, nullptr, nullptr);
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
## Using Cactus in your apps
|
| 150 |
+
Cactus SDKs run 500k+ weekly inference tasks in production today, try them!
|
| 151 |
+
|
| 152 |
+
<a href="https://github.com/cactus-compute/cactus-flutter" target="_blank">
|
| 153 |
+
<img alt="Flutter" src="https://img.shields.io/badge/Flutter-grey.svg?style=for-the-badge&logo=Flutter&logoColor=white">
|
| 154 |
+
</a> <a href="https://github.com/cactus-compute/cactus-react" target="_blank">
|
| 155 |
+
<img alt="React Native" src="https://img.shields.io/badge/React%20Native-grey.svg?style=for-the-badge&logo=react&logoColor=%2361DAFB">
|
| 156 |
+
</a> <a href="https://github.com/cactus-compute/cactus-kotlin" target="_blank">
|
| 157 |
+
<img alt="Kotlin" src="https://img.shields.io/badge/Kotlin_MP-grey.svg?style=for-the-badge&logo=kotlin&logoColor=white">
|
| 158 |
+
</a>
|
| 159 |
+
|
| 160 |
+
## Getting started
|
| 161 |
+
<a href="https://cactuscompute.com/docs" target="_blank">
|
| 162 |
+
<img alt="Documentation" src="https://img.shields.io/badge/Documentation-4A90E2?style=for-the-badge&logo=gitbook&logoColor=white">
|
| 163 |
+
</a> <a href="https://discord.gg/bNurx3AXTJ" target="_blank">
|
| 164 |
+
<img alt="Discord" src="https://img.shields.io/badge/Discord-5865F2?style=for-the-badge&logo=discord&logoColor=white">
|
| 165 |
+
</a>
|
| 166 |
+
|
| 167 |
+
## Demo
|
| 168 |
+
<a href="https://apps.apple.com/gb/app/cactus-chat/id6744444212" target="_blank">
|
| 169 |
+
<img alt="Download iOS App" src="https://img.shields.io/badge/Try_iOS_Demo-grey?style=for-the-badge&logo=apple&logoColor=white">
|
| 170 |
+
</a> <a href="https://play.google.com/store/apps/details?id=com.rshemetsubuser.myapp&pcampaignid=web_share" target="_blank">
|
| 171 |
+
<img alt="Download Android App" src="https://img.shields.io/badge/Try_Android_Demo-grey?style=for-the-badge&logo=android&logoColor=white">
|
| 172 |
+
</a>
|
| 173 |
+
|
| 174 |
+
## Using this repo
|
| 175 |
+
You can run these codes directly on M-series Macbooks since they are ARM-based.
|
| 176 |
+
Vanilla M3 CPU-only can run Qwen3-600m-INT8 at 60-70 toks/sec, just run the following:
|
| 177 |
+
|
| 178 |
+
```bash
|
| 179 |
+
./tests/run.sh # chmod +x first time
|
| 180 |
+
```
|
| 181 |
+
|
| 182 |
+
## Generating weights from HuggingFace
|
| 183 |
+
Use any of the (270m, 350m, 360m, 600m, 750m, 1B, 1.2B, 1.7B activated params):
|
| 184 |
+
```bash
|
| 185 |
+
# Language models
|
| 186 |
+
python3 tools/convert_hf.py google/gemma-3-270m-it weights/gemma3-270m/ --precision INT8
|
| 187 |
+
python3 tools/convert_hf.py LiquidAI/LFM2-350M weights/lfm2-350m/ --precision INT8
|
| 188 |
+
python3 tools/convert_hf.py HuggingFaceTB/SmolLM2-360m-Instruct weights/smollm2-360m/ --precision INT8
|
| 189 |
+
python3 tools/convert_hf.py Qwen/Qwen3-0.6B weights/qwen3-600m/ --precision INT8
|
| 190 |
+
python3 tools/convert_hf.py LiquidAI/LFM2-700M weights/lfm2-700m/ --precision INT8
|
| 191 |
+
python3 tools/convert_hf.py google/gemma-3-1b-it weights/gemma3-1b/ --precision INT8
|
| 192 |
+
python3 tools/convert_hf.py LiquidAI/LFM2-1.2B weights/lfm2-1.2B/ --precision INT8
|
| 193 |
+
python3 tools/convert_hf.py Qwen/Qwen3-1.7B weights/qwen3-1.7B/ --precision INT8
|
| 194 |
+
|
| 195 |
+
# Embedding models
|
| 196 |
+
python3 tools/convert_hf.py Qwen/Qwen3-Embedding-0.6B weights/qwen3-embed-600m/ --precision INT8
|
| 197 |
+
python3 tools/convert_hf.py nomic-ai/nomic-embed-text-v2-moe weights/nomic/ --precision INT8
|
| 198 |
+
```
|
| 199 |
+
|
| 200 |
+
Simply replace the weight path `tests/test_engine.cpp` with your choice.
|
| 201 |
+
|
| 202 |
+
## Roadmap:
|
| 203 |
+
- Llama, LFM, SmolVLM, Whisper, Kitten, Neuphonic
|
| 204 |
+
- Python tools for porting any Torch/JAX to cactus
|
| 205 |
+
- GPTQ & NPU/DSP/ISP for high-end phones
|
| 206 |
+
|
| 207 |
+
## Limitations
|
| 208 |
+
While Cactus can be used for all Apple devices including Macbooks, for computers/AMD/Intel/Nvidia generally,
|
| 209 |
+
please use HuggingFace, Llama.cpp, Ollama, vLLM, MLX. They're built for those, support x86, and are all great!
|
| 210 |
+
|
| 211 |
+
## Contributing
|
| 212 |
+
|
| 213 |
+
We welcome contributions! Please see our [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|