Fahad-S commited on Oct 7, 2025

Commit

712dbf0

verified ·

1 Parent(s): 1e8e70c

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +37 -0
ml-stable-diffusion/mlx/.circleci/config.yml +579 -0
ml-stable-diffusion/mlx/.clang-format +87 -0
ml-stable-diffusion/mlx/.github/ISSUE_TEMPLATE/bug_report.md +28 -0
ml-stable-diffusion/mlx/.github/pull_request_template.md +12 -0
ml-stable-diffusion/mlx/.github/workflows/pull_request.yml +20 -0
ml-stable-diffusion/mlx/.gitignore +88 -0
ml-stable-diffusion/mlx/.pre-commit-config.yaml +21 -0
ml-stable-diffusion/mlx/ACKNOWLEDGMENTS.md +268 -0
ml-stable-diffusion/mlx/CITATION.cff +24 -0
ml-stable-diffusion/mlx/CMakeLists.txt +353 -0
ml-stable-diffusion/mlx/CODE_OF_CONDUCT.md +132 -0
ml-stable-diffusion/mlx/CONTRIBUTING.md +38 -0
ml-stable-diffusion/mlx/LICENSE +21 -0
ml-stable-diffusion/mlx/MANIFEST.in +6 -0
ml-stable-diffusion/mlx/README.md +121 -0
ml-stable-diffusion/mlx/benchmarks/cpp/CMakeLists.txt +11 -0
ml-stable-diffusion/mlx/benchmarks/cpp/autograd.cpp +39 -0
ml-stable-diffusion/mlx/benchmarks/cpp/compare_devices.cpp +27 -0
ml-stable-diffusion/mlx/benchmarks/cpp/irregular_strides.cpp +201 -0
ml-stable-diffusion/mlx/benchmarks/cpp/single_ops.cpp +288 -0
ml-stable-diffusion/mlx/benchmarks/cpp/time_utils.h +39 -0
ml-stable-diffusion/mlx/benchmarks/numpy/single_ops.py +39 -0
ml-stable-diffusion/mlx/benchmarks/numpy/time_utils.py +20 -0
ml-stable-diffusion/mlx/benchmarks/python/batch_matmul_bench.py +62 -0
ml-stable-diffusion/mlx/benchmarks/python/blas/bench_gemm.py +191 -0
ml-stable-diffusion/mlx/benchmarks/python/blas/bench_gemv.py +221 -0
ml-stable-diffusion/mlx/benchmarks/python/comparative/README.md +15 -0
ml-stable-diffusion/mlx/benchmarks/python/comparative/bench_mlx.py +519 -0
ml-stable-diffusion/mlx/benchmarks/python/comparative/bench_torch.py +482 -0
ml-stable-diffusion/mlx/benchmarks/python/comparative/compare.py +284 -0
ml-stable-diffusion/mlx/benchmarks/python/compile_bench.py +107 -0
ml-stable-diffusion/mlx/benchmarks/python/conv1d_bench.py +123 -0
ml-stable-diffusion/mlx/benchmarks/python/conv2d_bench_cpu.py +127 -0
ml-stable-diffusion/mlx/benchmarks/python/conv2d_train_bench_cpu.py +143 -0
ml-stable-diffusion/mlx/benchmarks/python/conv2d_transpose_bench_cpu.py +129 -0
ml-stable-diffusion/mlx/benchmarks/python/conv3d_bench_cpu.py +110 -0
ml-stable-diffusion/mlx/benchmarks/python/conv3d_train_bench_cpu.py +143 -0
ml-stable-diffusion/mlx/benchmarks/python/conv3d_transpose_bench_cpu.py +116 -0
ml-stable-diffusion/mlx/benchmarks/python/conv_bench.py +135 -0
ml-stable-diffusion/mlx/benchmarks/python/conv_transpose_bench.py +135 -0
ml-stable-diffusion/mlx/benchmarks/python/conv_unaligned_bench.py +107 -0
ml-stable-diffusion/mlx/benchmarks/python/distributed_bench.py +66 -0
ml-stable-diffusion/mlx/benchmarks/python/einsum_bench.py +84 -0
ml-stable-diffusion/mlx/benchmarks/python/fft_bench.py +118 -0
ml-stable-diffusion/mlx/benchmarks/python/gather_bench.py +52 -0
ml-stable-diffusion/mlx/benchmarks/python/gather_mm_bench.py +74 -0
ml-stable-diffusion/mlx/benchmarks/python/gather_qmm_bench.py +84 -0
ml-stable-diffusion/mlx/benchmarks/python/hadamard_bench.py +70 -0
ml-stable-diffusion/mlx/benchmarks/python/layer_norm_bench.py +82 -0

.gitattributes CHANGED Viewed

@@ -120,3 +120,40 @@ ml-stable-diffusion/assets/mbp/a_high_quality_photo_of_a_surfing_dog.7667.final_
 ml-stable-diffusion/assets/mbp/a_high_quality_photo_of_a_surfing_dog.7667.final_float16_original.png filter=lfs diff=lfs merge=lfs -text
 ml-stable-diffusion/assets/palette6_cpuandne_readmereel.png filter=lfs diff=lfs merge=lfs -text
 ml-stable-diffusion/assets/readme_reel.png filter=lfs diff=lfs merge=lfs -text

 ml-stable-diffusion/assets/mbp/a_high_quality_photo_of_a_surfing_dog.7667.final_float16_original.png filter=lfs diff=lfs merge=lfs -text
 ml-stable-diffusion/assets/palette6_cpuandne_readmereel.png filter=lfs diff=lfs merge=lfs -text
 ml-stable-diffusion/assets/readme_reel.png filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/backend/cpu/arg_reduce.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/backend/cpu/binary.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/backend/cpu/compiled.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/backend/cpu/compiled_preamble.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/backend/cpu/conv.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/backend/cpu/copy.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/backend/cpu/fft.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/backend/cpu/indexing.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/backend/cpu/jit_compiler.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/backend/cpu/masked_mm.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/backend/cpu/matmul.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/backend/cpu/primitives.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/backend/cpu/quantized.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/backend/cpu/reduce.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/backend/cpu/scan.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/backend/cpu/select.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/backend/cpu/sort.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/backend/cpu/unary.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/backend/no_gpu/primitives.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/compile.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/distributed/mpi/mpi.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/distributed/ring/ring.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/einsum.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/export.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/fast.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/io/gguf.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/io/load.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/io/safetensors.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/linalg.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/primitives.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/random.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/CMakeFiles/mlx.dir/mlx/transforms.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/python/src/CMakeFiles/nanobind-static.dir/proj/cvl/users/x_fahkh2/caches/pip-build-env-nyl54h73/overlay/lib/python3.10/site-packages/nanobind/src/nb_type.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/python/src/CMakeFiles/nanobind-static.dir/proj/cvl/users/x_fahkh2/caches/pip-build-env-vont0ixn/overlay/lib/python3.10/site-packages/nanobind/src/nb_type.cpp.o filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/build/temp.linux-x86_64-cpython-310/mlx.core/python/src/libnanobind-static.a filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/docs/src/_static/metal_debugger/capture.png filter=lfs diff=lfs merge=lfs -text
+ml-stable-diffusion/mlx/docs/src/_static/metal_debugger/schema.png filter=lfs diff=lfs merge=lfs -text

ml-stable-diffusion/mlx/.circleci/config.yml ADDED Viewed

	@@ -0,0 +1,579 @@

+version: 2.1
+orbs:
+  apple: ml-explore/pr-approval@0.1.0
+parameters:
+  nightly_build:
+    type: boolean
+    default: false
+  test_release:
+    type: boolean
+    default: false
+jobs:
+  build_documentation:
+    parameters:
+      upload-docs:
+        type: boolean
+        default: false
+    macos:
+      xcode: "26.0.0"
+    resource_class: m4pro.medium
+    steps:
+      - checkout
+      - run:
+          name: Install
+          command: |
+            xcodebuild -downloadComponent MetalToolchain
+            brew install python@3.9
+            brew install doxygen
+            python3.9 -m venv env
+            source env/bin/activate
+            pip install --upgrade pip
+            pip install --upgrade cmake
+            pip install -r docs/requirements.txt
+            pip install . -v
+      - when:
+          condition:
+            not: << parameters.upload-docs >>
+          steps:
+            - run:
+               name: Build documentation
+               command: |
+                 source env/bin/activate
+                 cd docs && doxygen && make html O=-W
+      - when:
+          condition: << parameters.upload-docs >>
+          steps:
+            - add_ssh_keys:
+                fingerprints:
+                  - "SHA256:OhcVVMovbT0pkgMeiVRyxMnjV9R2t+hKBsNcuxq9h+0"
+            - run:
+               name: Upload documentation
+               command: |
+                 source env/bin/activate
+                 git config user.email "mlx@group.apple.com"
+                 git config user.name "CircleCI Docs"
+                 git checkout gh-pages
+                 git rebase main
+                 cd docs
+                 git rm -rf build/html
+                 doxygen && make html O=-W
+                 git add -f build/html
+                 git commit -m "rebase"
+                 git push -f origin gh-pages
+  linux_build_and_test:
+    machine:
+      image: ubuntu-2204:current
+      resource_class: large
+    steps:
+      - checkout
+      - run:
+          name: Run style checks
+          command: |
+            pip install pre-commit
+            pre-commit run --all
+            if ! git diff --quiet; then echo 'Style checks failed, please install pre-commit and run pre-commit run --all and push the change'; exit 1; fi
+      - run:
+          name: Install dependencies
+          command: |
+            export DEBIAN_FRONTEND=noninteractive
+            export NEEDRESTART_MODE=a
+            sudo apt-get update
+            sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev
+            sudo apt-get install openmpi-bin openmpi-common libopenmpi-dev
+            curl -LsSf https://astral.sh/uv/install.sh | sh
+      - run:
+          name: Install Python package
+          command: |
+            uv venv
+            uv pip install cmake
+            DEBUG=1 CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
+              uv pip install -e ".[dev]" -v
+      - run:
+          name: Generate package stubs
+          command: |
+            uv pip install typing_extensions
+            uv run --no-project setup.py generate_stubs
+      - run:
+          name: Run Python tests
+          command: |
+            source .venv/bin/activate
+            python -m unittest discover python/tests -v
+            mpirun --bind-to none -host localhost:8 -np 8 python python/tests/mpi_test_distributed.py
+            mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
+            if $(grep "\[WARN\]" stderr.log); then echo "Distributed ring test failed"; exit 1; fi
+      - run:
+          name: Build CPP only
+          command: |
+            source .venv/bin/activate
+            mkdir -p build && cd build
+            cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG
+            make -j `nproc`
+      - run:
+          name: Run CPP tests
+          command: ./build/tests/tests
+  mac_build_and_test:
+    parameters:
+      xcode_version:
+        type: string
+        default: "26.0.0"
+      macosx_deployment_target:
+        type: string
+        default: ""
+    macos:
+      xcode: << parameters.xcode_version >>
+    environment:
+      MACOSX_DEPLOYMENT_TARGET: << parameters.macosx_deployment_target >>
+    resource_class: m4pro.medium
+    steps:
+      - checkout
+      - run:
+          name: Install dependencies
+          command: |
+            xcodebuild -downloadComponent MetalToolchain
+            HOMEBREW_NO_AUTO_UPDATE=1 HOMEBREW_NO_INSTALL_CLEANUP=1 \
+              brew install openmpi uv
+      - run:
+          name: Install Python package
+          command: |
+            uv venv --python 3.9
+            uv pip install \
+              nanobind==2.4.0 \
+              cmake \
+              numpy \
+              torch \
+              tensorflow \
+              unittest-xml-reporting
+            DEBUG=1 CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
+              uv pip install -e . -v
+      - run:
+          name: Generate package stubs
+          command: |
+            uv pip install typing_extensions
+            uv run --no-project setup.py generate_stubs
+      - run:
+          name: Run Python tests
+          command: |
+            source .venv/bin/activate
+            LOW_MEMORY=1 DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
+            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 python -m xmlrunner discover -v python/tests -o test-results/gpu
+            mpirun --bind-to none -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
+            mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
+            if $(grep "\[WARN\]" stderr.log); then echo "Distributed ring test failed"; exit 1; fi
+      - run:
+          name: Build example extension
+          command: |
+            source .venv/bin/activate
+            cd examples/extensions
+            uv pip install -r requirements.txt
+            uv run --no-project setup.py build_ext --inplace
+            uv run --no-project python test.py
+      - store_test_results:
+          path: test-results
+      - run:
+          name: Build CPP only
+          command: |
+            source .venv/bin/activate
+            mkdir -p build && cd build && cmake .. && make -j `sysctl -n hw.ncpu`
+      - run:
+          name: Run CPP tests
+          command: |
+            DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 ./build/tests/tests
+      - run:
+          name: Build small binary
+          command: |
+            source .venv/bin/activate
+            cd build/
+            cmake .. -DCMAKE_BUILD_TYPE=MinSizeRel \
+              -DBUILD_SHARED_LIBS=ON \
+              -DMLX_BUILD_CPU=OFF \
+              -DMLX_BUILD_SAFETENSORS=OFF \
+              -DMLX_BUILD_GGUF=OFF \
+              -DMLX_METAL_JIT=ON
+            make -j `sysctl -n hw.ncpu`
+      - run:
+          name: Run Python tests with JIT
+          command: |
+            CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
+              uv pip install -e . -v
+            LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 \
+              METAL_DEBUG_ERROR_MODE=0 \
+              uv run --no-project python -m xmlrunner discover \
+                -v python/tests \
+                -o test-results/gpu_jit
+  cuda_build_and_test:
+    parameters:
+      image_date:
+        type: string
+        default: "2023.11.1"
+    machine:
+      image: "linux-cuda-12:<< parameters.image_date >>"
+      resource_class: gpu.nvidia.small.gen2
+    steps:
+      - checkout
+      - restore_cache:
+          keys:
+            - cuda-<< parameters.image_date >>-{{ arch }}-
+      - run:
+          name: Install dependencies
+          command: |
+            sudo apt-get update
+            sudo apt-get install libcudnn9-dev-cuda-12
+            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
+            sudo apt-get install libnccl2 libnccl-dev
+            curl -sL https://github.com/ccache/ccache/releases/download/v4.11.3/ccache-4.11.3-linux-x86_64.tar.xz | tar xJf -
+            sudo mv ccache-4.11.3-linux-x86_64/ccache /usr/bin/ccache
+            rm -rf ccache-4.11.3-linux-x86_64
+            curl -LsSf https://astral.sh/uv/install.sh | sh
+      - run:
+          name: Set CCache size
+          command: ccache --max-size 1G
+      - run:
+          name: Install Python package
+          command: |
+            uv venv
+            uv pip install cmake
+            DEBUG=1 CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+              uv pip install -e ".[dev]" -v
+      - run:
+          name: Run Python tests
+          command: |
+            source .venv/bin/activate
+            LOW_MEMORY=1 DEVICE=cpu python -m unittest discover python/tests -v
+            LOW_MEMORY=1 DEVICE=gpu python -m tests discover python/tests -v
+      - run:
+          name: Build CPP only
+          command: |
+            source .venv/bin/activate
+            cmake . -B build \
+              -DMLX_BUILD_CUDA=ON \
+              -DCMAKE_CUDA_COMPILER=`which nvcc` \
+              -DCMAKE_BUILD_TYPE=DEBUG
+            cmake --build build -j `nproc`
+      - run:
+          name: Run CPP tests
+          command: ./build/tests/tests -sfe="*fft_tests.cpp,*linalg_tests.cpp"
+      - run:
+          name: CCache report
+          command: |
+            ccache --show-stats
+            ccache --zero-stats
+            ccache --cleanup
+      - save_cache:
+          key: cuda-<< parameters.image_date >>-{{ arch }}-{{ epoch }}
+          paths:
+            - /home/circleci/.cache/ccache
+  build_release:
+    parameters:
+      python_version:
+        type: string
+        default: "3.9"
+      xcode_version:
+        type: string
+        default: "26.0.0"
+      build_env:
+        type: string
+        default: ""
+      macosx_deployment_target:
+        type: string
+        default: ""
+    macos:
+      xcode: << parameters.xcode_version >>
+    resource_class: m4pro.medium
+    environment:
+      MACOSX_DEPLOYMENT_TARGET: << parameters.macosx_deployment_target >>
+    steps:
+      - checkout
+      - run:
+          name: Install dependencies
+          command: |
+            xcodebuild -downloadComponent MetalToolchain
+            mkdir -p ~/miniconda3
+            curl https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-arm64.sh -o ~/miniconda3/miniconda.sh
+            bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
+            rm ~/miniconda3/miniconda.sh
+            source ~/miniconda3/bin/activate
+            conda init --all
+            conda create -n env python=<< parameters.python_version >> -y
+            conda activate env
+            pip install --upgrade cmake
+            pip install nanobind==2.4.0
+            pip install --upgrade setuptools
+            pip install numpy
+            pip install twine
+            pip install build
+      - run:
+          name: Install Python package
+          command: |
+            conda activate env
+            env -u MACOSX_DEPLOYMENT_TARGET DEV_RELEASE=1 \
+              pip install . -v
+      - run:
+          name: Generate package stubs
+          command: |
+            conda activate env
+            pip install typing_extensions
+            python setup.py generate_stubs
+      - run:
+          name: Build Python package
+          command: |
+            conda activate env
+            python setup.py clean --all
+            << parameters.build_env >> MLX_BUILD_STAGE=1 python -m build -w
+      - when:
+          condition:
+            equal: ["3.9", << parameters.python_version >>]
+          steps:
+            - run:
+                name: Build common package
+                command: |
+                  conda activate env
+                  python setup.py clean --all
+                  << parameters.build_env >> MLX_BUILD_STAGE=2 python -m build -w
+      - when:
+          condition: << parameters.build_env >>
+          steps:
+            - run:
+                name: Upload package
+                command: |
+                  conda activate env
+                  twine upload dist/*
+      - store_artifacts:
+          path: dist/
+  build_linux_release:
+    parameters:
+      python_version:
+        type: string
+        default: "3.9"
+      build_env:
+        type: string
+        default: ""
+    machine:
+      image: ubuntu-2204:current
+      resource_class: large
+    steps:
+      - checkout
+      - run:
+          name: Build wheel
+          command: |
+            PYTHON=python<< parameters.python_version >>
+            export DEBIAN_FRONTEND=noninteractive
+            export NEEDRESTART_MODE=a
+            sudo apt-get update
+            TZ=Etc/UTC sudo apt-get -y install tzdata
+            sudo add-apt-repository -y ppa:deadsnakes/ppa
+            sudo apt-get install -y $PYTHON $PYTHON-dev $PYTHON-full
+            sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev
+            $PYTHON -m venv env
+            source env/bin/activate
+            pip install --upgrade pip
+            pip install --upgrade cmake
+            pip install auditwheel
+            pip install patchelf
+            pip install build
+            pip install twine
+            << parameters.build_env >> pip install ".[dev]" -v
+            pip install typing_extensions
+            python setup.py generate_stubs
+            python setup.py clean --all
+            MLX_BUILD_STAGE=1 << parameters.build_env >> python -m build -w
+            bash python/scripts/repair_linux.sh
+      - when:
+          condition:
+            equal: ["3.9", << parameters.python_version >>]
+          steps:
+            - run:
+                name: Build common package
+                command: |
+                  source env/bin/activate
+                  python setup.py clean --all
+                  << parameters.build_env >> MLX_BUILD_STAGE=2 \
+                    python -m build -w
+                  auditwheel repair dist/mlx_cpu*.whl --plat manylinux_2_35_x86_64
+      - when:
+          condition: << parameters.build_env >>
+          steps:
+            - run:
+                name: Upload packages
+                command: |
+                  source env/bin/activate
+                  twine upload wheelhouse/*.whl
+      - store_artifacts:
+          path: wheelhouse/
+  build_cuda_release:
+    parameters:
+      build_env:
+        type: string
+        default: ""
+    machine:
+      image: ubuntu-2204:current
+      resource_class: xlarge
+    steps:
+      - checkout
+      - run:
+          name: Build wheel
+          command: |
+            export DEBIAN_FRONTEND=noninteractive
+            export NEEDRESTART_MODE=a
+            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
+            sudo dpkg -i cuda-keyring_1.1-1_all.deb
+            sudo apt-get update
+            sudo apt-get install cuda-toolkit-12-9 libcudnn9-dev-cuda-12
+            sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
+            sudo apt-get install zip
+            pip install auditwheel
+            pip install patchelf
+            pip install build
+            pip install twine
+            export PATH=/usr/local/cuda/bin${PATH:+:${PATH}}
+            export LD_LIBRARY_PATH=/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+            << parameters.build_env >> MLX_BUILD_STAGE=2 \
+              CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
+              python -m build -w
+            bash python/scripts/repair_cuda.sh
+      - when:
+          condition: << parameters.build_env >>
+          steps:
+            - run:
+                name: Upload package
+                command: |
+                  twine upload wheelhouse/*.whl
+      - store_artifacts:
+          path: wheelhouse/
+workflows:
+  build_and_test:
+    when:
+      and:
+        - matches:
+            pattern: "^(?!pull/)[-\\w]+$"
+            value: << pipeline.git.branch >>
+        - not: << pipeline.parameters.nightly_build >>
+        - not: << pipeline.parameters.test_release >>
+    jobs:
+      - mac_build_and_test:
+          matrix:
+            parameters:
+              macosx_deployment_target: ["13.5", "15.0"]
+      - linux_build_and_test
+      - cuda_build_and_test:
+          matrix:
+            parameters:
+              image_date: ["2023.11.1", "2025.05.1"]
+      - build_documentation
+  build_pypi_release:
+    when:
+      and:
+        - not: << pipeline.parameters.nightly_build >>
+        - not: << pipeline.parameters.test_release >>
+    jobs:
+      - build_release:
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+          matrix:
+            parameters:
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              macosx_deployment_target: ["13.5", "14.0", "15.0"]
+              build_env: ["PYPI_RELEASE=1"]
+              xcode_version: ["26.0.0"]
+      - build_documentation:
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+          upload-docs: true
+      - build_linux_release:
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+          matrix:
+            parameters:
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              build_env: ["PYPI_RELEASE=1"]
+      - build_cuda_release:
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+          matrix:
+            parameters:
+              build_env: ["PYPI_RELEASE=1"]
+  prb:
+    when:
+      matches:
+        pattern: "^pull/\\d+(/head)?$"
+        value: << pipeline.git.branch >>
+    jobs:
+      - hold:
+          type: approval
+      - apple/authenticate:
+          context: pr-approval
+      - mac_build_and_test:
+          requires: [ hold ]
+          matrix:
+            parameters:
+              macosx_deployment_target: ["13.5", "15.0"]
+      - linux_build_and_test:
+          requires: [ hold ]
+      - cuda_build_and_test:
+          requires: [ hold ]
+          matrix:
+            parameters:
+              image_date: ["2023.11.1", "2025.05.1"]
+  nightly_build:
+    when:
+      and:
+        - equal: [ main, << pipeline.git.branch >> ]
+        - << pipeline.parameters.nightly_build >>
+    jobs:
+      - build_release:
+          matrix:
+            parameters:
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              macosx_deployment_target: ["13.5", "14.0", "15.0"]
+              xcode_version: ["26.0.0"]
+      - build_linux_release:
+          matrix:
+            parameters:
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+      - build_cuda_release
+  build_dev_release:
+    when:
+      and:
+        - equal: [ main, << pipeline.git.branch >> ]
+        - << pipeline.parameters.test_release >>
+    jobs:
+      - build_release:
+          matrix:
+            parameters:
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              macosx_deployment_target: ["13.5", "14.0", "15.0"]
+              build_env: ["DEV_RELEASE=1"]
+              xcode_version: ["26.0.0"]
+      - build_linux_release:
+          matrix:
+            parameters:
+              python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+              build_env: ["DEV_RELEASE=1"]
+      - build_cuda_release:
+          matrix:
+            parameters:
+              build_env: ["DEV_RELEASE=1"]

ml-stable-diffusion/mlx/.clang-format ADDED Viewed

	@@ -0,0 +1,87 @@

+---
+AccessModifierOffset: -1
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   false
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ForEachMacros:   [ FOR_EACH, FOR_EACH_R, FOR_EACH_RANGE, ]
+IncludeCategories:
+  - Regex:           '^<.*\.h(pp)?>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IndentCaseLabels: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    true
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        8
+UseTab:          Never
+...

ml-stable-diffusion/mlx/.github/ISSUE_TEMPLATE/bug_report.md ADDED Viewed

	@@ -0,0 +1,28 @@

+---
+name: Bug report
+about: Create a report about an issue you've encountered
+title: "[BUG] "
+labels: ''
+assignees: ''
+---
+**Describe the bug**
+A clear and concise description of what the bug is.
+**To Reproduce**
+Include code snippet
+```python
+```
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+**Desktop (please complete the following information):**
+ - OS Version: [e.g. MacOS 14.1.2]
+ - Version [e.g. 0.7.0]
+**Additional context**
+Add any other context about the problem here.

ml-stable-diffusion/mlx/.github/pull_request_template.md ADDED Viewed

	@@ -0,0 +1,12 @@

+## Proposed changes
+Please include a description of the problem or feature this PR is addressing. If there is a corresponding issue, include the issue #.
+## Checklist
+Put an `x` in the boxes that apply.
+- [ ] I have read the [CONTRIBUTING](https://github.com/ml-explore/mlx/blob/main/CONTRIBUTING.md) document
+- [ ] I have run `pre-commit run --all-files` to format my code / installed pre-commit prior to committing changes
+- [ ] I have added tests that prove my fix is effective or that my feature works
+- [ ] I have updated the necessary documentation (if needed)

ml-stable-diffusion/mlx/.github/workflows/pull_request.yml ADDED Viewed

	@@ -0,0 +1,20 @@

+on:
+  pull_request:
+    branches:
+      - main
+jobs:
+  check_lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v4
+        with:
+          python-version: 3.8
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install pre-commit black isort clang-format
+      - name: Run lint
+        run: |
+          pre-commit run --all-files

ml-stable-diffusion/mlx/.gitignore ADDED Viewed

	@@ -0,0 +1,88 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# tensor files
+*.safe
+*.safetensors
+# Metal libraries
+*.metallib
+venv/
+# Distribution / packaging
+python/mlx/core
+python/mlx/share
+python/mlx/include
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+uv.lock
+# vim
+*.swp
+# Ignore build dir
+build/
+# Prerequisites
+*.d
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+# Precompiled Headers
+*.gch
+*.pch
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+# Fortran module files
+*.mod
+*.smod
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+# Executables
+*.exe
+*.out
+*.app
+# Debug symbols
+*.pdb
+# VSCode
+.vscode/
+.DS_Store
+# Jetbrains
+.cache

ml-stable-diffusion/mlx/.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+repos:
+-   repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v19.1.7
+    hooks:
+    -   id: clang-format
+# Using this mirror lets us use mypyc-compiled black, which is about 2x faster
+-   repo: https://github.com/psf/black-pre-commit-mirror
+    rev: 25.1.0
+    hooks:
+    -   id: black
+-   repo: https://github.com/pycqa/isort
+    rev: 6.0.0
+    hooks:
+    -   id: isort
+        args:
+            - --profile=black
+- repo: https://github.com/cheshirekow/cmake-format-precommit
+  rev: v0.6.13
+  hooks:
+    - id: cmake-format

ml-stable-diffusion/mlx/ACKNOWLEDGMENTS.md ADDED Viewed

	@@ -0,0 +1,268 @@

+# Individual Contributors
+If you wish to be acknowledged for your contributions, please list your name
+with a short description of your contribution(s) below. For example:
+- Jane Smith: Added the `foo` and `bar` ops.
+MLX was developed with contributions from the following individuals:
+- Nripesh Niketan: Added `softsign`, `softmax`, `hardswish`, `logsoftmax` activation functions. Added `dropout3d` ops. Added `LogicalAnd` and `LogicalOR` ops. Added `clip_grad_norm` along with `tree_reduce`. Added `cross`. Added `orthogonal` initializer.
+- Juarez Bochi: Fixed bug in cross attention.
+- Justin Deschenaux: Sine, Cosine, arange, randint, truncated normal, bernoulli, lion optimizer, Dropout2d, linear and logistic regression python example.
+- Diogo Da Cruz: Added `tri`, `tril`, `triu`, `tensordot`, `inner`, `outer`, `tile`, `StreamContext`, `stream`, safetensors support, `einsum`, and `einsum_path`.
+- Gabrijel Boduljak: Added `mlx.core.linalg`, implemented `norm` method and `InstanceNorm` layer. Implemented pooling layers and ``Upsample``.
+- Hinrik Snær Guðmundsson: Added `atleast_1d`, `atleast_2d`, `atleast_3d` ops.
+- Luca Arnaboldi: Added `Ceil` and `Floor` ops; implemented pickling, copy and deepcopy for mlx arrays.
+- Brian Keene & Atila Orhon, with Argmax Inc.: Added `fast.scaled_dot_product_attention`
+- AmirHossein Razlighi: Added chaining support for some of the ops in `nn.Module`. Comparison works for non array objects in `mlx.core.array`. Exception handling for invalid operations in `mlx.core.array`.
+- Gleb Pobudzey: Added the `where` primitive, and groups in 1D and 2D convolutions.
+- Paul Paczuski: Improved stability of BCE loss calculation
+- Max-Heinrich Laves: Added `conv_transpose1d`, `conv_transpose2d`, and `conv_transpose3d` ops.
+- Gökdeniz Gülmez: Added the `Muon (MomentUm Orthogonalized by Newton-schulz)` optimizer, and the `ReLU²` activation function.
+<a href="https://github.com/ml-explore/mlx/graphs/contributors">
+  <img class="dark-light" src="https://contrib.rocks/image?repo=ml-explore/mlx&anon=0&columns=20&max=100&r=true" />
+</a>
+# Organizations
+MLX has received contributions from the following companies:
+- NVIDIA Corporation & Affiliates
+# Third-Party Software
+MLX leverages several third-party software, listed here together with
+their license copied verbatim.
+## PocketFFT
+Copyright (C) 2010-2018 Max-Planck-Society
+All rights reserved.
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its contributors may
+  be used to endorse or promote products derived from this software without
+  specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+## metal-cpp
+                              Apache License
+                        Version 2.0, January 2004
+                    http://www.apache.org/licenses/
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+1. Definitions.
+  "License" shall mean the terms and conditions for use, reproduction,
+  and distribution as defined by Sections 1 through 9 of this document.
+  "Licensor" shall mean the copyright owner or entity authorized by
+  the copyright owner that is granting the License.
+  "Legal Entity" shall mean the union of the acting entity and all
+  other entities that control, are controlled by, or are under common
+  control with that entity. For the purposes of this definition,
+  "control" means (i) the power, direct or indirect, to cause the
+  direction or management of such entity, whether by contract or
+  otherwise, or (ii) ownership of fifty percent (50%) or more of the
+  outstanding shares, or (iii) beneficial ownership of such entity.
+  "You" (or "Your") shall mean an individual or Legal Entity
+  exercising permissions granted by this License.
+  "Source" form shall mean the preferred form for making modifications,
+  including but not limited to software source code, documentation
+  source, and configuration files.
+  "Object" form shall mean any form resulting from mechanical
+  transformation or translation of a Source form, including but
+  not limited to compiled object code, generated documentation,
+  and conversions to other media types.
+  "Work" shall mean the work of authorship, whether in Source or
+  Object form, made available under the License, as indicated by a
+  copyright notice that is included in or attached to the work
+  (an example is provided in the Appendix below).
+  "Derivative Works" shall mean any work, whether in Source or Object
+  form, that is based on (or derived from) the Work and for which the
+  editorial revisions, annotations, elaborations, or other modifications
+  represent, as a whole, an original work of authorship. For the purposes
+  of this License, Derivative Works shall not include works that remain
+  separable from, or merely link (or bind by name) to the interfaces of,
+  the Work and Derivative Works thereof.
+  "Contribution" shall mean any work of authorship, including
+  the original version of the Work and any modifications or additions
+  to that Work or Derivative Works thereof, that is intentionally
+  submitted to Licensor for inclusion in the Work by the copyright owner
+  or by an individual or Legal Entity authorized to submit on behalf of
+  the copyright owner. For the purposes of this definition, "submitted"
+  means any form of electronic, verbal, or written communication sent
+  to the Licensor or its representatives, including but not limited to
+  communication on electronic mailing lists, source code control systems,
+  and issue tracking systems that are managed by, or on behalf of, the
+  Licensor for the purpose of discussing and improving the Work, but
+  excluding communication that is conspicuously marked or otherwise
+  designated in writing by the copyright owner as "Not a Contribution."
+  "Contributor" shall mean Licensor and any individual or Legal Entity
+  on behalf of whom a Contribution has been received by Licensor and
+  subsequently incorporated within the Work.
+2. Grant of Copyright License. Subject to the terms and conditions of
+  this License, each Contributor hereby grants to You a perpetual,
+  worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+  copyright license to reproduce, prepare Derivative Works of,
+  publicly display, publicly perform, sublicense, and distribute the
+  Work and such Derivative Works in Source or Object form.
+3. Grant of Patent License. Subject to the terms and conditions of
+  this License, each Contributor hereby grants to You a perpetual,
+  worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+  (except as stated in this section) patent license to make, have made,
+  use, offer to sell, sell, import, and otherwise transfer the Work,
+  where such license applies only to those patent claims licensable
+  by such Contributor that are necessarily infringed by their
+  Contribution(s) alone or by combination of their Contribution(s)
+  with the Work to which such Contribution(s) was submitted. If You
+  institute patent litigation against any entity (including a
+  cross-claim or counterclaim in a lawsuit) alleging that the Work
+  or a Contribution incorporated within the Work constitutes direct
+  or contributory patent infringement, then any patent licenses
+  granted to You under this License for that Work shall terminate
+  as of the date such litigation is filed.
+4. Redistribution. You may reproduce and distribute copies of the
+  Work or Derivative Works thereof in any medium, with or without
+  modifications, and in Source or Object form, provided that You
+  meet the following conditions:
+  (a) You must give any other recipients of the Work or
+      Derivative Works a copy of this License; and
+  (b) You must cause any modified files to carry prominent notices
+      stating that You changed the files; and
+  (c) You must retain, in the Source form of any Derivative Works
+      that You distribute, all copyright, patent, trademark, and
+      attribution notices from the Source form of the Work,
+      excluding those notices that do not pertain to any part of
+      the Derivative Works; and
+  (d) If the Work includes a "NOTICE" text file as part of its
+      distribution, then any Derivative Works that You distribute must
+      include a readable copy of the attribution notices contained
+      within such NOTICE file, excluding those notices that do not
+      pertain to any part of the Derivative Works, in at least one
+      of the following places: within a NOTICE text file distributed
+      as part of the Derivative Works; within the Source form or
+      documentation, if provided along with the Derivative Works; or,
+      within a display generated by the Derivative Works, if and
+      wherever such third-party notices normally appear. The contents
+      of the NOTICE file are for informational purposes only and
+      do not modify the License. You may add Your own attribution
+      notices within Derivative Works that You distribute, alongside
+      or as an addendum to the NOTICE text from the Work, provided
+      that such additional attribution notices cannot be construed
+      as modifying the License.
+  You may add Your own copyright statement to Your modifications and
+  may provide additional or different license terms and conditions
+  for use, reproduction, or distribution of Your modifications, or
+  for any such Derivative Works as a whole, provided Your use,
+  reproduction, and distribution of the Work otherwise complies with
+  the conditions stated in this License.
+5. Submission of Contributions. Unless You explicitly state otherwise,
+  any Contribution intentionally submitted for inclusion in the Work
+  by You to the Licensor shall be under the terms and conditions of
+  this License, without any additional terms or conditions.
+  Notwithstanding the above, nothing herein shall supersede or modify
+  the terms of any separate license agreement you may have executed
+  with Licensor regarding such Contributions.
+6. Trademarks. This License does not grant permission to use the trade
+  names, trademarks, service marks, or product names of the Licensor,
+  except as required for reasonable and customary use in describing the
+  origin of the Work and reproducing the content of the NOTICE file.
+7. Disclaimer of Warranty. Unless required by applicable law or
+  agreed to in writing, Licensor provides the Work (and each
+  Contributor provides its Contributions) on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+  implied, including, without limitation, any warranties or conditions
+  of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+  PARTICULAR PURPOSE. You are solely responsible for determining the
+  appropriateness of using or redistributing the Work and assume any
+  risks associated with Your exercise of permissions under this License.
+8. Limitation of Liability. In no event and under no legal theory,
+  whether in tort (including negligence), contract, or otherwise,
+  unless required by applicable law (such as deliberate and grossly
+  negligent acts) or agreed to in writing, shall any Contributor be
+  liable to You for damages, including any direct, indirect, special,
+  incidental, or consequential damages of any character arising as a
+  result of this License or out of the use or inability to use the
+  Work (including but not limited to damages for loss of goodwill,
+  work stoppage, computer failure or malfunction, or any and all
+  other commercial damages or losses), even if such Contributor
+  has been advised of the possibility of such damages.
+9. Accepting Warranty or Additional Liability. While redistributing
+  the Work or Derivative Works thereof, You may choose to offer,
+  and charge a fee for, acceptance of support, warranty, indemnity,
+  or other liability obligations and/or rights consistent with this
+  License. However, in accepting such obligations, You may act only
+  on Your own behalf and on Your sole responsibility, not on behalf
+  of any other Contributor, and only if You agree to indemnify,
+  defend, and hold each Contributor harmless for any liability
+  incurred by, or claims asserted against, such Contributor by reason
+  of your accepting any such warranty or additional liability.
+END OF TERMS AND CONDITIONS
+APPENDIX: How to apply the Apache License to your work.
+  To apply the Apache License to your work, attach the following
+  boilerplate notice, with the fields enclosed by brackets "[]"
+  replaced with your own identifying information. (Don't include
+  the brackets!)  The text should be enclosed in the appropriate
+  comment syntax for the file format. We also recommend that a
+  file or class name and description of purpose be included on the
+  same "printed page" as the copyright notice for easier
+  identification within third-party archives.
+Copyright © 2023 Apple Inc.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.

ml-stable-diffusion/mlx/CITATION.cff ADDED Viewed

	@@ -0,0 +1,24 @@

+cff-version: 1.2.0
+title: mlx
+message: >-
+  If you use this software, please cite it using the
+  metadata from this file.
+type: software
+authors:
+  - given-names: Awni
+    family-names: Hannun
+    affiliation: Apple
+  - given-names: Jagrit
+    family-names: Digani
+    affiliation: Apple
+  - given-names: Angelos
+    family-names: Katharopoulos
+    affiliation: Apple
+  - given-names: Ronan
+    family-names: Collobert
+    affiliation: Apple
+repository-code: 'https://github.com/ml-explore'
+abstract: >-
+  MLX: efficient and flexible machine learning on Apple
+  silicon
+license: MIT

ml-stable-diffusion/mlx/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,353 @@

+cmake_minimum_required(VERSION 3.25)
+if(NOT MLX_VERSION)
+  file(STRINGS "mlx/version.h" _mlx_h_version REGEX "^#define MLX_VERSION_.*$")
+  string(REGEX MATCH "#define MLX_VERSION_MAJOR ([0-9]+)" _ "${_mlx_h_version}")
+  set(_major ${CMAKE_MATCH_1})
+  string(REGEX MATCH "#define MLX_VERSION_MINOR ([0-9]+)" _ "${_mlx_h_version}")
+  set(_minor ${CMAKE_MATCH_1})
+  string(REGEX MATCH "#define MLX_VERSION_PATCH ([0-9]+)" _ "${_mlx_h_version}")
+  set(_patch ${CMAKE_MATCH_1})
+  set(MLX_PROJECT_VERSION "${_major}.${_minor}.${_patch}")
+  set(MLX_VERSION ${MLX_PROJECT_VERSION})
+else()
+  string(REGEX REPLACE "^([0-9]+\.[0-9]+\.[0-9]+).*" "\\1" MLX_PROJECT_VERSION
+                       ${MLX_VERSION})
+endif()
+project(
+  mlx
+  LANGUAGES C CXX
+  VERSION ${MLX_PROJECT_VERSION})
+# ----------------------------- Setup -----------------------------
+set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+set(CMAKE_INSTALL_MESSAGE NEVER)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+# ----------------------------- Configuration -----------------------------
+option(MLX_BUILD_TESTS "Build tests for mlx" ON)
+option(MLX_BUILD_EXAMPLES "Build examples for mlx" ON)
+option(MLX_BUILD_BENCHMARKS "Build benchmarks for mlx" OFF)
+option(MLX_BUILD_PYTHON_BINDINGS "Build python bindings for mlx" OFF)
+option(MLX_BUILD_METAL "Build metal backend" ON)
+option(MLX_BUILD_CPU "Build cpu backend" ON)
+option(MLX_BUILD_CUDA "Build cuda backend" OFF)
+option(MLX_METAL_DEBUG "Enhance metal debug workflow" OFF)
+option(MLX_ENABLE_X64_MAC "Enable building for x64 macOS" OFF)
+option(MLX_BUILD_GGUF "Include support for GGUF format" ON)
+option(MLX_BUILD_SAFETENSORS "Include support for safetensors format" ON)
+option(MLX_BUILD_BLAS_FROM_SOURCE "Build OpenBLAS from source code" OFF)
+option(MLX_METAL_JIT "Use JIT compilation for Metal kernels" OFF)
+option(MLX_USE_CCACHE "Use CCache for compilation cache when available" ON)
+option(BUILD_SHARED_LIBS "Build mlx as a shared library" OFF)
+option(USE_SYSTEM_FMT "Use system's provided fmt library" OFF)
+# --------------------- Processor tests -------------------------
+message(
+  STATUS
+    "Building MLX for ${CMAKE_SYSTEM_PROCESSOR} processor on ${CMAKE_SYSTEM_NAME}"
+)
+if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+  if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
+    if(NOT MLX_ENABLE_X64_MAC)
+      message(
+        FATAL_ERROR
+          "Building for x86_64 on macOS is not supported."
+          " If you are on an Apple silicon system, check the build"
+          " documentation for possible fixes: "
+          "https://ml-explore.github.io/mlx/build/html/install.html#build-from-source"
+      )
+    else()
+      set(MLX_BUILD_METAL OFF)
+      message(WARNING "Building for x86_64 arch is not officially supported.")
+    endif()
+  endif()
+else()
+  set(MLX_BUILD_METAL OFF)
+endif()
+if(MLX_USE_CCACHE)
+  find_program(CCACHE_PROGRAM ccache)
+  if(CCACHE_PROGRAM)
+    set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+    set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+    set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+  endif()
+endif()
+# ----------------------------- Lib -----------------------------
+include(FetchContent)
+# Avoid warning about DOWNLOAD_EXTRACT_TIMESTAMP in CMake 3.24:
+cmake_policy(SET CMP0135 NEW)
+add_library(mlx)
+if(MLX_BUILD_CUDA)
+  enable_language(CUDA)
+endif()
+if(MLX_BUILD_METAL)
+  find_library(METAL_LIB Metal)
+  find_library(FOUNDATION_LIB Foundation)
+  find_library(QUARTZ_LIB QuartzCore)
+  if(METAL_LIB)
+    message(STATUS "Metal found ${METAL_LIB}")
+  else()
+    message(
+      FATAL_ERROR
+        "Metal not found. Set MLX_BUILD_METAL=OFF to build without GPU")
+  endif()
+  if(MLX_METAL_DEBUG)
+    add_compile_definitions(MLX_METAL_DEBUG)
+  endif()
+  # Throw an error if xcrun not found
+  execute_process(
+    COMMAND zsh "-c" "/usr/bin/xcrun -sdk macosx --show-sdk-version"
+    OUTPUT_VARIABLE MACOS_SDK_VERSION
+    OUTPUT_STRIP_TRAILING_WHITESPACE COMMAND_ERROR_IS_FATAL ANY)
+  if(${MACOS_SDK_VERSION} LESS 14.0)
+    message(
+      FATAL_ERROR
+        "MLX requires macOS SDK >= 14.0 to be built with MLX_BUILD_METAL=ON")
+  endif()
+  message(STATUS "Building with macOS SDK version ${MACOS_SDK_VERSION}")
+  set(METAL_CPP_URL
+      https://developer.apple.com/metal/cpp/files/metal-cpp_macOS15_iOS18.zip)
+  if(NOT CMAKE_OSX_DEPLOYMENT_TARGET STREQUAL "")
+    set(XCRUN_FLAGS "-mmacosx-version-min=${CMAKE_OSX_DEPLOYMENT_TARGET}")
+  endif()
+  execute_process(
+    COMMAND
+      zsh "-c"
+      "echo \"__METAL_VERSION__\" | xcrun -sdk macosx metal ${XCRUN_FLAGS} -E -x metal -P - | tail -1 | tr -d '\n'"
+    OUTPUT_VARIABLE MLX_METAL_VERSION COMMAND_ERROR_IS_FATAL ANY)
+  FetchContent_Declare(metal_cpp URL ${METAL_CPP_URL})
+  FetchContent_MakeAvailable(metal_cpp)
+  target_include_directories(
+    mlx PUBLIC $<BUILD_INTERFACE:${metal_cpp_SOURCE_DIR}>
+               $<INSTALL_INTERFACE:include/metal_cpp>)
+  target_link_libraries(mlx PUBLIC ${METAL_LIB} ${FOUNDATION_LIB} ${QUARTZ_LIB})
+endif()
+if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+  # With newer clang/gcc versions following libs are implicitly linked, but when
+  # building on old distributions they need to be explicitly listed.
+  target_link_libraries(mlx PRIVATE dl pthread)
+endif()
+if(WIN32)
+  if(MSVC)
+    # GGUF does not build with MSVC.
+    set(MLX_BUILD_GGUF OFF)
+    # There is no prebuilt OpenBLAS distribution for MSVC.
+    set(MLX_BUILD_BLAS_FROM_SOURCE ON)
+  endif()
+  # Windows implementation of dlfcn.h APIs.
+  FetchContent_Declare(
+    dlfcn-win32
+    GIT_REPOSITORY https://github.com/dlfcn-win32/dlfcn-win32.git
+    GIT_TAG v1.4.1
+    EXCLUDE_FROM_ALL)
+  block()
+  set(BUILD_SHARED_LIBS OFF)
+  FetchContent_MakeAvailable(dlfcn-win32)
+  endblock()
+  target_include_directories(mlx PRIVATE "${dlfcn-win32_SOURCE_DIR}/src")
+  target_link_libraries(mlx PRIVATE dl)
+endif()
+if(MLX_BUILD_CPU)
+  find_library(ACCELERATE_LIBRARY Accelerate)
+  if(ACCELERATE_LIBRARY)
+    message(STATUS "Accelerate found ${ACCELERATE_LIBRARY}")
+    set(MLX_BUILD_ACCELERATE ON)
+  else()
+    message(STATUS "Accelerate not found, using default backend.")
+    set(MLX_BUILD_ACCELERATE OFF)
+  endif()
+  if(MLX_BUILD_ACCELERATE)
+    target_link_libraries(mlx PUBLIC ${ACCELERATE_LIBRARY})
+    add_compile_definitions(MLX_USE_ACCELERATE)
+    add_compile_definitions(ACCELERATE_NEW_LAPACK)
+  elseif(MLX_BUILD_BLAS_FROM_SOURCE)
+    # Download and build OpenBLAS from source code.
+    FetchContent_Declare(
+      openblas
+      GIT_REPOSITORY https://github.com/OpenMathLib/OpenBLAS.git
+      GIT_TAG v0.3.28
+      EXCLUDE_FROM_ALL)
+    set(BUILD_STATIC_LIBS ON) # link statically
+    set(NOFORTRAN ON) # msvc has no fortran compiler
+    FetchContent_MakeAvailable(openblas)
+    target_link_libraries(mlx PRIVATE openblas)
+    target_include_directories(
+      mlx PRIVATE "${openblas_SOURCE_DIR}/lapack-netlib/LAPACKE/include"
+                  "${CMAKE_BINARY_DIR}/generated" "${CMAKE_BINARY_DIR}")
+  else()
+    if(${CMAKE_HOST_APPLE})
+      # The blas shipped in macOS SDK is not supported, search homebrew for
+      # openblas instead.
+      set(BLA_VENDOR OpenBLAS)
+      set(LAPACK_ROOT
+          "${LAPACK_ROOT};$ENV{LAPACK_ROOT};/usr/local/opt/openblas")
+    endif()
+    # Search and link with lapack.
+    find_package(LAPACK REQUIRED)
+    if(NOT LAPACK_FOUND)
+      message(FATAL_ERROR "Must have LAPACK installed")
+    endif()
+    find_path(LAPACK_INCLUDE_DIRS lapacke.h /usr/include /usr/local/include
+              /usr/local/opt/openblas/include)
+    message(STATUS "Lapack lib " ${LAPACK_LIBRARIES})
+    message(STATUS "Lapack include " ${LAPACK_INCLUDE_DIRS})
+    target_include_directories(mlx PRIVATE ${LAPACK_INCLUDE_DIRS})
+    target_link_libraries(mlx PRIVATE ${LAPACK_LIBRARIES})
+    # List blas after lapack otherwise we may accidentally incldue an old
+    # version of lapack.h from the include dirs of blas.
+    find_package(BLAS REQUIRED)
+    if(NOT BLAS_FOUND)
+      message(FATAL_ERROR "Must have BLAS installed")
+    endif()
+    # TODO find a cleaner way to do this
+    find_path(BLAS_INCLUDE_DIRS cblas.h /usr/include /usr/local/include
+              $ENV{BLAS_HOME}/include)
+    message(STATUS "Blas lib " ${BLAS_LIBRARIES})
+    message(STATUS "Blas include " ${BLAS_INCLUDE_DIRS})
+    target_include_directories(mlx PRIVATE ${BLAS_INCLUDE_DIRS})
+    target_link_libraries(mlx PRIVATE ${BLAS_LIBRARIES})
+  endif()
+else()
+  set(MLX_BUILD_ACCELERATE OFF)
+endif()
+message(STATUS "Downloading json")
+FetchContent_Declare(
+  json
+  URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz)
+FetchContent_MakeAvailable(json)
+target_include_directories(
+  mlx PRIVATE $<BUILD_INTERFACE:${json_SOURCE_DIR}/single_include/nlohmann>)
+add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/mlx)
+target_include_directories(
+  mlx PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
+             $<INSTALL_INTERFACE:include>)
+# Do not add mlx_EXPORTS define for shared library.
+set_target_properties(mlx PROPERTIES DEFINE_SYMBOL "")
+if(USE_SYSTEM_FMT)
+  find_package(fmt REQUIRED)
+else()
+  FetchContent_Declare(
+    fmt
+    GIT_REPOSITORY https://github.com/fmtlib/fmt.git
+    GIT_TAG 10.2.1
+    EXCLUDE_FROM_ALL)
+  FetchContent_MakeAvailable(fmt)
+endif()
+target_link_libraries(mlx PRIVATE $<BUILD_INTERFACE:fmt::fmt-header-only>)
+if(MLX_BUILD_PYTHON_BINDINGS)
+  message(STATUS "Building Python bindings.")
+  find_package(
+    Python 3.8
+    COMPONENTS Interpreter Development.Module
+    REQUIRED)
+  execute_process(
+    COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    OUTPUT_VARIABLE nanobind_ROOT)
+  find_package(nanobind CONFIG REQUIRED)
+  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/python/src)
+endif()
+if(MLX_BUILD_TESTS)
+  include(CTest)
+  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/tests)
+endif()
+if(MLX_BUILD_EXAMPLES)
+  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/examples/cpp)
+endif()
+if(MLX_BUILD_BENCHMARKS)
+  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/benchmarks/cpp)
+endif()
+# ----------------------------- Installation -----------------------------
+include(GNUInstallDirs)
+# Install library
+install(
+  TARGETS mlx
+  EXPORT MLXTargets
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+  INCLUDES
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+# Install headers
+install(
+  DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/mlx
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+  COMPONENT headers
+  FILES_MATCHING
+  PATTERN "*.h"
+  PATTERN "backend/metal/kernels.h" EXCLUDE)
+# Install metal dependencies
+if(MLX_BUILD_METAL)
+  # Install metal cpp
+  install(
+    DIRECTORY ${metal_cpp_SOURCE_DIR}/
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/metal_cpp
+    COMPONENT metal_cpp_source)
+endif()
+# Install cmake config
+set(MLX_CMAKE_BUILD_CONFIG ${CMAKE_BINARY_DIR}/MLXConfig.cmake)
+set(MLX_CMAKE_BUILD_VERSION_CONFIG ${CMAKE_BINARY_DIR}/MLXConfigVersion.cmake)
+set(MLX_CMAKE_INSTALL_MODULE_DIR share/cmake/MLX)
+install(
+  EXPORT MLXTargets
+  FILE MLXTargets.cmake
+  DESTINATION ${MLX_CMAKE_INSTALL_MODULE_DIR})
+include(CMakePackageConfigHelpers)
+write_basic_package_version_file(
+  ${MLX_CMAKE_BUILD_VERSION_CONFIG}
+  COMPATIBILITY SameMajorVersion
+  VERSION ${MLX_VERSION})
+configure_package_config_file(
+  ${CMAKE_CURRENT_LIST_DIR}/mlx.pc.in ${MLX_CMAKE_BUILD_CONFIG}
+  INSTALL_DESTINATION ${MLX_CMAKE_INSTALL_MODULE_DIR}
+  NO_CHECK_REQUIRED_COMPONENTS_MACRO
+  PATH_VARS CMAKE_INSTALL_LIBDIR CMAKE_INSTALL_INCLUDEDIR
+            MLX_CMAKE_INSTALL_MODULE_DIR)
+install(FILES ${MLX_CMAKE_BUILD_CONFIG} ${MLX_CMAKE_BUILD_VERSION_CONFIG}
+        DESTINATION ${MLX_CMAKE_INSTALL_MODULE_DIR})
+install(DIRECTORY ${CMAKE_MODULE_PATH}/
+        DESTINATION ${MLX_CMAKE_INSTALL_MODULE_DIR})

ml-stable-diffusion/mlx/CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,132 @@

+# Contributor Covenant Code of Conduct
+## Our Pledge
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, caste, color, religion, or sexual
+identity and orientation.
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+## Our Standards
+Examples of behavior that contributes to a positive environment for our
+community include:
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the overall
+  community
+Examples of unacceptable behavior include:
+* The use of sexualized language or imagery, and sexual attention or advances of
+  any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email address,
+  without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Enforcement Responsibilities
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+## Scope
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+[opensource-conduct@group.apple.com](mailto:opensource-conduct@group.apple.com).
+All complaints will be reviewed and investigated promptly and fairly.
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+## Enforcement Guidelines
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+### 1. Correction
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+### 2. Warning
+**Community Impact**: A violation through a single incident or series of
+actions.
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or permanent
+ban.
+### 3. Temporary Ban
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+### 4. Permanent Ban
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+**Consequence**: A permanent ban from any sort of public interaction within the
+community.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.1, available at
+[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
+Community Impact Guidelines were inspired by
+[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
+For answers to common questions about this code of conduct, see the FAQ at
+[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
+[https://www.contributor-covenant.org/translations][translations].
+[homepage]: https://www.contributor-covenant.org
+[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
+[Mozilla CoC]: https://github.com/mozilla/diversity
+[FAQ]: https://www.contributor-covenant.org/faq
+[translations]: https://www.contributor-covenant.org/translations

ml-stable-diffusion/mlx/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,38 @@

+# Contributing to MLX
+We want to make contributing to this project as easy and transparent as
+possible.
+## Pull Requests
+1. Fork and submit pull requests to the repo.
+2. If you've added code that should be tested, add tests.
+3. If a change is likely to impact efficiency, run some of the benchmarks before
+   and after the change. Examples of benchmarks can be found in `benchmarks/python/`.
+4. If you've changed APIs, update the documentation.
+5. Every PR should have passing tests and at least one review.
+6. For code formatting install `pre-commit` using something like `pip install pre-commit` and run `pre-commit install`.
+   This should install hooks for running `black` and `clang-format` to ensure
+   consistent style for C++ and python code.
+   You can also run the formatters manually as follows:
+   ```shell
+   clang-format -i file.cpp
+   ```
+   ```shell
+   black file.py
+   ```
+   or run `pre-commit run --all-files` to check all files in the repo.
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+## License
+By contributing to MLX, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.

ml-stable-diffusion/mlx/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright © 2023 Apple Inc.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

ml-stable-diffusion/mlx/MANIFEST.in ADDED Viewed

	@@ -0,0 +1,6 @@

+include CMakeLists.txt
+include mlx.pc.in
+recursive-include mlx/ *
+include cmake/*
+include python/src/*
+include python/mlx/py.typed # support type hinting as in PEP-561

ml-stable-diffusion/mlx/README.md ADDED Viewed

	@@ -0,0 +1,121 @@

+# MLX
+[**Quickstart**](#quickstart) | [**Installation**](#installation) |
+[**Documentation**](https://ml-explore.github.io/mlx/build/html/index.html) |
+[**Examples**](#examples)
+[![CircleCI](https://circleci.com/gh/ml-explore/mlx.svg?style=svg)](https://circleci.com/gh/ml-explore/mlx)
+MLX is an array framework for machine learning on Apple silicon,
+brought to you by Apple machine learning research.
+Some key features of MLX include:
+ - **Familiar APIs**: MLX has a Python API that closely follows NumPy. MLX
+   also has fully featured C++, [C](https://github.com/ml-explore/mlx-c), and
+   [Swift](https://github.com/ml-explore/mlx-swift/) APIs, which closely mirror
+   the Python API. MLX has higher-level packages like `mlx.nn` and
+   `mlx.optimizers` with APIs that closely follow PyTorch to simplify building
+   more complex models.
+ - **Composable function transformations**: MLX supports composable function
+   transformations for automatic differentiation, automatic vectorization,
+   and computation graph optimization.
+ - **Lazy computation**: Computations in MLX are lazy. Arrays are only
+   materialized when needed.
+ - **Dynamic graph construction**: Computation graphs in MLX are constructed
+   dynamically. Changing the shapes of function arguments does not trigger
+   slow compilations, and debugging is simple and intuitive.
+ - **Multi-device**: Operations can run on any of the supported devices
+   (currently the CPU and the GPU).
+ - **Unified memory**: A notable difference from MLX and other frameworks
+   is the *unified memory model*. Arrays in MLX live in shared memory.
+   Operations on MLX arrays can be performed on any of the supported
+   device types without transferring data.
+MLX is designed by machine learning researchers for machine learning
+researchers. The framework is intended to be user-friendly, but still efficient
+to train and deploy models. The design of the framework itself is also
+conceptually simple. We intend to make it easy for researchers to extend and
+improve MLX with the goal of quickly exploring new ideas.
+The design of MLX is inspired by frameworks like
+[NumPy](https://numpy.org/doc/stable/index.html),
+[PyTorch](https://pytorch.org/), [Jax](https://github.com/google/jax), and
+[ArrayFire](https://arrayfire.org/).
+## Examples
+The [MLX examples repo](https://github.com/ml-explore/mlx-examples) has a
+variety of examples, including:
+- [Transformer language model](https://github.com/ml-explore/mlx-examples/tree/main/transformer_lm) training.
+- Large-scale text generation with
+  [LLaMA](https://github.com/ml-explore/mlx-examples/tree/main/llms/llama) and
+  finetuning with [LoRA](https://github.com/ml-explore/mlx-examples/tree/main/lora).
+- Generating images with [Stable Diffusion](https://github.com/ml-explore/mlx-examples/tree/main/stable_diffusion).
+- Speech recognition with [OpenAI's Whisper](https://github.com/ml-explore/mlx-examples/tree/main/whisper).
+## Quickstart
+See the [quick start
+guide](https://ml-explore.github.io/mlx/build/html/usage/quick_start.html)
+in the documentation.
+## Installation
+MLX is available on [PyPI](https://pypi.org/project/mlx/). To install MLX on
+macOS, run:
+```bash
+pip install mlx
+```
+To install the CUDA backend on Linux, run:
+```bash
+pip install mlx[cuda]
+```
+To install a CPU-only Linux package, run:
+```bash
+pip install mlx[cpu]
+```
+Checkout the
+[documentation](https://ml-explore.github.io/mlx/build/html/install.html#)
+for more information on building the C++ and Python APIs from source.
+## Contributing
+Check out the [contribution guidelines](https://github.com/ml-explore/mlx/tree/main/CONTRIBUTING.md) for more information
+on contributing to MLX. See the
+[docs](https://ml-explore.github.io/mlx/build/html/install.html) for more
+information on building from source, and running tests.
+We are grateful for all of [our
+contributors](https://github.com/ml-explore/mlx/tree/main/ACKNOWLEDGMENTS.md#Individual-Contributors). If you contribute
+to MLX and wish to be acknowledged, please add your name to the list in your
+pull request.
+## Citing MLX
+The MLX software suite was initially developed with equal contribution by Awni
+Hannun, Jagrit Digani, Angelos Katharopoulos, and Ronan Collobert. If you find
+MLX useful in your research and wish to cite it, please use the following
+BibTex entry:
+```
+@software{mlx2023,
+  author = {Awni Hannun and Jagrit Digani and Angelos Katharopoulos and Ronan Collobert},
+  title = {{MLX}: Efficient and flexible machine learning on Apple silicon},
+  url = {https://github.com/ml-explore},
+  version = {0.0},
+  year = {2023},
+}
+```

ml-stable-diffusion/mlx/benchmarks/cpp/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+function(build_benchmark SRCFILE)
+  get_filename_component(src_name ${SRCFILE} NAME_WE)
+  set(target "${src_name}")
+  add_executable(${target} ${SRCFILE})
+  target_link_libraries(${target} PRIVATE mlx)
+endfunction(build_benchmark)
+build_benchmark(single_ops.cpp)
+build_benchmark(irregular_strides.cpp)
+build_benchmark(compare_devices.cpp)
+build_benchmark(autograd.cpp)

ml-stable-diffusion/mlx/benchmarks/cpp/autograd.cpp ADDED Viewed

	@@ -0,0 +1,39 @@

+// Copyright © 2023 Apple Inc.
+#include <iostream>
+#include "mlx/mlx.h"
+#include "time_utils.h"
+namespace mx = mlx::core;
+void time_value_and_grad() {
+  auto x = mx::ones({200, 1000});
+  mx::eval(x);
+  auto fn = [](mx::array x) {
+    for (int i = 0; i < 20; ++i) {
+      x = mx::log(mx::exp(x));
+    }
+    return mx::sum(x);
+  };
+  auto grad_fn = mx::grad(fn);
+  auto independent_value_and_grad = [&]() {
+    auto value = fn(x);
+    auto dfdx = grad_fn(x);
+    return std::vector<mx::array>{value, dfdx};
+  };
+  TIME(independent_value_and_grad);
+  auto value_and_grad_fn = mx::value_and_grad(fn);
+  auto combined_value_and_grad = [&]() {
+    auto [value, dfdx] = value_and_grad_fn(x);
+    return std::vector<mx::array>{value, dfdx};
+  };
+  TIME(combined_value_and_grad);
+}
+int main() {
+  std::cout << "Benchmarks for " << mx::default_device() << std::endl;
+  time_value_and_grad();
+}

ml-stable-diffusion/mlx/benchmarks/cpp/compare_devices.cpp ADDED Viewed

	@@ -0,0 +1,27 @@

+// Copyright © 2023 Apple Inc.
+#include <iostream>
+#include "mlx/mlx.h"
+#include "time_utils.h"
+namespace mx = mlx::core;
+void time_add_op() {
+  std::vector<int> sizes(1, 1);
+  for (int i = 0; i < 9; ++i) {
+    sizes.push_back(10 * sizes.back());
+  }
+  set_default_device(mx::Device::cpu);
+  for (auto size : sizes) {
+    auto a = mx::random::uniform({size});
+    auto b = mx::random::uniform({size});
+    mx::eval(a, b);
+    std::cout << "Size " << size << std::endl;
+    TIMEM("cpu", mx::add, a, b, mx::Device::cpu);
+    TIMEM("gpu", mx::add, a, b, mx::Device::gpu);
+  }
+}
+int main() {
+  time_add_op();
+}

ml-stable-diffusion/mlx/benchmarks/cpp/irregular_strides.cpp ADDED Viewed

	@@ -0,0 +1,201 @@

+// Copyright © 2023 Apple Inc.
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include "mlx/mlx.h"
+#include "time_utils.h"
+namespace mx = mlx::core;
+void time_irregular_binary_ops_1D() {
+  auto device = mx::default_device();
+  int size = 1000000;
+  int step = 2;
+  auto a = mx::random::uniform({size});
+  auto b = mx::random::uniform({size});
+  mx::eval(a, b);
+  a = slice(a, {0}, {size}, {step});
+  b = slice(b, {0}, {size}, {step});
+  TIMEM("1D strided", mx::add, a, b, device);
+}
+void time_irregular_binary_ops_2D() {
+  auto device = mx::default_device();
+  int size = 2048;
+  auto a = mx::random::uniform({size, size});
+  auto b = mx::random::uniform({size, size});
+  mx::eval(a, b);
+  TIMEM("2D regular", mx::add, a, b, device);
+  b = mx::transpose(b);
+  mx::eval(b);
+  TIMEM("2D mx::transpose", mx::add, a, b, device);
+  b = mx::random::uniform({size});
+  mx::eval(b);
+  TIMEM("2D broadcast dim 0", mx::add, a, b, device);
+  b = mx::reshape(b, {size, 1});
+  mx::eval(b);
+  TIMEM("2D broadcast dim 1", mx::add, a, b, device);
+}
+void time_irregular_binary_ops_3D() {
+  auto device = mx::default_device();
+  int d0 = 32;
+  int d1 = 512;
+  int d2 = 512;
+  auto a = mx::random::uniform({d0, d1, d2});
+  auto b = mx::random::uniform({d0, d1, d2});
+  TIMEM("3D regular", mx::add, a, b, device);
+  b = mx::transpose(b, {0, 2, 1});
+  TIMEM("3D mx::transpose", mx::add, a, b, device);
+  b = mx::random::uniform({d1, d2});
+  TIMEM("3D broadcast dim 0", mx::add, a, b, device);
+  b = mx::random::uniform({d0, 1, d2});
+  TIMEM("3D broadcast dim 1", mx::add, a, b, device);
+  b = mx::random::uniform({d0, d1, 1});
+  TIMEM("3D broadcast dim 2", mx::add, a, b, device);
+  b = mx::random::uniform({d2});
+  TIMEM("3D broadcast dims 0, 1", mx::add, a, b, device);
+  b = mx::random::uniform({d1, 1});
+  TIMEM("3D broadcast dims 0, 2", mx::add, a, b, device);
+  b = mx::random::uniform({d0, 1, 1});
+  TIMEM("3D broadcast dims 1, 2", mx::add, a, b, device);
+}
+void time_irregular_binary_ops_4D() {
+  auto device = mx::default_device();
+  std::vector<int> shape = {8, 8, 512, 512};
+  auto a = mx::random::uniform(shape);
+  auto b = mx::random::uniform(shape);
+  TIMEM("4D regular", mx::add, a, b, device);
+  b = mx::transpose(b, {0, 1, 3, 2});
+  TIMEM("4D mx::transpose", mx::add, a, b, device);
+  std::string om = "4D broadcast dims ";
+  for (int i = 0; i < shape.size(); ++i) {
+    shape[i] = 1;
+    b = mx::random::uniform(shape);
+    std::ostringstream msg;
+    msg << om << i;
+    TIMEM(msg.str(), mx::add, a, b, device);
+    for (int j = i + 1; j < shape.size(); ++j) {
+      shape[j] = 1;
+      std::ostringstream msg;
+      msg << om << i << ", " << j;
+      b = mx::random::uniform(shape);
+      TIMEM(msg.str(), mx::add, a, b, device);
+      shape[j] = a.shape(j);
+      for (int k = j + 1; k < shape.size(); ++k) {
+        shape[k] = 1;
+        std::ostringstream msg;
+        msg << om << i << ", " << j << ", " << k;
+        b = mx::random::uniform(shape);
+        TIMEM(msg.str(), mx::add, a, b, device);
+        shape[k] = a.shape(k);
+      }
+    }
+    shape[i] = a.shape(i);
+  }
+}
+void time_irregular_reshape() {
+  auto device = mx::default_device();
+  std::vector<int> shape;
+  auto reshape_fn = [&shape, device](const mx::array& a) {
+    return mx::reshape(a, shape, device);
+  };
+  int size = 64;
+  int d = 2 * size;
+  auto a = mx::random::uniform({d, d, d});
+  shape = {8 * size, size, size};
+  TIMEM("3D contiguous", reshape_fn, a);
+  a = mx::transpose(a);
+  shape = {8 * size, size, size};
+  TIMEM("3D mx::transpose", reshape_fn, a);
+  a = mx::transpose(a, {1, 2, 0});
+  shape = {8 * size, size, size};
+  TIMEM("3D mx::transpose dims 1 2", reshape_fn, a);
+  a = mx::broadcast_to(mx::random::uniform({d, d}), {d, d, d});
+  TIMEM("3D broadcast dim 0", reshape_fn, a);
+  a = mx::broadcast_to(mx::random::uniform({d, 1, d}), {d, d, d});
+  TIMEM("3D broadcast dim 1", reshape_fn, a);
+  a = mx::broadcast_to(mx::random::uniform({d, d, 1}), {d, d, d});
+  TIMEM("3D broadcast dim 2", reshape_fn, a);
+  a = mx::broadcast_to(mx::random::uniform({d}), {d, d, d});
+  TIMEM("3D broadcast dims 0, 1", reshape_fn, a);
+  a = mx::broadcast_to(mx::random::uniform({d, 1}), {d, d, d});
+  TIMEM("3D broadcast dims 0, 2", reshape_fn, a);
+  a = mx::broadcast_to(mx::random::uniform({d, 1, 1}), {d, d, d});
+  TIMEM("3D broadcast dims 1, 2", reshape_fn, a);
+  a = mx::broadcast_to(mx::random::uniform({1, 1, 1}), {d, d, d});
+  TIMEM("3D broadcast dims 1, 2, 3", reshape_fn, a);
+}
+void time_irregular_astype_1D() {
+  auto device = mx::default_device();
+  int size = 1000000;
+  int step = 2;
+  auto a = mx::random::uniform({size});
+  a = slice(a, {0}, {size}, {step});
+  TIMEM("1D strided", mx::astype, a, mx::int32, device);
+}
+void time_irregular_astype_2D() {
+  auto device = mx::default_device();
+  int size = 2048;
+  std::vector<int> shape = {size, size};
+  auto a = mx::random::uniform(shape);
+  TIMEM("2D regular", mx::astype, a, mx::int32, device);
+  a = mx::transpose(a);
+  TIMEM("2D mx::transpose", mx::astype, a, mx::int32, device);
+  a = mx::broadcast_to(mx::random::uniform({size}), shape);
+  TIMEM("2D broadcast dim 0", mx::astype, a, mx::int32, device);
+  a = mx::broadcast_to(mx::random::uniform({size, 1}), shape);
+  TIMEM("2D broadcast dim 1", mx::astype, a, mx::int32, device);
+}
+int main(int argc, char** argv) {
+  if (argc > 1) {
+    bool use_gpu = !strcmp(argv[1], "gpu");
+    set_default_device(use_gpu ? mx::Device::gpu : mx::Device::cpu);
+  }
+  std::cout << "Benchmarks for " << mx::default_device() << std::endl;
+  time_irregular_binary_ops_1D();
+  time_irregular_binary_ops_2D();
+  time_irregular_binary_ops_3D();
+  time_irregular_binary_ops_4D();
+  time_irregular_reshape();
+  time_irregular_astype_1D();
+  time_irregular_astype_2D();
+}

ml-stable-diffusion/mlx/benchmarks/cpp/single_ops.cpp ADDED Viewed

	@@ -0,0 +1,288 @@

+// Copyright © 2023 Apple Inc.
+#include "mlx/mlx.h"
+#include "time_utils.h"
+namespace mx = mlx::core;
+void time_creation_ops() {
+  int M = 2000;
+  int N = 500;
+  auto shape = {M, N};
+  auto full_fp32 = [&]() { return mx::full(shape, 3.3f); };
+  TIME(full_fp32);
+  auto zeros_fp32 = [&]() { return mx::zeros(shape, mx::float32); };
+  TIME(zeros_fp32);
+  auto ones_fp32 = [&]() { return mx::ones(shape, mx::float32); };
+  TIME(ones_fp32);
+  auto arange_fp32 = [&]() { return mx::arange(0.0, 10.0, 1e-4); };
+  TIME(arange_fp32);
+}
+void time_type_conversions() {
+  int M = 2000;
+  int N = 500;
+  auto shape = {M, N};
+  auto device = mx::default_device();
+  auto a = mx::zeros(shape, mx::float32);
+  mx::eval(a);
+  TIMEM("mx::float32 to mx::int32", mx::astype, a, mx::int32, device);
+  TIMEM("mx::float32 to mx::uint32", mx::astype, a, mx::uint32, device);
+  a = mx::zeros(shape, mx::int32);
+  mx::eval(a);
+  TIMEM("mx::int32 to mx::float32", mx::astype, a, mx::float32, device);
+  a = mx::zeros(shape, mx::bool_);
+  mx::eval(a);
+  TIMEM("bool to mx::float32", mx::astype, a, mx::float32, device);
+  TIMEM("bool to mx::int32", mx::astype, a, mx::int32, device);
+  TIMEM("bool to mx::uint32", mx::astype, a, mx::uint32, device);
+}
+void time_random_generation() {
+  int M = 2000;
+  int N = 500;
+  auto uniform = [&]() { return mx::random::uniform({M, N}, mx::float32); };
+  TIME(uniform);
+  auto normal = [&]() { return mx::random::normal({M, N}, mx::float32); };
+  TIME(normal);
+}
+void time_unary_ops() {
+  int M = 2000;
+  int N = 500;
+  auto device = mx::default_device();
+  auto a = mx::random::normal({M, N});
+  mx::eval(a);
+  TIME(mlx::core::abs, a, device);
+  TIME(mx::negative, a, device);
+  TIME(mx::sign, a, device);
+  TIME(mx::square, a, device);
+  TIME(mlx::core::sqrt, a, device);
+  TIME(mx::rsqrt, a, device);
+  TIME(mlx::core::exp, a, device);
+  a = mx::random::uniform({M, N});
+  TIME(mlx::core::log, a, device);
+}
+void time_binary_ops() {
+  int M = 1000, N = 100, K = 10;
+  auto condition = mx::random::randint(0, 2, {M, N, K});
+  auto a = mx::random::uniform({M, N, K});
+  auto b = mx::random::uniform({M, N, K});
+  auto device = mx::default_device();
+  mx::eval(a, b);
+  TIME(mx::add, a, b, device);
+  TIME(mx::subtract, a, b, device);
+  TIME(mx::multiply, a, b, device);
+  TIME(mx::divide, a, b, device);
+  TIME(mx::maximum, a, b, device);
+  TIME(mx::minimum, a, b, device);
+  TIME(mx::where, condition, a, b, device);
+  condition = mx::array({true});
+  b = mx::random::uniform({1});
+  mx::eval(b);
+  TIMEM("scalar", mx::add, a, b, device);
+  TIMEM("vector-scalar", mx::subtract, a, b, device);
+  TIMEM("scalar-vector", mx::subtract, b, a, device);
+  TIMEM("scalar", mx::multiply, a, b, device);
+  TIMEM("vector-scalar", mx::divide, a, b, device);
+  TIMEM("scalar-vector", mx::divide, b, a, device);
+  TIMEM("scalar-vector", mx::where, condition, a, b, device);
+  condition = mx::broadcast_to(mx::array({true}), {1000, 100});
+  a = mx::broadcast_to(mx::random::uniform({1}), {1000, 100});
+  b = mx::broadcast_to(mx::random::uniform({1}), {1000, 100});
+  mx::eval(a, b);
+  TIMEM("scalar-scalar broadcast", mx::add, a, b, device);
+  TIMEM("scalar-scalar broadcast", mx::subtract, a, b, device);
+  TIMEM("scalar-scalar broadcast", mx::multiply, a, b, device);
+  TIMEM("scalar-scalar broadcast", mx::divide, a, b, device);
+  TIMEM("scalar-scalar broadcast", mx::where, condition, a, b, device);
+}
+void time_strided_ops() {
+  int M = 50, N = 50, O = 50, P = 50;
+  auto a = mx::random::uniform({M, N, O, P});
+  auto b = mx::random::uniform({M, N, O, P});
+  auto device = mx::default_device();
+  mx::eval(a, b);
+  TIMEM("non-strided", mx::add, a, b, device);
+  a = mx::transpose(a, {1, 0, 2, 3});
+  b = mx::transpose(b, {3, 2, 0, 1});
+  mx::eval(a, b);
+  TIMEM("strided", mx::add, a, b, device);
+}
+void time_comparisons() {
+  int M = 1000, N = 100, K = 10;
+  auto a = mx::random::uniform({M, N, K});
+  auto b = mx::random::uniform({M, N, K});
+  auto device = mx::default_device();
+  mx::eval(a, b);
+  TIME(mx::equal, a, b, device);
+  TIME(mx::greater, a, b, device);
+  TIME(mx::greater_equal, a, b, device);
+  TIME(mx::less, a, b, device);
+  TIME(mx::less_equal, a, b, device);
+}
+void time_matvec() {
+  int M = 2000, N = 200;
+  auto a = mx::random::uniform({M, N});
+  auto b = mx::random::uniform({N});
+  auto c = mx::random::uniform({M});
+  mx::eval(a, b, c);
+  auto matvec = [&]() { return mx::matmul(a, b); };
+  TIME(matvec);
+  auto matvec_transpose = [&]() { return mx::matmul(mx::transpose(a), c); };
+  TIME(matvec_transpose);
+}
+void time_matmul() {
+  int M = 1000, N = 1000, K = 1000;
+  auto a = mx::random::uniform({M, K});
+  auto b = mx::random::uniform({K, N});
+  auto device = mx::default_device();
+  mx::eval(a, b);
+  TIME(mx::matmul, a, b, device);
+  auto transpose_matmul = [&]() { return mx::matmul(mx::transpose(a), b); };
+  TIME(transpose_matmul);
+}
+void time_reductions() {
+  auto a = mx::random::normal({10000, 1000});
+  mx::eval(a);
+  auto sum_all = [&a]() { return mx::sum(a, false); };
+  TIME(sum_all);
+  auto sum_along_0 = [&a]() { return mx::sum(a, 0, false); };
+  TIME(sum_along_0);
+  auto sum_along_1 = [&a]() { return mx::sum(a, 1, false); };
+  TIME(sum_along_1);
+  auto prod_all = [&a]() { return mx::prod(a, false); };
+  TIME(prod_all);
+  auto all_true = [&a]() { return mx::all(a, false); };
+  TIME(all_true);
+  auto all_along_0 = [&a]() { return mx::all(a, 0, false); };
+  TIME(all_along_0);
+  auto all_along_1 = [&a]() { return mx::all(a, 1, false); };
+  TIME(all_along_1);
+  auto any_true = [&a]() { return mx::any(a, false); };
+  TIME(any_true);
+  auto argmin_along_0 = [&a]() { return mx::argmin(a, 0, false); };
+  TIME(argmin_along_0);
+  auto argmin_along_1 = [&a]() { return mx::argmin(a, 1, false); };
+  TIME(argmin_along_1);
+  auto indices = mx::array({1});
+  auto updates = mx::reshape(mx::array({NAN}), {1, 1, 1});
+  std::vector<int> axes{0};
+  auto b = scatter(a, {indices}, updates, axes);
+  mx::eval(b);
+  auto max_along_0 = [&b]() { return mx::max(b, 0, false); };
+  TIME(max_along_0);
+  auto max_along_1 = [&b]() { return mx::max(b, 1, false); };
+  TIME(max_along_1);
+  auto min_along_0 = [&b]() { return mx::min(b, 0, false); };
+  TIME(min_along_0);
+  auto min_along_1 = [&b]() { return mx::min(b, 1, false); };
+  TIME(min_along_1);
+}
+void time_gather_scatter() {
+  auto a = mx::random::normal({1000, 768});
+  mx::eval(a);
+  auto indices = mx::random::randint(0, 1000, {256});
+  mx::eval(indices);
+  auto embedding_lookup = [&a, &indices]() { return mx::take(a, indices, 0); };
+  TIME(embedding_lookup);
+  indices = mx::random::randint(0, 768 * 1000, {256 * 768});
+  mx::eval(indices);
+  auto single_element_lookup = [&a, &indices]() {
+    return mx::take(a, indices);
+  };
+  TIME(single_element_lookup);
+  indices = mx::random::randint(0, 1000, {256});
+  auto updates = mx::random::normal({256, 1, 768});
+  mx::eval(indices, updates);
+  auto embedding_update = [&a, &indices, &updates]() {
+    return scatter(a, indices, updates, 0);
+  };
+  TIME(embedding_update);
+  auto embedding_add = [&a, &indices, &updates]() {
+    return scatter_add(a, indices, updates, 0);
+  };
+  TIME(embedding_add);
+  a = mx::reshape(a, {-1});
+  indices = mx::random::randint(0, 768 * 1000, {768 * 256});
+  updates = mx::random::normal({256 * 768, 1});
+  mx::eval(a, indices, updates);
+  auto single_element_update = [&a, &indices, &updates]() {
+    return scatter(a, indices, updates, 0);
+  };
+  TIME(single_element_update);
+  auto single_element_add = [&a, &indices, &updates]() {
+    return scatter_add(a, indices, updates, 0);
+  };
+  TIME(single_element_add);
+}
+void time_divmod() {
+  auto a = mx::random::normal({1000});
+  auto b = mx::random::normal({1000});
+  mx::eval({a, b});
+  auto divmod_fused = [&a, &b]() { return mx::divmod(a, b); };
+  TIME(divmod_fused);
+  auto divmod_separate = [&a, &b]() {
+    return std::vector<mx::array>{mx::floor_divide(a, b), mx::remainder(a, b)};
+  };
+  TIME(divmod_separate);
+}
+int main() {
+  std::cout << "Benchmarks for " << mx::default_device() << std::endl;
+  time_creation_ops();
+  time_type_conversions();
+  time_unary_ops();
+  time_binary_ops();
+  time_strided_ops();
+  time_random_generation();
+  time_comparisons();
+  time_matvec();
+  time_matmul();
+  time_reductions();
+  time_gather_scatter();
+  time_divmod();
+}

ml-stable-diffusion/mlx/benchmarks/cpp/time_utils.h ADDED Viewed

	@@ -0,0 +1,39 @@

+// Copyright © 2023 Apple Inc.
+#pragma once
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include "mlx/mlx.h"
+#define milliseconds(x) \
+  (std::chrono::duration_cast<std::chrono::nanoseconds>(x).count() / 1e6)
+#define time_now() std::chrono::high_resolution_clock::now()
+#define TIME(FUNC, ...)                                                        \
+  std::cout << "Timing " << #FUNC << " ... " << std::flush                     \
+            << std::setprecision(5) << time_fn(FUNC, ##__VA_ARGS__) << " msec" \
+            << std::endl;
+#define TIMEM(MSG, FUNC, ...)                                      \
+  std::cout << "Timing " << "(" << MSG << ") " << #FUNC << " ... " \
+            << std::flush << std::setprecision(5)                  \
+            << time_fn(FUNC, ##__VA_ARGS__) << " msec" << std::endl;
+template <typename F, typename... Args>
+double time_fn(F fn, Args&&... args) {
+  // warmup
+  for (int i = 0; i < 5; ++i) {
+    eval(fn(std::forward<Args>(args)...));
+  }
+  int num_iters = 100;
+  auto start = time_now();
+  for (int i = 0; i < num_iters; i++) {
+    eval(fn(std::forward<Args>(args)...));
+  }
+  auto end = time_now();
+  return milliseconds(end - start) / static_cast<double>(num_iters);
+}

ml-stable-diffusion/mlx/benchmarks/numpy/single_ops.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# Copyright © 2023 Apple Inc.
+import numpy as np
+from time_utils import time_fn
+def time_add():
+    a = np.ones((100, 100, 10), dtype=np.float32)
+    b = np.ones((100, 100, 10), dtype=np.float32)
+    time_fn(np.add, a, b)
+def time_matmul():
+    a = np.random.rand(1000, 500).astype(np.float32)
+    b = np.random.rand(500, 1000).astype(np.float32)
+    time_fn(np.matmul, a, b)
+def time_exp():
+    a = np.random.randn(1000, 100).astype(np.float32)
+    time_fn(np.exp, a)
+def time_take():
+    a = np.random.rand(10000, 500)
+    ids = np.random.randint(0, 10000, (20, 10))
+    ids = [idx.reshape(-1) for idx in np.split(ids, 20)]
+    def random_take():
+        return [np.take(a, idx, 0) for idx in ids]
+    time_fn(random_take)
+if __name__ == "__main__":
+    time_add()
+    time_matmul()
+    time_exp()
+    time_take()

ml-stable-diffusion/mlx/benchmarks/numpy/time_utils.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# Copyright © 2023 Apple Inc.
+import time
+def time_fn(fn, *args):
+    print(f"Timing {fn.__name__} ...", end=" ")
+    # warmup
+    for _ in range(5):
+        fn(*args)
+    num_iters = 100
+    tic = time.perf_counter()
+    for _ in range(num_iters):
+        x = fn(*args)
+    toc = time.perf_counter()
+    msec = 1e3 * (toc - tic) / num_iters
+    print(f"{msec:.5f} msec")

ml-stable-diffusion/mlx/benchmarks/python/batch_matmul_bench.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# Copyright © 2023 Apple Inc.
+import argparse
+import mlx.core as mx
+from time_utils import time_fn
+B = 8
+T = 1024
+D = 512
+def time_batch_matmul():
+    mx.random.seed(3)
+    a = mx.random.uniform(shape=(B, T, D))
+    b = mx.random.uniform(shape=(D, D))
+    c = mx.random.uniform(shape=(B, T, D))
+    mx.eval(a, b, c)
+    time_fn(mx.matmul, a, b)
+    def batch_vjp_first():
+        return mx.vjp(mx.matmul, [a, b], [c])[1][0]
+    time_fn(batch_vjp_first)
+    def batch_vjp_second():
+        return mx.vjp(mx.matmul, [a, b], [c])[1][1]
+    time_fn(batch_vjp_second)
+def time_unbatch_matmul():
+    mx.random.seed(3)
+    a = mx.random.uniform(shape=(B * T, D))
+    b = mx.random.uniform(shape=(D, D))
+    c = mx.random.uniform(shape=(B * T, D))
+    mx.eval(a, b, c)
+    time_fn(mx.matmul, a, b)
+    def unbatch_vjp_first():
+        return mx.matmul(c, mx.transpose(b))
+    time_fn(unbatch_vjp_first)
+    def unbatch_vjp_second():
+        return mx.matmul(mx.transpose(a), c)
+    time_fn(unbatch_vjp_second)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("MLX benchmarks.")
+    parser.add_argument("--gpu", action="store_true", help="Use the Metal back-end.")
+    args = parser.parse_args()
+    if args.gpu:
+        mx.set_default_device(mx.gpu)
+    else:
+        mx.set_default_device(mx.cpu)
+    time_batch_matmul()
+    time_unbatch_matmul()

ml-stable-diffusion/mlx/benchmarks/python/blas/bench_gemm.py ADDED Viewed

	@@ -0,0 +1,191 @@

+# Copyright © 2023 Apple Inc.
+import argparse
+import math
+import os
+import subprocess
+import time
+import mlx.core as mx
+import numpy as np
+import torch
+device_name = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"])
+device_name = device_name.decode("utf-8").strip("\n")
+N_warmup = 8
+N_iter_bench = 80
+N_iter_func = 5
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+    torch.mps.synchronize()
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+def gemm_nn_mlx(a, b):
+    ys = []
+    for i in range(N_iter_func):
+        y = a @ b
+        ys.append(y)
+    mx.eval(ys)
+    return ys
+def gemm_nt_mlx(a, b):
+    ys = []
+    for i in range(N_iter_func):
+        y = a @ b.transpose((0, 2, 1))
+        ys.append(y)
+    mx.eval(ys)
+    return ys
+def gemm_tn_mlx(a, b):
+    ys = []
+    for i in range(N_iter_func):
+        y = a.transpose((0, 2, 1)) @ b
+        ys.append(y)
+    mx.eval(ys)
+    return ys
+def gemm_tt_mlx(a, b):
+    ys = []
+    for i in range(N_iter_func):
+        y = a.transpose((0, 2, 1)) @ b.transpose((0, 2, 1))
+        ys.append(y)
+    mx.eval(ys)
+    return ys
+@torch.no_grad()
+def gemm_nn_torch(a, b):
+    ys = []
+    for i in range(N_iter_func):
+        y = a @ b
+        ys.append(y)
+    torch.mps.synchronize()
+    return ys
+@torch.no_grad()
+def gemm_nt_torch(a, b):
+    ys = []
+    for i in range(N_iter_func):
+        y = a @ b.transpose(-1, -2)
+        ys.append(y)
+    torch.mps.synchronize()
+    return ys
+@torch.no_grad()
+def gemm_tn_torch(a, b):
+    ys = []
+    for i in range(N_iter_func):
+        y = a.transpose(-1, -2) @ b
+        ys.append(y)
+    torch.mps.synchronize()
+    return ys
+@torch.no_grad()
+def gemm_tt_torch(a, b):
+    ys = []
+    for i in range(N_iter_func):
+        y = a.transpose(-1, -2) @ b.transpose(-1, -2)
+        ys.append(y)
+    torch.mps.synchronize()
+    return ys
+def bench_shape(B, M, N, K, np_dtype, transpose="nn"):
+    shape_a = (B, M, K) if transpose[0] == "n" else (B, K, M)
+    shape_b = (B, K, N) if transpose[1] == "n" else (B, N, K)
+    a_np = np.random.normal(0.0, 1.0 / math.sqrt(M + K), shape_a).astype(np_dtype)
+    b_np = np.random.normal(0.0, 1.0 / math.sqrt(N + K), shape_b).astype(np_dtype)
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+    a_pt = torch.from_numpy(a_np).to("mps")
+    b_pt = torch.from_numpy(b_np).to("mps")
+    torch.mps.synchronize()
+    f_mx = {
+        "nn": gemm_nn_mlx,
+        "nt": gemm_nt_mlx,
+        "tn": gemm_tn_mlx,
+        "tt": gemm_tt_mlx,
+    }[transpose]
+    f_pt = {
+        "nn": gemm_nn_torch,
+        "nt": gemm_nt_torch,
+        "tn": gemm_tn_torch,
+        "tt": gemm_tt_torch,
+    }[transpose]
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+    t_a = (0, 1, 2) if transpose[0] == "n" else (0, 2, 1)
+    t_b = (0, 1, 2) if transpose[1] == "n" else (0, 2, 1)
+    c_mlx = a_mx.transpose(t_a) @ b_mx.transpose(t_b)
+    c_npy = a_np.transpose(t_a).astype(np_dtype) @ b_np.transpose(t_b).astype(np_dtype)
+    atol = 1e-5 if np_dtype == np.float32 else 1e-4
+    if not np.allclose(c_mlx, c_npy.astype(np_dtype), atol=atol):
+        print(
+            f"Failed at {(B, M, N, K)} [transpose = {transpose}] with max(|a - b|) = {np.max(np.abs(c_npy - c_mlx))}"
+        )
+    return time_mlx, time_torch
+def get_gflop_count(B, M, N, K):
+    return float(2.0 * N_iter_bench * N_iter_func * B * M * N * K) / float(1024.0**3)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run gemm benchmarks")
+    dtypes = ("float32", "float16", "complex64")
+    transposes = ("nn", "nt", "tn")
+    shapes = (
+        (16, 234, 768, 3072),
+        (1, 64, 64, 25344),
+        (16, 1024, 1024, 1024),
+        (1, 1024, 1024, 2048),
+        (4, 1024, 1024, 4096),
+        (4, 1024, 4096, 1024),
+        (1, 4096, 4096, 4096),
+    )
+    for dtype in dtypes:
+        for transpose in transposes:
+            for B, M, N, K in shapes:
+                np_dtype = getattr(np, dtype)
+                time_mlx, time_torch = bench_shape(B, M, N, K, np_dtype, transpose)
+                gflop_count = get_gflop_count(B, M, N, K)
+                gflops_mx = gflop_count / (time_mlx)
+                gflops_pt = gflop_count / (time_torch)
+                diff = gflops_mx / gflops_pt - 1.0
+                print(
+                    f"{B:3d}, {M:4d}, {N:4d}, {K:4d}, {dtype}, {transpose}, {gflops_pt:05.3f}, {gflops_mx:05.3f}, {100.0 * diff:+5.2f}%"
+                )
+                if gflops_pt >= 2.0 * gflops_mx:
+                    print("ATTENTION ^^^^^^^")

ml-stable-diffusion/mlx/benchmarks/python/blas/bench_gemv.py ADDED Viewed

	@@ -0,0 +1,221 @@

+# Copyright © 2023 Apple Inc.
+import argparse
+import os
+import subprocess
+import time
+import matplotlib.pyplot as plt
+import mlx.core as mx
+import numpy as np
+import torch
+results_dir = "./results"
+if not os.path.isdir(results_dir):
+    os.mkdir(results_dir)
+device_name = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"])
+device_name = device_name.decode("utf-8").strip("\n")
+N_warmup = 5
+N_iter_bench = 50
+N_iter_func = 20
+out_vec_sizes = [128, 512, 2048, 4096]
+in_vec_sizes = [128, 512, 2048, 4096]
+benchmark_vector_lens = []
+benchmark_vector_lens += [(i + 1) * 4096 for i in range(8)][::2]
+benchmark_vector_lens += [(i + 1) * 4095 for i in range(8)][::2]
+benchmark_vector_lens += [(i + 1) * 4097 for i in range(8)][::2]
+benchmark_vector_lens += [64, 128, 512, 1024, 2048, 11008, 32000]
+benchmark_vector_lens.sort()
+def bench(f, m, v):
+    for i in range(N_warmup):
+        f(m, v)
+    torch.mps.synchronize()
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(m, v)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+def gemv_mlx(m, v):
+    ys = []
+    for i in range(N_iter_func):
+        y = m @ v
+        ys.append(y)
+    mx.eval(ys)
+    return ys
+def gemv_t_mlx(m, v):
+    ys = []
+    for i in range(N_iter_func):
+        y = v @ m
+        ys.append(y)
+    mx.eval(ys)
+    return ys
+@torch.no_grad()
+def gemv_torch(m, v):
+    ys = []
+    for i in range(N_iter_func):
+        y = m @ v
+        ys.append(y)
+    torch.mps.synchronize()
+    return ys
+@torch.no_grad()
+def gemv_t_torch(m, v):
+    ys = []
+    for i in range(N_iter_func):
+        y = v @ m
+        ys.append(y)
+    torch.mps.synchronize()
+    return ys
+def bench_lens(in_vec_len, out_vec_len, np_dtype, transpose=False):
+    shape_mat = (in_vec_len, out_vec_len) if transpose else (out_vec_len, in_vec_len)
+    shape_vec = (1, in_vec_len) if transpose else (in_vec_len, 1)
+    mat_npy = np.random.normal(0.0, 2.0 / in_vec_len, shape_mat).astype(np_dtype)
+    vec_npy = np.random.normal(0.0, 2.0 / in_vec_len, shape_vec).astype(np_dtype)
+    mat_mlx = mx.array(mat_npy)
+    vec_mlx = mx.array(vec_npy)
+    mat_trc = torch.from_numpy(mat_npy).to("mps")
+    vec_trc = torch.from_numpy(vec_npy).to("mps")
+    torch.mps.synchronize()
+    time_torch = (
+        bench(gemv_t_torch, mat_trc, vec_trc)
+        if transpose
+        else bench(gemv_torch, mat_trc, vec_trc)
+    )
+    time_mlx = (
+        bench(gemv_t_mlx, mat_mlx, vec_mlx)
+        if transpose
+        else bench(gemv_mlx, mat_mlx, vec_mlx)
+    )
+    c_mlx = (
+        np.asarray(vec_mlx @ mat_mlx) if transpose else np.asarray(mat_mlx @ vec_mlx)
+    )
+    c_npy = (vec_npy @ mat_npy) if transpose else (mat_npy @ vec_npy)
+    if not np.allclose(c_mlx, c_npy, atol=2e-5):
+        print(
+            f"Failed at {shape_mat} [transpose = {transpose}] with max(|a - b|) = {np.max(np.abs(c_npy - c_mlx))}"
+        )
+    return time_mlx, time_torch
+def get_gflop_count(in_vec_len, out_vec_len):
+    return float(2.0 * N_iter_bench * N_iter_func * in_vec_len * out_vec_len) / float(
+        1024**3
+    )
+def get_gbyte_size(in_vec_len, out_vec_len, np_dtype):
+    n_elem = in_vec_len * out_vec_len + in_vec_len + out_vec_len
+    item_size = 4 if np_dtype == np.float32 else 2
+    return float(N_iter_bench * N_iter_func * n_elem * item_size) / float(1024**3)
+def bench_with_in_len(ax, in_vec_len, out_vector_lens, dtype, transpose):
+    np_dtype = getattr(np, dtype)
+    mlx_gb_s = []
+    mlx_gflops = []
+    pyt_gb_s = []
+    pyt_gflops = []
+    for out_vec_len in out_vector_lens:
+        gflop_count = get_gflop_count(in_vec_len, out_vec_len)
+        gbyte_size = get_gbyte_size(in_vec_len, out_vec_len, np_dtype)
+        time_mlx, time_torch = bench_lens(in_vec_len, out_vec_len, np_dtype, transpose)
+        mlx_gb_s.append(gbyte_size / time_mlx)
+        pyt_gb_s.append(gbyte_size / time_torch)
+        mlx_gflops.append(gflop_count / time_mlx)
+        pyt_gflops.append(gflop_count / time_torch)
+    if transpose:
+        title = f"gemv_t ([1, {in_vec_len}] [{in_vec_len}, out_vec_len]) | {dtype}"
+    else:
+        title = f"gemv ([out_vec_len, {in_vec_len}] X [{in_vec_len}, 1] ) | {dtype}"
+    ax.plot(out_vector_lens, mlx_gb_s, "tab:blue", label="MLX")
+    ax.plot(out_vector_lens, pyt_gb_s, "tab:red", label="Torch")
+    ax.set_title(title)
+    ax.set(xlabel="out_vector_len", ylabel="Performance (GB/s)")
+    ax.legend()
+def bench_with_out_len(ax, out_vec_len, in_vector_lens, dtype, transpose):
+    np_dtype = getattr(np, dtype)
+    mlx_gb_s = []
+    mlx_gflops = []
+    pyt_gb_s = []
+    pyt_gflops = []
+    for in_vec_len in in_vector_lens:
+        gflop_count = get_gflop_count(in_vec_len, out_vec_len)
+        gbyte_size = get_gbyte_size(in_vec_len, out_vec_len, np_dtype)
+        time_mlx, time_torch = bench_lens(in_vec_len, out_vec_len, np_dtype, transpose)
+        mlx_gb_s.append(gbyte_size / time_mlx)
+        pyt_gb_s.append(gbyte_size / time_torch)
+        mlx_gflops.append(gflop_count / time_mlx)
+        pyt_gflops.append(gflop_count / time_torch)
+    if transpose:
+        title = f"([1, in_vec_len] [in_vec_len, {out_vec_len}])"
+    else:
+        title = f"([{out_vec_len}, in_vec_len] X [in_vec_len, 1] )"
+    ax.plot(in_vector_lens, mlx_gb_s, "tab:blue", label="MLX")
+    ax.plot(in_vector_lens, pyt_gb_s, "tab:red", label="Torch")
+    ax.set_title(title)
+    ax.set(xlabel="in_vector_len", ylabel="Performance (GB/s)")
+    ax.legend()
+for transpose in (False, True):
+    for dtype in ("float32", "float16", "complex64"):
+        fig, axs = plt.subplots(
+            len(in_vec_sizes), 2, figsize=(8.5, 11), layout="constrained"
+        )
+        for i, in_vec_len in enumerate(in_vec_sizes):
+            bench_with_in_len(
+                axs[i][0], in_vec_len, benchmark_vector_lens, dtype, transpose
+            )
+        for i, out_vec_len in enumerate(out_vec_sizes):
+            bench_with_out_len(
+                axs[i][1], out_vec_len, benchmark_vector_lens, dtype, transpose
+            )
+        op_name = "gemv_t" if transpose else "gemv"
+        fig.suptitle(f"{device_name}: {dtype} {op_name}")
+        fig.savefig(
+            os.path.join(
+                results_dir, f"{device_name.replace(' ', '_')}_{dtype}_{op_name}.pdf"
+            )
+        )
+        plt.close(fig)

ml-stable-diffusion/mlx/benchmarks/python/comparative/README.md ADDED Viewed

	@@ -0,0 +1,15 @@

+Microbenchmarks comparing MLX to PyTorch
+========================================
+Implement the same microbenchmarks in MLX and PyTorch to compare and make a
+list of the biggest possible performance improvements and/or regressions.
+Run with `python bench_mlx.py sum_axis --size 8x1024x128 --axis 2 --cpu` for
+instance to measure the times it takes to sum across the 3rd axis of the above
+tensor on the cpu.
+`compare.py` runs several benchmarks and compares the speed-up or lack thereof
+in comparison to PyTorch.
+Each bench script can be run with `--print-pid` to print the PID and wait for a
+key in order to ease attaching a debugger.

ml-stable-diffusion/mlx/benchmarks/python/comparative/bench_mlx.py ADDED Viewed

	@@ -0,0 +1,519 @@

+# Copyright © 2023 Apple Inc.
+import argparse
+import math
+import os
+import time
+from functools import partial
+import mlx.core as mx
+import mlx.nn as nn
+def int_or_list(x):
+    try:
+        return int(x)
+    except ValueError:
+        return [int(xi) for xi in x.split(",")]
+def none_or_list(x):
+    if x == "":
+        return None
+    else:
+        return [int(xi) for xi in x.split(",")]
+def dtype_from_str(x):
+    if x == "":
+        return mx.float32
+    else:
+        dt = getattr(mx, x)
+        if not isinstance(dt, mx.Dtype):
+            raise ValueError(f"{x} is not an mlx dtype")
+        return dt
+def bench(f, *args):
+    for i in range(10):
+        f(*args)
+    s = time.time()
+    for i in range(100):
+        f(*args)
+    e = time.time()
+    return e - s
+def matmul_square(x):
+    y = x
+    for i in range(10):
+        y = y @ x
+    mx.eval(y)
+    return y
+def matmul(x, y):
+    ys = []
+    for i in range(10):
+        ys.append(x @ y)
+    mx.eval(ys)
+def _quant_matmul(x, w, s, b, transpose, group_size, bits):
+    ys = []
+    for i in range(10):
+        ys.append(
+            mx.quantized_matmul(
+                x, w, s, b, transpose=transpose, group_size=group_size, bits=bits
+            )
+        )
+    mx.eval(ys)
+quant_matmul = {
+    "quant_matmul_32_2": partial(_quant_matmul, transpose=False, group_size=32, bits=2),
+    "quant_matmul_32_4": partial(_quant_matmul, transpose=False, group_size=32, bits=4),
+    "quant_matmul_32_8": partial(_quant_matmul, transpose=False, group_size=32, bits=8),
+    "quant_matmul_64_2": partial(_quant_matmul, transpose=False, group_size=64, bits=2),
+    "quant_matmul_64_4": partial(_quant_matmul, transpose=False, group_size=64, bits=4),
+    "quant_matmul_64_8": partial(_quant_matmul, transpose=False, group_size=64, bits=8),
+    "quant_matmul_128_2": partial(
+        _quant_matmul, transpose=False, group_size=128, bits=2
+    ),
+    "quant_matmul_128_4": partial(
+        _quant_matmul, transpose=False, group_size=128, bits=4
+    ),
+    "quant_matmul_128_8": partial(
+        _quant_matmul, transpose=False, group_size=128, bits=8
+    ),
+    "quant_matmul_t_32_2": partial(
+        _quant_matmul, transpose=True, group_size=32, bits=2
+    ),
+    "quant_matmul_t_32_4": partial(
+        _quant_matmul, transpose=True, group_size=32, bits=4
+    ),
+    "quant_matmul_t_32_8": partial(
+        _quant_matmul, transpose=True, group_size=32, bits=8
+    ),
+    "quant_matmul_t_64_2": partial(
+        _quant_matmul, transpose=True, group_size=64, bits=2
+    ),
+    "quant_matmul_t_64_4": partial(
+        _quant_matmul, transpose=True, group_size=64, bits=4
+    ),
+    "quant_matmul_t_64_8": partial(
+        _quant_matmul, transpose=True, group_size=64, bits=8
+    ),
+    "quant_matmul_t_128_2": partial(
+        _quant_matmul, transpose=True, group_size=128, bits=2
+    ),
+    "quant_matmul_t_128_4": partial(
+        _quant_matmul, transpose=True, group_size=128, bits=4
+    ),
+    "quant_matmul_t_128_8": partial(
+        _quant_matmul, transpose=True, group_size=128, bits=8
+    ),
+}
+def conv1d(x, y):
+    ys = []
+    for i in range(10):
+        ys.append(mx.conv1d(x, y))
+    mx.eval(ys)
+def conv2d(x, y):
+    ys = []
+    for i in range(10):
+        ys.append(mx.conv2d(x, y))
+    mx.eval(ys)
+def binary(op, x, y):
+    for i in range(100):
+        y = getattr(mx, op)(x, y)
+    mx.eval(y)
+def reduction(op, axis, x):
+    ys = []
+    for i in range(100):
+        ys.append(getattr(mx, op)(x, axis=axis))
+    mx.eval(ys)
+def sum_and_add(axis, x, y):
+    z = x.sum(axis=axis, keepdims=True)
+    for i in range(50):
+        z = (z + y).sum(axis=axis, keepdims=True)
+    mx.eval(z)
+def softmax(axis, x):
+    ys = []
+    for i in range(100):
+        ex = mx.exp(x - mx.max(x, axis=axis, keepdims=True))
+        y = ex / mx.sum(ex, axis=axis, keepdims=True)
+        ys.append(y)
+    mx.eval(ys)
+def softmax_fused(axis, x):
+    ys = []
+    for i in range(100):
+        y = mx.softmax(x, axis=axis)
+        ys.append(y)
+    mx.eval(ys)
+def relu(x):
+    y = x
+    for i in range(100):
+        y = nn.relu(y)
+    mx.eval(y)
+def leaky_relu(x: mx.array):
+    y = x
+    for i in range(100):
+        y = nn.leaky_relu(y)
+    mx.eval(y)
+def prelu(x: mx.array):
+    y = x
+    for i in range(100):
+        y = nn.prelu(y, mx.ones(1))
+    mx.eval(y)
+def softplus(x: mx.array):
+    y = x
+    for i in range(100):
+        y = nn.softplus(y)
+    mx.eval(y)
+def mish(x: mx.array):
+    y = x
+    for i in range(100):
+        y = nn.mish(y)
+    mx.eval(y)
+def leaky_relu(x):
+    y = x
+    for i in range(100):
+        y = nn.leaky_relu(y)
+    mx.eval(y)
+def elu(x):
+    y = x
+    for i in range(100):
+        y = nn.elu(y)
+    mx.eval(y)
+def relu6(x):
+    y = x
+    for i in range(100):
+        y = nn.relu6(y)
+    mx.eval(y)
+def softplus(x):
+    y = x
+    for i in range(100):
+        y = nn.softplus(y)
+    mx.eval(y)
+def celu(x):
+    y = x
+    for i in range(100):
+        y = nn.celu(y)
+    mx.eval(y)
+def log_sigmoid(x):
+    y = x
+    for i in range(100):
+        y = nn.log_sigmoid(y)
+    mx.eval(y)
+def scalar_mult(x):
+    y = x
+    for i in range(100):
+        y = y * (1.0 / (1 + i))
+    mx.eval(y)
+def cross_entropy(targets, x):
+    ys = []
+    for i in range(100):
+        y = mx.logsumexp(x, axis=-1, keepdims=True) - mx.take_along_axis(
+            x, mx.reshape(targets, (-1, 1)), axis=-1
+        )
+        ys.append(mx.mean(y))
+    mx.eval(ys)
+def logsumexp(axis, x):
+    ys = []
+    for i in range(100):
+        ys.append(mx.logsumexp(x, axis=axis))
+    mx.eval(ys)
+def linear(w, b, x):
+    ys = []
+    for i in range(10):
+        ys.append(x @ mx.transpose(w, (1, 0)) + b)
+    mx.eval(ys)
+def linear_fused(w, b, x):
+    ys = []
+    for i in range(10):
+        ys.append(mx.addmm(b, x, mx.transpose(w, (1, 0))))
+    mx.eval(ys)
+def rope(x):
+    *_, N, D = x.shape
+    ys = []
+    for i in range(10):
+        shape = x.shape
+        x = mx.reshape(x, (-1, N, D))
+        positions = mx.arange(N)
+        freqs = mx.exp(mx.arange(0.0, D // 2) / math.log(10000 / (D // 2 - 1)))
+        theta = mx.reshape(positions, (-1, 1)) * mx.reshape(freqs, (1, -1))
+        costheta = mx.cos(theta)
+        sintheta = mx.sin(theta)
+        x1 = x[..., ::2]
+        x2 = x[..., 1::2]
+        rx1 = x1 * costheta - x2 * sintheta
+        rx2 = x1 * sintheta + x2 * costheta
+        y = mx.concatenate([rx1[..., None], rx2[..., None]], axis=-1)
+        y = mx.reshape(y, (-1, N, D))
+        ys.append(y)
+    mx.eval(ys)
+def concatenate(axis, x, y):
+    ys = []
+    for i in range(10):
+        ys.append(mx.concatenate([x, y], axis=axis))
+    mx.eval(ys)
+def cumsum(axis, x):
+    ys = []
+    for i in range(10):
+        ys.append(mx.cumsum(x, axis))
+    mx.eval(ys)
+def sort(axis, x):
+    ys = []
+    for i in range(10):
+        ys.append(mx.sort(x, axis))
+    mx.eval(ys)
+def topk(axis, x):
+    k = x.shape[axis] // 3
+    ys = []
+    for i in range(10):
+        ys.append(mx.topk(x, k, axis))
+    mx.eval(ys)
+def step_function(x):
+    y = x
+    for i in range(100):
+        y = nn.step(x)
+    mx.eval(y)
+def selu(x):
+    y = x
+    for i in range(100):
+        y = nn.selu(x)
+    mx.eval(y)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("benchmark", help="Choose the benchmark to run")
+    parser.add_argument(
+        "--size",
+        default=[(1024, 1024)],
+        type=lambda x: list(map(int, x.split("x"))),
+        help="Set the matrix size",
+        action="append",
+    )
+    parser.add_argument(
+        "--axis",
+        default=[1],
+        type=int_or_list,
+        help="Set a reduction axis",
+        action="append",
+    )
+    parser.add_argument(
+        "--transpose",
+        type=none_or_list,
+        default=[],
+        help="Permute the matrix",
+        action="append",
+    )
+    parser.add_argument(
+        "--print-pid", action="store_true", help="Print the PID and pause"
+    )
+    parser.add_argument("--cpu", action="store_true", help="Use the CPU")
+    parser.add_argument(
+        "--fused", action="store_true", help="Use fused functions where possible"
+    )
+    parser.add_argument("--dtype", type=dtype_from_str, default=[], action="append")
+    args = parser.parse_args()
+    if len(args.size) > 1:
+        args.size.pop(0)
+    if len(args.axis) > 1:
+        args.axis.pop(0)
+    if args.cpu:
+        mx.set_default_device(mx.cpu)
+    else:
+        mx.set_default_device(mx.gpu)
+    types = args.dtype
+    if not types:
+        types = [mx.float32]
+    if len(types) < len(args.size):
+        types = types + [types[0]] * (len(args.size) - len(types))
+    xs = []
+    for size, dtype in zip(args.size, types):
+        xs.append(mx.random.normal(size).astype(dtype))
+    for i, t in enumerate(args.transpose):
+        if t is None:
+            continue
+        xs[i] = mx.transpose(xs[i], t)
+    mx.eval(xs)
+    x = xs[0]
+    axis = args.axis[0]
+    if args.print_pid:
+        print(os.getpid())
+        input("Press enter to run")
+    if args.benchmark == "matmul_square":
+        print(bench(matmul_square, x))
+    elif args.benchmark == "matmul":
+        print(bench(matmul, *xs))
+    elif args.benchmark.startswith("quant_matmul"):
+        print(bench(quant_matmul[args.benchmark], *xs))
+    elif args.benchmark == "linear":
+        if args.fused:
+            print(bench(linear_fused, *xs))
+        else:
+            print(bench(linear, *xs))
+    elif args.benchmark == "sum_axis":
+        print(bench(reduction, "sum", axis, x))
+    elif args.benchmark == "sum_all":
+        print(bench(reduction, "sum", None, x))
+    elif args.benchmark == "argmax":
+        print(bench(reduction, "argmax", axis, x))
+    elif args.benchmark == "add":
+        print(bench(binary, "add", *xs))
+    elif args.benchmark == "mul":
+        print(bench(binary, "multiply", *xs))
+    elif args.benchmark == "softmax":
+        if args.fused:
+            print(bench(softmax_fused, axis, x))
+        else:
+            print(bench(softmax, axis, x))
+    elif args.benchmark == "relu":
+        print(bench(relu, x))
+    elif args.benchmark == "elu":
+        print(bench(elu, x))
+    elif args.benchmark == "relu6":
+        print(bench(relu6, x))
+    elif args.benchmark == "celu":
+        print(bench(celu, x))
+    elif args.benchmark == "log_sigmoid":
+        print(bench(log_sigmoid, x))
+    elif args.benchmark == "leaky_relu":
+        print(bench(leaky_relu, x))
+    elif args.benchmark == "prelu":
+        print(bench(prelu, x))
+    elif args.benchmark == "softplus":
+        print(bench(softplus, x))
+    elif args.benchmark == "mish":
+        print(bench(mish, x))
+    elif args.benchmark == "scalar_mul":
+        print(bench(scalar_mult, x))
+    elif args.benchmark == "cross_entropy":
+        if len(size) != 2:
+            raise ValueError("Error: [cross_entropy] benchmark requires a 2 dim size")
+        targets = mx.zeros((len(x),), dtype=mx.uint32)
+        print(bench(cross_entropy, targets, x))
+    elif args.benchmark == "logsumexp":
+        print(bench(logsumexp, axis, x))
+    elif args.benchmark == "rope":
+        print(bench(rope, x))
+    elif args.benchmark == "concatenate":
+        print(bench(concatenate, axis, *xs))
+    elif args.benchmark == "cumsum":
+        print(bench(cumsum, axis, *xs))
+    elif args.benchmark == "conv1d":
+        print(bench(conv1d, *xs))
+    elif args.benchmark == "conv2d":
+        print(bench(conv2d, *xs))
+    elif args.benchmark == "sort":
+        print(bench(sort, axis, x))
+    elif args.benchmark == "topk":
+        print(bench(topk, axis, x))
+    elif args.benchmark == "step":
+        print(bench(step_function, x))
+    elif args.benchmark == "selu":
+        print(bench(selu, x))
+    elif args.benchmark == "sum_and_add":
+        print(bench(sum_and_add, axis, *xs))
+    else:
+        raise ValueError("Unknown benchmark")

ml-stable-diffusion/mlx/benchmarks/python/comparative/bench_torch.py ADDED Viewed

	@@ -0,0 +1,482 @@

+# Copyright © 2023 Apple Inc.
+import argparse
+import os
+import time
+import torch
+import torch.cuda
+import torch.mps
+def int_or_list(x):
+    try:
+        return int(x)
+    except ValueError:
+        return [int(xi) for xi in x.split(",")]
+def none_or_list(x):
+    if x == "":
+        return None
+    else:
+        return [int(xi) for xi in x.split(",")]
+def dtype_from_str(x):
+    if x == "":
+        return torch.float32
+    else:
+        dt = getattr(torch, x)
+        if not isinstance(dt, torch.dtype):
+            raise ValueError(f"{x} is not a torch dtype")
+        return dt
+def bench(f, *args):
+    for i in range(10):
+        f(*args)
+    s = time.time()
+    for i in range(100):
+        f(*args)
+    e = time.time()
+    return e - s
+def sync_if_needed(x):
+    if x.device == torch.device("mps"):
+        torch.mps.synchronize()
+    elif x.device == torch.device("cuda"):
+        torch.cuda.synchronize()
+@torch.no_grad()
+def matmul_square(x):
+    y = x
+    for i in range(10):
+        y = y @ x
+    sync_if_needed(x)
+@torch.no_grad()
+def matmul(x, y):
+    ys = []
+    for i in range(10):
+        ys.append(x @ y)
+    sync_if_needed(x)
+@torch.no_grad()
+def conv1d(x, y):
+    x = torch.transpose(x, -1, -2)
+    y = torch.transpose(y, -1, -2)
+    ys = []
+    for i in range(10):
+        ys.append(torch.nn.functional.conv1d(x, y))
+    sync_if_needed(x)
+@torch.no_grad()
+def conv2d(x, y):
+    x = torch.permute(x, (0, 3, 1, 2))
+    y = torch.permute(y, (0, 3, 1, 2))
+    ys = []
+    for i in range(10):
+        ys.append(torch.nn.functional.conv2d(x, y))
+    sync_if_needed(x)
+@torch.no_grad()
+def binary(op, x, y):
+    for i in range(100):
+        y = getattr(torch, op)(x, y)
+    sync_if_needed(x)
+@torch.no_grad()
+def reduction(op, axis, x):
+    ys = []
+    for i in range(100):
+        ys.append(getattr(x, op)(axis))
+    sync_if_needed(x)
+@torch.no_grad()
+def sum_and_add(axis, x, y):
+    z = x.sum(axis=axis, keepdims=True)
+    for i in range(50):
+        z = (z + y).sum(axis=axis, keepdims=True)
+    sync_if_needed(x)
+@torch.no_grad()
+def softmax(axis, x):
+    ys = []
+    for i in range(100):
+        ex = torch.exp(x - torch.max(x, dim=axis, keepdims=True).values)
+        y = ex / torch.sum(ex, dim=axis, keepdims=True)
+        ys.append(y)
+    sync_if_needed(x)
+@torch.no_grad()
+def softmax_fused(axis, x):
+    ys = []
+    for i in range(100):
+        ys.append(torch.nn.functional.softmax(x, dim=axis))
+    sync_if_needed(x)
+@torch.no_grad()
+def relu(x):
+    y = x
+    for i in range(100):
+        y = torch.nn.functional.relu(y)
+    sync_if_needed(x)
+@torch.no_grad()
+def leaky_relu(x):
+    y = x
+    for i in range(100):
+        y = torch.nn.functional.leaky_relu(y)
+    sync_if_needed(x)
+@torch.no_grad()
+def elu(x):
+    y = x
+    for i in range(100):
+        y = torch.nn.functional.elu(y)
+    sync_if_needed(x)
+@torch.no_grad()
+def celu(x):
+    y = x
+    for i in range(100):
+        y = torch.nn.functional.celu(y)
+    sync_if_needed(x)
+@torch.no_grad()
+def relu6(x):
+    y = x
+    for i in range(100):
+        y = torch.nn.functional.relu6(y)
+    sync_if_needed(x)
+@torch.no_grad()
+def softplus(x):
+    y = x
+    for i in range(100):
+        y = torch.nn.functional.softplus(y)
+    sync_if_needed(x)
+@torch.no_grad()
+def log_sigmoid(x):
+    y = x
+    for i in range(100):
+        y = torch.nn.functional.logsigmoid(y)
+    sync_if_needed(x)
+@torch.no_grad()
+def prelu(x: torch.Tensor) -> torch.Tensor:
+    y = x
+    for _ in range(100):
+        y = torch.nn.functional.prelu(y, torch.ones(1).to(y.device))
+    sync_if_needed(x)
+@torch.no_grad()
+def mish(x: torch.Tensor) -> torch.Tensor:
+    y = x
+    for _ in range(100):
+        y = torch.nn.functional.mish(y)
+    sync_if_needed(x)
+@torch.no_grad()
+def scalar_mult(x):
+    y = x
+    for i in range(100):
+        y = y * (1.0 / (1 + i))
+    sync_if_needed(x)
+@torch.no_grad()
+def cross_entropy(targets, x):
+    ys = []
+    for i in range(100):
+        ys.append(torch.nn.functional.cross_entropy(x, targets))
+    sync_if_needed(x)
+@torch.no_grad()
+def logsumexp(axis, x):
+    ys = []
+    for i in range(100):
+        ys.append(torch.logsumexp(x, dim=axis))
+    sync_if_needed(x)
+@torch.no_grad()
+def linear_fused(w, b, x):
+    ys = []
+    for i in range(10):
+        ys.append(torch.nn.functional.linear(x, w, b))
+    sync_if_needed(x)
+@torch.no_grad()
+def linear(w, b, x):
+    ys = []
+    for i in range(10):
+        ys.append((x @ torch.transpose(w, -2, -1)) + b)
+    sync_if_needed(x)
+@torch.no_grad()
+def rope(x):
+    *_, N, D = x.shape
+    ys = []
+    for i in range(10):
+        x = x.view(-1, N, D)
+        positions = torch.arange(N, device=x.device)
+        freqs = 10000 ** torch.linspace(0, 1, D // 2, device=x.device)
+        theta = positions[:, None] * freqs[None]
+        costheta = torch.cos(theta)
+        sintheta = torch.sin(theta)
+        x1 = x[..., ::2]
+        x2 = x[..., 1::2]
+        rx1 = x1 * costheta - x2 * sintheta
+        rx2 = x1 * sintheta + x2 * costheta
+        y = torch.cat([rx1[..., None], rx2[..., None]], dim=-1)
+        y = y.reshape(-1, N, D)
+        ys.append(y)
+    sync_if_needed(x)
+@torch.no_grad()
+def concatenate(axis, x, y):
+    ys = []
+    for i in range(10):
+        ys.append(torch.cat([x, y], dim=axis))
+    sync_if_needed(x)
+@torch.no_grad()
+def cumsum(axis, x):
+    ys = []
+    for i in range(10):
+        ys.append(x.cumsum(axis))
+    sync_if_needed(x)
+@torch.no_grad()
+def sort(axis, x):
+    ys = []
+    for i in range(10):
+        ys.append(torch.sort(x, dim=axis)[0])
+    sync_if_needed(x)
+@torch.no_grad()
+def topk(axis, x):
+    k = x.shape[axis] // 3
+    ys = []
+    for i in range(10):
+        ys.append(torch.topk(x, k, dim=axis)[0])
+    sync_if_needed(x)
+@torch.no_grad()
+def step_function(x):
+    y = x
+    for i in range(100):
+        y = torch.where(y < 0, 0, 1)
+    sync_if_needed(x)
+@torch.no_grad()
+def selu(x):
+    y = x
+    for i in range(100):
+        y = torch.nn.functional.selu(y)
+    sync_if_needed(x)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("benchmark", help="Choose the benchmark to run")
+    parser.add_argument(
+        "--size",
+        default=[(1024, 1024)],
+        type=lambda x: list(map(int, x.split("x"))),
+        help="Set the matrix size",
+        action="append",
+    )
+    parser.add_argument(
+        "--axis",
+        default=[1],
+        type=int_or_list,
+        help="Set a reduction axis",
+        action="append",
+    )
+    parser.add_argument(
+        "--transpose",
+        type=none_or_list,
+        default=[],
+        help="Permute the matrix",
+        action="append",
+    )
+    parser.add_argument(
+        "--print-pid", action="store_true", help="Print the PID and pause"
+    )
+    parser.add_argument("--cpu", action="store_true", help="Use the CPU")
+    parser.add_argument(
+        "--fused", action="store_true", help="Use fused functions where possible"
+    )
+    parser.add_argument("--dtype", type=dtype_from_str, default=[], action="append")
+    args = parser.parse_args()
+    if len(args.size) > 1:
+        args.size.pop(0)
+    if len(args.axis) > 1:
+        args.axis.pop(0)
+    torch.set_num_threads(1)
+    device = "mps"
+    if torch.cuda.is_available():
+        device = "cuda"
+    if args.cpu:
+        device = "cpu"
+    types = args.dtype
+    if not types:
+        types = [torch.float32]
+    if len(types) < len(args.size):
+        types = types + [types[0]] * (len(args.size) - len(types))
+    xs = []
+    for size, dtype in zip(args.size, types):
+        xs.append(torch.randn(*size).to(device).to(dtype))
+    for i, t in enumerate(args.transpose):
+        if t is None:
+            continue
+        xs[i] = xs[i].permute(*t)
+    x = xs[0]
+    axis = args.axis[0]
+    if args.print_pid:
+        print(os.getpid())
+        input("Press enter to run")
+    if args.benchmark == "matmul_square":
+        print(bench(matmul_square, x))
+    elif args.benchmark == "matmul":
+        print(bench(matmul, *xs))
+    elif args.benchmark == "linear":
+        if args.fused:
+            print(bench(linear_fused, *xs))
+        else:
+            print(bench(linear, *xs))
+    elif args.benchmark == "sum_axis":
+        print(bench(reduction, "sum", axis, x))
+    elif args.benchmark == "sum_all":
+        print(bench(reduction, "sum", None, x))
+    elif args.benchmark == "argmax":
+        print(bench(reduction, "argmax", axis, x))
+    elif args.benchmark == "add":
+        print(bench(binary, "add", *xs))
+    elif args.benchmark == "mul":
+        print(bench(binary, "mul", *xs))
+    elif args.benchmark == "softmax":
+        if args.fused:
+            print(bench(softmax_fused, axis, x))
+        else:
+            print(bench(softmax, axis, x))
+    elif args.benchmark == "relu":
+        print(bench(relu, x))
+    elif args.benchmark == "leaky_relu":
+        print(bench(leaky_relu, x))
+    elif args.benchmark == "elu":
+        print(bench(elu, x))
+    elif args.benchmark == "relu6":
+        print(bench(relu6, x))
+    elif args.benchmark == "softplus":
+        print(bench(softplus, x))
+    elif args.benchmark == "celu":
+        print(bench(celu, x))
+    elif args.benchmark == "log_sigmoid":
+        print(bench(log_sigmoid, x))
+    elif args.benchmark == "prelu":
+        print(bench(prelu, x))
+    elif args.benchmark == "mish":
+        print(bench(mish, x))
+    elif args.benchmark == "scalar_mul":
+        print(bench(scalar_mult, x))
+    elif args.benchmark == "cross_entropy":
+        if len(size) != 2:
+            raise ValueError("Error: [cross_entropy] benchmark requires a 2 dim size")
+        targets = torch.zeros(len(x), dtype=torch.long).to(x.device)
+        print(bench(cross_entropy, targets, x))
+    elif args.benchmark == "logsumexp":
+        print(bench(logsumexp, axis, x))
+    elif args.benchmark == "rope":
+        print(bench(rope, x))
+    elif args.benchmark == "concatenate":
+        print(bench(concatenate, axis, *xs))
+    elif args.benchmark == "cumsum":
+        print(bench(cumsum, axis, *xs))
+    elif args.benchmark == "conv1d":
+        print(bench(conv1d, *xs))
+    elif args.benchmark == "conv2d":
+        print(bench(conv2d, *xs))
+    elif args.benchmark == "sort":
+        print(bench(sort, axis, x))
+    elif args.benchmark == "topk":
+        print(bench(topk, axis, x))
+    elif args.benchmark == "step":
+        print(bench(step_function, x))
+    elif args.benchmark == "selu":
+        print(bench(selu, x))
+    elif args.benchmark == "sum_and_add":
+        print(bench(sum_and_add, axis, *xs))
+    else:
+        raise ValueError(f"Unknown benchmark `{args.benchmark}`.")

ml-stable-diffusion/mlx/benchmarks/python/comparative/compare.py ADDED Viewed

	@@ -0,0 +1,284 @@

+# Copyright © 2023 Apple Inc.
+#!/usr/bin/env python
+import argparse
+import re
+from pathlib import Path
+from subprocess import run
+BENCH_MLX = Path(__file__).parent / "bench_mlx.py"
+BENCH_TORCH = Path(__file__).parent / "bench_torch.py"
+def run_or_raise(*args, **kwargs):
+    try:
+        result = run(*args, capture_output=True, **kwargs)
+        return float(result.stdout)
+    except ValueError:
+        raise ValueError(
+            f"stdout: {result.stdout.decode()}\nstderr: {result.stderr.decode()}"
+        )
+def compare(args):
+    t_mlx = run_or_raise(["python", BENCH_MLX] + args)
+    t_torch = run_or_raise(["python", BENCH_TORCH] + args)
+    print((t_torch - t_mlx) / t_torch, " ".join(args), sep="\t")
+def compare_mlx_dtypes(args, dt1, dt2):
+    t_mlx_dt1 = run_or_raise(["python", BENCH_MLX] + args + ["--dtype", dt1])
+    t_mlx_dt2 = run_or_raise(["python", BENCH_MLX] + args + ["--dtype", dt2])
+    print((t_mlx_dt2 - t_mlx_dt1) / t_mlx_dt2, " ".join(args), sep="\t")
+def make_regex_search(regexes):
+    compiled_regexes = list(map(re.compile, regexes))
+    def search(x):
+        return (c.search(x) is not None for c in compiled_regexes)
+    return search
+def make_predicate(positive_filter, negative_filter):
+    if positive_filter is not None:
+        positive_filter_search = make_regex_search(positive_filter)
+        positive_filter = lambda x: all(positive_filter_search(x))
+    else:
+        positive_filter = lambda x: True
+    if negative_filter is not None:
+        negative_filter_search = make_regex_search(negative_filter)
+        negative_filter = lambda x: not any(negative_filter_search(x))
+    else:
+        negative_filter = lambda x: True
+    def predicate(x):
+        return positive_filter(x) and negative_filter(x)
+    return predicate
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run comparisons against PyTorch")
+    parser.add_argument(
+        "--filter", "-f", help="Regex filter to select benchmarks", nargs="+"
+    )
+    parser.add_argument(
+        "--negative_filter", "-n", help="Regex filter to remove benchmarks", nargs="+"
+    )
+    parser.add_argument(
+        "--mlx_dtypes",
+        "-d",
+        help="Compare mlx benchmarks between the 2 provided data types",
+        nargs=2,
+    )
+    args, rest = parser.parse_known_args()
+    _filter = make_predicate(args.filter, args.negative_filter)
+    if args.mlx_dtypes:
+        compare_filtered = lambda x: (
+            compare_mlx_dtypes(x.split() + rest, args.mlx_dtypes[0], args.mlx_dtypes[1])
+            if _filter(x)
+            else None
+        )
+    else:
+        compare_filtered = lambda x: compare(x.split() + rest) if _filter(x) else None
+    # Binary ops
+    compare_filtered("add --size 10x1024x128 --size 1x1024x128 --cpu")
+    compare_filtered("add --size 10x1024x128 --size 1x1024x128")
+    compare_filtered("add --size 1024x128 --size 1x128 --cpu")
+    compare_filtered("add --size 1024x128 --size 1x128")
+    compare_filtered("add --size 1024x4096 --size 1x4096 --cpu")
+    compare_filtered("add --size 1024x4096 --size 1x4096")
+    compare_filtered("add --size 1024x4096 --size 1x1024 --transpose 1,0 --cpu")
+    compare_filtered("add --size 1024x4096 --size 1x1024 --transpose 1,0")
+    compare_filtered("add --size 1024x1024 --size 1024x1024 --cpu")
+    compare_filtered("add --size 1024x1024 --size 1024x1024")
+    compare_filtered("add --size 1024x1024 --size 1024x1024 --transpose 1,0 --cpu")
+    compare_filtered("add --size 1024x1024 --size 1024x1024 --transpose 1,0")
+    compare_filtered(
+        "add --size 1024x1024 --size 1024x1024 --transpose 1,0 --transpose 1,0 --cpu"
+    )
+    compare_filtered(
+        "add --size 1024x1024 --size 1024x1024 --transpose 1,0 --transpose 1,0"
+    )
+    # Reduction ops
+    compare_filtered("sum_all --size 10x1024x128 --cpu")
+    compare_filtered("sum_all --size 10x1024x128")
+    compare_filtered("sum_axis --size 16x1024x128 --axis 2 --cpu")
+    compare_filtered("sum_axis --size 16x1024x128 --axis 2")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 2 --cpu")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 2")
+    compare_filtered("sum_axis --size 1024x1024 --axis 1 --cpu")
+    compare_filtered("sum_axis --size 1024x1024 --axis 1")
+    compare_filtered("sum_axis --size 1024x1024 --axis 0 --cpu")
+    compare_filtered("sum_axis --size 1024x1024 --axis 0")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 1 --cpu")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 1")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0 --cpu")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0,1 --cpu")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0,1")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0,2 --cpu")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0,2")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0,1 --transpose 0,2,1 --cpu")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0,1 --transpose 0,2,1")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0,2 --transpose 0,2,1 --cpu")
+    compare_filtered("sum_axis --size 16x128x1024 --axis 0,2 --transpose 0,2,1")
+    compare_filtered("argmax --size 10x1024x128 --axis 1 --cpu")
+    compare_filtered("argmax --size 10x1024x128 --axis 1")
+    compare_filtered("argmax --size 10x1024x128 --axis 2 --cpu")
+    compare_filtered("argmax --size 10x1024x128 --axis 2")
+    compare_filtered("argmax --size 1024x1024 --axis 1 --cpu")
+    compare_filtered("argmax --size 1024x1024 --axis 1")
+    # Matmul ops
+    compare_filtered("matmul_square --size 1024x1024")
+    compare_filtered("matmul_square --size 1024x1024 --cpu")
+    compare_filtered("matmul_square --size 16x1024x1024")
+    compare_filtered("matmul_square --size 16x1024x1024 --cpu")
+    compare_filtered(
+        "matmul --size 16x768x768 --size 16x768x768 --transpose= --transpose 0,2,1"
+    )
+    compare_filtered(
+        "matmul --size 16x768x768 --size 16x768x768 --transpose= --transpose 0,2,1 --cpu"
+    )
+    compare_filtered(
+        "matmul --size 16x768x128 --size 16x768x128 --transpose= --transpose 0,2,1"
+    )
+    compare_filtered(
+        "matmul --size 16x768x128 --size 16x768x128 --transpose= --transpose 0,2,1 --cpu"
+    )
+    compare_filtered("matmul --size 512x8192 --size 8192x512")
+    compare_filtered("matmul --size 512x8192 --size 8192x512 --cpu")
+    # compare_filtered("matmul --size 512x131072 --size 131072x512")
+    # compare_filtered("matmul --size 512x131072 --size 131072x512 --cpu")
+    compare_filtered("matmul --size 8192x512 --size 512x8192")
+    compare_filtered("matmul --size 8192x512 --size 512x8192 --cpu")
+    # compare_filtered("matmul --size 131072x512 --size 512x512")
+    # compare_filtered("matmul --size 131072x512 --size 512x512 --cpu")
+    compare_filtered("linear --size 1024x1024 --size 1024 --size 128x1024")
+    compare_filtered("linear --size 1024x1024 --size 1024 --size 128x1024 --cpu")
+    compare_filtered("linear --size 1024x1024 --size 1024 --size 128x1024 --fused")
+    compare_filtered(
+        "linear --size 1024x1024 --size 1024 --size 128x1024 --fused --cpu"
+    )
+    # Matvec ops
+    compare_filtered("matmul --size 1x1x4096 --size 4096x4096 --cpu")
+    compare_filtered("matmul --size 1x1x4096 --size 4096x4096")
+    compare_filtered(
+        "matmul --size 1x1x4096 --size 4096x4096 --transpose= --transpose 1,0 --cpu"
+    )
+    compare_filtered(
+        "matmul --size 1x1x4096 --size 4096x4096 --transpose= --transpose 1,0"
+    )
+    compare_filtered("matmul --size 32x1x1000 --size 32x1000x128 --cpu")
+    compare_filtered("matmul --size 32x1x1000 --size 32x1000x128")
+    compare_filtered(
+        "matmul --size 32x1x1000 --size 32x128x1000 --transpose= --transpose 0,2,1 --cpu"
+    )
+    compare_filtered(
+        "matmul --size 32x1x1000 --size 32x128x1000 --transpose= --transpose 0,2,1"
+    )
+    # Various ops
+    compare_filtered("softmax --size 32x16x1024 --axis 2")
+    compare_filtered("softmax --size 32x16x1024 --axis 2 --cpu")
+    compare_filtered("softmax --size 32x16x1024 --axis 2 --fused")
+    compare_filtered("softmax --size 32x16x1024 --axis 2 --fused --cpu")
+    compare_filtered("softmax --size 2x1024x1024 --axis 1")
+    compare_filtered("softmax --size 2x1024x1024 --axis 1 --cpu")
+    compare_filtered("softmax --size 2x1024x1024 --axis 1 --fused")
+    compare_filtered("softmax --size 2x1024x1024 --axis 1 --fused --cpu")
+    compare_filtered("relu --size 32x16x1024")
+    compare_filtered("relu --size 32x16x1024 --cpu")
+    compare_filtered("leaky_relu --size 32x16x1024")
+    compare_filtered("leaky_relu --size 32x16x1024 --cpu")
+    compare_filtered("elu --size 32x16x1024")
+    compare_filtered("elu --size 32x16x1024 --cpu")
+    compare_filtered("relu6 --size 32x16x1024")
+    compare_filtered("relu6 --size 32x16x1024 --cpu")
+    compare_filtered("softplus --size 32x16x1024")
+    compare_filtered("softplus --size 32x16x1024 --cpu")
+    compare_filtered("celu --size 32x16x1024")
+    compare_filtered("celu --size 32x16x1024 --cpu")
+    compare_filtered("log_sigmoid --size 32x16x1024")
+    compare_filtered("log_sigmoid --size 32x16x1024 --cpu")
+    compare_filtered("step --size 32x16x1024")
+    compare_filtered("step --size 32x16x1024 --cpu")
+    compare_filtered("selu --size 32x16x1024")
+    compare_filtered("selu --size 32x16x1024 --cpu")
+    # compare_filtered("mish --size 32x16x1024") NOTE: Torch does not implement Mish in MPS atm
+    compare_filtered("mish --size 32x16x1024 --cpu")
+    compare_filtered("prelu --size 32x16x1024")
+    compare_filtered("prelu --size 32x16x1024 --cpu")
+    compare_filtered("scalar_mul --size 32x16x1024")
+    compare_filtered("scalar_mul --size 32x16x1024 --cpu")
+    compare_filtered("cross_entropy --size 256x1024")
+    compare_filtered("cross_entropy --size 256x1024 --cpu")
+    compare_filtered("logsumexp --size 1024x1024 --axis 1")
+    compare_filtered("logsumexp --size 1024x1024 --axis 1 --cpu")
+    compare_filtered("logsumexp --size 1024x1024 --axis 0")
+    compare_filtered("logsumexp --size 1024x1024 --axis 0 --cpu")
+    compare_filtered("concatenate --size 32x1024x128 --size 32x1024x128 --axis 2")
+    compare_filtered("concatenate --size 32x1024x128 --size 32x1024x128 --axis 2 --cpu")
+    compare_filtered("concatenate --size 32x1024x128 --size 32x1024x128 --axis 1")
+    compare_filtered("concatenate --size 32x1024x128 --size 32x1024x128 --axis 1 --cpu")
+    compare_filtered("concatenate --size 32x1024x128 --size 32x1024x128 --axis 0")
+    compare_filtered("concatenate --size 32x1024x128 --size 32x1024x128 --axis 0 --cpu")
+    compare_filtered("concatenate --size 32x1024x128 --size 32x16x128 --axis 1")
+    compare_filtered("concatenate --size 32x1024x128 --size 32x16x128 --axis 1 --cpu")
+    compare_filtered("concatenate --size 32x1024x128 --size 32x1x128 --axis 1")
+    compare_filtered("concatenate --size 32x1024x128 --size 32x1x128 --axis 1 --cpu")
+    compare_filtered("concatenate --size 1x32x1024x128 --size 1x32x1x128 --axis 2")
+    compare_filtered(
+        "concatenate --size 1x32x1024x128 --size 1x32x1x128 --axis 2 --cpu"
+    )
+    compare_filtered("conv1d --size 1x1000x80 --size 128x11x80")
+    compare_filtered("conv1d --size 1x1000x80 --size 128x11x80 --cpu")
+    compare_filtered("conv1d --size 16x1000x80 --size 128x11x80")
+    compare_filtered("conv1d --size 4x1000x80 --size 128x11x80 --cpu")
+    compare_filtered("conv2d --size 1x256x256x3 --size 8x3x3x3")
+    compare_filtered("conv2d --size 1x256x256x3 --size 8x3x3x3 --cpu")
+    compare_filtered("conv2d --size 16x256x256x3 --size 8x3x3x3")
+    compare_filtered("conv2d --size 4x256x256x3 --size 8x3x3x3 --cpu")
+    compare_filtered("cumsum --size 1024x1024 --axis 1 --cpu")
+    compare_filtered("cumsum --size 1024x1024 --axis 0 --cpu")
+    compare_filtered("cumsum --size 1024x1024 --axis 1")
+    compare_filtered("cumsum --size 1024x1024 --axis 0")
+    compare_filtered("cumsum --size 128x1024 --axis 1")
+    compare_filtered("cumsum --size 128x1024 --axis 0")
+    compare_filtered("cumsum --size 1024x4096 --axis 1")
+    compare_filtered("cumsum --size 1024x4096 --axis 0")
+    compare_filtered("cumsum --size 128x4096 --axis 1")
+    compare_filtered("cumsum --size 128x4096 --axis 0")
+    compare_filtered("cumsum --size 1024x7777 --axis 1")
+    compare_filtered("cumsum --size 1024x7777 --axis 0")
+    compare_filtered("cumsum --size 128x7777 --axis 1")
+    compare_filtered("cumsum --size 128x7777 --axis 0")
+    compare_filtered("cumsum --size 32768x128 --axis 1")
+    compare_filtered("cumsum --size 32768x128 --axis 0")
+    compare_filtered("sort --size 1024x1024 --axis 0")
+    compare_filtered("sort --size 1024x1024 --axis 1")
+    compare_filtered("sort --size 32768x128 --axis 0")
+    compare_filtered("sort --size 32768x128 --axis 1")
+    compare_filtered("sort --size 128x128 --axis 0 --cpu")
+    compare_filtered("sort --size 128x128 --axis 1 --cpu")
+    compare_filtered("topk --size 1024x1024 --axis 0")
+    compare_filtered("topk --size 1024x1024 --axis 1")
+    compare_filtered("topk --size 32768x128 --axis 0")
+    compare_filtered("topk --size 32768x128 --axis 1")
+    compare_filtered("topk --size 128x128 --axis 0 --cpu")
+    compare_filtered("topk --size 128x128 --axis 1 --cpu")

ml-stable-diffusion/mlx/benchmarks/python/compile_bench.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# Copyright © 2023-2024 Apple Inc.
+import argparse
+import math
+import random
+import mlx.core as mx
+from time_utils import time_fn
+def bench_gelu():
+    def gelu(x):
+        return x * (1 + mx.erf(x / math.sqrt(2))) / 2
+    x = mx.random.uniform(shape=(1000, 1024))
+    def gen_fun(fun):
+        def bench_fun(x):
+            for _ in range(10):
+                x = fun(x)
+            return x
+        return bench_fun
+    time_fn(gen_fun(gelu), x, msg="fixed gelu")
+    time_fn(gen_fun(mx.compile(gelu)), x, msg="compiled fixed gelu")
+    def randint():
+        return random.randint(1, x.shape[0])
+    def gen_fun(fun):
+        def bench_fun(x, y):
+            x = x[: randint()]
+            for _ in range(10):
+                x = fun(x)
+                y = fun(y)
+            return x, y
+        return bench_fun
+    y = mx.random.uniform(shape=(1000, 1024))
+    time_fn(gen_fun(gelu), x, y, msg="variable gelu")
+    time_fn(gen_fun(mx.compile(gelu)), x, y, msg="compiled variable gelu")
+    time_fn(
+        gen_fun(mx.compile(gelu, shapeless=True)),
+        x,
+        y,
+        msg="shapeless variable gelu",
+    )
+def bench_layernorm():
+    weight = mx.random.uniform(shape=(4096,)).astype(mx.float16)
+    bias = mx.random.uniform(shape=(4096,)).astype(mx.float16)
+    mx.eval(weight, bias)
+    def layernorm(x):
+        x = x.astype(mx.float32)
+        means = mx.mean(x, axis=-1, keepdims=True)
+        var = mx.var(x, axis=-1, keepdims=True)
+        x = (x - means) * mx.rsqrt(var + 1e-4)
+        x = x.astype(mx.float16)
+        return weight * x + bias
+    x = mx.random.uniform(shape=(1000, 4096)).astype(mx.float16)
+    def gen_fun(fun):
+        def bench_fun(x):
+            for _ in range(10):
+                x = fun(x)
+            return x
+        return bench_fun
+    time_fn(gen_fun(layernorm), x, msg="fixed layernorm")
+    time_fn(gen_fun(mx.compile(layernorm)), x, msg="compiled fixed layernorm")
+    def randint():
+        return random.randint(1, x.shape[0])
+    def gen_fun(fun):
+        def bench_fun(x):
+            x = x[: randint()]
+            for _ in range(10):
+                x = fun(x)
+            return x
+        return bench_fun
+    random.seed(0)
+    time_fn(gen_fun(layernorm), x, msg="variable layernorm")
+    random.seed(0)
+    time_fn(gen_fun(mx.compile(layernorm)), x, msg="compiled variable layernorm")
+    random.seed(0)
+    time_fn(
+        gen_fun(mx.compile(layernorm, shapeless=True)),
+        x,
+        msg="shapeless variable layernorm",
+    )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Compile benchmarks.")
+    args = parser.parse_args()
+    bench_gelu()
+    bench_layernorm()

ml-stable-diffusion/mlx/benchmarks/python/conv1d_bench.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import argparse
+import math
+import os
+import subprocess
+import time
+import mlx.core as mx
+import numpy as np
+import torch
+device_name = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"])
+device_name = device_name.decode("utf-8").strip("\n")
+N_warmup = 10
+N_iter_bench = 100
+N_iter_func = 5
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+    torch.mps.synchronize()
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+def make_mx_conv_1D(strides=1, padding=0, groups=1):
+    def mx_conv_1D(a, b):
+        ys = []
+        for _ in range(N_iter_func):
+            y = mx.conv1d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        mx.eval(ys)
+        return ys
+    return mx_conv_1D
+def make_pt_conv_1D(strides=1, padding=0, groups=1):
+    @torch.no_grad()
+    def pt_conv_1D(a, b):
+        ys = []
+        for _ in range(N_iter_func):
+            y = torch.conv1d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        torch.mps.synchronize()
+        return ys
+    return pt_conv_1D
+def bench_shape(N, iH, C, wH, O, strides, padding, np_dtype, groups):
+    scale = 1.0 / math.sqrt(wH * C)
+    a_np = np.random.uniform(0, 0.5, (N, iH, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (O, wH, int(C / groups))).astype(np_dtype)
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+    a_pt = torch.from_numpy(a_np.transpose((0, 2, 1))).to("mps")
+    b_pt = torch.from_numpy(b_np.transpose((0, 2, 1))).to("mps")
+    torch.mps.synchronize()
+    f_mx = make_mx_conv_1D(strides, padding, groups)
+    f_pt = make_pt_conv_1D(strides, padding, groups)
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+    out_mx = mx.conv1d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
+    out_pt = torch.conv1d(
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.permute(out_pt, (0, 2, 1))
+    out_pt = out_pt.numpy(force=True)
+    atol = 2e-5 if np_dtype == np.float32 else 1e-4
+    if not np.allclose(out_pt, out_mx, atol=atol):
+        print(
+            f"Failed at {(N, iH, C)}, {(O, wH, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+        )
+    return time_mlx, time_torch
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run conv benchmarks")
+    dtypes = ("float32",)
+    shapes = (
+        (4, 32, 32, 5, 32, 1, 2, 1),
+        (4, 32, 32, 5, 32, 1, 2, 2),
+        (4, 32, 32, 5, 32, 1, 2, 4),
+        (4, 32, 32, 5, 32, 1, 2, 8),
+        (4, 32, 32, 5, 32, 1, 2, 8),
+        (4, 32, 32, 5, 32, 1, 2, 16),
+        (4, 32, 32, 5, 32, 1, 2, 32),
+        (4, 32, 256, 5, 512, 1, 2, 2),
+        (4, 32, 256, 5, 512, 1, 2, 128),
+        (4, 32, 256, 5, 512, 1, 2, 256),
+    )
+    for dtype in dtypes:
+        print("(N,  iH,  C),  (O,  wH,  C),   dtype,  stride, pads, groups, diff%")
+        for N, iH, C, wH, O, strides, padding, groups in shapes:
+            np_dtype = getattr(np, dtype)
+            time_mlx, time_torch = bench_shape(
+                N, iH, C, wH, O, strides, padding, np_dtype, groups
+            )
+            diff = time_torch / time_mlx - 1.0
+            print(
+                f"({N}, {iH:3d}, {C:3d}), ({O:3d}, {wH:2d}, {C:3d}), {dtype}, {strides:5d}, {padding:4d}, {groups:6d}, {100. * diff:+5.2f}%"
+            )
+            if time_mlx >= 2.0 * time_torch:
+                print("ATTENTION ^^^^^^^")

ml-stable-diffusion/mlx/benchmarks/python/conv2d_bench_cpu.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import argparse
+import math
+import time
+import mlx.core as mx
+import numpy as np
+import torch
+N_warmup = 1
+N_iter_bench = 10
+N_iter_func = 5
+mx.set_default_device(mx.cpu)
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+def make_mx_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    def mx_conv_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = mx.conv2d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        mx.eval(ys)
+        return ys
+    return mx_conv_2D
+def make_pt_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    @torch.no_grad()
+    def pt_conv_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = torch.conv2d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        return ys
+    return pt_conv_2D
+def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
+    scale = 1.0 / math.sqrt(kH * kH * C)
+    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (O, kH, kW, int(C / groups))).astype(
+        np_dtype
+    )
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+    a_pt = torch.from_numpy(a_np.transpose((0, 3, 1, 2))).to("cpu")
+    b_pt = torch.from_numpy(b_np.transpose((0, 3, 1, 2))).to("cpu")
+    f_mx = make_mx_conv_2D(strides, padding, groups)
+    f_pt = make_pt_conv_2D(strides, padding, groups)
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+    out_mx = mx.conv2d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
+    out_pt = torch.conv2d(
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.permute(out_pt, (0, 2, 3, 1))
+    out_pt = out_pt.numpy(force=True)
+    atol = 2e-5 if np_dtype == np.float32 else 1e-4
+    if not np.allclose(out_pt, out_mx, atol=atol):
+        print(
+            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+        )
+    return time_mlx, time_torch
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run conv benchmarks")
+    dtypes = ("float32",)
+    shapes = (
+        (4, 32, 32, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 32, 32, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 32, 32, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 32, 32, 256, 5, 5, 256, (1, 1), (2, 2), 1),
+        (4, 32, 32, 512, 5, 5, 512, (1, 1), (2, 2), 1),
+        (4, 64, 64, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 64, 64, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 64, 64, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 1),
+        # (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 2),
+        # (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 16),
+        # (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 64),
+        (4, 128, 128, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 128, 128, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 128, 128, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 256, 256, 32, 5, 5, 3, (1, 1), (2, 2), 1),
+        (4, 256, 256, 3, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 128, 128, 64, 5, 5, 3, (1, 1), (2, 2), 1),
+        (4, 128, 128, 3, 5, 5, 64, (1, 1), (2, 2), 1),
+    )
+    for dtype in dtypes:
+        print(
+            "(N,   H,   W,   C), (  O, kH, kW,   C),   dtype, stride,   pads,  groups, diff%"
+        )
+        for N, H, W, C, kH, kW, O, strides, padding, groups in shapes:
+            np_dtype = getattr(np, dtype)
+            time_mlx, time_torch = bench_shape(
+                N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype
+            )
+            diff = time_torch / time_mlx - 1.0
+            print(
+                f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
+            )
+            if time_mlx >= 2.0 * time_torch:
+                print("ATTENTION ^^^^^^^")

ml-stable-diffusion/mlx/benchmarks/python/conv2d_train_bench_cpu.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import time
+import mlx.core as mx
+import mlx.nn
+import mlx.optimizers as opt
+import torch
+def bench_mlx(steps: int = 20) -> float:
+    mx.set_default_device(mx.cpu)
+    class BenchNetMLX(mlx.nn.Module):
+        # simple encoder-decoder net
+        def __init__(self, in_channels, hidden_channels=32):
+            super().__init__()
+            self.net = mlx.nn.Sequential(
+                mlx.nn.Conv2d(in_channels, hidden_channels, kernel_size=3, padding=1),
+                mlx.nn.ReLU(),
+                mlx.nn.Conv2d(
+                    hidden_channels, 2 * hidden_channels, kernel_size=3, padding=1
+                ),
+                mlx.nn.ReLU(),
+                mlx.nn.ConvTranspose2d(
+                    2 * hidden_channels, hidden_channels, kernel_size=3, padding=1
+                ),
+                mlx.nn.ReLU(),
+                mlx.nn.ConvTranspose2d(
+                    hidden_channels, in_channels, kernel_size=3, padding=1
+                ),
+            )
+        def __call__(self, input):
+            return self.net(input)
+    benchNet = BenchNetMLX(3)
+    mx.eval(benchNet.parameters())
+    optim = opt.Adam(learning_rate=1e-3)
+    inputs = mx.random.normal([10, 256, 256, 3])
+    params = benchNet.parameters()
+    optim.init(params)
+    state = [benchNet.state, optim.state]
+    def loss_fn(params, image):
+        benchNet.update(params)
+        pred_image = benchNet(image)
+        return (pred_image - image).abs().mean()
+    def step(params, image):
+        loss, grads = mx.value_and_grad(loss_fn)(params, image)
+        optim.update(benchNet, grads)
+        return loss
+    total_time = 0.0
+    print("MLX:")
+    for i in range(steps):
+        start_time = time.perf_counter()
+        step(benchNet.parameters(), inputs)
+        mx.eval(state)
+        end_time = time.perf_counter()
+        print(f"{i:3d}, time={(end_time-start_time) * 1000:7.2f} ms")
+        total_time += (end_time - start_time) * 1000
+    return total_time
+def bench_torch(steps: int = 20) -> float:
+    device = torch.device("cpu")
+    class BenchNetTorch(torch.nn.Module):
+        # simple encoder-decoder net
+        def __init__(self, in_channels, hidden_channels=32):
+            super().__init__()
+            self.net = torch.nn.Sequential(
+                torch.nn.Conv2d(in_channels, hidden_channels, kernel_size=3, padding=1),
+                torch.nn.ReLU(),
+                torch.nn.Conv2d(
+                    hidden_channels, 2 * hidden_channels, kernel_size=3, padding=1
+                ),
+                torch.nn.ReLU(),
+                torch.nn.ConvTranspose2d(
+                    2 * hidden_channels, hidden_channels, kernel_size=3, padding=1
+                ),
+                torch.nn.ReLU(),
+                torch.nn.ConvTranspose2d(
+                    hidden_channels, in_channels, kernel_size=3, padding=1
+                ),
+            )
+        def forward(self, input):
+            return self.net(input)
+    benchNet = BenchNetTorch(3).to(device)
+    optim = torch.optim.Adam(benchNet.parameters(), lr=1e-3)
+    inputs = torch.randn(10, 3, 256, 256, device=device)
+    def loss_fn(pred_image, image):
+        return (pred_image - image).abs().mean()
+    total_time = 0.0
+    print("PyTorch:")
+    for i in range(steps):
+        start_time = time.perf_counter()
+        optim.zero_grad()
+        pred_image = benchNet(inputs)
+        loss = loss_fn(pred_image, inputs)
+        loss.backward()
+        optim.step()
+        end_time = time.perf_counter()
+        print(f"{i:3d}, time={(end_time-start_time) * 1000:7.2f} ms")
+        total_time += (end_time - start_time) * 1000
+    return total_time
+def main():
+    steps = 20
+    time_mlx = bench_mlx(steps)
+    time_torch = bench_torch(steps)
+    print(f"average time of MLX:     {time_mlx/steps:9.2f} ms")
+    print(f"total time of MLX:       {time_mlx:9.2f} ms")
+    print(f"average time of PyTorch: {time_torch/steps:9.2f} ms")
+    print(f"total time of PyTorch:   {time_torch:9.2f} ms")
+    diff = time_torch / time_mlx - 1.0
+    print(f"torch/mlx diff: {100. * diff:+5.2f}%")
+if __name__ == "__main__":
+    main()

ml-stable-diffusion/mlx/benchmarks/python/conv2d_transpose_bench_cpu.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import argparse
+import math
+import time
+import mlx.core as mx
+import numpy as np
+import torch
+N_warmup = 1
+N_iter_bench = 10
+N_iter_func = 5
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+def make_mx_conv_transpose_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    def mx_conv_transpose_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = mx.conv_transpose2d(
+                a, b, stride=strides, padding=padding, groups=groups, stream=mx.cpu
+            )
+            ys.append(y)
+        mx.eval(ys)
+        return ys
+    return mx_conv_transpose_2D
+def make_pt_conv_transpose_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    @torch.no_grad()
+    def pt_conv_transpose_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = torch.conv_transpose2d(
+                a, b, stride=strides, padding=padding, groups=groups
+            )
+            ys.append(y)
+        return ys
+    return pt_conv_transpose_2D
+def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
+    scale = 1.0 / math.sqrt(kH * kH * C)
+    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (int(O / groups), kH, kW, C)).astype(
+        np_dtype
+    )
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+    a_pt = torch.from_numpy(a_np.transpose((0, 3, 1, 2))).to("cpu")
+    b_pt = torch.from_numpy(b_np.transpose((3, 0, 1, 2))).to("cpu")
+    f_mx = make_mx_conv_transpose_2D(strides, padding, groups)
+    f_pt = make_pt_conv_transpose_2D(strides, padding, groups)
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+    out_mx = mx.conv_transpose2d(
+        a_mx, b_mx, stride=strides, padding=padding, groups=groups, stream=mx.cpu
+    )
+    out_pt = torch.conv_transpose2d(
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.permute(out_pt, (0, 2, 3, 1))
+    out_pt = out_pt.numpy(force=True)
+    atol = 2e-5 if np_dtype == np.float32 else 1e-4
+    if not np.allclose(out_pt, out_mx, atol=atol):
+        print(
+            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+        )
+    return time_mlx, time_torch
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run conv benchmarks")
+    dtypes = ("float32",)
+    shapes = (
+        (4, 32, 32, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 32, 32, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 32, 32, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 32, 32, 256, 5, 5, 256, (1, 1), (2, 2), 1),
+        (4, 32, 32, 512, 5, 5, 512, (1, 1), (2, 2), 1),
+        (4, 64, 64, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 64, 64, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 64, 64, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 1),
+        (4, 128, 128, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 128, 128, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 128, 128, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 256, 256, 32, 5, 5, 3, (1, 1), (2, 2), 1),
+        (4, 256, 256, 3, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 128, 128, 64, 5, 5, 3, (1, 1), (2, 2), 1),
+        (4, 128, 128, 3, 5, 5, 64, (1, 1), (2, 2), 1),
+    )
+    for dtype in dtypes:
+        print(
+            "(N,   H,   W,   C), (  O, kH, kW,   C),   dtype, stride,   pads,  groups, diff%"
+        )
+        for N, H, W, C, kH, kW, O, strides, padding, groups in shapes:
+            np_dtype = getattr(np, dtype)
+            time_mlx, time_torch = bench_shape(
+                N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype
+            )
+            diff = time_torch / time_mlx - 1.0
+            print(
+                f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
+            )
+            if time_mlx >= 2.0 * time_torch:
+                print("ATTENTION ^^^^^^^")

ml-stable-diffusion/mlx/benchmarks/python/conv3d_bench_cpu.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import argparse
+import math
+import time
+import mlx.core as mx
+import numpy as np
+import torch
+N_warmup = 1
+N_iter_bench = 10
+N_iter_func = 5
+mx.set_default_device(mx.cpu)
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+def make_mx_conv_3D(strides=(1, 1), padding=(0, 0), groups=1):
+    def mx_conv_3D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = mx.conv3d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        mx.eval(ys)
+        return ys
+    return mx_conv_3D
+def make_pt_conv_3D(strides=(1, 1), padding=(0, 0), groups=1):
+    @torch.no_grad()
+    def pt_conv_3D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = torch.conv3d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        return ys
+    return pt_conv_3D
+def bench_shape(N, D, H, W, C, kD, kH, kW, O, strides, padding, groups, np_dtype):
+    scale = 1.0 / math.sqrt(kD * kH * kW * C)
+    a_np = np.random.uniform(0, 0.5, (N, D, H, W, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (O, kD, kH, kW, int(C / groups))).astype(
+        np_dtype
+    )
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+    a_pt = torch.from_numpy(a_np.transpose((0, 4, 1, 2, 3))).to("cpu")
+    b_pt = torch.from_numpy(b_np.transpose((0, 4, 1, 2, 3))).to("cpu")
+    f_mx = make_mx_conv_3D(strides, padding, groups)
+    f_pt = make_pt_conv_3D(strides, padding, groups)
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+    out_mx = mx.conv3d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
+    out_pt = torch.conv3d(
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.permute(out_pt, (0, 2, 3, 4, 1))
+    out_pt = out_pt.numpy(force=True)
+    atol = 2e-5 if np_dtype == np.float32 else 1e-4
+    if not np.allclose(out_pt, out_mx, atol=atol):
+        print(
+            f"Failed at {(N, D, H, W, C)}, {(O, kD, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+        )
+    return time_mlx, time_torch
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run conv benchmarks")
+    dtypes = ("float32",)
+    shapes = (
+        (4, 16, 16, 16, 16, 5, 5, 5, 16, (1, 1, 1), (2, 2, 2), 1),
+        (4, 16, 16, 16, 32, 5, 5, 5, 32, (1, 1, 1), (2, 2, 2), 1),
+    )
+    for dtype in dtypes:
+        print(
+            "(N,   D,   H,   W,   C), (  O, kD, kH, kW,   C),   dtype,    stride,      pads,  groups, diff%"
+        )
+        for N, D, H, W, C, kD, kH, kW, O, strides, padding, groups in shapes:
+            np_dtype = getattr(np, dtype)
+            time_mlx, time_torch = bench_shape(
+                N, D, H, W, C, kD, kH, kW, O, strides, padding, groups, np_dtype
+            )
+            diff = time_torch / time_mlx - 1.0
+            print(
+                f"({N}, {D:3d}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kD:2d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
+            )
+            if time_mlx >= 2.0 * time_torch:
+                print("ATTENTION ^^^^^^^")

ml-stable-diffusion/mlx/benchmarks/python/conv3d_train_bench_cpu.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import time
+import mlx.core as mx
+import mlx.nn
+import mlx.optimizers as opt
+import torch
+def bench_mlx(steps: int = 20, shape=(10, 32, 32, 32, 3)) -> float:
+    mx.set_default_device(mx.cpu)
+    class BenchNetMLX(mlx.nn.Module):
+        # simple encoder-decoder net
+        def __init__(self, in_channels, hidden_channels=16):
+            super().__init__()
+            self.net = mlx.nn.Sequential(
+                mlx.nn.Conv3d(in_channels, hidden_channels, kernel_size=3, padding=1),
+                mlx.nn.ReLU(),
+                mlx.nn.Conv3d(
+                    hidden_channels, 2 * hidden_channels, kernel_size=3, padding=1
+                ),
+                mlx.nn.ReLU(),
+                mlx.nn.ConvTranspose3d(
+                    2 * hidden_channels, hidden_channels, kernel_size=3, padding=1
+                ),
+                mlx.nn.ReLU(),
+                mlx.nn.ConvTranspose3d(
+                    hidden_channels, in_channels, kernel_size=3, padding=1
+                ),
+            )
+        def __call__(self, input):
+            return self.net(input)
+    benchNet = BenchNetMLX(3)
+    mx.eval(benchNet.parameters())
+    optim = opt.Adam(learning_rate=1e-3)
+    inputs = mx.random.normal(shape)
+    params = benchNet.parameters()
+    optim.init(params)
+    state = [benchNet.state, optim.state]
+    def loss_fn(params, image):
+        benchNet.update(params)
+        pred_image = benchNet(image)
+        return (pred_image - image).abs().mean()
+    def step(params, image):
+        loss, grads = mx.value_and_grad(loss_fn)(params, image)
+        optim.update(benchNet, grads)
+        return loss
+    total_time = 0.0
+    print("MLX:")
+    for i in range(steps):
+        start_time = time.perf_counter()
+        step(benchNet.parameters(), inputs)
+        mx.eval(state)
+        end_time = time.perf_counter()
+        print(f"{i:3d}, time={(end_time-start_time) * 1000:7.2f} ms")
+        total_time += (end_time - start_time) * 1000
+    return total_time
+def bench_torch(steps: int = 20, shape=(10, 3, 32, 32, 32)) -> float:
+    device = torch.device("cpu")
+    class BenchNetTorch(torch.nn.Module):
+        # simple encoder-decoder net
+        def __init__(self, in_channels, hidden_channels=16):
+            super().__init__()
+            self.net = torch.nn.Sequential(
+                torch.nn.Conv3d(in_channels, hidden_channels, kernel_size=3, padding=1),
+                torch.nn.ReLU(),
+                torch.nn.Conv3d(
+                    hidden_channels, 2 * hidden_channels, kernel_size=3, padding=1
+                ),
+                torch.nn.ReLU(),
+                torch.nn.ConvTranspose3d(
+                    2 * hidden_channels, hidden_channels, kernel_size=3, padding=1
+                ),
+                torch.nn.ReLU(),
+                torch.nn.ConvTranspose3d(
+                    hidden_channels, in_channels, kernel_size=3, padding=1
+                ),
+            )
+        def forward(self, input):
+            return self.net(input)
+    benchNet = BenchNetTorch(3).to(device)
+    optim = torch.optim.Adam(benchNet.parameters(), lr=1e-3)
+    inputs = torch.randn(*shape, device=device)
+    def loss_fn(pred_image, image):
+        return (pred_image - image).abs().mean()
+    total_time = 0.0
+    print("PyTorch:")
+    for i in range(steps):
+        start_time = time.perf_counter()
+        optim.zero_grad()
+        pred_image = benchNet(inputs)
+        loss = loss_fn(pred_image, inputs)
+        loss.backward()
+        optim.step()
+        end_time = time.perf_counter()
+        print(f"{i:3d}, time={(end_time-start_time) * 1000:7.2f} ms")
+        total_time += (end_time - start_time) * 1000
+    return total_time
+def main():
+    steps = 10
+    time_mlx = bench_mlx(steps)
+    time_torch = bench_torch(steps)
+    print(f"average time of MLX:     {time_mlx/steps:9.2f} ms")
+    print(f"total time of MLX:       {time_mlx:9.2f} ms")
+    print(f"average time of PyTorch: {time_torch/steps:9.2f} ms")
+    print(f"total time of PyTorch:   {time_torch:9.2f} ms")
+    diff = time_torch / time_mlx - 1.0
+    print(f"torch/mlx diff: {100. * diff:+5.2f}%")
+if __name__ == "__main__":
+    main()

ml-stable-diffusion/mlx/benchmarks/python/conv3d_transpose_bench_cpu.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import argparse
+import math
+import time
+import mlx.core as mx
+import numpy as np
+import torch
+N_warmup = 1
+N_iter_bench = 10
+N_iter_func = 5
+mx.set_default_device(mx.cpu)
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+def make_mx_conv_3D(strides=(1, 1, 1), padding=(0, 0, 0), groups=1):
+    def mx_conv_3D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = mx.conv_transpose3d(
+                a, b, stride=strides, padding=padding, groups=groups
+            )
+            ys.append(y)
+        mx.eval(ys)
+        return ys
+    return mx_conv_3D
+def make_pt_conv_3D(strides=(1, 1, 1), padding=(0, 0, 0), groups=1):
+    @torch.no_grad()
+    def pt_conv_3D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = torch.conv_transpose3d(
+                a, b, stride=strides, padding=padding, groups=groups
+            )
+            ys.append(y)
+        return ys
+    return pt_conv_3D
+def bench_shape(N, D, H, W, C, kD, kH, kW, O, strides, padding, groups, np_dtype):
+    scale = 1.0 / math.sqrt(kD * kH * kW * C)
+    a_np = np.random.uniform(0, 0.5, (N, D, H, W, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (O, kD, kH, kW, int(C / groups))).astype(
+        np_dtype
+    )
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+    a_pt = torch.from_numpy(a_np.transpose((0, 4, 1, 2, 3))).to("cpu")
+    b_pt = torch.from_numpy(b_np.transpose((4, 0, 1, 2, 3))).to("cpu")
+    f_mx = make_mx_conv_3D(strides, padding, groups)
+    f_pt = make_pt_conv_3D(strides, padding, groups)
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+    out_mx = mx.conv_transpose3d(
+        a_mx, b_mx, stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.conv_transpose3d(
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.permute(out_pt, (0, 2, 3, 4, 1))
+    out_pt = out_pt.numpy(force=True)
+    atol = 2e-5 if np_dtype == np.float32 else 1e-4
+    if not np.allclose(out_pt, out_mx, atol=atol):
+        print(
+            f"Failed at {(N, D, H, W, C)}, {(O, kD, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+        )
+    return time_mlx, time_torch
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run conv benchmarks")
+    dtypes = ("float32",)
+    shapes = (
+        (4, 16, 16, 16, 16, 5, 5, 5, 16, (1, 1, 1), (2, 2, 2), 1),
+        (4, 16, 16, 16, 32, 5, 5, 5, 32, (1, 1, 1), (2, 2, 2), 1),
+    )
+    for dtype in dtypes:
+        print(
+            "(N,   D,   H,   W,   C), (  O, kD, kH, kW,   C),   dtype,    stride,      pads,  groups, diff%"
+        )
+        for N, D, H, W, C, kD, kH, kW, O, strides, padding, groups in shapes:
+            np_dtype = getattr(np, dtype)
+            time_mlx, time_torch = bench_shape(
+                N, D, H, W, C, kD, kH, kW, O, strides, padding, groups, np_dtype
+            )
+            diff = time_torch / time_mlx - 1.0
+            print(
+                f"({N}, {D:3d}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kD:2d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
+            )
+            if time_mlx >= 2.0 * time_torch:
+                print("ATTENTION ^^^^^^^")

ml-stable-diffusion/mlx/benchmarks/python/conv_bench.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import argparse
+import math
+import os
+import subprocess
+import time
+import mlx.core as mx
+import numpy as np
+import torch
+device_name = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"])
+device_name = device_name.decode("utf-8").strip("\n")
+N_warmup = 10
+N_iter_bench = 100
+N_iter_func = 5
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+    torch.mps.synchronize()
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+def make_mx_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    def mx_conv_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = mx.conv2d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        mx.eval(ys)
+        return ys
+    return mx_conv_2D
+def make_pt_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    @torch.no_grad()
+    def pt_conv_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = torch.conv2d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        torch.mps.synchronize()
+        return ys
+    return pt_conv_2D
+def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
+    scale = 1.0 / math.sqrt(kH * kH * C)
+    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (O, kH, kW, int(C / groups))).astype(
+        np_dtype
+    )
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+    a_pt = torch.from_numpy(a_np.transpose((0, 3, 1, 2))).to("mps")
+    b_pt = torch.from_numpy(b_np.transpose((0, 3, 1, 2))).to("mps")
+    torch.mps.synchronize()
+    f_mx = make_mx_conv_2D(strides, padding, groups)
+    f_pt = make_pt_conv_2D(strides, padding, groups)
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+    out_mx = mx.conv2d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
+    out_pt = torch.conv2d(
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.permute(out_pt, (0, 2, 3, 1))
+    out_pt = out_pt.numpy(force=True)
+    atol = 2e-5 if np_dtype == np.float32 else 1e-4
+    if not np.allclose(out_pt, out_mx, atol=atol):
+        print(
+            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+        )
+    return time_mlx, time_torch
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run conv benchmarks")
+    dtypes = ("float32",)
+    shapes = (
+        (4, 32, 32, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 32, 32, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 32, 32, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 32, 32, 256, 5, 5, 256, (1, 1), (2, 2), 1),
+        (4, 32, 32, 512, 5, 5, 512, (1, 1), (2, 2), 1),
+        (4, 64, 64, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 64, 64, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 64, 64, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 1),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 2),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 16),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 64),
+        (4, 128, 128, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 128, 128, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 128, 128, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 256, 256, 32, 5, 5, 3, (1, 1), (2, 2), 1),
+        (4, 256, 256, 3, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 128, 128, 64, 5, 5, 3, (1, 1), (2, 2), 1),
+        (4, 128, 128, 3, 5, 5, 64, (1, 1), (2, 2), 1),
+    )
+    for dtype in dtypes:
+        print(
+            "(N,   H,   W,   C), (  O, kH, kW,   C),   dtype, stride,   pads,  groups, diff%"
+        )
+        for N, H, W, C, kH, kW, O, strides, padding, groups in shapes:
+            np_dtype = getattr(np, dtype)
+            time_mlx, time_torch = bench_shape(
+                N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype
+            )
+            diff = time_torch / time_mlx - 1.0
+            print(
+                f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
+            )
+            if time_mlx >= 2.0 * time_torch:
+                print("ATTENTION ^^^^^^^")

ml-stable-diffusion/mlx/benchmarks/python/conv_transpose_bench.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import argparse
+import math
+import os
+import subprocess
+import time
+import mlx.core as mx
+import numpy as np
+import torch
+N_warmup = 10
+N_iter_bench = 100
+N_iter_func = 5
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+    torch.mps.synchronize()
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+def make_mx_conv_transpose_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    def mx_conv_transpose_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = mx.conv_transpose2d(
+                a, b, stride=strides, padding=padding, groups=groups
+            )
+            ys.append(y)
+        mx.eval(ys)
+        return ys
+    return mx_conv_transpose_2D
+def make_pt_conv_transpose_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    @torch.no_grad()
+    def pt_conv_transpose_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = torch.conv_transpose2d(
+                a, b, stride=strides, padding=padding, groups=groups
+            )
+            ys.append(y)
+        torch.mps.synchronize()
+        return ys
+    return pt_conv_transpose_2D
+def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
+    scale = 1.0 / math.sqrt(kH * kH * C)
+    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (O, kH, kW, int(C / groups))).astype(
+        np_dtype
+    )
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+    a_pt = torch.from_numpy(a_np.transpose((0, 3, 1, 2))).to("mps")
+    b_pt = torch.from_numpy(b_np.transpose((3, 0, 1, 2))).to("mps")
+    torch.mps.synchronize()
+    f_mx = make_mx_conv_transpose_2D(strides, padding, groups)
+    f_pt = make_pt_conv_transpose_2D(strides, padding, groups)
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+    out_mx = mx.conv_transpose2d(
+        a_mx, b_mx, stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.conv_transpose2d(
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.permute(out_pt, (0, 2, 3, 1))
+    out_pt = out_pt.numpy(force=True)
+    atol = 2e-5 if np_dtype == np.float32 else 1e-4
+    if not np.allclose(out_pt, out_mx, atol=atol):
+        print(
+            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+        )
+    return time_mlx, time_torch
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run conv benchmarks")
+    dtypes = ("float32",)
+    shapes = (
+        (4, 32, 32, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 32, 32, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 32, 32, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 32, 32, 256, 5, 5, 256, (1, 1), (2, 2), 1),
+        (4, 32, 32, 512, 5, 5, 512, (1, 1), (2, 2), 1),
+        (4, 64, 64, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 64, 64, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 64, 64, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 1),
+        (4, 128, 128, 32, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 128, 128, 64, 5, 5, 64, (1, 1), (2, 2), 1),
+        (4, 128, 128, 128, 5, 5, 128, (1, 1), (2, 2), 1),
+        (4, 256, 256, 32, 5, 5, 3, (1, 1), (2, 2), 1),
+        (4, 256, 256, 3, 5, 5, 32, (1, 1), (2, 2), 1),
+        (4, 128, 128, 64, 5, 5, 3, (1, 1), (2, 2), 1),
+        (4, 128, 128, 3, 5, 5, 64, (1, 1), (2, 2), 1),
+    )
+    for dtype in dtypes:
+        print(
+            "(N,   H,   W,   C), (  O, kH, kW,   C),   dtype, stride,   pads,  groups, diff%"
+        )
+        for N, H, W, C, kH, kW, O, strides, padding, groups in shapes:
+            np_dtype = getattr(np, dtype)
+            time_mlx, time_torch = bench_shape(
+                N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype
+            )
+            diff = time_torch / time_mlx - 1.0
+            print(
+                f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
+            )
+            if time_mlx >= 2.0 * time_torch:
+                print("ATTENTION ^^^^^^^")

ml-stable-diffusion/mlx/benchmarks/python/conv_unaligned_bench.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import math
+import time
+import mlx.core as mx
+import numpy as np
+import torch
+N_warmup = 10
+N_iter_bench = 100
+N_iter_func = 5
+def bench(f, a, b):
+    for i in range(N_warmup):
+        f(a, b)
+    torch.mps.synchronize()
+    s = time.perf_counter_ns()
+    for i in range(N_iter_bench):
+        f(a, b)
+    e = time.perf_counter_ns()
+    return (e - s) * 1e-9
+def make_mx_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    def mx_conv_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = mx.conv2d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        mx.eval(ys)
+        return ys
+    return mx_conv_2D
+def make_pt_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
+    @torch.no_grad()
+    def pt_conv_2D(a, b):
+        ys = []
+        for i in range(N_iter_func):
+            y = torch.conv2d(a, b, stride=strides, padding=padding, groups=groups)
+            ys.append(y)
+        torch.mps.synchronize()
+        return ys
+    return pt_conv_2D
+def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
+    scale = 1.0 / math.sqrt(kH * kH * C)
+    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
+    b_np = np.random.uniform(-scale, scale, (O, kH, kW, int(C / groups))).astype(
+        np_dtype
+    )
+    a_mx = mx.array(a_np)
+    b_mx = mx.array(b_np)
+    a_pt = torch.from_numpy(a_np.transpose((0, 3, 1, 2))).to("mps")
+    b_pt = torch.from_numpy(b_np.transpose((0, 3, 1, 2))).to("mps")
+    torch.mps.synchronize()
+    f_mx = make_mx_conv_2D(strides, padding, groups)
+    f_pt = make_pt_conv_2D(strides, padding, groups)
+    time_torch = bench(f_pt, a_pt, b_pt)
+    time_mlx = bench(f_mx, a_mx, b_mx)
+    out_mx = mx.conv2d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
+    out_pt = torch.conv2d(
+        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
+    )
+    out_pt = torch.permute(out_pt, (0, 2, 3, 1))
+    out_pt = out_pt.numpy(force=True)
+    atol = 2e-5 if np_dtype == np.float32 else 1e-4
+    if not np.allclose(out_pt, out_mx, atol=atol):
+        print(
+            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
+        )
+    return time_mlx, time_torch
+if __name__ == "__main__":
+    dtype = "float32"
+    shapes = (
+        (4, 32, 32, 21, 3, 3, 128),
+        (4, 32, 32, 21, 3, 3, 37),
+        (4, 32, 32, 370, 3, 3, 370),
+        (4, 32, 32, 370, 7, 7, 128),
+        (2, 320, 640, 21, 7, 7, 21),
+    )
+    for N, H, W, C, kh, kw, O in shapes:
+        time_mlx, time_torch = bench_shape(
+            N, H, W, C, kh, kw, O, (1, 1), (0, 0), 1, dtype
+        )
+        diff = time_torch / time_mlx - 1.0
+        print(
+            f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kh:2d}, {kw:2d}, {C:3d}), {dtype}, {100. * diff:+5.2f}%"
+        )
+        if time_mlx >= 2.0 * time_torch:
+            print("ATTENTION ^^^^^^^")

ml-stable-diffusion/mlx/benchmarks/python/distributed_bench.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# Copyright © 2024 Apple Inc.
+"""
+Run with:
+    mpirun -n 2 python /path/to/distributed_bench.py
+"""
+import time
+import mlx.core as mx
+def time_fn(fn, *args, **kwargs):
+    msg = kwargs.pop("msg", None)
+    world = mx.distributed.init()
+    if world.rank() == 0:
+        if msg:
+            print(f"Timing {msg} ...", end=" ")
+        else:
+            print(f"Timing {fn.__name__} ...", end=" ")
+    # warmup
+    for _ in range(5):
+        mx.eval(fn(*args, **kwargs))
+    num_iters = 100
+    tic = time.perf_counter()
+    for _ in range(num_iters):
+        x = mx.eval(fn(*args, **kwargs))
+    toc = time.perf_counter()
+    msec = 1e3 * (toc - tic) / num_iters
+    if world.rank() == 0:
+        print(f"{msec:.5f} msec")
+def time_all_sum():
+    shape = (4096,)
+    x = mx.random.uniform(shape=shape)
+    mx.eval(x)
+    def sine(x):
+        for _ in range(20):
+            x = mx.sin(x)
+        return x
+    time_fn(sine, x)
+    def all_sum_plain(x):
+        for _ in range(20):
+            x = mx.distributed.all_sum(x)
+        return x
+    time_fn(all_sum_plain, x)
+    def all_sum_with_sine(x):
+        for _ in range(20):
+            x = mx.sin(x)
+            x = mx.distributed.all_sum(x)
+        return x
+    time_fn(all_sum_with_sine, x)
+if __name__ == "__main__":
+    time_all_sum()

ml-stable-diffusion/mlx/benchmarks/python/einsum_bench.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# Copyright © 2024 Apple Inc.
+import time
+import mlx.core as mx
+import numpy as np
+def timeit(fn, its=100, args=[]):
+    for _ in range(5):
+        fn(*args)
+    tic = time.perf_counter()
+    for _ in range(its):
+        fn(*args)
+    toc = time.perf_counter()
+    return 1e3 * (toc - tic) / its
+def time_little_einsum_path():
+    subscripts = "ik,kj->ij"
+    x = mx.ones((32, 32))
+    y = mx.ones((32, 32))
+    mx_time = timeit(mx.einsum_path, args=(subscripts, x, y))
+    x = np.array(x)
+    y = np.array(y)
+    np_time = timeit(np.einsum_path, args=(subscripts, x, y))
+    print("Timing little einsum path...")
+    print(f"MLX ... {mx_time:.3f} ms")
+    print(f"NumPy... {np_time:.3f} ms")
+def time_big_einsum_path():
+    chars = list("abcdefgh")
+    char_to_dim = {c: v for v, c in enumerate(chars)}
+    num_inputs = 10
+    inputs = []
+    subscripts = []
+    for _ in range(num_inputs):
+        subscript = np.random.choice(chars, size=5, replace=False).tolist()
+        subscripts.append("".join(subscript))
+        inputs.append(np.ones(list(char_to_dim[c] for c in subscript)))
+    subscripts = ",".join(subscripts)
+    np_time = timeit(np.einsum_path, args=(subscripts, *inputs))
+    inputs = [mx.array(x) for x in inputs]
+    mx_time = timeit(mx.einsum_path, args=(subscripts, *inputs))
+    print("Timing big einsum path...")
+    print(f"MLX ... {mx_time:.3f} ms")
+    print(f"NumPy... {np_time:.3f} ms")
+def time_attention():
+    def regular_attention(x):
+        # shape [batch, sequence, num_heads, head_dim]
+        queries, keys, values = x, x, x
+        scores = queries.transpose(0, 2, 1, 3) @ keys.transpose(0, 2, 3, 1)
+        scores = mx.softmax(scores, axis=-1)
+        output = (scores @ values.transpose(0, 2, 1, 3)).swapaxes(1, 2)
+        mx.eval(output)
+    def einsum_attention(x):
+        # shape [batch, sequence, num_heads, head_dim]
+        queries, keys, values = x, x, x
+        scores = mx.einsum("itjk,iujk->ijtu", queries, keys)
+        scores = mx.softmax(scores, axis=-1)
+        output = mx.einsum("ijtu,iujk->itjk", scores, values)
+        mx.eval(output)
+    x = mx.random.uniform(shape=(8, 512, 32, 128))
+    regular_time = timeit(regular_attention, args=(x,))
+    ein_time = timeit(einsum_attention, args=(x,))
+    print("Timing einsum attention...")
+    print(f"Regular ... {regular_time:.3f} ms")
+    print(f"Einsum ... {ein_time:.3f} ms")
+if __name__ == "__main__":
+    time_little_einsum_path()
+    time_big_einsum_path()
+    time_attention()

ml-stable-diffusion/mlx/benchmarks/python/fft_bench.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# Copyright © 2024 Apple Inc.
+import matplotlib
+import mlx.core as mx
+import numpy as np
+import sympy
+import torch
+from time_utils import measure_runtime
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+def bandwidth_gb(runtime_ms, system_size):
+    bytes_per_fft = np.dtype(np.complex64).itemsize * 2
+    bytes_per_gb = 1e9
+    ms_per_s = 1e3
+    return system_size * bytes_per_fft / runtime_ms * ms_per_s / bytes_per_gb
+def run_bench(system_size, fft_sizes, backend="mlx", dim=1):
+    def fft_mlx(x):
+        if dim == 1:
+            out = mx.fft.fft(x)
+        elif dim == 2:
+            out = mx.fft.fft2(x)
+        mx.eval(out)
+        return out
+    def fft_mps(x):
+        if dim == 1:
+            out = torch.fft.fft(x)
+        elif dim == 2:
+            out = torch.fft.fft2(x)
+        torch.mps.synchronize()
+        return out
+    bandwidths = []
+    for n in fft_sizes:
+        batch_size = system_size // n**dim
+        shape = [batch_size] + [n for _ in range(dim)]
+        if backend == "mlx":
+            x_np = np.random.uniform(size=(system_size // n, n)).astype(np.complex64)
+            x = mx.array(x_np)
+            mx.eval(x)
+            fft = fft_mlx
+        elif backend == "mps":
+            x_np = np.random.uniform(size=(system_size // n, n)).astype(np.complex64)
+            x = torch.tensor(x_np, device="mps")
+            torch.mps.synchronize()
+            fft = fft_mps
+        else:
+            raise NotImplementedError()
+        runtime_ms = measure_runtime(fft, x=x)
+        bandwidth = bandwidth_gb(runtime_ms, np.prod(shape))
+        print(n, bandwidth)
+        bandwidths.append(bandwidth)
+    return np.array(bandwidths)
+def time_fft():
+    x = np.array(range(2, 512))
+    system_size = int(2**26)
+    print("MLX GPU")
+    with mx.stream(mx.gpu):
+        gpu_bandwidths = run_bench(system_size=system_size, fft_sizes=x)
+    print("MPS GPU")
+    mps_bandwidths = run_bench(system_size=system_size, fft_sizes=x, backend="mps")
+    print("CPU")
+    system_size = int(2**20)
+    with mx.stream(mx.cpu):
+        cpu_bandwidths = run_bench(system_size=system_size, fft_sizes=x)
+    x = np.array(x)
+    all_indices = x - x[0]
+    radix_2to13 = (
+        np.array([i for i in x if all(p <= 13 for p in sympy.primefactors(i))]) - x[0]
+    )
+    bluesteins = (
+        np.array([i for i in x if any(p > 13 for p in sympy.primefactors(i))]) - x[0]
+    )
+    for indices, name in [
+        (all_indices, "All"),
+        (radix_2to13, "Radix 2-13"),
+        (bluesteins, "Bluestein's"),
+    ]:
+        # plot bandwidths
+        print(name)
+        plt.scatter(x[indices], gpu_bandwidths[indices], color="green", label="GPU")
+        plt.scatter(x[indices], mps_bandwidths[indices], color="blue", label="MPS")
+        plt.scatter(x[indices], cpu_bandwidths[indices], color="red", label="CPU")
+        plt.title(f"MLX FFT Benchmark -- {name}")
+        plt.xlabel("N")
+        plt.ylabel("Bandwidth (GB/s)")
+        plt.legend()
+        plt.savefig(f"{name}.png")
+        plt.clf()
+    av_gpu_bandwidth = np.mean(gpu_bandwidths)
+    av_mps_bandwidth = np.mean(mps_bandwidths)
+    av_cpu_bandwidth = np.mean(cpu_bandwidths)
+    print("Average bandwidths:")
+    print("GPU:", av_gpu_bandwidth)
+    print("MPS:", av_mps_bandwidth)
+    print("CPU:", av_cpu_bandwidth)
+    portion_faster = len(np.where(gpu_bandwidths > mps_bandwidths)[0]) / len(x)
+    print("Percent MLX faster than MPS: ", portion_faster * 100)
+if __name__ == "__main__":
+    time_fft()

ml-stable-diffusion/mlx/benchmarks/python/gather_bench.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# Copyright © 2023-2024 Apple Inc.
+import argparse
+import mlx.core as mx
+import torch
+from time_utils import measure_runtime
+def benchmark_gather_mlx(x_shape, idx_shape):
+    def gather(x, idx):
+        mx.eval(x[idx])
+    idx = mx.random.randint(0, x_shape[0] - 1, idx_shape)
+    x = mx.random.normal(x_shape).astype(mx.float32)
+    runtime = measure_runtime(gather, x=x, idx=idx)
+    print(f"MLX: {runtime:.3f}ms")
+def benchmark_gather_torch(x_shape, idx_shape, device):
+    def gather(x, idx, device):
+        _ = x[idx]
+        if device == torch.device("mps"):
+            torch.mps.synchronize()
+    idx = torch.randint(0, x_shape[0] - 1, idx_shape).to(device)
+    x = torch.randn(x_shape, dtype=torch.float32).to(device)
+    runtime = measure_runtime(gather, x=x, idx=idx, device=device)
+    print(f"PyTorch: {runtime:.3f}ms")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Gather benchmarks.")
+    parser.add_argument("--cpu", action="store_true", help="Use the CPU.")
+    args = parser.parse_args()
+    if args.cpu:
+        mx.set_default_device(mx.cpu)
+        device = torch.device("cpu")
+    else:
+        device = torch.device("mps")
+    idx_shapes = [(1_000_000,), (100_000,), ()]
+    x_shapes = [(100, 64), (100, 1024), (4, 1_000_000)]
+    for x_shape, idx_shape in zip(x_shapes, idx_shapes):
+        print("=" * 20)
+        print(f"X {x_shape}, Indices {idx_shape}")
+        benchmark_gather_mlx(x_shape, idx_shape)
+        benchmark_gather_torch(x_shape, idx_shape, device=device)

ml-stable-diffusion/mlx/benchmarks/python/gather_mm_bench.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Copyright © 2025 Apple Inc.
+import mlx.core as mx
+from time_utils import time_fn
+N = 1024
+D = 1024
+M = 1024
+E = 32
+I = 4
+def gather_sort(x, indices):
+    N, M = indices.shape
+    indices = indices.flatten()
+    order = mx.argsort(indices)
+    inv_order = mx.argsort(order)
+    return x.flatten(0, -3)[order // M], indices[order], inv_order
+def scatter_unsort(x, inv_order, shape=None):
+    x = x[inv_order]
+    if shape is not None:
+        x = mx.unflatten(x, 0, shape)
+    return x
+def gather_mm_simulate(x, w, indices):
+    x, idx, inv_order = gather_sort(x, indices)
+    for i in range(2):
+        y = mx.concatenate([x[i] @ w[j].T for i, j in enumerate(idx.tolist())], axis=0)
+        x = y[:, None]
+    x = scatter_unsort(x, inv_order, indices.shape)
+    return x
+def time_gather_mm():
+    x = mx.random.normal((N, 1, 1, D)) / 1024**0.5
+    w1 = mx.random.normal((E, M, D)) / 1024**0.5
+    w2 = mx.random.normal((E, D, M)) / 1024**0.5
+    indices = (mx.random.uniform(shape=(N, I)) * E).astype(mx.uint32)
+    sorted_indices = mx.sort(indices.flatten()).reshape(N, I)
+    mx.eval(x, w1, w2, indices, sorted_indices)
+    def gather_mm(x, w1, w2, indices, sort):
+        idx = indices
+        inv_order = None
+        if sort:
+            x, idx, inv_order = gather_sort(x, indices)
+        x = mx.gather_mm(x, w1.swapaxes(-1, -2), rhs_indices=idx, sorted_indices=sort)
+        x = mx.gather_mm(x, w2.swapaxes(-1, -2), rhs_indices=idx, sorted_indices=sort)
+        if sort:
+            x = scatter_unsort(x, inv_order, indices.shape)
+        return x
+    time_fn(gather_mm, x, w1, w2, indices, False)
+    time_fn(gather_mm, x, w1, w2, sorted_indices, False)
+    time_fn(gather_mm, x, w1, w2, indices, True)
+    x = mx.random.normal((N * I, D)) / 1024**0.5
+    w1 = mx.random.normal((M, D)) / 1024**0.5
+    w2 = mx.random.normal((D, M)) / 1024**0.5
+    mx.eval(x, w1, w2)
+    def equivalent_matmul(x, w1, w2):
+        x = x @ w1.T
+        x = x @ w2.T
+        return x
+    time_fn(equivalent_matmul, x, w1, w2)
+if __name__ == "__main__":
+    time_gather_mm()

ml-stable-diffusion/mlx/benchmarks/python/gather_qmm_bench.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# Copyright © 2025 Apple Inc.
+import mlx.core as mx
+from time_utils import time_fn
+N = 1024
+D = 1024
+M = 1024
+E = 32
+I = 4
+def gather_sort(x, indices):
+    N, M = indices.shape
+    indices = indices.flatten()
+    order = mx.argsort(indices)
+    inv_order = mx.argsort(order)
+    return x.flatten(0, -3)[order // M], indices[order], inv_order
+def scatter_unsort(x, inv_order, shape=None):
+    x = x[inv_order]
+    if shape is not None:
+        x = mx.unflatten(x, 0, shape)
+    return x
+def gather_mm_simulate(x, w, indices):
+    x, idx, inv_order = gather_sort(x, indices)
+    for i in range(2):
+        y = mx.concatenate(
+            [
+                mx.quantized_matmul(x[i], w[0][j], w[1][j], w[2][j], transpose=True)
+                for i, j in enumerate(idx.tolist())
+            ],
+            axis=0,
+        )
+        x = y[:, None]
+    x = scatter_unsort(x, inv_order, indices.shape)
+    return x
+def time_gather_qmm():
+    x = mx.random.normal((N, 1, 1, D)) / 1024**0.5
+    w1 = mx.random.normal((E, M, D)) / 1024**0.5
+    w2 = mx.random.normal((E, D, M)) / 1024**0.5
+    w1 = mx.quantize(w1)
+    w2 = mx.quantize(w2)
+    indices = (mx.random.uniform(shape=(N, I)) * E).astype(mx.uint32)
+    sorted_indices = mx.sort(indices.flatten()).reshape(N, I)
+    mx.eval(x, w1, w2, indices, sorted_indices)
+    def gather_mm(x, w1, w2, indices, sort):
+        idx = indices
+        inv_order = None
+        if sort:
+            x, idx, inv_order = gather_sort(x, indices)
+        x = mx.gather_qmm(x, *w1, transpose=True, rhs_indices=idx, sorted_indices=sort)
+        x = mx.gather_qmm(x, *w2, transpose=True, rhs_indices=idx, sorted_indices=sort)
+        if sort:
+            x = scatter_unsort(x, inv_order, indices.shape)
+        return x
+    time_fn(gather_mm, x, w1, w2, indices, False)
+    time_fn(gather_mm, x, w1, w2, sorted_indices, False)
+    time_fn(gather_mm, x, w1, w2, indices, True)
+    x = mx.random.normal((N * I, D)) / 1024**0.5
+    w1 = mx.random.normal((M, D)) / 1024**0.5
+    w2 = mx.random.normal((D, M)) / 1024**0.5
+    w1 = mx.quantize(w1)
+    w2 = mx.quantize(w2)
+    mx.eval(x, w1, w2)
+    def equivalent_matmul(x, w1, w2):
+        x = mx.quantized_matmul(x, *w1, transpose=True)
+        x = mx.quantized_matmul(x, *w2, transpose=True)
+        return x
+    time_fn(equivalent_matmul, x, w1, w2)
+if __name__ == "__main__":
+    time_gather_qmm()

ml-stable-diffusion/mlx/benchmarks/python/hadamard_bench.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import argparse
+import matplotlib
+import mlx.core as mx
+import numpy as np
+from time_utils import measure_runtime
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+def had(x):
+    y = mx.hadamard_transform(x)
+    mx.eval(y)
+def copy(x):
+    y = x + 1.0
+    mx.eval(y)
+def run(dtype):
+    system_size = 2**26
+    outputs = {}
+    for test_fn in (had, copy):
+        for m in [1, 12, 20, 28]:
+            if test_fn == copy:
+                key = "copy"
+            elif m == 1:
+                key = "had_2^k"
+            else:
+                key = "had_m*2^k"
+            outputs.setdefault(key, {})
+            for k in range(7, 14):
+                n = m * 2**k
+                if n > 2**15:
+                    continue
+                x_np = np.random.normal(size=(system_size // n, n)).astype(dtype)
+                x = mx.array(x_np)
+                runtime_ms = measure_runtime(test_fn, x=x)
+                bytes_per_gb = 1e9
+                ms_per_s = 1e3
+                bytes_per_had = np.dtype(x_np.dtype).itemsize * 2
+                bandwidth_gb = (
+                    system_size * bytes_per_had / runtime_ms * ms_per_s / bytes_per_gb
+                )
+                print(n, bandwidth_gb)
+                outputs[key][n] = bandwidth_gb
+    colors = {
+        "copy": "black",
+        "had_2^k": "steelblue",
+        "had_m*2^k": "skyblue",
+    }
+    for key, output in outputs.items():
+        plt.scatter(output.keys(), output.values(), color=colors[key], label=key)
+    plt.title(f"MLX Hadamard Benchmark -- {dtype.__name__}")
+    plt.xlabel("N")
+    plt.ylabel("Bandwidth (GB/s)")
+    plt.legend()
+    plt.savefig(f"bench_{dtype.__name__}.png")
+    plt.clf()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--fp16", action="store_true")
+    args = parser.parse_args()
+    dtype = np.float16 if args.fp16 else np.float32
+    run(dtype)

ml-stable-diffusion/mlx/benchmarks/python/layer_norm_bench.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright © 2023-2024 Apple Inc.
+from functools import partial
+import mlx.core as mx
+import mlx.nn as nn
+from time_utils import time_fn
+def layer_norm(x, w, b, eps):
+    ot = x.dtype
+    x = x.astype(mx.float32)
+    mu = mx.mean(x, -1, keepdims=True)
+    v = mx.var(x, -1, keepdims=True)
+    y = (x - mu) * mx.rsqrt(v + eps)
+    if w is not None:
+        y = y * w
+    if b is not None:
+        y = y + b
+    return y
+def time_layer_norm(N, dt):
+    L = 1024
+    f1 = lambda x, w, b, y: (layer_norm(x, w, b, 1e-5) * y).sum()
+    f2 = lambda x, w, b, y: (mx.fast.layer_norm(x, w, b, 1e-5) * y).sum()
+    g1 = mx.grad(f1, argnums=(0, 1, 2))
+    g2 = mx.grad(f2, argnums=(0, 1, 2))
+    x = mx.random.uniform(shape=(8, L, N)).astype(dt)
+    w = mx.random.uniform(shape=(N,)).astype(dt)
+    b = mx.random.uniform(shape=(N,)).astype(dt)
+    y = mx.random.uniform(shape=(8, L, N)).astype(dt)
+    mx.eval(x, w, b, y)
+    def layer_norm_loop(f, x, w, b):
+        for _ in range(32):
+            x = f(x, w, b)
+        return x
+    time_fn(layer_norm_loop, partial(layer_norm, eps=1e-5), x, w, b)
+    time_fn(layer_norm_loop, partial(mx.fast.layer_norm, eps=1e-5), x, w, b)
+    def layer_norm_grad_loop(g, x, w, b):
+        gx, gw, gb = x, w, b
+        for _ in range(32):
+            gx, gw, gb = g(gx, gw, gb, y)
+        return gx, gw, gb
+    time_fn(layer_norm_grad_loop, g1, x, w, b)
+    time_fn(layer_norm_grad_loop, g2, x, w, b)
+    time_fn(layer_norm_grad_loop, mx.compile(g1), x, w, b)
+    time_fn(layer_norm_grad_loop, mx.compile(g2), x, w, b)
+    f1 = lambda x, y: (layer_norm(x, None, None, 1e-5) * y).sum()
+    f2 = lambda x, y: (mx.fast.layer_norm(x, None, None, 1e-5) * y).sum()
+    g1 = mx.grad(f1, argnums=(0,))
+    g2 = mx.grad(f2, argnums=(0,))
+    x = mx.random.uniform(shape=(8, L, N)).astype(dt)
+    w = mx.random.uniform(shape=(N,)).astype(dt)
+    b = mx.random.uniform(shape=(N,)).astype(dt)
+    y = mx.random.uniform(shape=(8, L, N)).astype(dt)
+    mx.eval(x, w, b, y)
+    def layer_norm_grad_x_loop(g, x):
+        gx = x
+        for _ in range(32):
+            gx = g(gx, y)
+        return gx
+    time_fn(layer_norm_grad_x_loop, g1, x)
+    time_fn(layer_norm_grad_x_loop, g2, x)
+    time_fn(layer_norm_grad_x_loop, mx.compile(g1), x)
+    time_fn(layer_norm_grad_x_loop, mx.compile(g2), x)
+if __name__ == "__main__":
+    for dt in [mx.float32, mx.float16, mx.bfloat16]:
+        for n in [1024, 2048, 4096, 8192, 8192 + 1024]:
+            print(dt, n)
+            time_layer_norm(n, dt)