trans2

Sleeping

App Files Files Community

Mayo commited on Apr 25

Commit

aab9632

unverified ·

1 Parent(s): 504f8c2

perf: FLUX.2 improvements

Browse files

Files changed (9) hide show

.cargo/config.toml +1 -1
Cargo.lock +44 -24
Cargo.toml +7 -3
koharu-ml/Cargo.toml +6 -0
koharu-ml/src/flux2_klein/mod.rs +69 -7
koharu-ml/src/flux2_klein/transformer.rs +150 -40
koharu-ml/src/flux2_klein/vae.rs +18 -25
koharu-runtime/src/cuda.rs +43 -2
koharu/tauri.windows.conf.json +2 -2

.cargo/config.toml CHANGED Viewed

@@ -5,4 +5,4 @@ LLAMA_CPP_TAG = "b8665"
 # CUDA 13.0 requires C++17
 NVCC_PREPEND_FLAGS = "-std=c++17"
 # override nvidia-smi compute capability
-CUDA_COMPUTE_CAP = "75"

 # CUDA 13.0 requires C++17
 NVCC_PREPEND_FLAGS = "-std=c++17"
 # override nvidia-smi compute capability
+CUDA_COMPUTE_CAP = "80"

Cargo.lock CHANGED Viewed

@@ -828,9 +828,9 @@ dependencies = [
 [[package]]
 name = "blake3"
-version = "1.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4d2d5991425dfd0785aed03aedcf0b321d61975c9b5b3689c774a2610ae0b51e"
 dependencies = [
  "arrayref",
  "arrayvec",
@@ -1007,7 +1007,7 @@ dependencies = [
 [[package]]
 name = "candle-core"
 version = "0.9.2"
-source = "git+https://github.com/mayocream/candle?branch=cuda-dynamic-loading#38be780754e954d88b63bbe1ef7e4098bbaa4c02"
 dependencies = [
  "byteorder",
  "candle-kernels",
@@ -1033,10 +1033,29 @@ dependencies = [
  "zip 7.2.0",
 ]
 [[package]]
 name = "candle-kernels"
 version = "0.9.2"
-source = "git+https://github.com/mayocream/candle?branch=cuda-dynamic-loading#38be780754e954d88b63bbe1ef7e4098bbaa4c02"
 dependencies = [
  "bindgen_cuda",
 ]
@@ -1044,7 +1063,7 @@ dependencies = [
 [[package]]
 name = "candle-metal-kernels"
 version = "0.9.2"
-source = "git+https://github.com/mayocream/candle?branch=cuda-dynamic-loading#38be780754e954d88b63bbe1ef7e4098bbaa4c02"
 dependencies = [
  "half",
  "objc2",
@@ -1058,7 +1077,7 @@ dependencies = [
 [[package]]
 name = "candle-nn"
 version = "0.9.2"
-source = "git+https://github.com/mayocream/candle?branch=cuda-dynamic-loading#38be780754e954d88b63bbe1ef7e4098bbaa4c02"
 dependencies = [
  "candle-core",
  "candle-metal-kernels",
@@ -1075,7 +1094,7 @@ dependencies = [
 [[package]]
 name = "candle-transformers"
 version = "0.9.2"
-source = "git+https://github.com/mayocream/candle?branch=cuda-dynamic-loading#38be780754e954d88b63bbe1ef7e4098bbaa4c02"
 dependencies = [
  "byteorder",
  "candle-core",
@@ -1093,7 +1112,7 @@ dependencies = [
 [[package]]
 name = "candle-ug"
 version = "0.9.2"
-source = "git+https://github.com/mayocream/candle?branch=cuda-dynamic-loading#38be780754e954d88b63bbe1ef7e4098bbaa4c02"
 dependencies = [
  "ug",
  "ug-cuda",
@@ -1150,9 +1169,9 @@ dependencies = [
 [[package]]
 name = "cc"
-version = "1.2.60"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "43c5703da9466b66a946814e1adf53ea2c90f10063b86290cc9eb67ce3478a20"
 dependencies = [
  "find-msvc-tools",
  "jobserver",
@@ -1790,9 +1809,9 @@ dependencies = [
 [[package]]
 name = "data-encoding"
-version = "2.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d7a1e2f27636f116493b8b860f5546edb47c8d8f8ea73e1d2a20be88e28d1fea"
 [[package]]
 name = "debugid"
@@ -2078,14 +2097,14 @@ checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
 [[package]]
 name = "embed-resource"
-version = "3.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "63a1d0de4f2249aa0ff5884d7080814f446bb241a559af6c170a41e878ed2d45"
 dependencies = [
  "cc",
  "memchr",
  "rustc_version",
- "toml 0.9.12+spec-1.1.0",
  "vswhom",
  "winreg 0.55.0",
 ]
@@ -4677,6 +4696,7 @@ version = "0.49.0"
 dependencies = [
  "anyhow",
  "candle-core",
  "candle-nn",
  "candle-transformers",
  "clap",
@@ -4866,9 +4886,9 @@ dependencies = [
 [[package]]
 name = "libc"
-version = "0.2.185"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "52ff2c0fe9bc6cb6b14a0592c2ff4fa9ceb83eea9db979b0487cd054946a2b8f"
 [[package]]
 name = "libfuzzer-sys"
@@ -6080,9 +6100,9 @@ checksum = "35fb2e5f958ec131621fdd531e9fc186ed768cbe395337403ae56c17a74c68ec"
 [[package]]
 name = "pastey"
-version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b867cad97c0791bbd3aaa6472142568c6c9e8f71937e98379f584cfb0cf35bec"
 [[package]]
 name = "pathdiff"
@@ -7354,7 +7374,7 @@ dependencies = [
  "http 1.4.0",
  "http-body",
  "http-body-util",
- "pastey 0.2.1",
  "pin-project-lite",
  "rand 0.10.1",
  "rmcp-macros",
@@ -7462,9 +7482,9 @@ dependencies = [
 [[package]]
 name = "rustls"
-version = "0.23.38"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "69f9466fb2c14ea04357e91413efb882e2a6d4a406e625449bc0a5d360d53a21"
 dependencies = [
  "aws-lc-rs",
  "log",
@@ -7490,9 +7510,9 @@ dependencies = [
 [[package]]
 name = "rustls-pki-types"
-version = "1.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd"
 dependencies = [
  "web-time",
  "zeroize",

 [[package]]
 name = "blake3"
+version = "1.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0aa83c34e62843d924f905e0f5c866eb1dd6545fc4d719e803d9ba6030371fce"
 dependencies = [
  "arrayref",
  "arrayvec",
 [[package]]
 name = "candle-core"
 version = "0.9.2"
+source = "git+https://github.com/mayocream/candle?branch=flash-attn#e7e71e18414db8de91113963beaabb6b4046a0a5"
 dependencies = [
  "byteorder",
  "candle-kernels",
  "zip 7.2.0",
 ]
+[[package]]
+name = "candle-flash-attn"
+version = "0.9.2"
+source = "git+https://github.com/mayocream/candle?branch=flash-attn#e7e71e18414db8de91113963beaabb6b4046a0a5"
+dependencies = [
+ "anyhow",
+ "candle-core",
+ "candle-flash-attn-build",
+ "half",
+]
+[[package]]
+name = "candle-flash-attn-build"
+version = "0.9.2"
+source = "git+https://github.com/mayocream/candle?branch=flash-attn#e7e71e18414db8de91113963beaabb6b4046a0a5"
+dependencies = [
+ "anyhow",
+]
 [[package]]
 name = "candle-kernels"
 version = "0.9.2"
+source = "git+https://github.com/mayocream/candle?branch=flash-attn#e7e71e18414db8de91113963beaabb6b4046a0a5"
 dependencies = [
  "bindgen_cuda",
 ]
 [[package]]
 name = "candle-metal-kernels"
 version = "0.9.2"
+source = "git+https://github.com/mayocream/candle?branch=flash-attn#e7e71e18414db8de91113963beaabb6b4046a0a5"
 dependencies = [
  "half",
  "objc2",
 [[package]]
 name = "candle-nn"
 version = "0.9.2"
+source = "git+https://github.com/mayocream/candle?branch=flash-attn#e7e71e18414db8de91113963beaabb6b4046a0a5"
 dependencies = [
  "candle-core",
  "candle-metal-kernels",
 [[package]]
 name = "candle-transformers"
 version = "0.9.2"
+source = "git+https://github.com/mayocream/candle?branch=flash-attn#e7e71e18414db8de91113963beaabb6b4046a0a5"
 dependencies = [
  "byteorder",
  "candle-core",
 [[package]]
 name = "candle-ug"
 version = "0.9.2"
+source = "git+https://github.com/mayocream/candle?branch=flash-attn#e7e71e18414db8de91113963beaabb6b4046a0a5"
 dependencies = [
  "ug",
  "ug-cuda",
 [[package]]
 name = "cc"
+version = "1.2.61"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d16d90359e986641506914ba71350897565610e87ce0ad9e6f28569db3dd5c6d"
 dependencies = [
  "find-msvc-tools",
  "jobserver",
 [[package]]
 name = "data-encoding"
+version = "2.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4ae5f15dda3c708c0ade84bfee31ccab44a3da4f88015ed22f63732abe300c8"
 [[package]]
 name = "debugid"
 [[package]]
 name = "embed-resource"
+version = "3.0.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c31a88c8d26de40ed18fe748c547845aa39de1db3afd958f8cb91579f3644bcb"
 dependencies = [
  "cc",
  "memchr",
  "rustc_version",
+ "toml 1.1.2+spec-1.1.0",
  "vswhom",
  "winreg 0.55.0",
 ]
 dependencies = [
  "anyhow",
  "candle-core",
+ "candle-flash-attn",
  "candle-nn",
  "candle-transformers",
  "clap",
 [[package]]
 name = "libc"
+version = "0.2.186"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
 [[package]]
 name = "libfuzzer-sys"
 [[package]]
 name = "pastey"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c5a797f0e07bdf071d15742978fc3128ec6c22891c31a3a931513263904c982a"
 [[package]]
 name = "pathdiff"
  "http 1.4.0",
  "http-body",
  "http-body-util",
+ "pastey 0.2.2",
  "pin-project-lite",
  "rand 0.10.1",
  "rmcp-macros",
 [[package]]
 name = "rustls"
+version = "0.23.39"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c2c118cb077cca2822033836dfb1b975355dfb784b5e8da48f7b6c5db74e60e"
 dependencies = [
  "aws-lc-rs",
  "log",
 [[package]]
 name = "rustls-pki-types"
+version = "1.14.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "30a7197ae7eb376e574fe940d068c30fe0462554a3ddbe4eca7838e049c937a9"
 dependencies = [
  "web-time",
  "zeroize",

Cargo.toml CHANGED Viewed

@@ -44,6 +44,7 @@ koharu-rpc = { path = "koharu-rpc", default-features = false }
 candle-transformers = "=0.9.2"
 candle-core = "=0.9.2"
 candle-nn = "=0.9.2"
 hf-hub = "0.5"
 image = "0.25"
 anyhow = "1.0"
@@ -102,7 +103,9 @@ cudarc = { version = "0.19.4", features = [
     "cublas",
     "cublaslt",
     "curand",
     "driver",
     "nvrtc",
     "f16",
     "f8",
@@ -166,9 +169,10 @@ natord = "1.0.9"
 sentry = { version = "0.47", features = ["tracing"] }
 [patch.crates-io]
-candle-transformers = { git = "https://github.com/mayocream/candle", branch = "cuda-dynamic-loading" }
-candle-core = { git = "https://github.com/mayocream/candle", branch = "cuda-dynamic-loading" }
-candle-nn = { git = "https://github.com/mayocream/candle", branch = "cuda-dynamic-loading" }
 ug = { git = "https://github.com/mayocream/ug", branch = "cuda-dynamic-loading" }
 ug-cuda = { git = "https://github.com/mayocream/ug", branch = "cuda-dynamic-loading" }

 candle-transformers = "=0.9.2"
 candle-core = "=0.9.2"
 candle-nn = "=0.9.2"
+candle-flash-attn = "=0.9.2"
 hf-hub = "0.5"
 image = "0.25"
 anyhow = "1.0"
     "cublas",
     "cublaslt",
     "curand",
+    "cudnn",
     "driver",
+    "dynamic-loading",
     "nvrtc",
     "f16",
     "f8",
 sentry = { version = "0.47", features = ["tracing"] }
 [patch.crates-io]
+candle-transformers = { git = "https://github.com/mayocream/candle", branch = "flash-attn" }
+candle-core = { git = "https://github.com/mayocream/candle", branch = "flash-attn" }
+candle-nn = { git = "https://github.com/mayocream/candle", branch = "flash-attn" }
+candle-flash-attn = { git = "https://github.com/mayocream/candle", branch = "flash-attn" }
 ug = { git = "https://github.com/mayocream/ug", branch = "cuda-dynamic-loading" }
 ug-cuda = { git = "https://github.com/mayocream/ug", branch = "cuda-dynamic-loading" }

koharu-ml/Cargo.toml CHANGED Viewed

@@ -20,6 +20,7 @@ imageproc = { workspace = true }
 candle-core = { workspace = true }
 candle-transformers = { workspace = true }
 candle-nn = { workspace = true }
 tokenizers = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
@@ -44,9 +45,14 @@ objc2-foundation = { workspace = true, optional = true }
 [features]
 cuda = [
     "candle-core/cuda",
     "candle-nn/cuda",
     "candle-transformers/cuda",
     "cudarc",
 ]
 metal = [
     "candle-core/metal",

 candle-core = { workspace = true }
 candle-transformers = { workspace = true }
 candle-nn = { workspace = true }
+candle-flash-attn = { workspace = true, optional = true }
 tokenizers = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
 [features]
 cuda = [
     "candle-core/cuda",
+    "candle-core/cudnn",
     "candle-nn/cuda",
+    "candle-nn/cudnn",
     "candle-transformers/cuda",
+    "candle-transformers/cudnn",
     "cudarc",
+    "candle-flash-attn",
+    "candle-flash-attn/cudnn",
 ]
 metal = [
     "candle-core/metal",

koharu-ml/src/flux2_klein/mod.rs CHANGED Viewed

@@ -191,6 +191,7 @@ impl Flux2Klein {
             return Ok(image.clone());
         }
         let (latents, packed_h, packed_w, size) = {
             let (rgb, size) = prepare_rgb_image(image, options.max_pixels);
             let image_latents = self.encode_image_latents(&rgb)?;
@@ -226,6 +227,7 @@ impl Flux2Klein {
                 )?;
             }
             let condition_latents = condition_latents.to_dtype(transformer_dtype)?;
             let mut scheduler =
                 FlowMatchScheduler::new(options.num_inference_steps, packed_h * packed_w);
@@ -236,6 +238,9 @@ impl Flux2Klein {
             let initial_timestep = timesteps[start_index];
             let mut latents =
                 pack_latents(&scheduler.scale_noise(&image_latents, initial_timestep, &noise)?)?;
             for step_idx in start_index..timesteps.len() {
                 let timestep = Tensor::from_vec(
@@ -250,7 +255,6 @@ impl Flux2Klein {
                     ],
                     1,
                 )?;
-                let img_ids = Tensor::cat(&[latent_ids.clone(), condition_ids.clone()], 1)?;
                 let noise_pred = self.transformer.forward(
                     &latent_model_input,
                     &img_ids,
@@ -258,10 +262,15 @@ impl Flux2Klein {
                     &text_ids,
                     &timestep,
                 )?;
                 let noise_pred = noise_pred
                     .narrow(1, 0, latents.dim(1)?)?
                     .to_dtype(DType::F32)?;
-                latents = scheduler.step(&noise_pred, &latents)?;
             }
             (latents, packed_h, packed_w, size)
@@ -322,6 +331,7 @@ impl Flux2Klein {
         reference_image: Option<&DynamicImage>,
         options: &Flux2InpaintOptions,
     ) -> Result<DynamicImage> {
         let (latents, packed_h, packed_w, size) = {
             let (rgb, size) = prepare_rgb_image(image, options.max_pixels);
             let resized_mask = expand_mask(
@@ -362,6 +372,7 @@ impl Flux2Klein {
                 )?;
             }
             let condition_latents = condition_latents.to_dtype(transformer_dtype)?;
             let mut scheduler =
                 FlowMatchScheduler::new(options.num_inference_steps, packed_h * packed_w);
@@ -373,6 +384,8 @@ impl Flux2Klein {
             let initial_timestep = timesteps[start_index];
             let mut latents =
                 pack_latents(&scheduler.scale_noise(&image_latents, initial_timestep, &noise)?)?;
             for step_idx in start_index..timesteps.len() {
                 let timestep = Tensor::from_vec(
@@ -387,7 +400,6 @@ impl Flux2Klein {
                     ],
                     1,
                 )?;
-                let img_ids = Tensor::cat(&[latent_ids.clone(), condition_ids.clone()], 1)?;
                 let noise_pred = self.transformer.forward(
                     &latent_model_input,
                     &img_ids,
@@ -395,10 +407,15 @@ impl Flux2Klein {
                     &text_ids,
                     &timestep,
                 )?;
                 let noise_pred = noise_pred
                     .narrow(1, 0, latents.dim(1)?)?
                     .to_dtype(DType::F32)?;
-                latents = scheduler.step(&noise_pred, &latents)?;
                 let init_latents = if step_idx + 1 < timesteps.len() {
                     scheduler.scale_noise(
@@ -409,9 +426,11 @@ impl Flux2Klein {
                 } else {
                     image_latents_packed.clone()
                 };
-                let keep_mask = ((&latent_mask * -1.0)? + 1.0)?;
-                latents = (keep_mask.broadcast_mul(&init_latents)?
                     + latent_mask.broadcast_mul(&latents)?)?;
             }
             (latents, packed_h, packed_w, size)
@@ -457,10 +476,53 @@ impl Flux2Klein {
     }
 }
-fn transformer_dtype(_device: &Device) -> DType {
     DType::F32
 }
 fn inpaint_crop_bounds(
     image: &DynamicImage,
     mask: &DynamicImage,

             return Ok(image.clone());
         }
+        let _cuda_cleanup = CudaTemporaryMemoryCleanup::new(&self.device);
         let (latents, packed_h, packed_w, size) = {
             let (rgb, size) = prepare_rgb_image(image, options.max_pixels);
             let image_latents = self.encode_image_latents(&rgb)?;
                 )?;
             }
             let condition_latents = condition_latents.to_dtype(transformer_dtype)?;
+            let img_ids = Tensor::cat(&[latent_ids, condition_ids], 1)?;
             let mut scheduler =
                 FlowMatchScheduler::new(options.num_inference_steps, packed_h * packed_w);
             let initial_timestep = timesteps[start_index];
             let mut latents =
                 pack_latents(&scheduler.scale_noise(&image_latents, initial_timestep, &noise)?)?;
+            drop(image_latents_packed);
+            drop(image_latents);
+            drop(noise);
             for step_idx in start_index..timesteps.len() {
                 let timestep = Tensor::from_vec(
                     ],
                     1,
                 )?;
                 let noise_pred = self.transformer.forward(
                     &latent_model_input,
                     &img_ids,
                     &text_ids,
                     &timestep,
                 )?;
+                drop(latent_model_input);
+                drop(timestep);
                 let noise_pred = noise_pred
                     .narrow(1, 0, latents.dim(1)?)?
                     .to_dtype(DType::F32)?;
+                let next_latents = scheduler.step(&noise_pred, &latents)?;
+                drop(noise_pred);
+                let previous_latents = std::mem::replace(&mut latents, next_latents);
+                drop(previous_latents);
             }
             (latents, packed_h, packed_w, size)
         reference_image: Option<&DynamicImage>,
         options: &Flux2InpaintOptions,
     ) -> Result<DynamicImage> {
+        let _cuda_cleanup = CudaTemporaryMemoryCleanup::new(&self.device);
         let (latents, packed_h, packed_w, size) = {
             let (rgb, size) = prepare_rgb_image(image, options.max_pixels);
             let resized_mask = expand_mask(
                 )?;
             }
             let condition_latents = condition_latents.to_dtype(transformer_dtype)?;
+            let img_ids = Tensor::cat(&[latent_ids, condition_ids], 1)?;
             let mut scheduler =
                 FlowMatchScheduler::new(options.num_inference_steps, packed_h * packed_w);
             let initial_timestep = timesteps[start_index];
             let mut latents =
                 pack_latents(&scheduler.scale_noise(&image_latents, initial_timestep, &noise)?)?;
+            let keep_mask = ((&latent_mask * -1.0)? + 1.0)?;
+            drop(noise);
             for step_idx in start_index..timesteps.len() {
                 let timestep = Tensor::from_vec(
                     ],
                     1,
                 )?;
                 let noise_pred = self.transformer.forward(
                     &latent_model_input,
                     &img_ids,
                     &text_ids,
                     &timestep,
                 )?;
+                drop(latent_model_input);
+                drop(timestep);
                 let noise_pred = noise_pred
                     .narrow(1, 0, latents.dim(1)?)?
                     .to_dtype(DType::F32)?;
+                let next_latents = scheduler.step(&noise_pred, &latents)?;
+                drop(noise_pred);
+                let previous_latents = std::mem::replace(&mut latents, next_latents);
+                drop(previous_latents);
                 let init_latents = if step_idx + 1 < timesteps.len() {
                     scheduler.scale_noise(
                 } else {
                     image_latents_packed.clone()
                 };
+                let masked_latents = (keep_mask.broadcast_mul(&init_latents)?
                     + latent_mask.broadcast_mul(&latents)?)?;
+                drop(init_latents);
+                let previous_latents = std::mem::replace(&mut latents, masked_latents);
+                drop(previous_latents);
             }
             (latents, packed_h, packed_w, size)
     }
 }
+struct CudaTemporaryMemoryCleanup<'a> {
+    device: &'a Device,
+}
+impl<'a> CudaTemporaryMemoryCleanup<'a> {
+    fn new(device: &'a Device) -> Self {
+        Self { device }
+    }
+}
+impl Drop for CudaTemporaryMemoryCleanup<'_> {
+    fn drop(&mut self) {
+        let _ = release_cuda_temporary_memory(self.device);
+    }
+}
+fn transformer_dtype(device: &Device) -> DType {
+    if device.is_cuda() {
+        return DType::BF16;
+    }
     DType::F32
 }
+fn release_cuda_temporary_memory(device: &Device) -> Result<()> {
+    device.synchronize()?;
+    #[cfg(feature = "cuda")]
+    if let Ok(cuda_device) = device.as_cuda_device() {
+        let stream = cuda_device.cuda_stream();
+        let context = stream.context();
+        if context.has_async_alloc() {
+            context.bind_to_thread()?;
+            let pool = unsafe {
+                candle_core::cuda::cudarc::driver::result::device::get_mem_pool(
+                    context.cu_device(),
+                )?
+            };
+            unsafe {
+                candle_core::cuda::cudarc::driver::result::mem_pool::trim_to(pool, 0)?;
+            }
+        }
+    }
+    Ok(())
+}
 fn inpaint_crop_bounds(
     image: &DynamicImage,
     mask: &DynamicImage,

koharu-ml/src/flux2_klein/transformer.rs CHANGED Viewed

@@ -1,8 +1,6 @@
 use std::path::Path;
-use candle_core::{D, DType, IndexOp, Module, Result, Tensor};
-use candle_nn::{LayerNorm, RmsNorm};
-use candle_transformers::quantized_nn::{Linear, linear_b};
 use candle_transformers::quantized_var_builder::VarBuilder;
 #[derive(Debug, Clone)]
@@ -32,8 +30,97 @@ impl Default for Flux2TransformerConfig {
     }
 }
 fn qlinear_no_bias(in_dim: usize, out_dim: usize, vb: VarBuilder) -> Result<Linear> {
-    linear_b(in_dim, out_dim, false, vb)
 }
 fn layer_norm(dim: usize, vb: &VarBuilder) -> Result<LayerNorm> {
@@ -83,6 +170,18 @@ fn apply_rope(xs: &Tensor, freq_cis: &Tensor) -> Result<Tensor> {
 fn scaled_dot_product_attention(q: &Tensor, k: &Tensor, v: &Tensor) -> Result<Tensor> {
     let dim = q.dim(D::Minus1)?;
     let scale = 1.0 / (dim as f64).sqrt();
     if q.device().is_metal() {
         return candle_nn::ops::sdpa(q, k, v, None, false, scale as f32, 1.0);
     }
@@ -107,6 +206,8 @@ fn attention(q: &Tensor, k: &Tensor, v: &Tensor, pe: &Tensor) -> Result<Tensor>
     let q = apply_rope(q, pe)?.contiguous()?;
     let k = apply_rope(k, pe)?.contiguous()?;
     let xs = scaled_dot_product_attention(&q, &k, v)?;
     xs.transpose(1, 2)?.flatten_from(2)
 }
@@ -265,6 +366,7 @@ impl SelfAttention {
         let v = qkv.i((.., .., 2))?.transpose(1, 2)?;
         let q = q.apply(&self.norm.query_norm)?;
         let k = k.apply(&self.norm.key_norm)?;
         Ok((q, k, v))
     }
 }
@@ -284,7 +386,9 @@ impl Mlp {
     }
     fn forward(&self, xs: &Tensor) -> Result<Tensor> {
-        swiglu(&xs.apply(&self.lin1)?)?.apply(&self.lin2)
     }
 }
@@ -336,8 +440,10 @@ impl DoubleStreamBlock {
         let img_modulated = img_mod1.scale_shift(&img.apply(&self.img_norm1)?)?;
         let (img_q, img_k, img_v) = self.img_attn.qkv(&img_modulated)?;
         let txt_modulated = txt_mod1.scale_shift(&txt.apply(&self.txt_norm1)?)?;
         let (txt_q, txt_k, txt_v) = self.txt_attn.qkv(&txt_modulated)?;
         let attn = {
             let q = Tensor::cat(&[&txt_q, &img_q], 2)?;
@@ -361,44 +467,31 @@ impl DoubleStreamBlock {
         let img_attn = img_attn.apply(&self.img_attn.proj)?;
         let txt_attn = txt_attn.apply(&self.txt_attn.proj)?;
         drop(attn);
         drop(img_modulated);
         drop(txt_modulated);
-        let img = (img + img_mod1.gate(&img_attn)?)?;
-        drop(img_attn);
-        let img_mlp = img_mod2
-            .scale_shift(&img.apply(&self.img_norm2)?)?
-            .apply_fn(|xs| self.img_mlp.forward(xs))?;
-        let img = (img + img_mod2.gate(&img_mlp)?)?;
-        drop(img_mlp);
-        let txt = (txt + txt_mod1.gate(&txt_attn)?)?;
-        drop(txt_attn);
-        let txt_mlp = txt_mod2
-            .scale_shift(&txt.apply(&self.txt_norm2)?)?
-            .apply_fn(|xs| self.txt_mlp.forward(xs))?;
-        let txt = (txt + txt_mod2.gate(&txt_mlp)?)?;
-        drop(txt_mlp);
         Ok((img, txt))
     }
 }
-trait ApplyFn {
-    fn apply_fn<F>(&self, f: F) -> Result<Tensor>
-    where
-        F: FnOnce(&Tensor) -> Result<Tensor>;
-}
-impl ApplyFn for Tensor {
-    fn apply_fn<F>(&self, f: F) -> Result<Tensor>
-    where
-        F: FnOnce(&Tensor) -> Result<Tensor>,
-    {
-        f(self)
-    }
-}
 #[derive(Debug, Clone)]
 struct SingleStreamBlock {
     linear1: Linear,
@@ -432,8 +525,11 @@ impl SingleStreamBlock {
     fn forward(&self, xs: &Tensor, mods: &[ModulationOut], pe: &Tensor) -> Result<Tensor> {
         let mod_ = &mods[0];
-        let x_mod = mod_.scale_shift(&xs.apply(&self.pre_norm)?)?;
         let qkv_mlp = x_mod.apply(&self.linear1)?;
         let qkv = qkv_mlp.narrow(D::Minus1, 0, 3 * self.hidden_size)?;
         let (b, len, _) = qkv.dims3()?;
         let qkv = qkv.reshape((b, len, 3, self.num_heads, ()))?;
@@ -441,6 +537,8 @@ impl SingleStreamBlock {
         let k = qkv.i((.., .., 1))?.transpose(1, 2)?;
         let v = qkv.i((.., .., 2))?.transpose(1, 2)?;
         let mlp = qkv_mlp.narrow(D::Minus1, 3 * self.hidden_size, self.mlp_size * 2)?;
         let q = q.apply(&self.norm.query_norm)?;
         let k = k.apply(&self.norm.key_norm)?;
         let attn = attention(&q, &k, &v, pe)?;
@@ -448,10 +546,13 @@ impl SingleStreamBlock {
         drop(k);
         drop(v);
         let mlp = swiglu(&mlp)?;
-        let output = Tensor::cat(&[&attn, &mlp], D::Minus1)?.apply(&self.linear2)?;
         drop(attn);
         drop(mlp);
-        xs + mod_.gate(&output)?
     }
 }
@@ -585,6 +686,7 @@ impl Flux2Transformer {
         let dtype = img.dtype();
         let ids = Tensor::cat(&[txt_ids, img_ids], 1)?;
         let pe = self.pe_embedder.forward(&ids)?;
         let mut img = img.apply(&self.img_in)?;
         let mut txt = txt.apply(&self.txt_in)?;
         let vec_ = timestep_embedding(timesteps, 256, dtype)?.apply(&self.time_in)?;
@@ -595,14 +697,22 @@ impl Flux2Transformer {
         for block in &self.double_blocks {
             (img, txt) = block.forward(&img, &txt, &ds_img_mods, &ds_txt_mods, &pe)?;
         }
         let txt_len = txt.dim(1)?;
         let img_len = img.dim(1)?;
         let mut xs = Tensor::cat(&[&txt, &img], 1)?;
         for block in &self.single_blocks {
             xs = block.forward(&xs, &ss_mods, &pe)?;
         }
         let img = xs.narrow(1, txt_len, img_len)?;
-        self.final_layer.forward(&img, &vec_)
     }
     pub fn in_channels(&self) -> usize {

 use std::path::Path;
+use candle_core::{D, DType, IndexOp, Module, Result, Tensor, quantized::QMatMul};
 use candle_transformers::quantized_var_builder::VarBuilder;
 #[derive(Debug, Clone)]
     }
 }
+#[derive(Debug, Clone)]
+struct Linear {
+    weight: QMatMul,
+}
+impl Module for Linear {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        let dtype = xs.dtype();
+        let xs = if should_promote_for_cuda(xs) {
+            xs.to_dtype(DType::F32)?
+        } else {
+            xs.clone()
+        };
+        let ys = xs.apply(&self.weight)?;
+        if ys.dtype() != dtype && matches!(dtype, DType::BF16 | DType::F16) {
+            ys.to_dtype(dtype)
+        } else {
+            Ok(ys)
+        }
+    }
+}
 fn qlinear_no_bias(in_dim: usize, out_dim: usize, vb: VarBuilder) -> Result<Linear> {
+    let weight = vb.get((out_dim, in_dim), "weight")?;
+    Ok(Linear {
+        weight: QMatMul::from_arc(weight)?,
+    })
+}
+#[derive(Debug, Clone)]
+struct LayerNorm {
+    inner: candle_nn::LayerNorm,
+}
+impl LayerNorm {
+    fn new_no_bias(weight: Tensor, eps: f64) -> Self {
+        Self {
+            inner: candle_nn::LayerNorm::new_no_bias(weight, eps),
+        }
+    }
+}
+impl Module for LayerNorm {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        let dtype = xs.dtype();
+        let xs = if should_promote_for_cuda(xs) {
+            xs.to_dtype(DType::F32)?
+        } else {
+            xs.clone()
+        };
+        let ys = xs.apply(&self.inner)?;
+        if ys.dtype() != dtype && matches!(dtype, DType::BF16 | DType::F16) {
+            ys.to_dtype(dtype)
+        } else {
+            Ok(ys)
+        }
+    }
+}
+#[derive(Debug, Clone)]
+struct RmsNorm {
+    inner: candle_nn::RmsNorm,
+}
+impl RmsNorm {
+    fn new(weight: Tensor, eps: f64) -> Self {
+        Self {
+            inner: candle_nn::RmsNorm::new(weight, eps),
+        }
+    }
+}
+impl Module for RmsNorm {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        let dtype = xs.dtype();
+        let xs = if should_promote_for_cuda(xs) {
+            xs.to_dtype(DType::F32)?
+        } else {
+            xs.clone()
+        };
+        let ys = xs.apply(&self.inner)?;
+        if ys.dtype() != dtype && matches!(dtype, DType::BF16 | DType::F16) {
+            ys.to_dtype(dtype)
+        } else {
+            Ok(ys)
+        }
+    }
+}
+fn should_promote_for_cuda(xs: &Tensor) -> bool {
+    xs.device().is_cuda() && matches!(xs.dtype(), DType::BF16 | DType::F16)
 }
 fn layer_norm(dim: usize, vb: &VarBuilder) -> Result<LayerNorm> {
 fn scaled_dot_product_attention(q: &Tensor, k: &Tensor, v: &Tensor) -> Result<Tensor> {
     let dim = q.dim(D::Minus1)?;
     let scale = 1.0 / (dim as f64).sqrt();
+    #[cfg(feature = "cuda")]
+    if q.device().is_cuda() {
+        let q = q.transpose(1, 2)?.contiguous()?;
+        let k = k.transpose(1, 2)?.contiguous()?;
+        let v = v.transpose(1, 2)?.contiguous()?;
+        let xs = candle_flash_attn::flash_attn(&q, &k, &v, scale as f32, false)?;
+        drop(q);
+        drop(k);
+        drop(v);
+        return xs.transpose(1, 2);
+    }
     if q.device().is_metal() {
         return candle_nn::ops::sdpa(q, k, v, None, false, scale as f32, 1.0);
     }
     let q = apply_rope(q, pe)?.contiguous()?;
     let k = apply_rope(k, pe)?.contiguous()?;
     let xs = scaled_dot_product_attention(&q, &k, v)?;
+    drop(q);
+    drop(k);
     xs.transpose(1, 2)?.flatten_from(2)
 }
         let v = qkv.i((.., .., 2))?.transpose(1, 2)?;
         let q = q.apply(&self.norm.query_norm)?;
         let k = k.apply(&self.norm.key_norm)?;
+        drop(qkv);
         Ok((q, k, v))
     }
 }
     }
     fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        let xs = xs.apply(&self.lin1)?;
+        let xs = swiglu(&xs)?;
+        xs.apply(&self.lin2)
     }
 }
         let img_modulated = img_mod1.scale_shift(&img.apply(&self.img_norm1)?)?;
         let (img_q, img_k, img_v) = self.img_attn.qkv(&img_modulated)?;
+        drop(img_modulated);
         let txt_modulated = txt_mod1.scale_shift(&txt.apply(&self.txt_norm1)?)?;
         let (txt_q, txt_k, txt_v) = self.txt_attn.qkv(&txt_modulated)?;
+        drop(txt_modulated);
         let attn = {
             let q = Tensor::cat(&[&txt_q, &img_q], 2)?;
         let img_attn = img_attn.apply(&self.img_attn.proj)?;
         let txt_attn = txt_attn.apply(&self.txt_attn.proj)?;
         drop(attn);
+        let img_attn = img_mod1.gate(&img_attn)?;
+        let img = (img + img_attn)?;
+        let img_normed = img.apply(&self.img_norm2)?;
+        let img_modulated = img_mod2.scale_shift(&img_normed)?;
+        drop(img_normed);
+        let img_mlp = self.img_mlp.forward(&img_modulated)?;
         drop(img_modulated);
+        let img_mlp = img_mod2.gate(&img_mlp)?;
+        let img = (img + img_mlp)?;
+        let txt_attn = txt_mod1.gate(&txt_attn)?;
+        let txt = (txt + txt_attn)?;
+        let txt_normed = txt.apply(&self.txt_norm2)?;
+        let txt_modulated = txt_mod2.scale_shift(&txt_normed)?;
+        drop(txt_normed);
+        let txt_mlp = self.txt_mlp.forward(&txt_modulated)?;
         drop(txt_modulated);
+        let txt_mlp = txt_mod2.gate(&txt_mlp)?;
+        let txt = (txt + txt_mlp)?;
         Ok((img, txt))
     }
 }
 #[derive(Debug, Clone)]
 struct SingleStreamBlock {
     linear1: Linear,
     fn forward(&self, xs: &Tensor, mods: &[ModulationOut], pe: &Tensor) -> Result<Tensor> {
         let mod_ = &mods[0];
+        let x_normed = xs.apply(&self.pre_norm)?;
+        let x_mod = mod_.scale_shift(&x_normed)?;
+        drop(x_normed);
         let qkv_mlp = x_mod.apply(&self.linear1)?;
+        drop(x_mod);
         let qkv = qkv_mlp.narrow(D::Minus1, 0, 3 * self.hidden_size)?;
         let (b, len, _) = qkv.dims3()?;
         let qkv = qkv.reshape((b, len, 3, self.num_heads, ()))?;
         let k = qkv.i((.., .., 1))?.transpose(1, 2)?;
         let v = qkv.i((.., .., 2))?.transpose(1, 2)?;
         let mlp = qkv_mlp.narrow(D::Minus1, 3 * self.hidden_size, self.mlp_size * 2)?;
+        drop(qkv_mlp);
+        drop(qkv);
         let q = q.apply(&self.norm.query_norm)?;
         let k = k.apply(&self.norm.key_norm)?;
         let attn = attention(&q, &k, &v, pe)?;
         drop(k);
         drop(v);
         let mlp = swiglu(&mlp)?;
+        let output = Tensor::cat(&[&attn, &mlp], D::Minus1)?;
         drop(attn);
         drop(mlp);
+        let output = output.apply(&self.linear2)?;
+        let gated = mod_.gate(&output)?;
+        drop(output);
+        xs + gated
     }
 }
         let dtype = img.dtype();
         let ids = Tensor::cat(&[txt_ids, img_ids], 1)?;
         let pe = self.pe_embedder.forward(&ids)?;
+        drop(ids);
         let mut img = img.apply(&self.img_in)?;
         let mut txt = txt.apply(&self.txt_in)?;
         let vec_ = timestep_embedding(timesteps, 256, dtype)?.apply(&self.time_in)?;
         for block in &self.double_blocks {
             (img, txt) = block.forward(&img, &txt, &ds_img_mods, &ds_txt_mods, &pe)?;
         }
+        drop(ds_img_mods);
+        drop(ds_txt_mods);
         let txt_len = txt.dim(1)?;
         let img_len = img.dim(1)?;
         let mut xs = Tensor::cat(&[&txt, &img], 1)?;
+        drop(txt);
+        drop(img);
         for block in &self.single_blocks {
             xs = block.forward(&xs, &ss_mods, &pe)?;
         }
+        drop(ss_mods);
+        drop(pe);
         let img = xs.narrow(1, txt_len, img_len)?;
+        let xs = self.final_layer.forward(&img, &vec_)?;
+        drop(img);
+        Ok(xs)
     }
     pub fn in_channels(&self) -> usize {

koharu-ml/src/flux2_klein/vae.rs CHANGED Viewed

@@ -1,4 +1,4 @@
-use candle_core::{D, Module, Result, Tensor};
 use candle_nn::{Conv2d, Conv2dConfig, GroupNorm, VarBuilder, conv2d, group_norm};
 use super::latents::{patchify_latents, unpatchify_latents};
@@ -30,6 +30,15 @@ impl Default for Flux2VaeConfig {
     }
 }
 fn scaled_dot_product_attention(q: &Tensor, k: &Tensor, v: &Tensor) -> Result<Tensor> {
     let dim = q.dim(D::Minus1)?;
     let scale = 1.0 / (dim as f64).sqrt();
@@ -113,10 +122,7 @@ impl ResnetBlock2D {
         num_groups: usize,
         vb: VarBuilder,
     ) -> Result<Self> {
-        let conv_cfg = Conv2dConfig {
-            padding: 1,
-            ..Default::default()
-        };
         let norm1 = group_norm(num_groups, in_channels, 1e-6, vb.pp("norm1"))?;
         let conv1 = conv2d(in_channels, out_channels, 3, conv_cfg, vb.pp("conv1"))?;
         let norm2 = group_norm(num_groups, out_channels, 1e-6, vb.pp("norm2"))?;
@@ -126,7 +132,7 @@ impl ResnetBlock2D {
                 in_channels,
                 out_channels,
                 1,
-                Default::default(),
                 vb.pp("conv_shortcut"),
             )?)
         } else {
@@ -165,11 +171,7 @@ struct Downsample2D {
 impl Downsample2D {
     fn new(channels: usize, vb: VarBuilder) -> Result<Self> {
-        let conv_cfg = Conv2dConfig {
-            stride: 2,
-            padding: 0,
-            ..Default::default()
-        };
         let conv = conv2d(channels, channels, 3, conv_cfg, vb.pp("conv"))?;
         Ok(Self { conv })
     }
@@ -243,10 +245,7 @@ struct Upsample2D {
 impl Upsample2D {
     fn new(channels: usize, vb: VarBuilder) -> Result<Self> {
-        let conv_cfg = Conv2dConfig {
-            padding: 1,
-            ..Default::default()
-        };
         let conv = conv2d(channels, channels, 3, conv_cfg, vb.pp("conv"))?;
         Ok(Self { conv })
     }
@@ -342,10 +341,7 @@ struct Encoder {
 impl Encoder {
     fn new(cfg: &Flux2VaeConfig, vb: VarBuilder) -> Result<Self> {
-        let conv_cfg = Conv2dConfig {
-            padding: 1,
-            ..Default::default()
-        };
         let conv_in = conv2d(
             cfg.in_channels,
             cfg.block_out_channels[0],
@@ -419,10 +415,7 @@ struct Decoder {
 impl Decoder {
     fn new(cfg: &Flux2VaeConfig, vb: VarBuilder) -> Result<Self> {
-        let conv_cfg = Conv2dConfig {
-            padding: 1,
-            ..Default::default()
-        };
         let mid_channels = *cfg.decoder_block_out_channels.last().unwrap();
         let conv_in = conv2d(
             cfg.latent_channels,
@@ -512,14 +505,14 @@ impl Flux2Vae {
             2 * cfg.latent_channels,
             2 * cfg.latent_channels,
             1,
-            Default::default(),
             vb.pp("quant_conv"),
         )?;
         let post_quant_conv = conv2d(
             cfg.latent_channels,
             cfg.latent_channels,
             1,
-            Default::default(),
             vb.pp("post_quant_conv"),
         )?;
         let bn_running_mean = vb.get(4 * cfg.latent_channels, "bn.running_mean")?;

+use candle_core::{D, Module, Result, Tensor, conv::CudnnFwdAlgo};
 use candle_nn::{Conv2d, Conv2dConfig, GroupNorm, VarBuilder, conv2d, group_norm};
 use super::latents::{patchify_latents, unpatchify_latents};
     }
 }
+fn vae_conv_config(padding: usize, stride: usize) -> Conv2dConfig {
+    Conv2dConfig {
+        padding,
+        stride,
+        cudnn_fwd_algo: Some(CudnnFwdAlgo::ImplicitGemm),
+        ..Default::default()
+    }
+}
 fn scaled_dot_product_attention(q: &Tensor, k: &Tensor, v: &Tensor) -> Result<Tensor> {
     let dim = q.dim(D::Minus1)?;
     let scale = 1.0 / (dim as f64).sqrt();
         num_groups: usize,
         vb: VarBuilder,
     ) -> Result<Self> {
+        let conv_cfg = vae_conv_config(1, 1);
         let norm1 = group_norm(num_groups, in_channels, 1e-6, vb.pp("norm1"))?;
         let conv1 = conv2d(in_channels, out_channels, 3, conv_cfg, vb.pp("conv1"))?;
         let norm2 = group_norm(num_groups, out_channels, 1e-6, vb.pp("norm2"))?;
                 in_channels,
                 out_channels,
                 1,
+                vae_conv_config(0, 1),
                 vb.pp("conv_shortcut"),
             )?)
         } else {
 impl Downsample2D {
     fn new(channels: usize, vb: VarBuilder) -> Result<Self> {
+        let conv_cfg = vae_conv_config(0, 2);
         let conv = conv2d(channels, channels, 3, conv_cfg, vb.pp("conv"))?;
         Ok(Self { conv })
     }
 impl Upsample2D {
     fn new(channels: usize, vb: VarBuilder) -> Result<Self> {
+        let conv_cfg = vae_conv_config(1, 1);
         let conv = conv2d(channels, channels, 3, conv_cfg, vb.pp("conv"))?;
         Ok(Self { conv })
     }
 impl Encoder {
     fn new(cfg: &Flux2VaeConfig, vb: VarBuilder) -> Result<Self> {
+        let conv_cfg = vae_conv_config(1, 1);
         let conv_in = conv2d(
             cfg.in_channels,
             cfg.block_out_channels[0],
 impl Decoder {
     fn new(cfg: &Flux2VaeConfig, vb: VarBuilder) -> Result<Self> {
+        let conv_cfg = vae_conv_config(1, 1);
         let mid_channels = *cfg.decoder_block_out_channels.last().unwrap();
         let conv_in = conv2d(
             cfg.latent_channels,
             2 * cfg.latent_channels,
             2 * cfg.latent_channels,
             1,
+            vae_conv_config(0, 1),
             vb.pp("quant_conv"),
         )?;
         let post_quant_conv = conv2d(
             cfg.latent_channels,
             cfg.latent_channels,
             1,
+            vae_conv_config(0, 1),
             vb.pp("post_quant_conv"),
         )?;
         let bn_running_mean = vb.get(4 * cfg.latent_channels, "bn.running_mean")?;

koharu-runtime/src/cuda.rs CHANGED Viewed

@@ -11,6 +11,7 @@ use crate::loader::{add_runtime_search_path, preload_library};
 const CUDA_SUCCESS: i32 = 0;
 const CUDA_13_0_DRIVER_VERSION: i32 = 13000;
 const CUDA_13_1_DRIVER_VERSION: i32 = 13010;
 const CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: i32 = 75;
 const CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: i32 = 76;
 const MIN_COMPUTE_CAPABILITY: (i32, i32) = (7, 5); // Turing (RTX 20xx) and above
@@ -64,6 +65,31 @@ const WHEELS: &[WheelSpec] = &[
         windows_dylibs: &["curand64_10.dll"],
         linux_dylibs: &["libcurand.so.10"],
     },
 ];
 impl CudaDriverVersion {
@@ -379,9 +405,10 @@ impl WheelSpec {
 fn source_id() -> Result<String> {
     let packages = WHEELS.iter().map(|wheel| wheel.package).collect::<Vec<_>>();
     Ok(format!(
-        "cuda;platform={};wheels={}",
         platform_tags()?.join(","),
-        packages.join(",")
     ))
 }
@@ -458,6 +485,20 @@ mod tests {
         }
     }
     #[test]
     fn parses_major_minor_from_driver_version() {
         let version = CudaDriverVersion::from_raw(13010);

 const CUDA_SUCCESS: i32 = 0;
 const CUDA_13_0_DRIVER_VERSION: i32 = 13000;
 const CUDA_13_1_DRIVER_VERSION: i32 = 13010;
+const CUDA_EXTRACT_REVISION: u32 = 2;
 const CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: i32 = 75;
 const CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: i32 = 76;
 const MIN_COMPUTE_CAPABILITY: (i32, i32) = (7, 5); // Turing (RTX 20xx) and above
         windows_dylibs: &["curand64_10.dll"],
         linux_dylibs: &["libcurand.so.10"],
     },
+    WheelSpec {
+        package: "nvidia-cudnn-cu13/9.21.0.82",
+        windows_dylibs: &[
+            "cudnn64_9.dll",
+            "cudnn_adv64_9.dll",
+            "cudnn_cnn64_9.dll",
+            "cudnn_engines_precompiled64_9.dll",
+            "cudnn_engines_runtime_compiled64_9.dll",
+            "cudnn_engines_tensor_ir64_9.dll",
+            "cudnn_graph64_9.dll",
+            "cudnn_heuristic64_9.dll",
+            "cudnn_ops64_9.dll",
+        ],
+        linux_dylibs: &[
+            "libcudnn.so.9",
+            "libcudnn_adv.so.9",
+            "libcudnn_cnn.so.9",
+            "libcudnn_engines_precompiled.so.9",
+            "libcudnn_engines_runtime_compiled.so.9",
+            "libcudnn_engines_tensor_ir.so.9",
+            "libcudnn_graph.so.9",
+            "libcudnn_heuristic.so.9",
+            "libcudnn_ops.so.9",
+        ],
+    },
 ];
 impl CudaDriverVersion {
 fn source_id() -> Result<String> {
     let packages = WHEELS.iter().map(|wheel| wheel.package).collect::<Vec<_>>();
     Ok(format!(
+        "cuda;platform={};wheels={};extract={}",
         platform_tags()?.join(","),
+        packages.join(","),
+        CUDA_EXTRACT_REVISION
     ))
 }
         }
     }
+    #[test]
+    fn cuda_runtime_includes_cudnn() {
+        let wheel = WHEELS
+            .iter()
+            .find(|wheel| wheel.package.starts_with("nvidia-cudnn-cu13/"))
+            .expect("missing cuDNN runtime wheel");
+        #[cfg(target_os = "windows")]
+        assert!(wheel.dylibs().contains(&"cudnn64_9.dll"));
+        #[cfg(target_os = "linux")]
+        assert!(wheel.dylibs().contains(&"libcudnn.so.9"));
+    }
     #[test]
     fn parses_major_minor_from_driver_version() {
         let version = CudaDriverVersion::from_raw(13010);

koharu/tauri.windows.conf.json CHANGED Viewed

@@ -1,5 +1,5 @@
-{
-  "$schema": "../node_modules/@tauri-apps/cli/config.schema.json",
   "identifier": "Koharu",
   "build": {
     "features": [

+{
+  "$schema": "../node_modules/@tauri-apps/cli/config.schema.json",
   "identifier": "Koharu",
   "build": {
     "features": [