trans1

Sleeping

App Files Files Community

Mayo commited on May 1

Commit

aa8927a

unverified ·

1 Parent(s): 6911069

feat: anime-text model

Browse files

Files changed (11) hide show

README.md +1 -0
koharu-app/src/config.rs +1 -1
koharu-app/src/pipeline/engines/anime_text.rs +59 -0
koharu-app/src/pipeline/engines/mod.rs +1 -0
koharu-app/src/pipeline/mod.rs +16 -0
koharu-ml/Cargo.toml +4 -0
koharu-ml/bin/anime-text.rs +112 -0
koharu-ml/src/anime_text/mod.rs +440 -0
koharu-ml/src/anime_text/model.rs +982 -0
koharu-ml/src/lib.rs +1 -0
koharu-ml/tests/anime_text.rs +30 -0

README.md CHANGED Viewed

@@ -186,6 +186,7 @@ Koharu uses multiple pretrained models, each tuned for a specific part of the pa
 These models find text regions, speech bubbles, and page structure.
 - [comic-text-bubble-detector](https://huggingface.co/ogkalu/comic-text-and-bubble-detector) for joint text block and speech bubble detection
 - [comic-text-detector](https://huggingface.co/mayocream/comic-text-detector) for text segmentation masks
 - [PP-DocLayoutV3](https://huggingface.co/PaddlePaddle/PP-DocLayoutV3_safetensors) for document layout analysis

 These models find text regions, speech bubbles, and page structure.
+- [anime-text-yolo](https://huggingface.co/mayocream/anime-text-yolo) for text block detection
 - [comic-text-bubble-detector](https://huggingface.co/ogkalu/comic-text-and-bubble-detector) for joint text block and speech bubble detection
 - [comic-text-detector](https://huggingface.co/mayocream/comic-text-detector) for text segmentation masks
 - [PP-DocLayoutV3](https://huggingface.co/PaddlePaddle/PP-DocLayoutV3_safetensors) for document layout analysis

koharu-app/src/config.rs CHANGED Viewed

@@ -83,7 +83,7 @@ pub struct PipelineConfig {
 impl Default for PipelineConfig {
     fn default() -> Self {
         Self {
-            detector: "pp-doclayout-v3".to_string(),
             font_detector: "yuzumarker-font-detection".to_string(),
             segmenter: "comic-text-detector-seg".to_string(),
             bubble_segmenter: "speech-bubble-segmentation".to_string(),

 impl Default for PipelineConfig {
     fn default() -> Self {
         Self {
+            detector: "anime-text".to_string(),
             font_detector: "yuzumarker-font-detection".to_string(),
             segmenter: "comic-text-detector-seg".to_string(),
             bubble_segmenter: "speech-bubble-segmentation".to_string(),

koharu-app/src/pipeline/engines/anime_text.rs ADDED Viewed

	@@ -0,0 +1,59 @@

+//! Anime Text YOLO detector. Emits `AddNode` ops for each detected text region.
+use anyhow::Result;
+use async_trait::async_trait;
+use koharu_core::{Op, TextData};
+use koharu_ml::anime_text::AnimeTextDetector;
+use crate::pipeline::artifacts::Artifact;
+use crate::pipeline::engine::{Engine, EngineCtx, EngineInfo};
+use crate::pipeline::engines::support::{
+    clear_text_nodes_ops, load_source_image, new_text_node, page_node_count,
+    sort_manga_reading_order, text_region_to_pair,
+};
+const DETECTOR_NAME: &str = "anime-text";
+pub struct Model(AnimeTextDetector);
+#[async_trait]
+impl Engine for Model {
+    async fn run(&self, ctx: EngineCtx<'_>) -> Result<Vec<Op>> {
+        let image = load_source_image(ctx.scene, ctx.page, ctx.blobs)?;
+        let det = self.0.inference(&image)?;
+        let mut pairs: Vec<([f32; 4], TextData)> = det
+            .text_blocks
+            .into_iter()
+            .map(|r| text_region_to_pair(r, DETECTOR_NAME))
+            .collect();
+        sort_manga_reading_order(&mut pairs);
+        let mut ops = clear_text_nodes_ops(ctx.scene, ctx.page);
+        let removed = ops.len();
+        let insertion_start = page_node_count(ctx.scene, ctx.page).saturating_sub(removed);
+        ops.reserve(pairs.len());
+        for (at, (bbox, text)) in (insertion_start..).zip(pairs) {
+            let node = new_text_node(bbox, text);
+            ops.push(Op::AddNode {
+                page: ctx.page,
+                node,
+                at,
+            });
+        }
+        Ok(ops)
+    }
+}
+inventory::submit! {
+    EngineInfo {
+        id: "anime-text",
+        name: "Anime Text YOLO (N)",
+        needs: &[],
+        produces: &[Artifact::TextBoxes],
+        load: |runtime, cpu| Box::pin(async move {
+            let m = AnimeTextDetector::load(runtime, cpu).await?;
+            Ok(Box::new(Model(m)) as Box<dyn Engine>)
+        }),
+    }
+}

koharu-app/src/pipeline/engines/mod.rs CHANGED Viewed

@@ -4,6 +4,7 @@
 //! `inventory::submit! { EngineInfo { … } }`. The registry picks them up
 //! automatically at link time.
 pub mod aot;
 pub mod bubble_segmentation;
 pub mod comic_text_bubble;

 //! `inventory::submit! { EngineInfo { … } }`. The registry picks them up
 //! automatically at link time.
+pub mod anime_text;
 pub mod aot;
 pub mod bubble_segmentation;
 pub mod comic_text_bubble;

koharu-app/src/pipeline/mod.rs CHANGED Viewed

@@ -350,3 +350,19 @@ pub fn catalog() -> EngineCatalog {
             .collect(),
     }
 }

             .collect(),
     }
 }
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn catalog_includes_anime_text_detector() {
+        let catalog = catalog();
+        assert!(catalog.detectors.iter().any(|engine| {
+            engine.id == "anime-text"
+                && engine.name == "Anime Text YOLO (N)"
+                && engine.produces.iter().map(String::as_str).eq(["TextBoxes"])
+        }));
+    }
+}

koharu-ml/Cargo.toml CHANGED Viewed

@@ -102,6 +102,10 @@ path = "bin/manga-text-segmentation-2025.rs"
 name = "speech-bubble-segmentation"
 path = "bin/speech-bubble-segmentation.rs"
 [[bin]]
 name = "aot-inpainting"
 path = "bin/aot-inpainting.rs"

 name = "speech-bubble-segmentation"
 path = "bin/speech-bubble-segmentation.rs"
+[[bin]]
+name = "anime-text"
+path = "bin/anime-text.rs"
 [[bin]]
 name = "aot-inpainting"
 path = "bin/aot-inpainting.rs"

koharu-ml/bin/anime-text.rs ADDED Viewed

	@@ -0,0 +1,112 @@

+use anyhow::{Result, anyhow, ensure};
+use clap::{Parser, ValueEnum};
+use imageproc::{drawing::draw_hollow_rect_mut, rect::Rect};
+use koharu_ml::anime_text::{AnimeTextDetector, AnimeTextYoloVariant};
+use koharu_runtime::{ComputePolicy, RuntimeManager, default_app_data_root};
+use tokio::runtime::Builder;
+#[path = "common.rs"]
+mod common;
+#[derive(Clone, Copy, Debug, ValueEnum)]
+enum Variant {
+    N,
+    S,
+    M,
+    L,
+    X,
+}
+impl From<Variant> for AnimeTextYoloVariant {
+    fn from(value: Variant) -> Self {
+        match value {
+            Variant::N => Self::N,
+            Variant::S => Self::S,
+            Variant::M => Self::M,
+            Variant::L => Self::L,
+            Variant::X => Self::X,
+        }
+    }
+}
+#[derive(Parser)]
+struct Cli {
+    #[arg(short, long, value_name = "FILE")]
+    input: String,
+    #[arg(short, long, value_name = "FILE")]
+    output: String,
+    #[arg(long, value_name = "FILE")]
+    json_output: Option<String>,
+    #[arg(long, value_enum, default_value_t = Variant::N)]
+    variant: Variant,
+    #[arg(long, default_value_t = 0.25)]
+    confidence_threshold: f32,
+    #[arg(long, default_value_t = 0.45)]
+    nms_threshold: f32,
+    #[arg(long, default_value_t = false)]
+    cpu: bool,
+}
+fn main() -> Result<()> {
+    common::init_tracing();
+    std::thread::Builder::new()
+        .name("anime-text-yolo".to_string())
+        .stack_size(64 * 1024 * 1024)
+        .spawn(|| {
+            let runtime = Builder::new_current_thread().enable_all().build()?;
+            runtime.block_on(async_main())
+        })?
+        .join()
+        .map_err(|_| anyhow!("anime-text-yolo thread panicked"))?
+}
+async fn async_main() -> Result<()> {
+    let cli = Cli::parse();
+    let variant = AnimeTextYoloVariant::from(cli.variant);
+    let runtime = RuntimeManager::new(
+        default_app_data_root(),
+        if cli.cpu {
+            ComputePolicy::CpuOnly
+        } else {
+            ComputePolicy::PreferGpu
+        },
+    )?;
+    runtime.prepare().await?;
+    let model = AnimeTextDetector::load_variant(&runtime, variant, cli.cpu).await?;
+    let bytes = std::fs::read(&cli.input)?;
+    let format = image::guess_format(&bytes)?;
+    let image = image::load_from_memory_with_format(&bytes, format)?;
+    let detection =
+        model.inference_with_thresholds(&image, cli.confidence_threshold, cli.nms_threshold)?;
+    ensure!(
+        !detection.regions.is_empty(),
+        "No anime text blocks detected in the image."
+    );
+    let mut image = image.to_rgba8();
+    for region in &detection.regions {
+        let width = (region.bbox[2] - region.bbox[0]).max(1.0) as u32;
+        let height = (region.bbox[3] - region.bbox[1]).max(1.0) as u32;
+        draw_hollow_rect_mut(
+            &mut image,
+            Rect::at(region.bbox[0] as i32, region.bbox[1] as i32).of_size(width, height),
+            image::Rgba([255, 0, 0, 255]),
+        );
+    }
+    image::DynamicImage::ImageRgba8(image).save(&cli.output)?;
+    if let Some(path) = &cli.json_output {
+        std::fs::write(path, serde_json::to_vec_pretty(&detection)?)?;
+    }
+    Ok(())
+}

koharu-ml/src/anime_text/mod.rs ADDED Viewed

	@@ -0,0 +1,440 @@

+mod model;
+use std::{path::Path, path::PathBuf, time::Instant};
+use anyhow::{Context, Result, bail};
+use candle_core::{DType, Device, IndexOp, Tensor};
+use candle_transformers::object_detection::{Bbox, non_maximum_suppression};
+use image::{
+    DynamicImage, Rgb, RgbImage,
+    imageops::{self, FilterType},
+};
+use koharu_runtime::RuntimeManager;
+use serde::{Deserialize, Serialize};
+use tracing::instrument;
+use crate::{device, loading, types::TextRegion};
+use self::model::{Yolo12, Yolo12Scale};
+pub const HF_REPO: &str = "mayocream/anime-text-yolo";
+const INPUT_SIZE: u32 = 640;
+const NUM_CLASSES: usize = 1;
+const DEFAULT_VARIANT: AnimeTextYoloVariant = AnimeTextYoloVariant::N;
+const DEFAULT_CONFIDENCE_THRESHOLD: f32 = 0.25;
+const DEFAULT_NMS_THRESHOLD: f32 = 0.45;
+const LETTERBOX_COLOR: u8 = 114;
+const DETECTOR_NAME: &str = "anime-text-yolo";
+const CLASS_NAMES: [&str; NUM_CLASSES] = ["text_block"];
+koharu_runtime::declare_hf_model_package!(
+    id: "model:anime-text-yolo:yolo12n",
+    repo: HF_REPO,
+    file: "yolo12n_animetext.safetensors",
+    bootstrap: false,
+    order: 118,
+);
+koharu_runtime::declare_hf_model_package!(
+    id: "model:anime-text-yolo:yolo12s",
+    repo: HF_REPO,
+    file: "yolo12s_animetext.safetensors",
+    bootstrap: false,
+    order: 119,
+);
+koharu_runtime::declare_hf_model_package!(
+    id: "model:anime-text-yolo:yolo12m",
+    repo: HF_REPO,
+    file: "yolo12m_animetext.safetensors",
+    bootstrap: false,
+    order: 120,
+);
+koharu_runtime::declare_hf_model_package!(
+    id: "model:anime-text-yolo:yolo12l",
+    repo: HF_REPO,
+    file: "yolo12l_animetext.safetensors",
+    bootstrap: false,
+    order: 121,
+);
+koharu_runtime::declare_hf_model_package!(
+    id: "model:anime-text-yolo:yolo12x",
+    repo: HF_REPO,
+    file: "yolo12x_animetext.safetensors",
+    bootstrap: false,
+    order: 122,
+);
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "lowercase")]
+pub enum AnimeTextYoloVariant {
+    N,
+    S,
+    M,
+    L,
+    X,
+}
+impl AnimeTextYoloVariant {
+    pub fn filename(self) -> &'static str {
+        match self {
+            Self::N => "yolo12n_animetext.safetensors",
+            Self::S => "yolo12s_animetext.safetensors",
+            Self::M => "yolo12m_animetext.safetensors",
+            Self::L => "yolo12l_animetext.safetensors",
+            Self::X => "yolo12x_animetext.safetensors",
+        }
+    }
+    pub fn as_str(self) -> &'static str {
+        match self {
+            Self::N => "n",
+            Self::S => "s",
+            Self::M => "m",
+            Self::L => "l",
+            Self::X => "x",
+        }
+    }
+    fn scale(self) -> Yolo12Scale {
+        match self {
+            Self::N => Yolo12Scale::N,
+            Self::S => Yolo12Scale::S,
+            Self::M => Yolo12Scale::M,
+            Self::L => Yolo12Scale::L,
+            Self::X => Yolo12Scale::X,
+        }
+    }
+}
+impl std::fmt::Display for AnimeTextYoloVariant {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+#[derive(Debug)]
+pub struct AnimeTextDetector {
+    model: Yolo12,
+    variant: AnimeTextYoloVariant,
+    device: Device,
+    dtype: DType,
+}
+#[derive(Debug, Clone)]
+struct PreparedInput {
+    pixel_values: Tensor,
+    original_width: u32,
+    original_height: u32,
+    pad_x: u32,
+    pad_y: u32,
+    scale: f32,
+}
+#[derive(Debug, Clone, Serialize)]
+#[serde(rename_all = "camelCase")]
+pub struct AnimeTextDetection {
+    pub image_width: u32,
+    pub image_height: u32,
+    pub variant: AnimeTextYoloVariant,
+    pub regions: Vec<AnimeTextRegion>,
+    pub text_blocks: Vec<TextRegion>,
+}
+#[derive(Debug, Clone, Serialize)]
+#[serde(rename_all = "camelCase")]
+pub struct AnimeTextRegion {
+    pub label_id: usize,
+    pub label: String,
+    pub score: f32,
+    pub bbox: [f32; 4],
+}
+impl AnimeTextDetector {
+    pub async fn load(runtime: &RuntimeManager, cpu: bool) -> Result<Self> {
+        Self::load_variant(runtime, DEFAULT_VARIANT, cpu).await
+    }
+    pub async fn load_variant(
+        runtime: &RuntimeManager,
+        variant: AnimeTextYoloVariant,
+        cpu: bool,
+    ) -> Result<Self> {
+        let weights_path = resolve_model_path(runtime, variant).await?;
+        Self::load_from_path(weights_path, variant, cpu)
+    }
+    pub fn load_from_path(
+        weights_path: impl AsRef<Path>,
+        variant: AnimeTextYoloVariant,
+        cpu: bool,
+    ) -> Result<Self> {
+        let device = device(cpu)?;
+        let dtype = loading::model_dtype(&device);
+        let model = loading::load_mmaped_safetensors_path_with_dtype(
+            weights_path.as_ref(),
+            &device,
+            dtype,
+            |vb| Yolo12::load(vb, variant.scale(), NUM_CLASSES),
+        )
+        .with_context(|| {
+            format!(
+                "failed to load anime text YOLO {} weights from {}",
+                variant,
+                weights_path.as_ref().display()
+            )
+        })?;
+        Ok(Self {
+            model,
+            variant,
+            device,
+            dtype,
+        })
+    }
+    pub fn variant(&self) -> AnimeTextYoloVariant {
+        self.variant
+    }
+    #[instrument(level = "debug", skip_all)]
+    pub fn inference(&self, image: &DynamicImage) -> Result<AnimeTextDetection> {
+        self.inference_with_thresholds(image, DEFAULT_CONFIDENCE_THRESHOLD, DEFAULT_NMS_THRESHOLD)
+    }
+    #[instrument(level = "debug", skip_all)]
+    pub fn inference_with_thresholds(
+        &self,
+        image: &DynamicImage,
+        confidence_threshold: f32,
+        nms_threshold: f32,
+    ) -> Result<AnimeTextDetection> {
+        let started = Instant::now();
+        let prepared = self.preprocess(image)?;
+        let outputs = self.model.forward(&prepared.pixel_values)?;
+        let regions = postprocess(&outputs, &prepared, confidence_threshold, nms_threshold)?;
+        let text_blocks = regions_to_text_blocks(&regions);
+        tracing::info!(
+            width = image.width(),
+            height = image.height(),
+            variant = %self.variant,
+            detections = regions.len(),
+            total_ms = started.elapsed().as_millis(),
+            "anime text YOLO timings"
+        );
+        Ok(AnimeTextDetection {
+            image_width: prepared.original_width,
+            image_height: prepared.original_height,
+            variant: self.variant,
+            regions,
+            text_blocks,
+        })
+    }
+    fn preprocess(&self, image: &DynamicImage) -> Result<PreparedInput> {
+        let rgb = image.to_rgb8();
+        let (original_width, original_height) = rgb.dimensions();
+        let scale = f32::min(
+            INPUT_SIZE as f32 / original_width.max(1) as f32,
+            INPUT_SIZE as f32 / original_height.max(1) as f32,
+        );
+        let resized_width = ((original_width as f32 * scale).round() as u32).clamp(1, INPUT_SIZE);
+        let resized_height = ((original_height as f32 * scale).round() as u32).clamp(1, INPUT_SIZE);
+        let pad_x = (INPUT_SIZE - resized_width) / 2;
+        let pad_y = (INPUT_SIZE - resized_height) / 2;
+        let resized = if resized_width == original_width && resized_height == original_height {
+            rgb
+        } else {
+            imageops::resize(&rgb, resized_width, resized_height, FilterType::Triangle)
+        };
+        let mut letterboxed =
+            RgbImage::from_pixel(INPUT_SIZE, INPUT_SIZE, Rgb([LETTERBOX_COLOR; 3]));
+        imageops::overlay(
+            &mut letterboxed,
+            &resized,
+            i64::from(pad_x),
+            i64::from(pad_y),
+        );
+        let pixel_values = Tensor::from_vec(
+            letterboxed.into_raw(),
+            (1, INPUT_SIZE as usize, INPUT_SIZE as usize, 3),
+            &self.device,
+        )?
+        .permute((0, 3, 1, 2))?
+        .to_dtype(self.dtype)?;
+        let pixel_values = (pixel_values * (1.0 / 255.0))?;
+        Ok(PreparedInput {
+            pixel_values,
+            original_width,
+            original_height,
+            pad_x,
+            pad_y,
+            scale,
+        })
+    }
+}
+pub async fn prefetch(runtime: &RuntimeManager) -> Result<()> {
+    prefetch_variant(runtime, DEFAULT_VARIANT).await
+}
+pub async fn prefetch_variant(
+    runtime: &RuntimeManager,
+    variant: AnimeTextYoloVariant,
+) -> Result<()> {
+    let _ = resolve_model_path(runtime, variant).await?;
+    Ok(())
+}
+async fn resolve_model_path(
+    runtime: &RuntimeManager,
+    variant: AnimeTextYoloVariant,
+) -> Result<PathBuf> {
+    runtime
+        .downloads()
+        .huggingface_model(HF_REPO, variant.filename())
+        .await
+        .with_context(|| format!("failed to download {} from {}", variant.filename(), HF_REPO))
+}
+fn postprocess(
+    outputs: &Tensor,
+    prepared: &PreparedInput,
+    confidence_threshold: f32,
+    nms_threshold: f32,
+) -> Result<Vec<AnimeTextRegion>> {
+    let pred = outputs
+        .to_dtype(DType::F32)?
+        .to_device(&Device::Cpu)?
+        .i(0)?;
+    let (channels, anchors) = pred.dims2()?;
+    let expected_channels = 4 + NUM_CLASSES;
+    if channels != expected_channels {
+        bail!(
+            "unexpected anime text YOLO prediction channels {channels}, expected {expected_channels}"
+        );
+    }
+    let mut grouped: Vec<Vec<Bbox<usize>>> = (0..NUM_CLASSES).map(|_| Vec::new()).collect();
+    for anchor_idx in 0..anchors {
+        let values = pred.i((.., anchor_idx))?.to_vec1::<f32>()?;
+        let class_scores = &values[4..4 + NUM_CLASSES];
+        let Some((label_id, &score)) = class_scores
+            .iter()
+            .enumerate()
+            .max_by(|(_, a), (_, b)| a.total_cmp(b))
+        else {
+            continue;
+        };
+        if score < confidence_threshold {
+            continue;
+        }
+        let bbox = map_bbox_to_original(
+            [
+                values[0] - values[2] * 0.5,
+                values[1] - values[3] * 0.5,
+                values[0] + values[2] * 0.5,
+                values[1] + values[3] * 0.5,
+            ],
+            prepared,
+        );
+        if bbox[2] <= bbox[0] || bbox[3] <= bbox[1] {
+            continue;
+        }
+        grouped[label_id].push(Bbox {
+            xmin: bbox[0],
+            ymin: bbox[1],
+            xmax: bbox[2],
+            ymax: bbox[3],
+            confidence: score,
+            data: label_id,
+        });
+    }
+    non_maximum_suppression(&mut grouped, nms_threshold);
+    let mut regions = Vec::new();
+    for (label_id, bboxes) in grouped.into_iter().enumerate() {
+        let label = CLASS_NAMES
+            .get(label_id)
+            .copied()
+            .unwrap_or("text_block")
+            .to_string();
+        for bbox in bboxes {
+            regions.push(AnimeTextRegion {
+                label_id,
+                label: label.clone(),
+                score: bbox.confidence,
+                bbox: [bbox.xmin, bbox.ymin, bbox.xmax, bbox.ymax],
+            });
+        }
+    }
+    regions.sort_by(|a, b| b.score.total_cmp(&a.score));
+    Ok(regions)
+}
+fn map_bbox_to_original(bbox: [f32; 4], prepared: &PreparedInput) -> [f32; 4] {
+    let width = prepared.original_width as f32;
+    let height = prepared.original_height as f32;
+    let pad_x = prepared.pad_x as f32;
+    let pad_y = prepared.pad_y as f32;
+    [
+        ((bbox[0] - pad_x) / prepared.scale).clamp(0.0, width),
+        ((bbox[1] - pad_y) / prepared.scale).clamp(0.0, height),
+        ((bbox[2] - pad_x) / prepared.scale).clamp(0.0, width),
+        ((bbox[3] - pad_y) / prepared.scale).clamp(0.0, height),
+    ]
+}
+fn regions_to_text_blocks(regions: &[AnimeTextRegion]) -> Vec<TextRegion> {
+    regions
+        .iter()
+        .filter_map(|region| {
+            let width = (region.bbox[2] - region.bbox[0]).max(0.0);
+            let height = (region.bbox[3] - region.bbox[1]).max(0.0);
+            if width <= 1.0 || height <= 1.0 {
+                return None;
+            }
+            Some(TextRegion {
+                x: region.bbox[0],
+                y: region.bbox[1],
+                width,
+                height,
+                confidence: region.score,
+                detector: Some(DETECTOR_NAME.to_string()),
+                ..Default::default()
+            })
+        })
+        .collect()
+}
+#[cfg(test)]
+mod tests {
+    use super::{PreparedInput, map_bbox_to_original};
+    use candle_core::{DType, Device, Tensor};
+    #[test]
+    fn map_bbox_to_original_removes_letterbox_padding() {
+        let prepared = PreparedInput {
+            pixel_values: Tensor::zeros((1, 3, 640, 640), DType::F32, &Device::Cpu)
+                .expect("tensor"),
+            original_width: 1000,
+            original_height: 500,
+            pad_x: 0,
+            pad_y: 160,
+            scale: 0.64,
+        };
+        let bbox = map_bbox_to_original([100.0, 200.0, 540.0, 440.0], &prepared);
+        assert!((bbox[0] - 156.25).abs() < 1e-3);
+        assert!((bbox[1] - 62.5).abs() < 1e-3);
+        assert!((bbox[2] - 843.75).abs() < 1e-3);
+        assert!((bbox[3] - 437.5).abs() < 1e-3);
+    }
+}

koharu-ml/src/anime_text/model.rs ADDED Viewed

	@@ -0,0 +1,982 @@

+use candle_core::{D, IndexOp, Result, Tensor};
+use candle_nn::{BatchNorm, Conv2d, Conv2dConfig, Module, ModuleT, VarBuilder, batch_norm};
+use crate::ops::{conv2d, conv2d_no_bias};
+const BN_EPS: f64 = 1e-3;
+const REG_MAX: usize = 16;
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Yolo12Scale {
+    N,
+    S,
+    M,
+    L,
+    X,
+}
+#[derive(Debug, Clone, Copy)]
+struct Multiples {
+    depth: f64,
+    width: f64,
+    max_channels: usize,
+}
+impl Yolo12Scale {
+    fn multiples(self) -> Multiples {
+        match self {
+            Self::N => Multiples {
+                depth: 0.50,
+                width: 0.25,
+                max_channels: 1024,
+            },
+            Self::S => Multiples {
+                depth: 0.50,
+                width: 0.50,
+                max_channels: 1024,
+            },
+            Self::M => Multiples {
+                depth: 0.50,
+                width: 1.00,
+                max_channels: 512,
+            },
+            Self::L => Multiples {
+                depth: 1.00,
+                width: 1.00,
+                max_channels: 512,
+            },
+            Self::X => Multiples {
+                depth: 1.00,
+                width: 1.50,
+                max_channels: 512,
+            },
+        }
+    }
+    fn uses_large_c3k(self) -> bool {
+        matches!(self, Self::M | Self::L | Self::X)
+    }
+    fn uses_a2_residual(self) -> bool {
+        matches!(self, Self::L | Self::X)
+    }
+}
+impl Multiples {
+    fn channels(&self, base: usize) -> usize {
+        make_divisible((base.min(self.max_channels) as f64) * self.width, 8)
+    }
+    fn repeats(&self, base: usize) -> usize {
+        if base > 1 {
+            ((base as f64 * self.depth).round() as usize).max(1)
+        } else {
+            base
+        }
+    }
+}
+fn make_divisible(value: f64, divisor: usize) -> usize {
+    ((value / divisor as f64).ceil() as usize) * divisor
+}
+#[derive(Debug)]
+struct Upsample {
+    scale_factor: usize,
+}
+impl Upsample {
+    fn new(scale_factor: usize) -> Self {
+        Self { scale_factor }
+    }
+}
+impl Module for Upsample {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        let (_, _, h, w) = xs.dims4()?;
+        xs.upsample_nearest2d(self.scale_factor * h, self.scale_factor * w)
+    }
+}
+#[derive(Debug)]
+struct ConvBlock {
+    conv: Conv2d,
+    bn: BatchNorm,
+    activation: bool,
+}
+impl ConvBlock {
+    #[allow(clippy::too_many_arguments)]
+    fn load(
+        vb: VarBuilder,
+        in_channels: usize,
+        out_channels: usize,
+        kernel_size: usize,
+        stride: usize,
+        padding: Option<usize>,
+        groups: usize,
+        activation: bool,
+    ) -> Result<Self> {
+        let cfg = Conv2dConfig {
+            padding: padding.unwrap_or(kernel_size / 2),
+            stride,
+            groups,
+            dilation: 1,
+            cudnn_fwd_algo: None,
+        };
+        Ok(Self {
+            conv: conv2d_no_bias(in_channels, out_channels, kernel_size, cfg, vb.pp("conv"))?,
+            bn: batch_norm(out_channels, BN_EPS, vb.pp("bn"))?,
+            activation,
+        })
+    }
+}
+impl Module for ConvBlock {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        let xs = self.conv.forward(xs)?;
+        let xs = self.bn.forward_t(&xs, false)?;
+        if self.activation {
+            candle_nn::ops::silu(&xs)
+        } else {
+            Ok(xs)
+        }
+    }
+}
+#[derive(Debug)]
+struct Bottleneck {
+    cv1: ConvBlock,
+    cv2: ConvBlock,
+    residual: bool,
+}
+impl Bottleneck {
+    fn load(
+        vb: VarBuilder,
+        in_channels: usize,
+        out_channels: usize,
+        shortcut: bool,
+        groups: usize,
+        kernel_size: usize,
+        expansion: f64,
+    ) -> Result<Self> {
+        let hidden = (out_channels as f64 * expansion) as usize;
+        Ok(Self {
+            cv1: ConvBlock::load(
+                vb.pp("cv1"),
+                in_channels,
+                hidden,
+                kernel_size,
+                1,
+                None,
+                1,
+                true,
+            )?,
+            cv2: ConvBlock::load(
+                vb.pp("cv2"),
+                hidden,
+                out_channels,
+                kernel_size,
+                1,
+                None,
+                groups,
+                true,
+            )?,
+            residual: shortcut && in_channels == out_channels,
+        })
+    }
+}
+impl Module for Bottleneck {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        let ys = self.cv2.forward(&self.cv1.forward(xs)?)?;
+        if self.residual { xs + ys } else { Ok(ys) }
+    }
+}
+#[derive(Debug)]
+struct C3k {
+    cv1: ConvBlock,
+    cv2: ConvBlock,
+    cv3: ConvBlock,
+    blocks: Vec<Bottleneck>,
+}
+#[derive(Debug, Clone, Copy)]
+struct C3kOptions {
+    shortcut: bool,
+    groups: usize,
+    expansion: f64,
+    kernel_size: usize,
+}
+impl C3k {
+    fn load(
+        vb: VarBuilder,
+        in_channels: usize,
+        out_channels: usize,
+        repeats: usize,
+        options: C3kOptions,
+    ) -> Result<Self> {
+        let hidden = (out_channels as f64 * options.expansion) as usize;
+        let mut blocks = Vec::with_capacity(repeats);
+        for index in 0..repeats {
+            blocks.push(Bottleneck::load(
+                vb.pp(format!("m.{index}")),
+                hidden,
+                hidden,
+                options.shortcut,
+                options.groups,
+                options.kernel_size,
+                1.0,
+            )?);
+        }
+        Ok(Self {
+            cv1: ConvBlock::load(vb.pp("cv1"), in_channels, hidden, 1, 1, None, 1, true)?,
+            cv2: ConvBlock::load(vb.pp("cv2"), in_channels, hidden, 1, 1, None, 1, true)?,
+            cv3: ConvBlock::load(vb.pp("cv3"), hidden * 2, out_channels, 1, 1, None, 1, true)?,
+            blocks,
+        })
+    }
+}
+impl Module for C3k {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        let mut y1 = self.cv1.forward(xs)?;
+        for block in &self.blocks {
+            y1 = block.forward(&y1)?;
+        }
+        let y2 = self.cv2.forward(xs)?;
+        self.cv3.forward(&Tensor::cat(&[&y1, &y2], 1)?)
+    }
+}
+#[derive(Debug)]
+enum C3k2Block {
+    Bottleneck(Box<Bottleneck>),
+    C3k(Box<C3k>),
+}
+impl Module for C3k2Block {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        match self {
+            Self::Bottleneck(block) => block.forward(xs),
+            Self::C3k(block) => block.forward(xs),
+        }
+    }
+}
+#[derive(Debug)]
+struct C3k2 {
+    cv1: ConvBlock,
+    cv2: ConvBlock,
+    blocks: Vec<C3k2Block>,
+}
+#[derive(Debug, Clone, Copy)]
+struct C3k2Options {
+    use_c3k: bool,
+    expansion: f64,
+    groups: usize,
+    shortcut: bool,
+}
+impl C3k2 {
+    fn load(
+        vb: VarBuilder,
+        in_channels: usize,
+        out_channels: usize,
+        repeats: usize,
+        options: C3k2Options,
+    ) -> Result<Self> {
+        let hidden = (out_channels as f64 * options.expansion) as usize;
+        let mut blocks = Vec::with_capacity(repeats);
+        for index in 0..repeats {
+            let vb = vb.pp(format!("m.{index}"));
+            let block = if options.use_c3k {
+                C3k2Block::C3k(Box::new(C3k::load(
+                    vb,
+                    hidden,
+                    hidden,
+                    2,
+                    C3kOptions {
+                        shortcut: options.shortcut,
+                        groups: options.groups,
+                        expansion: 0.5,
+                        kernel_size: 3,
+                    },
+                )?))
+            } else {
+                C3k2Block::Bottleneck(Box::new(Bottleneck::load(
+                    vb,
+                    hidden,
+                    hidden,
+                    options.shortcut,
+                    options.groups,
+                    3,
+                    0.5,
+                )?))
+            };
+            blocks.push(block);
+        }
+        Ok(Self {
+            cv1: ConvBlock::load(vb.pp("cv1"), in_channels, hidden * 2, 1, 1, None, 1, true)?,
+            cv2: ConvBlock::load(
+                vb.pp("cv2"),
+                (2 + repeats) * hidden,
+                out_channels,
+                1,
+                1,
+                None,
+                1,
+                true,
+            )?,
+            blocks,
+        })
+    }
+}
+impl Module for C3k2 {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        let mut ys = self.cv1.forward(xs)?.chunk(2, 1)?;
+        for block in &self.blocks {
+            ys.push(block.forward(ys.last().expect("c3k2 chunk"))?);
+        }
+        let refs = ys.iter().collect::<Vec<_>>();
+        self.cv2.forward(&Tensor::cat(&refs, 1)?)
+    }
+}
+#[derive(Debug)]
+struct AreaAttention {
+    area: usize,
+    num_heads: usize,
+    head_dim: usize,
+    qkv: ConvBlock,
+    proj: ConvBlock,
+    pe: ConvBlock,
+}
+impl AreaAttention {
+    fn load(vb: VarBuilder, dim: usize, num_heads: usize, area: usize) -> Result<Self> {
+        let head_dim = dim / num_heads;
+        let all_head_dim = head_dim * num_heads;
+        Ok(Self {
+            area,
+            num_heads,
+            head_dim,
+            qkv: ConvBlock::load(vb.pp("qkv"), dim, all_head_dim * 3, 1, 1, None, 1, false)?,
+            proj: ConvBlock::load(vb.pp("proj"), all_head_dim, dim, 1, 1, None, 1, false)?,
+            pe: ConvBlock::load(vb.pp("pe"), all_head_dim, dim, 7, 1, Some(3), dim, false)?,
+        })
+    }
+}
+impl Module for AreaAttention {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        let (batch, channels, height, width) = xs.dims4()?;
+        let num_tokens = height * width;
+        let qkv = self
+            .qkv
+            .forward(xs)?
+            .flatten_from(2)?
+            .transpose(1, 2)?
+            .contiguous()?;
+        let qkv = if self.area > 1 {
+            qkv.reshape((batch * self.area, num_tokens / self.area, channels * 3))?
+        } else {
+            qkv
+        };
+        let (area_batch, area_tokens, _) = qkv.dims3()?;
+        let qkv = qkv
+            .reshape((area_batch, area_tokens, self.num_heads, self.head_dim * 3))?
+            .permute((0, 2, 3, 1))?
+            .contiguous()?;
+        let q = qkv.narrow(2, 0, self.head_dim)?;
+        let k = qkv.narrow(2, self.head_dim, self.head_dim)?;
+        let v = qkv.narrow(2, self.head_dim * 2, self.head_dim)?;
+        let attn = (q.transpose(2, 3)?.matmul(&k)? * (self.head_dim as f64).powf(-0.5))?;
+        let attn = candle_nn::ops::softmax(&attn, D::Minus1)?;
+        let ys = v.matmul(&attn.transpose(2, 3)?)?;
+        let ys = ys.permute((0, 3, 1, 2))?.contiguous()?;
+        let v = v.permute((0, 3, 1, 2))?.contiguous()?;
+        let (ys, v) = if self.area > 1 {
+            (
+                ys.reshape((batch, num_tokens, channels))?,
+                v.reshape((batch, num_tokens, channels))?,
+            )
+        } else {
+            (ys, v)
+        };
+        let ys = ys
+            .reshape((batch, height, width, channels))?
+            .permute((0, 3, 1, 2))?
+            .contiguous()?;
+        let v = v
+            .reshape((batch, height, width, channels))?
+            .permute((0, 3, 1, 2))?
+            .contiguous()?;
+        self.proj.forward(&(ys + self.pe.forward(&v)?)?)
+    }
+}
+#[derive(Debug)]
+struct AreaBlock {
+    attn: AreaAttention,
+    mlp0: ConvBlock,
+    mlp1: ConvBlock,
+}
+impl AreaBlock {
+    fn load(
+        vb: VarBuilder,
+        dim: usize,
+        num_heads: usize,
+        mlp_ratio: f64,
+        area: usize,
+    ) -> Result<Self> {
+        let mlp_hidden = (dim as f64 * mlp_ratio) as usize;
+        Ok(Self {
+            attn: AreaAttention::load(vb.pp("attn"), dim, num_heads, area)?,
+            mlp0: ConvBlock::load(vb.pp("mlp.0"), dim, mlp_hidden, 1, 1, None, 1, true)?,
+            mlp1: ConvBlock::load(vb.pp("mlp.1"), mlp_hidden, dim, 1, 1, None, 1, false)?,
+        })
+    }
+}
+impl Module for AreaBlock {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        let xs = (xs + self.attn.forward(xs)?)?;
+        let mlp = self.mlp1.forward(&self.mlp0.forward(&xs)?)?;
+        xs + mlp
+    }
+}
+#[derive(Debug)]
+enum A2C2fBlock {
+    Attention(Vec<AreaBlock>),
+    C3k(Box<C3k>),
+}
+impl Module for A2C2fBlock {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        match self {
+            Self::Attention(blocks) => {
+                let mut ys = xs.clone();
+                for block in blocks {
+                    ys = block.forward(&ys)?;
+                }
+                Ok(ys)
+            }
+            Self::C3k(block) => block.forward(xs),
+        }
+    }
+}
+#[derive(Debug)]
+struct A2C2f {
+    cv1: ConvBlock,
+    cv2: ConvBlock,
+    gamma: Option<Tensor>,
+    blocks: Vec<A2C2fBlock>,
+}
+#[allow(clippy::too_many_arguments)]
+impl A2C2f {
+    fn load(
+        vb: VarBuilder,
+        in_channels: usize,
+        out_channels: usize,
+        repeats: usize,
+        attention: bool,
+        area: usize,
+        residual: bool,
+        mlp_ratio: f64,
+        expansion: f64,
+        groups: usize,
+        shortcut: bool,
+    ) -> Result<Self> {
+        let hidden = (out_channels as f64 * expansion) as usize;
+        let gamma = if attention && residual {
+            Some(vb.get(out_channels, "gamma")?)
+        } else {
+            None
+        };
+        let mut blocks = Vec::with_capacity(repeats);
+        for index in 0..repeats {
+            let block_vb = vb.pp(format!("m.{index}"));
+            let block = if attention {
+                let mut area_blocks = Vec::with_capacity(2);
+                for block_index in 0..2 {
+                    area_blocks.push(AreaBlock::load(
+                        block_vb.pp(block_index),
+                        hidden,
+                        hidden / 32,
+                        mlp_ratio,
+                        area,
+                    )?);
+                }
+                A2C2fBlock::Attention(area_blocks)
+            } else {
+                A2C2fBlock::C3k(Box::new(C3k::load(
+                    block_vb,
+                    hidden,
+                    hidden,
+                    2,
+                    C3kOptions {
+                        shortcut,
+                        groups,
+                        expansion: 0.5,
+                        kernel_size: 3,
+                    },
+                )?))
+            };
+            blocks.push(block);
+        }
+        Ok(Self {
+            cv1: ConvBlock::load(vb.pp("cv1"), in_channels, hidden, 1, 1, None, 1, true)?,
+            cv2: ConvBlock::load(
+                vb.pp("cv2"),
+                (1 + repeats) * hidden,
+                out_channels,
+                1,
+                1,
+                None,
+                1,
+                true,
+            )?,
+            gamma,
+            blocks,
+        })
+    }
+}
+impl Module for A2C2f {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        let mut ys = vec![self.cv1.forward(xs)?];
+        for block in &self.blocks {
+            ys.push(block.forward(ys.last().expect("a2c2f output"))?);
+        }
+        let refs = ys.iter().collect::<Vec<_>>();
+        let ys = self.cv2.forward(&Tensor::cat(&refs, 1)?)?;
+        match &self.gamma {
+            Some(gamma) => {
+                xs + ys.broadcast_mul(&gamma.reshape((1, gamma.elem_count(), 1, 1))?)?
+            }
+            None => Ok(ys),
+        }
+    }
+}
+#[derive(Debug)]
+struct Yolo12Backbone {
+    l0: ConvBlock,
+    l1: ConvBlock,
+    l2: C3k2,
+    l3: ConvBlock,
+    l4: C3k2,
+    l5: ConvBlock,
+    l6: A2C2f,
+    l7: ConvBlock,
+    l8: A2C2f,
+}
+impl Yolo12Backbone {
+    fn load(vb: VarBuilder, scale: Yolo12Scale) -> Result<Self> {
+        let m = scale.multiples();
+        let c64 = m.channels(64);
+        let c128 = m.channels(128);
+        let c256 = m.channels(256);
+        let c512 = m.channels(512);
+        let c1024 = m.channels(1024);
+        let a2_residual = scale.uses_a2_residual();
+        let mlp_ratio = if a2_residual { 1.2 } else { 2.0 };
+        Ok(Self {
+            l0: ConvBlock::load(vb.pp("model.0"), 3, c64, 3, 2, None, 1, true)?,
+            l1: ConvBlock::load(vb.pp("model.1"), c64, c128, 3, 2, None, 1, true)?,
+            l2: C3k2::load(
+                vb.pp("model.2"),
+                c128,
+                c256,
+                m.repeats(2),
+                C3k2Options {
+                    use_c3k: scale.uses_large_c3k(),
+                    expansion: 0.25,
+                    groups: 1,
+                    shortcut: true,
+                },
+            )?,
+            l3: ConvBlock::load(vb.pp("model.3"), c256, c256, 3, 2, None, 1, true)?,
+            l4: C3k2::load(
+                vb.pp("model.4"),
+                c256,
+                c512,
+                m.repeats(2),
+                C3k2Options {
+                    use_c3k: scale.uses_large_c3k(),
+                    expansion: 0.25,
+                    groups: 1,
+                    shortcut: true,
+                },
+            )?,
+            l5: ConvBlock::load(vb.pp("model.5"), c512, c512, 3, 2, None, 1, true)?,
+            l6: A2C2f::load(
+                vb.pp("model.6"),
+                c512,
+                c512,
+                m.repeats(4),
+                true,
+                4,
+                a2_residual,
+                mlp_ratio,
+                0.5,
+                1,
+                true,
+            )?,
+            l7: ConvBlock::load(vb.pp("model.7"), c512, c1024, 3, 2, None, 1, true)?,
+            l8: A2C2f::load(
+                vb.pp("model.8"),
+                c1024,
+                c1024,
+                m.repeats(4),
+                true,
+                1,
+                a2_residual,
+                mlp_ratio,
+                0.5,
+                1,
+                true,
+            )?,
+        })
+    }
+    fn forward(&self, xs: &Tensor) -> Result<(Tensor, Tensor, Tensor)> {
+        let x0 = self.l0.forward(xs)?;
+        let x1 = self.l1.forward(&x0)?;
+        let x2 = self.l2.forward(&x1)?;
+        let x3 = self.l3.forward(&x2)?;
+        let x4 = self.l4.forward(&x3)?;
+        let x5 = self.l5.forward(&x4)?;
+        let x6 = self.l6.forward(&x5)?;
+        let x7 = self.l7.forward(&x6)?;
+        let x8 = self.l8.forward(&x7)?;
+        Ok((x4, x6, x8))
+    }
+}
+#[derive(Debug)]
+struct Yolo12Neck {
+    upsample: Upsample,
+    l11: A2C2f,
+    l14: A2C2f,
+    l15: ConvBlock,
+    l17: A2C2f,
+    l18: ConvBlock,
+    l20: C3k2,
+}
+impl Yolo12Neck {
+    fn load(vb: VarBuilder, scale: Yolo12Scale) -> Result<Self> {
+        let m = scale.multiples();
+        let c256 = m.channels(256);
+        let c512 = m.channels(512);
+        let c1024 = m.channels(1024);
+        let repeats = m.repeats(2);
+        Ok(Self {
+            upsample: Upsample::new(2),
+            l11: A2C2f::load(
+                vb.pp("model.11"),
+                c1024 + c512,
+                c512,
+                repeats,
+                false,
+                1,
+                false,
+                2.0,
+                0.5,
+                1,
+                true,
+            )?,
+            l14: A2C2f::load(
+                vb.pp("model.14"),
+                c512 + c512,
+                c256,
+                repeats,
+                false,
+                1,
+                false,
+                2.0,
+                0.5,
+                1,
+                true,
+            )?,
+            l15: ConvBlock::load(vb.pp("model.15"), c256, c256, 3, 2, None, 1, true)?,
+            l17: A2C2f::load(
+                vb.pp("model.17"),
+                c256 + c512,
+                c512,
+                repeats,
+                false,
+                1,
+                false,
+                2.0,
+                0.5,
+                1,
+                true,
+            )?,
+            l18: ConvBlock::load(vb.pp("model.18"), c512, c512, 3, 2, None, 1, true)?,
+            l20: C3k2::load(
+                vb.pp("model.20"),
+                c512 + c1024,
+                c1024,
+                repeats,
+                C3k2Options {
+                    use_c3k: true,
+                    expansion: 0.5,
+                    groups: 1,
+                    shortcut: true,
+                },
+            )?,
+        })
+    }
+    fn forward(&self, p3: &Tensor, p4: &Tensor, p5: &Tensor) -> Result<(Tensor, Tensor, Tensor)> {
+        let x11 = self
+            .l11
+            .forward(&Tensor::cat(&[&self.upsample.forward(p5)?, p4], 1)?)?;
+        let x14 = self
+            .l14
+            .forward(&Tensor::cat(&[&self.upsample.forward(&x11)?, p3], 1)?)?;
+        let x17 = self
+            .l17
+            .forward(&Tensor::cat(&[&self.l15.forward(&x14)?, &x11], 1)?)?;
+        let x20 = self
+            .l20
+            .forward(&Tensor::cat(&[&self.l18.forward(&x17)?, p5], 1)?)?;
+        Ok((x14, x17, x20))
+    }
+}
+#[derive(Debug)]
+struct Dfl {
+    conv: Conv2d,
+    reg_max: usize,
+}
+impl Dfl {
+    fn load(vb: VarBuilder, reg_max: usize) -> Result<Self> {
+        Ok(Self {
+            conv: conv2d_no_bias(reg_max, 1, 1, Default::default(), vb.pp("conv"))?,
+            reg_max,
+        })
+    }
+}
+impl Module for Dfl {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        let (batch, _, anchors) = xs.dims3()?;
+        let xs = xs
+            .reshape((batch, 4, self.reg_max, anchors))?
+            .transpose(2, 1)?;
+        let xs = candle_nn::ops::softmax(&xs, 1)?;
+        self.conv.forward(&xs)?.reshape((batch, 4, anchors))
+    }
+}
+#[derive(Debug)]
+struct DetectCv3 {
+    dw0: ConvBlock,
+    pw0: ConvBlock,
+    dw1: ConvBlock,
+    pw1: ConvBlock,
+    conv: Conv2d,
+}
+impl DetectCv3 {
+    fn load(vb: VarBuilder, in_channels: usize, hidden: usize, num_classes: usize) -> Result<Self> {
+        Ok(Self {
+            dw0: ConvBlock::load(
+                vb.pp("0.0"),
+                in_channels,
+                in_channels,
+                3,
+                1,
+                None,
+                in_channels,
+                true,
+            )?,
+            pw0: ConvBlock::load(vb.pp("0.1"), in_channels, hidden, 1, 1, None, 1, true)?,
+            dw1: ConvBlock::load(vb.pp("1.0"), hidden, hidden, 3, 1, None, hidden, true)?,
+            pw1: ConvBlock::load(vb.pp("1.1"), hidden, hidden, 1, 1, None, 1, true)?,
+            conv: conv2d(hidden, num_classes, 1, Default::default(), vb.pp("2"))?,
+        })
+    }
+}
+impl Module for DetectCv3 {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        let xs = self.pw0.forward(&self.dw0.forward(xs)?)?;
+        let xs = self.pw1.forward(&self.dw1.forward(&xs)?)?;
+        self.conv.forward(&xs)
+    }
+}
+#[derive(Debug)]
+struct DetectionHead {
+    dfl: Dfl,
+    cv2: [(ConvBlock, ConvBlock, Conv2d); 3],
+    cv3: [DetectCv3; 3],
+    reg_max: usize,
+    no: usize,
+}
+impl DetectionHead {
+    fn load(vb: VarBuilder, num_classes: usize, filters: (usize, usize, usize)) -> Result<Self> {
+        let c2 = filters.0.div_ceil(4).max(REG_MAX * 4).max(16);
+        let c3 = filters.0.max(num_classes.min(100));
+        Ok(Self {
+            dfl: Dfl::load(vb.pp("dfl"), REG_MAX)?,
+            cv2: [
+                Self::load_cv2(vb.pp("cv2.0"), filters.0, c2)?,
+                Self::load_cv2(vb.pp("cv2.1"), filters.1, c2)?,
+                Self::load_cv2(vb.pp("cv2.2"), filters.2, c2)?,
+            ],
+            cv3: [
+                DetectCv3::load(vb.pp("cv3.0"), filters.0, c3, num_classes)?,
+                DetectCv3::load(vb.pp("cv3.1"), filters.1, c3, num_classes)?,
+                DetectCv3::load(vb.pp("cv3.2"), filters.2, c3, num_classes)?,
+            ],
+            reg_max: REG_MAX,
+            no: num_classes + REG_MAX * 4,
+        })
+    }
+    fn load_cv2(
+        vb: VarBuilder,
+        in_channels: usize,
+        hidden: usize,
+    ) -> Result<(ConvBlock, ConvBlock, Conv2d)> {
+        Ok((
+            ConvBlock::load(vb.pp("0"), in_channels, hidden, 3, 1, None, 1, true)?,
+            ConvBlock::load(vb.pp("1"), hidden, hidden, 3, 1, None, 1, true)?,
+            conv2d(hidden, REG_MAX * 4, 1, Default::default(), vb.pp("2"))?,
+        ))
+    }
+    fn forward_cv2(block: &(ConvBlock, ConvBlock, Conv2d), xs: &Tensor) -> Result<Tensor> {
+        block.2.forward(&block.1.forward(&block.0.forward(xs)?)?)
+    }
+    fn forward(&self, xs0: &Tensor, xs1: &Tensor, xs2: &Tensor) -> Result<Tensor> {
+        let xs0 = Tensor::cat(
+            &[
+                &Self::forward_cv2(&self.cv2[0], xs0)?,
+                &self.cv3[0].forward(xs0)?,
+            ],
+            1,
+        )?;
+        let xs1 = Tensor::cat(
+            &[
+                &Self::forward_cv2(&self.cv2[1], xs1)?,
+                &self.cv3[1].forward(xs1)?,
+            ],
+            1,
+        )?;
+        let xs2 = Tensor::cat(
+            &[
+                &Self::forward_cv2(&self.cv2[2], xs2)?,
+                &self.cv3[2].forward(xs2)?,
+            ],
+            1,
+        )?;
+        let (anchors, strides) = make_anchors(&xs0, &xs1, &xs2, (8, 16, 32), 0.5)?;
+        let anchors = anchors.transpose(0, 1)?.unsqueeze(0)?;
+        let strides = strides.transpose(0, 1)?;
+        let reshape = |xs: &Tensor| {
+            let batch = xs.dim(0)?;
+            xs.reshape((batch, self.no, xs.elem_count() / (batch * self.no)))
+        };
+        let ys0 = reshape(&xs0)?;
+        let ys1 = reshape(&xs1)?;
+        let ys2 = reshape(&xs2)?;
+        let x_cat = Tensor::cat(&[&ys0, &ys1, &ys2], 2)?;
+        let box_ = x_cat.i((.., ..self.reg_max * 4, ..))?;
+        let cls = x_cat.i((.., self.reg_max * 4.., ..))?;
+        let dbox = dist2bbox(&self.dfl.forward(&box_)?, &anchors)?.broadcast_mul(&strides)?;
+        Tensor::cat(&[&dbox, &candle_nn::ops::sigmoid(&cls)?], 1)
+    }
+}
+fn make_anchors(
+    xs0: &Tensor,
+    xs1: &Tensor,
+    xs2: &Tensor,
+    strides: (usize, usize, usize),
+    grid_cell_offset: f64,
+) -> Result<(Tensor, Tensor)> {
+    let device = xs0.device();
+    let dtype = xs0.dtype();
+    let mut anchor_points = Vec::with_capacity(3);
+    let mut stride_tensors = Vec::with_capacity(3);
+    for (xs, stride) in [(xs0, strides.0), (xs1, strides.1), (xs2, strides.2)] {
+        let (_, _, h, w) = xs.dims4()?;
+        let sx = (Tensor::arange(0, w as u32, device)?.to_dtype(dtype)? + grid_cell_offset)?;
+        let sy = (Tensor::arange(0, h as u32, device)?.to_dtype(dtype)? + grid_cell_offset)?;
+        let sx = sx
+            .reshape((1, sx.elem_count()))?
+            .repeat((h, 1))?
+            .flatten_all()?;
+        let sy = sy
+            .reshape((sy.elem_count(), 1))?
+            .repeat((1, w))?
+            .flatten_all()?;
+        anchor_points.push(Tensor::stack(&[&sx, &sy], D::Minus1)?);
+        stride_tensors.push((Tensor::ones(h * w, dtype, device)? * stride as f64)?);
+    }
+    let anchor_points = Tensor::cat(anchor_points.as_slice(), 0)?;
+    let stride_tensor = Tensor::cat(stride_tensors.as_slice(), 0)?.unsqueeze(1)?;
+    Ok((anchor_points, stride_tensor))
+}
+fn dist2bbox(distance: &Tensor, anchor_points: &Tensor) -> Result<Tensor> {
+    let chunks = distance.chunk(2, 1)?;
+    let lt = &chunks[0];
+    let rb = &chunks[1];
+    let x1y1 = anchor_points.sub(lt)?;
+    let x2y2 = anchor_points.add(rb)?;
+    let c_xy = ((&x1y1 + &x2y2)? * 0.5)?;
+    let wh = (&x2y2 - &x1y1)?;
+    Tensor::cat(&[&c_xy, &wh], 1)
+}
+#[derive(Debug)]
+pub struct Yolo12 {
+    backbone: Yolo12Backbone,
+    neck: Yolo12Neck,
+    head: DetectionHead,
+}
+impl Yolo12 {
+    pub fn load(vb: VarBuilder, scale: Yolo12Scale, num_classes: usize) -> Result<Self> {
+        let m = scale.multiples();
+        let filters = (m.channels(256), m.channels(512), m.channels(1024));
+        Ok(Self {
+            backbone: Yolo12Backbone::load(vb.clone(), scale)?,
+            neck: Yolo12Neck::load(vb.clone(), scale)?,
+            head: DetectionHead::load(vb.pp("model.21"), num_classes, filters)?,
+        })
+    }
+    pub fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        let (p3, p4, p5) = self.backbone.forward(xs)?;
+        let (h1, h2, h3) = self.neck.forward(&p3, &p4, &p5)?;
+        self.head.forward(&h1, &h2, &h3)
+    }
+}

koharu-ml/src/lib.rs CHANGED Viewed

@@ -1,5 +1,6 @@
 mod hf_hub;
 pub mod aot_inpainting;
 pub mod comic_text_bubble_detector;
 pub mod comic_text_detector;

 mod hf_hub;
+pub mod anime_text;
 pub mod aot_inpainting;
 pub mod comic_text_bubble_detector;
 pub mod comic_text_detector;

koharu-ml/tests/anime_text.rs ADDED Viewed

	@@ -0,0 +1,30 @@

+use std::path::Path;
+use koharu_ml::anime_text::AnimeTextDetector;
+mod support;
+#[tokio::test]
+#[ignore = "requires model download and is not critical for CI"]
+async fn anime_text_yolo() -> anyhow::Result<()> {
+    let runtime = support::cpu_runtime();
+    let model = AnimeTextDetector::load(&runtime, false).await?;
+    let image = image::open(Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures/1.jpg"))?;
+    let detection = model.inference(&image)?;
+    assert_eq!(detection.image_width, image.width());
+    assert_eq!(detection.image_height, image.height());
+    assert!(
+        !detection.text_blocks.is_empty(),
+        "expected anime text YOLO to detect text blocks"
+    );
+    assert!(
+        detection
+            .text_blocks
+            .iter()
+            .all(|block| block.detector.as_deref() == Some("anime-text-yolo"))
+    );
+    Ok(())
+}