ModerRAS
/

AniFileBERT

@@ -207,7 +207,7 @@ static CJK_TITLE_LANG_PREFIX_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"^(.+?)(国日双语|國日雙語|日语版|日語版|国语版|國語版|双语|雙語)(第?)$").unwrap());
 static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
 static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(r"(?i)^(?:(?:NCOP|NCED|OP|ED|PV|CM)(?:[\s_.-]?(?:\d{1,4}|v\d{1,3}|[A-Z]))?|SP(?:[\s_.-]?\d{0,4})?|(?:OVA|OAD|IV)(?:[\s_.-]?\d{0,4})?|(?:BD)?Menu(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?)$").unwrap()
 });
 static VOLUME_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
@@ -2192,6 +2192,7 @@ fn is_special_title_phrase(text: &str) -> bool {
             | "TOKUTEN"
             | "TRAILER"
             | "TV SPOT"
             | "WORLD PREMIERE"
             | "予告"
             | "番宣"
@@ -2203,6 +2204,7 @@ fn is_special_title_phrase(text: &str) -> bool {
         || normalized.contains("番宣")
         || normalized.contains("宣番")
         || normalized.contains("TV SPOT")
         || text.contains("予告")
         || SPECIAL_TITLE_PHRASE_RE.is_match(text)
 }
@@ -2224,6 +2226,7 @@ const KNOWN_TITLE_PHRASES: &[&[&str]] = &[
     &["Zom", "100"],
     &["Kamisama", "Hajimemashita", "2"],
     &["Phantasy", "Star", "Online", "2", "Episode", "Oracle"],
     &["Ghiblies", "Episode", "2"],
     &["Lupin The Thrid Jigen Daisuke no Bohyou"],
     &["Lupin The Third Jigen Daisuke no Bohyou"],
@@ -2445,6 +2448,48 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
             output[index] = "O".to_string();
             continue;
         }
         if roles[index].starts_with("EPISODE")
             && index >= 1
             && output[index - 1] == "TITLE"
@@ -2773,12 +2818,18 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
             }
             if matches!(
                 previous_real_text.to_ascii_lowercase().as_str(),
-                "lesson" | "part"
             )
             {
                 output[index] = "O".to_string();
                 continue;
             }
             if output[..index].iter().any(|role| role == "TITLE")
                 && (output[..index]
                     .iter()
@@ -3174,7 +3225,56 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
     ];
     let mut output = labels.to_vec();
     for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
         if label == "B-EPISODE" && token.chars().all(|ch| ch.is_ascii_digit()) {
             let previous_word = (0..index)
                 .rev()
                 .find(|&cursor| {
@@ -3182,7 +3282,7 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
                         || tokens[cursor].chars().any(|ch| ch.is_alphabetic())
                 })
                 .map(|cursor| tokens[cursor].to_ascii_lowercase());
-            if matches!(previous_word.as_deref(), Some("lesson" | "part")) {
                 output[index] = "O".to_string();
                 continue;
             }
@@ -3884,5 +3984,33 @@ mod tests {
             labels_for("(2014Q4) Bonjour♪恋味パティスリー 第01話 「Lesson 1」 (1280x720 x265 10bit AAC)");
         assert!(bonjour.contains(&("01".to_string(), "B-EPISODE".to_string())));
         assert!(!bonjour.contains(&("1".to_string(), "B-EPISODE".to_string())));
     }
 }

     Lazy::new(|| Regex::new(r"^(.+?)(国日双语|國日雙語|日语版|日語版|国语版|國語版|双语|雙語)(第?)$").unwrap());
 static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
 static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
+    Regex::new(r"(?i)^(?:(?:NCOP|NCED|OP|ED|PV|CM)(?:[\s_.-]?(?:\d{1,4}|v\d{1,3}|[A-Z]))?|SP(?:[\s_.-]?\d{0,4})?|(?:OVA|OAD|IV)(?:[\s_.-]?\d{0,4})?|(?:BD)?Menu(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:BD[-_. ]?)?Spot(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?)$").unwrap()
 });
 static VOLUME_RE: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
             | "TOKUTEN"
             | "TRAILER"
             | "TV SPOT"
+            | "SPOT"
             | "WORLD PREMIERE"
             | "予告"
             | "番宣"
         || normalized.contains("番宣")
         || normalized.contains("宣番")
         || normalized.contains("TV SPOT")
+        || normalized.contains("BD SPOT")
         || text.contains("予告")
         || SPECIAL_TITLE_PHRASE_RE.is_match(text)
 }
     &["Zom", "100"],
     &["Kamisama", "Hajimemashita", "2"],
     &["Phantasy", "Star", "Online", "2", "Episode", "Oracle"],
+    &["Durarara", "2", "Ketsu"],
     &["Ghiblies", "Episode", "2"],
     &["Lupin The Thrid Jigen Daisuke no Bohyou"],
     &["Lupin The Third Jigen Daisuke no Bohyou"],
             output[index] = "O".to_string();
             continue;
         }
+        if roles[index].starts_with("EPISODE")
+            && index >= 2
+            && matches!(group_text(tokens, &groups[index - 1]).as_str(), "×" | "x" | "X")
+            && output[index - 2] == "TITLE"
+            && !roles[index + 1..].iter().any(|role| role.starts_with("EPISODE"))
+        {
+            output[index] = "TITLE".to_string();
+            if let Some(next_text_index) = (index + 1..roles.len()).find(|&cursor| {
+                groups[cursor].class_name != "SEP" && groups[cursor].class_name == "TEXT"
+            }) {
+                output[next_text_index] = "TITLE".to_string();
+            }
+            continue;
+        }
+        if roles[index].starts_with("EPISODE")
+            && !output[..index].iter().any(|role| role.starts_with("EPISODE"))
+            && group_text(
+                tokens,
+                &groups[(0..index)
+                    .rev()
+                    .find(|&cursor| groups[cursor].class_name != "SEP")
+                    .unwrap_or(index)],
+            )
+            .eq_ignore_ascii_case("Movie")
+        {
+            output[index] = "TITLE".to_string();
+            continue;
+        }
+        if output[index] == "TITLE"
+            && matches!(text.as_str(), "中日" | "日中" | "英日" | "日英")
+        {
+            let next_source_lang = (index + 1..roles.len())
+                .find(|&cursor| groups[cursor].class_name != "SEP")
+                .is_some_and(|cursor| {
+                    output[cursor] == "SOURCE"
+                        && group_text(tokens, &groups[cursor]).contains('语')
+                });
+            if next_source_lang {
+                output[index] = "SOURCE".to_string();
+                continue;
+            }
+        }
         if roles[index].starts_with("EPISODE")
             && index >= 1
             && output[index - 1] == "TITLE"
             }
             if matches!(
                 previous_real_text.to_ascii_lowercase().as_str(),
+                "lesson" | "part" | "no"
             )
             {
                 output[index] = "O".to_string();
                 continue;
             }
+            if previous_real_text.contains("予告")
+                || previous_real_text.eq_ignore_ascii_case("Spot")
+            {
+                output[index] = "SPECIAL".to_string();
+                continue;
+            }
             if output[..index].iter().any(|role| role == "TITLE")
                 && (output[..index]
                     .iter()
     ];
     let mut output = labels.to_vec();
     for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
+        if label == "B-TITLE"
+            && token.chars().all(|ch| ch.is_ascii_digit())
+            && token.len() == 3
+            && index + 1 < tokens.len()
+            && matches!(tokens[index + 1].as_str(), "「" | "｢" | "\"" | "'")
+        {
+            output[index] = "B-EPISODE".to_string();
+            let mut cursor = index + 1;
+            while cursor < tokens.len() {
+                output[cursor] = "O".to_string();
+                if matches!(tokens[cursor].as_str(), "」" | "｣" | "\"" | "'") && cursor > index + 1 {
+                    break;
+                }
+                cursor += 1;
+            }
+            continue;
+        }
+        if label == "B-TITLE" && matches!(token.as_str(), "中日" | "日中" | "英日" | "日英") {
+            let next_word = (index + 1..tokens.len()).find(|&cursor| {
+                tokens[cursor].chars().any(|ch| ch.is_alphanumeric())
+            });
+            if next_word.is_some_and(|cursor| {
+                labels[cursor] == "B-SOURCE" && tokens[cursor].contains('语')
+            }) {
+                output[index] = "B-SOURCE".to_string();
+                continue;
+            }
+        }
         if label == "B-EPISODE" && token.chars().all(|ch| ch.is_ascii_digit()) {
+            let previous_non_space = (0..index)
+                .rev()
+                .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
+            if previous_non_space
+                .is_some_and(|cursor| matches!(tokens[cursor].as_str(), "×" | "x" | "X"))
+            {
+                let left_title = (0..previous_non_space.unwrap())
+                    .rev()
+                    .find(|&cursor| labels[cursor] != "O")
+                    .is_some_and(|cursor| labels[cursor] == "B-TITLE");
+                if left_title {
+                    output[index] = "B-TITLE".to_string();
+                    if let Some(next_word) = (index + 1..tokens.len()).find(|&cursor| {
+                        labels[cursor] == "O"
+                            && tokens[cursor].chars().any(|ch| ch.is_alphabetic())
+                    }) {
+                        output[next_word] = "B-TITLE".to_string();
+                    }
+                    continue;
+                }
+            }
             let previous_word = (0..index)
                 .rev()
                 .find(|&cursor| {
                         || tokens[cursor].chars().any(|ch| ch.is_alphabetic())
                 })
                 .map(|cursor| tokens[cursor].to_ascii_lowercase());
+            if matches!(previous_word.as_deref(), Some("lesson" | "part" | "no")) {
                 output[index] = "O".to_string();
                 continue;
             }
             labels_for("(2014Q4) Bonjour♪恋味パティスリー 第01話 「Lesson 1」 (1280x720 x265 10bit AAC)");
         assert!(bonjour.contains(&("01".to_string(), "B-EPISODE".to_string())));
         assert!(!bonjour.contains(&("1".to_string(), "B-EPISODE".to_string())));
+        let durarara = labels_for("[VCB-Studio] Durarara!!×2 Ketsu [Menu01][Ma10p_1080p][x265_flac]");
+        assert!(durarara.contains(&("Durarara".to_string(), "B-TITLE".to_string())));
+        assert!(durarara.contains(&("2".to_string(), "B-TITLE".to_string())));
+        assert!(!durarara.contains(&("2".to_string(), "B-EPISODE".to_string())));
+        let bd_spot =
+            labels_for("[Moozzi2] Amanchu! [SP05] BD-Spot - 01 (BD 1920x1080 x.264 Flac)");
+        assert!(bd_spot.contains(&("Spot".to_string(), "B-SPECIAL".to_string())));
+        assert!(bd_spot.contains(&("01".to_string(), "B-SPECIAL".to_string())));
+        assert!(!bd_spot.contains(&("01".to_string(), "B-EPISODE".to_string())));
+        let preview_number =
+            labels_for("[Snow-Raws] 刀使ノ巫女 第02話 予告01 (BD 1920x1080 HEVC-YUV420P10 FLAC)");
+        assert!(preview_number.contains(&("02".to_string(), "B-EPISODE".to_string())));
+        assert!(preview_number.contains(&("01".to_string(), "B-SPECIAL".to_string())));
+        let bleach_movie = labels_for("Bleach the Movie 3 - Fade to Black, I Call Your Name");
+        assert!(bleach_movie.contains(&("3".to_string(), "B-TITLE".to_string())));
+        assert!(!bleach_movie.contains(&("3".to_string(), "B-EPISODE".to_string())));
+        let no_number = labels_for("[甜甜圈字幕组] 小讨厌 081「爷爷的礼物 No.1」");
+        assert!(no_number.contains(&("081".to_string(), "B-EPISODE".to_string())));
+        assert!(!no_number.contains(&("1".to_string(), "B-EPISODE".to_string())));
+        let bilingual = labels_for("辉夜大小姐想让我告白~天才们的恋爱头脑战~.S2-01.中日双语.云光字幕组.[1080p]");
+        assert!(bilingual.contains(&("中日".to_string(), "B-SOURCE".to_string())));
+        assert!(!bilingual.contains(&("中日".to_string(), "B-TITLE".to_string())));
     }
 }