AniFileBERT / tools /encoded_dataset_cache /src /bin /regex_benchmark.rs
ModerRAS's picture
Fix Rust encoded cache label repairs
7934324
raw
history blame
13.2 kB
use anyhow::{ensure, Context, Result};
use clap::Parser;
use fancy_regex::Regex as FancyRegex;
use regex::Regex;
use serde_json::Value;
use std::fs::File;
use std::hint::black_box;
use std::io::{BufRead, BufReader};
use std::path::PathBuf;
use std::time::Instant;
const SOURCE_TOKEN_PATTERN: &str = r"WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|AAC|FLAC|MP3|DTS|Opus|CHS|CHT|GB|BIG5|JPN?|JPSC|JPTC|繁中|简中";
const RESOLUTION_BODY: &str = r"\d{3,4}[pP]|\d[Kk]|\d{3,4}[xX×]\d{3,4}";
const SPECIAL_TAG_PATTERN: &str =
r"(?i)^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[::].+";
const SPECIAL_CODE_PATTERN: &str =
r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM)\d*$|^IV\d+$|^(?:OVA|OAD|SP)\d*$";
const EPISODE_CONTEXT_PATTERN: &str = r"(?i)^\s*(?:[-_]\s*(?:\d{1,4}|NCOP|NCED|OP|ED|OVA|OAD|SP|END)\b|#\s*\d{1,4}|[\[\(【《]\s*(?:EP?|#)?\d{1,4})";
const EPISODE_SPAN_PATTERN: &str = r"(?i)(?:[Ss]\d{1,2}[Ee]\d{1,4}(?:v\d+)?|(?:^|[\s._])[-_]\s*\d{1,4}(?:v\d+)?(?=$|[\s._\-\]\)】》\[])|[\[\(【《](?:EP?|#)?\d{1,4}(?:v\d+)?[\]\)】》]|(?:^|[\s._\-\[\(【《#])(?:EP?|第|#)\d{1,4}(?:v\d+)?(?:[话話集])?(?=$|[\s._\-\]\)】》]))";
const READING_MARKER_PATTERN: &str = r"(?i)(?<![A-Za-z0-9])(?P<marker>Ni\s+no\s+(?:Sara|Shou|Sho|Syo|Shō)|San\s+no\s+(?:Sara|Shou|Sho|Syo)|(?:Yon|Shi|Shin)\s+no\s+Sara|(?:Go|Gou)\s+no\s+Sara|Ni\s+Gakki|Sono\s+Ni)(?![A-Za-z0-9])";
const ROMAN_MARKER_PATTERN: &str =
r"(?<![A-Za-z0-9])(?P<marker>II|III|IV|V|VI|VII|VIII|IX|[ⅡⅢⅣⅤⅥⅦⅧⅨ])(?![A-Za-z0-9])";
const CJK_MARKER_PATTERN: &str = r"(?:[一二三四五六七八九十兩两貳贰弐弍參叁参肆伍陸陆柒捌玖](?:\s*(?:ノ|の|之)\s*(?:章|期|季|部))?|第[一二三四五六七八九十兩两貳贰弐弍參叁参肆伍陸陆柒捌玖\d]+[季期部章])";
const SPECIAL_CONTEXT_PREFIX_PATTERN: &str =
r"(?i)^(?:[\[\(【《]\s*(?:menu|menus|bdmenu|ncop|nced|op|ed|ova|oad|sp)\s*[\]\)】》]\s*){0,2}";
#[derive(Parser, Debug)]
#[command(
about = "Compare regex vs fancy-regex workload costs for AniFileBERT cache preprocessing"
)]
struct Args {
#[arg(long)]
input: PathBuf,
#[arg(long, default_value_t = 0)]
limit_rows: usize,
#[arg(long, default_value_t = 3)]
repeat: usize,
}
fn main() -> Result<()> {
let args = Args::parse();
ensure!(args.repeat > 0, "--repeat must be greater than 0");
let filenames = load_filenames(&args.input, args.limit_rows)?;
if filenames.is_empty() {
anyhow::bail!("no filenames loaded from {}", args.input.display());
}
let selective = SelectivePatterns::new()?;
let fancy_all = FancyAllPatterns::new()?;
let (selective_seconds, selective_count) =
time_repeated(args.repeat, || run_selective(&filenames, &selective))?;
let (fancy_seconds, fancy_count) =
time_repeated(args.repeat, || run_fancy_all(&filenames, &fancy_all))?;
ensure!(
selective_count == fancy_count,
"selective and fancy-all match counts differ: selective={}, fancy_all={}",
selective_count,
fancy_count
);
let ratio = if selective_seconds > 0.0 {
fancy_seconds / selective_seconds
} else {
0.0
};
println!(
"{}",
serde_json::json!({
"rows": filenames.len(),
"repeat": args.repeat,
"selective_seconds": selective_seconds,
"fancy_all_seconds": fancy_seconds,
"ratio": ratio,
"match_count": selective_count,
})
);
Ok(())
}
fn time_repeated<F>(repeat: usize, mut run_once: F) -> Result<(f64, usize)>
where
F: FnMut() -> Result<usize>,
{
let started = Instant::now();
let mut count = 0usize;
for _ in 0..repeat {
count = count.wrapping_add(black_box(run_once()?));
}
Ok((started.elapsed().as_secs_f64(), count))
}
struct SelectivePatterns {
resolution: Regex,
source: Regex,
source_tag: Regex,
special_tag: Regex,
special_code: Regex,
episode_context: Regex,
episode_span: FancyRegex,
reading_marker: FancyRegex,
roman_marker: FancyRegex,
cjk_marker: Regex,
special_context_prefix: Regex,
}
impl SelectivePatterns {
fn new() -> Result<Self> {
Ok(Self {
resolution: Regex::new(&format!(r"(?i)(?:{RESOLUTION_BODY})"))?,
source: Regex::new(&format!(r"(?i)(?:{SOURCE_TOKEN_PATTERN})"))?,
source_tag: Regex::new(&format!(
r"(?i)^(?:{SOURCE_TOKEN_PATTERN})(?:\s*(?:[&+/,_-]|,\s*)\s*(?:{SOURCE_TOKEN_PATTERN}))*$"
))?,
special_tag: Regex::new(SPECIAL_TAG_PATTERN)?,
special_code: Regex::new(SPECIAL_CODE_PATTERN)?,
episode_context: Regex::new(EPISODE_CONTEXT_PATTERN)?,
episode_span: FancyRegex::new(EPISODE_SPAN_PATTERN)?,
reading_marker: FancyRegex::new(READING_MARKER_PATTERN)?,
roman_marker: FancyRegex::new(ROMAN_MARKER_PATTERN)?,
cjk_marker: Regex::new(CJK_MARKER_PATTERN)?,
special_context_prefix: Regex::new(SPECIAL_CONTEXT_PREFIX_PATTERN)?,
})
}
}
struct FancyAllPatterns {
resolution: FancyRegex,
source: FancyRegex,
source_tag: FancyRegex,
special_tag: FancyRegex,
special_code: FancyRegex,
episode_context: FancyRegex,
episode_span: FancyRegex,
reading_marker: FancyRegex,
roman_marker: FancyRegex,
cjk_marker: FancyRegex,
special_context_prefix: FancyRegex,
}
impl FancyAllPatterns {
fn new() -> Result<Self> {
Ok(Self {
resolution: FancyRegex::new(&format!(r"(?i)(?:{RESOLUTION_BODY})"))?,
source: FancyRegex::new(&format!(r"(?i)(?:{SOURCE_TOKEN_PATTERN})"))?,
source_tag: FancyRegex::new(&format!(
r"(?i)^(?:{SOURCE_TOKEN_PATTERN})(?:\s*(?:[&+/,_-]|,\s*)\s*(?:{SOURCE_TOKEN_PATTERN}))*$"
))?,
special_tag: FancyRegex::new(SPECIAL_TAG_PATTERN)?,
special_code: FancyRegex::new(SPECIAL_CODE_PATTERN)?,
episode_context: FancyRegex::new(EPISODE_CONTEXT_PATTERN)?,
episode_span: FancyRegex::new(EPISODE_SPAN_PATTERN)?,
reading_marker: FancyRegex::new(READING_MARKER_PATTERN)?,
roman_marker: FancyRegex::new(ROMAN_MARKER_PATTERN)?,
cjk_marker: FancyRegex::new(CJK_MARKER_PATTERN)?,
special_context_prefix: FancyRegex::new(SPECIAL_CONTEXT_PREFIX_PATTERN)?,
})
}
}
fn run_selective(filenames: &[String], patterns: &SelectivePatterns) -> Result<usize> {
let mut count = 0usize;
for filename in filenames {
count = count.wrapping_add(
patterns
.resolution
.find_iter(filename)
.filter(|mat| has_ascii_token_boundaries(filename, mat.start(), mat.end()))
.count(),
);
count = count.wrapping_add(
patterns
.source
.find_iter(filename)
.filter(|mat| has_ascii_token_boundaries(filename, mat.start(), mat.end()))
.count(),
);
count = count.wrapping_add(patterns.episode_context.is_match(filename) as usize);
count = count.wrapping_add(patterns.cjk_marker.find_iter(filename).count());
count = count.wrapping_add(fancy_count(&patterns.episode_span, filename)?);
count = count.wrapping_add(fancy_count(&patterns.reading_marker, filename)?);
count = count.wrapping_add(fancy_count(&patterns.roman_marker, filename)?);
for inner in bracket_inners(filename) {
count = count.wrapping_add(patterns.source_tag.is_match(&inner) as usize);
count = count.wrapping_add(patterns.special_tag.is_match(&inner) as usize);
count = count.wrapping_add(patterns.special_code.is_match(&inner) as usize);
count = count.wrapping_add(patterns.special_context_prefix.is_match(&inner) as usize);
}
}
Ok(count)
}
fn run_fancy_all(filenames: &[String], patterns: &FancyAllPatterns) -> Result<usize> {
let mut count = 0usize;
for filename in filenames {
count = count.wrapping_add(fancy_count_with_boundaries(&patterns.resolution, filename)?);
count = count.wrapping_add(fancy_count_with_boundaries(&patterns.source, filename)?);
count = count.wrapping_add(patterns.episode_context.is_match(filename)? as usize);
count = count.wrapping_add(fancy_count(&patterns.cjk_marker, filename)?);
count = count.wrapping_add(fancy_count(&patterns.episode_span, filename)?);
count = count.wrapping_add(fancy_count(&patterns.reading_marker, filename)?);
count = count.wrapping_add(fancy_count(&patterns.roman_marker, filename)?);
for inner in bracket_inners(filename) {
count = count.wrapping_add(patterns.source_tag.is_match(&inner)? as usize);
count = count.wrapping_add(patterns.special_tag.is_match(&inner)? as usize);
count = count.wrapping_add(patterns.special_code.is_match(&inner)? as usize);
count = count.wrapping_add(patterns.special_context_prefix.is_match(&inner)? as usize);
}
}
Ok(count)
}
fn fancy_count(regex: &FancyRegex, text: &str) -> Result<usize> {
let mut count = 0usize;
for item in regex.find_iter(text) {
let _ = item?;
count += 1;
}
Ok(count)
}
fn fancy_count_with_boundaries(regex: &FancyRegex, text: &str) -> Result<usize> {
let mut count = 0usize;
for item in regex.find_iter(text) {
let mat = item?;
if has_ascii_token_boundaries(text, mat.start(), mat.end()) {
count += 1;
}
}
Ok(count)
}
fn load_filenames(path: &PathBuf, limit_rows: usize) -> Result<Vec<String>> {
let file = File::open(path).with_context(|| format!("failed to open {}", path.display()))?;
let reader = BufReader::new(file);
let mut filenames = Vec::new();
for (idx, line) in reader.lines().enumerate() {
if limit_rows > 0 && filenames.len() >= limit_rows {
break;
}
let raw = line.with_context(|| format!("failed reading line {}", idx + 1))?;
if raw.trim().is_empty() {
continue;
}
let value: Value = serde_json::from_str(&raw)
.with_context(|| format!("invalid JSONL line {}", idx + 1))?;
if let Some(filename) = value.get("filename").and_then(Value::as_str) {
filenames.push(filename.to_string());
}
}
Ok(filenames)
}
fn bracket_inners(text: &str) -> Vec<String> {
let chars = text.chars().collect::<Vec<_>>();
let mut spans = Vec::new();
let mut idx = 0usize;
while idx < chars.len() {
let close = match chars[idx] {
'[' => ']',
'(' => ')',
'【' => '】',
'《' => '》',
_ => {
idx += 1;
continue;
}
};
if let Some(relative_end) = chars[idx + 1..].iter().position(|ch| *ch == close) {
let end = idx + 1 + relative_end;
spans.push(chars[idx + 1..end].iter().collect::<String>());
idx = end + 1;
} else {
idx += 1;
}
}
spans
}
fn has_ascii_token_boundaries(text: &str, start: usize, end: usize) -> bool {
let previous_ok = text[..start]
.chars()
.next_back()
.map(|ch| !ch.is_ascii_alphanumeric())
.unwrap_or(true);
let next_ok = text[end..]
.chars()
.next()
.map(|ch| !ch.is_ascii_alphanumeric())
.unwrap_or(true);
previous_ok && next_ok
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn selective_and_fancy_all_count_the_same_matches() -> Result<()> {
let filenames = vec![
"[GM-Team][國漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4".to_string(),
"[YYDM&VCB-Studio] Shinsekai Yori II [NCED02][Ma10p_1080p][x265_flac].mkv".to_string(),
"Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub".to_string(),
"[Lilith-Raws] San no Sara - 03 [Baha][WEB-DL][1080p][AVC AAC][CHT].mp4".to_string(),
"[Test] 搜索: 别名 [OVA01][BDRip][720p]".to_string(),
];
let selective = SelectivePatterns::new()?;
let fancy_all = FancyAllPatterns::new()?;
assert_eq!(
run_selective(&filenames, &selective)?,
run_fancy_all(&filenames, &fancy_all)?
);
Ok(())
}
#[test]
fn bracket_inners_extract_supported_pairs() {
assert_eq!(bracket_inners("[A](B)【C】《D》"), vec!["A", "B", "C", "D"]);
}
#[test]
fn ascii_token_boundaries_reject_embedded_matches() {
let text = "ABC1080p 1080p HEVC2 HEVC";
assert!(!has_ascii_token_boundaries(text, 3, 8));
assert!(has_ascii_token_boundaries(text, 9, 14));
assert!(!has_ascii_token_boundaries(text, 15, 19));
assert!(has_ascii_token_boundaries(text, 21, 25));
}
}