use anyhow::{ensure, Context, Result}; use fancy_regex::Regex as FancyRegex; use regex::Regex; use serde_json::Value; use std::fs::File; use std::hint::black_box; use std::io::{BufRead, BufReader}; use std::path::PathBuf; use std::time::Instant; const SOURCE_TOKEN_PATTERN: &str = r"WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|AAC|FLAC|MP3|DTS|Opus|CHS|CHT|GB|BIG5|JPN?|JPSC|JPTC|繁中|简中"; const RESOLUTION_BODY: &str = r"\d{3,4}[pP]|\d[Kk]|\d{3,4}[xX×]\d{3,4}"; const SPECIAL_TAG_PATTERN: &str = r"(?i)^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[::].+"; const SPECIAL_CODE_PATTERN: &str = r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM)\d*$|^IV\d+$|^(?:OVA|OAD|SP)\d*$"; const EPISODE_CONTEXT_PATTERN: &str = r"(?i)^\s*(?:[-_]\s*(?:\d{1,4}|NCOP|NCED|OP|ED|OVA|OAD|SP|END)\b|#\s*\d{1,4}|[\[\(【《]\s*(?:EP?|#)?\d{1,4})"; const EPISODE_SPAN_PATTERN: &str = r"(?i)(?:[Ss]\d{1,2}[Ee]\d{1,4}(?:v\d+)?|(?:^|[\s._])[-_]\s*\d{1,4}(?:v\d+)?(?=$|[\s._\-\]\)】》\[])|[\[\(【《](?:EP?|#)?\d{1,4}(?:v\d+)?[\]\)】》]|(?:^|[\s._\-\[\(【《#])(?:EP?|第|#)\d{1,4}(?:v\d+)?(?:[话話集])?(?=$|[\s._\-\]\)】》]))"; const READING_MARKER_PATTERN: &str = r"(?i)(?Ni\s+no\s+(?:Sara|Shou|Sho|Syo|Shō)|San\s+no\s+(?:Sara|Shou|Sho|Syo)|(?:Yon|Shi|Shin)\s+no\s+Sara|(?:Go|Gou)\s+no\s+Sara|Ni\s+Gakki|Sono\s+Ni)(?![A-Za-z0-9])"; const ROMAN_MARKER_PATTERN: &str = r"(?II|III|IV|V|VI|VII|VIII|IX|[ⅡⅢⅣⅤⅥⅦⅧⅨ])(?![A-Za-z0-9])"; const CJK_MARKER_PATTERN: &str = r"(?:[一二三四五六七八九十兩两貳贰弐弍參叁参肆伍陸陆柒捌玖](?:\s*(?:ノ|の|之)\s*(?:章|期|季|部))?|第[一二三四五六七八九十兩两貳贰弐弍參叁参肆伍陸陆柒捌玖\d]+[季期部章])"; const SPECIAL_CONTEXT_PREFIX_PATTERN: &str = r"(?i)^(?:[\[\(【《]\s*(?:menu|menus|bdmenu|ncop|nced|op|ed|ova|oad|sp)\s*[\]\)】》]\s*){0,2}"; pub fn run(input: &PathBuf, limit_rows: usize, repeat: usize) -> Result<()> { ensure!(repeat > 0, "--regex-benchmark-repeat must be greater than 0"); let filenames = load_filenames(input, limit_rows)?; if filenames.is_empty() { anyhow::bail!("no filenames loaded from {}", input.display()); } let selective = SelectivePatterns::new()?; let fancy_all = FancyAllPatterns::new()?; let (selective_seconds, selective_count) = time_repeated(repeat, || run_selective(&filenames, &selective))?; let (fancy_seconds, fancy_count) = time_repeated(repeat, || run_fancy_all(&filenames, &fancy_all))?; ensure!( selective_count == fancy_count, "selective and fancy-all match counts differ: selective={}, fancy_all={}", selective_count, fancy_count ); let ratio = if selective_seconds > 0.0 { fancy_seconds / selective_seconds } else { 0.0 }; println!( "{}", serde_json::json!({ "rows": filenames.len(), "repeat": repeat, "selective_seconds": selective_seconds, "fancy_all_seconds": fancy_seconds, "ratio": ratio, "match_count": selective_count, }) ); Ok(()) } fn time_repeated(repeat: usize, mut run_once: F) -> Result<(f64, usize)> where F: FnMut() -> Result, { let started = Instant::now(); let mut count = 0usize; for _ in 0..repeat { count = count.wrapping_add(black_box(run_once()?)); } Ok((started.elapsed().as_secs_f64(), count)) } struct SelectivePatterns { resolution: Regex, source: Regex, source_tag: Regex, special_tag: Regex, special_code: Regex, episode_context: Regex, episode_span: FancyRegex, reading_marker: FancyRegex, roman_marker: FancyRegex, cjk_marker: Regex, special_context_prefix: Regex, } impl SelectivePatterns { fn new() -> Result { Ok(Self { resolution: Regex::new(&format!(r"(?i)(?:{RESOLUTION_BODY})"))?, source: Regex::new(&format!(r"(?i)(?:{SOURCE_TOKEN_PATTERN})"))?, source_tag: Regex::new(&format!( r"(?i)^(?:{SOURCE_TOKEN_PATTERN})(?:\s*(?:[&+/,_-]|,\s*)\s*(?:{SOURCE_TOKEN_PATTERN}))*$" ))?, special_tag: Regex::new(SPECIAL_TAG_PATTERN)?, special_code: Regex::new(SPECIAL_CODE_PATTERN)?, episode_context: Regex::new(EPISODE_CONTEXT_PATTERN)?, episode_span: FancyRegex::new(EPISODE_SPAN_PATTERN)?, reading_marker: FancyRegex::new(READING_MARKER_PATTERN)?, roman_marker: FancyRegex::new(ROMAN_MARKER_PATTERN)?, cjk_marker: Regex::new(CJK_MARKER_PATTERN)?, special_context_prefix: Regex::new(SPECIAL_CONTEXT_PREFIX_PATTERN)?, }) } } struct FancyAllPatterns { resolution: FancyRegex, source: FancyRegex, source_tag: FancyRegex, special_tag: FancyRegex, special_code: FancyRegex, episode_context: FancyRegex, episode_span: FancyRegex, reading_marker: FancyRegex, roman_marker: FancyRegex, cjk_marker: FancyRegex, special_context_prefix: FancyRegex, } impl FancyAllPatterns { fn new() -> Result { Ok(Self { resolution: FancyRegex::new(&format!(r"(?i)(?:{RESOLUTION_BODY})"))?, source: FancyRegex::new(&format!(r"(?i)(?:{SOURCE_TOKEN_PATTERN})"))?, source_tag: FancyRegex::new(&format!( r"(?i)^(?:{SOURCE_TOKEN_PATTERN})(?:\s*(?:[&+/,_-]|,\s*)\s*(?:{SOURCE_TOKEN_PATTERN}))*$" ))?, special_tag: FancyRegex::new(SPECIAL_TAG_PATTERN)?, special_code: FancyRegex::new(SPECIAL_CODE_PATTERN)?, episode_context: FancyRegex::new(EPISODE_CONTEXT_PATTERN)?, episode_span: FancyRegex::new(EPISODE_SPAN_PATTERN)?, reading_marker: FancyRegex::new(READING_MARKER_PATTERN)?, roman_marker: FancyRegex::new(ROMAN_MARKER_PATTERN)?, cjk_marker: FancyRegex::new(CJK_MARKER_PATTERN)?, special_context_prefix: FancyRegex::new(SPECIAL_CONTEXT_PREFIX_PATTERN)?, }) } } fn run_selective(filenames: &[String], patterns: &SelectivePatterns) -> Result { let mut count = 0usize; for filename in filenames { count = count.wrapping_add( patterns .resolution .find_iter(filename) .filter(|mat| has_ascii_token_boundaries(filename, mat.start(), mat.end())) .count(), ); count = count.wrapping_add( patterns .source .find_iter(filename) .filter(|mat| has_ascii_token_boundaries(filename, mat.start(), mat.end())) .count(), ); count = count.wrapping_add(patterns.episode_context.is_match(filename) as usize); count = count.wrapping_add(patterns.cjk_marker.find_iter(filename).count()); count = count.wrapping_add(fancy_count(&patterns.episode_span, filename)?); count = count.wrapping_add(fancy_count(&patterns.reading_marker, filename)?); count = count.wrapping_add(fancy_count(&patterns.roman_marker, filename)?); for inner in bracket_inners(filename) { count = count.wrapping_add(patterns.source_tag.is_match(&inner) as usize); count = count.wrapping_add(patterns.special_tag.is_match(&inner) as usize); count = count.wrapping_add(patterns.special_code.is_match(&inner) as usize); count = count.wrapping_add(patterns.special_context_prefix.is_match(&inner) as usize); } } Ok(count) } fn run_fancy_all(filenames: &[String], patterns: &FancyAllPatterns) -> Result { let mut count = 0usize; for filename in filenames { count = count.wrapping_add(fancy_count_with_boundaries(&patterns.resolution, filename)?); count = count.wrapping_add(fancy_count_with_boundaries(&patterns.source, filename)?); count = count.wrapping_add(patterns.episode_context.is_match(filename)? as usize); count = count.wrapping_add(fancy_count(&patterns.cjk_marker, filename)?); count = count.wrapping_add(fancy_count(&patterns.episode_span, filename)?); count = count.wrapping_add(fancy_count(&patterns.reading_marker, filename)?); count = count.wrapping_add(fancy_count(&patterns.roman_marker, filename)?); for inner in bracket_inners(filename) { count = count.wrapping_add(patterns.source_tag.is_match(&inner)? as usize); count = count.wrapping_add(patterns.special_tag.is_match(&inner)? as usize); count = count.wrapping_add(patterns.special_code.is_match(&inner)? as usize); count = count.wrapping_add(patterns.special_context_prefix.is_match(&inner)? as usize); } } Ok(count) } fn fancy_count(regex: &FancyRegex, text: &str) -> Result { let mut count = 0usize; for item in regex.find_iter(text) { let _ = item?; count += 1; } Ok(count) } fn fancy_count_with_boundaries(regex: &FancyRegex, text: &str) -> Result { let mut count = 0usize; for item in regex.find_iter(text) { let mat = item?; if has_ascii_token_boundaries(text, mat.start(), mat.end()) { count += 1; } } Ok(count) } fn load_filenames(path: &PathBuf, limit_rows: usize) -> Result> { let file = File::open(path).with_context(|| format!("failed to open {}", path.display()))?; let reader = BufReader::new(file); let mut filenames = Vec::new(); for (idx, line) in reader.lines().enumerate() { if limit_rows > 0 && filenames.len() >= limit_rows { break; } let raw = line.with_context(|| format!("failed reading line {}", idx + 1))?; if raw.trim().is_empty() { continue; } let value: Value = serde_json::from_str(&raw) .with_context(|| format!("invalid JSONL line {}", idx + 1))?; if let Some(filename) = value.get("filename").and_then(Value::as_str) { filenames.push(filename.to_string()); } } Ok(filenames) } fn bracket_inners(text: &str) -> Vec { let chars = text.chars().collect::>(); let mut spans = Vec::new(); let mut idx = 0usize; while idx < chars.len() { let close = match chars[idx] { '[' => ']', '(' => ')', '【' => '】', '《' => '》', _ => { idx += 1; continue; } }; if let Some(relative_end) = chars[idx + 1..].iter().position(|ch| *ch == close) { let end = idx + 1 + relative_end; spans.push(chars[idx + 1..end].iter().collect::()); idx = end + 1; } else { idx += 1; } } spans } fn has_ascii_token_boundaries(text: &str, start: usize, end: usize) -> bool { let previous_ok = text[..start] .chars() .next_back() .map(|ch| !ch.is_ascii_alphanumeric()) .unwrap_or(true); let next_ok = text[end..] .chars() .next() .map(|ch| !ch.is_ascii_alphanumeric()) .unwrap_or(true); previous_ok && next_ok } #[cfg(test)] mod tests { use super::*; #[test] fn selective_and_fancy_all_count_the_same_matches() -> Result<()> { let filenames = vec![ "[GM-Team][國漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4".to_string(), "[YYDM&VCB-Studio] Shinsekai Yori II [NCED02][Ma10p_1080p][x265_flac].mkv".to_string(), "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub".to_string(), "[Lilith-Raws] San no Sara - 03 [Baha][WEB-DL][1080p][AVC AAC][CHT].mp4".to_string(), "[Test] 搜索: 别名 [OVA01][BDRip][720p]".to_string(), ]; let selective = SelectivePatterns::new()?; let fancy_all = FancyAllPatterns::new()?; assert_eq!( run_selective(&filenames, &selective)?, run_fancy_all(&filenames, &fancy_all)? ); Ok(()) } #[test] fn bracket_inners_extract_supported_pairs() { assert_eq!(bracket_inners("[A](B)【C】《D》"), vec!["A", "B", "C", "D"]); } #[test] fn ascii_token_boundaries_reject_embedded_matches() { let text = "ABC1080p 1080p HEVC2 HEVC"; assert!(!has_ascii_token_boundaries(text, 3, 8)); assert!(has_ascii_token_boundaries(text, 9, 14)); assert!(!has_ascii_token_boundaries(text, 15, 19)); assert!(has_ascii_token_boundaries(text, 21, 25)); } }