Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
| use anyhow::{bail, Context, Result}; | |
| use clap::Parser; | |
| use fancy_regex::Regex as FancyRegex; | |
| use rand::rngs::StdRng; | |
| use rand::seq::SliceRandom; | |
| use rand::SeedableRng; | |
| use rayon::prelude::*; | |
| use regex::Regex; | |
| use serde::{Deserialize, Serialize}; | |
| use serde_json::{json, Value}; | |
| use std::collections::HashMap; | |
| use std::fs::{self, File}; | |
| use std::io::{BufRead, BufReader, BufWriter, Write}; | |
| use std::path::{Path, PathBuf}; | |
| use std::sync::OnceLock; | |
| use std::time::Instant; | |
| const FALLBACK_LABELS: [&str; 37] = [ | |
| "O", | |
| "B-TITLE_CHS", | |
| "I-TITLE_CHS", | |
| "B-TITLE_CHT", | |
| "I-TITLE_CHT", | |
| "B-TITLE_JPN", | |
| "I-TITLE_JPN", | |
| "B-TITLE_LATIN", | |
| "I-TITLE_LATIN", | |
| "B-TITLE_MIXED", | |
| "I-TITLE_MIXED", | |
| "B-PATH_TITLE_CHS", | |
| "I-PATH_TITLE_CHS", | |
| "B-PATH_TITLE_CHT", | |
| "I-PATH_TITLE_CHT", | |
| "B-PATH_TITLE_JPN", | |
| "I-PATH_TITLE_JPN", | |
| "B-PATH_TITLE_LATIN", | |
| "I-PATH_TITLE_LATIN", | |
| "B-PATH_TITLE_MIXED", | |
| "I-PATH_TITLE_MIXED", | |
| "B-PATH_SEASON", | |
| "I-PATH_SEASON", | |
| "B-SEASON", | |
| "I-SEASON", | |
| "B-EPISODE", | |
| "I-EPISODE", | |
| "B-SPECIAL", | |
| "I-SPECIAL", | |
| "B-GROUP", | |
| "I-GROUP", | |
| "B-RESOLUTION", | |
| "I-RESOLUTION", | |
| "B-SOURCE", | |
| "I-SOURCE", | |
| "B-TAG", | |
| "I-TAG", | |
| ]; | |
| const SOURCE_TOKEN_PATTERN: &str = r"WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|AAC|FLAC|MP3|DTS|Opus|CHS|CHT|GB|BIG5|JPN?|JPSC|JPTC|繁中|简中"; | |
| static RESOLUTION_RE: OnceLock<FancyRegex> = OnceLock::new(); | |
| static SOURCE_RE: OnceLock<Regex> = OnceLock::new(); | |
| static SOURCE_TAG_RE: OnceLock<Regex> = OnceLock::new(); | |
| static SPECIAL_TAG_RE: OnceLock<Regex> = OnceLock::new(); | |
| static SPECIAL_CODE_RE: OnceLock<Regex> = OnceLock::new(); | |
| static EPISODE_CONTEXT_RE: OnceLock<Regex> = OnceLock::new(); | |
| static EPISODE_SPAN_RE: OnceLock<FancyRegex> = OnceLock::new(); | |
| static READING_MARKER_RE: OnceLock<FancyRegex> = OnceLock::new(); | |
| static ROMAN_MARKER_RE: OnceLock<FancyRegex> = OnceLock::new(); | |
| static CJK_MARKER_RE: OnceLock<Regex> = OnceLock::new(); | |
| static SPECIAL_CONTEXT_PREFIX_RE: OnceLock<Regex> = OnceLock::new(); | |
| const SEPARATOR_CHARS: &[char] = &[' ', '\t', '-', '_', '.', '|', '~', '~']; | |
| struct Args { | |
| input: PathBuf, | |
| vocab_file: PathBuf, | |
| output_dir: PathBuf, | |
| label_schema_file: PathBuf, | |
| max_length: usize, | |
| shard_size: usize, | |
| limit_rows: usize, | |
| train_split: f64, | |
| seed: u64, | |
| no_shuffle: bool, | |
| threads: usize, | |
| } | |
| struct LabelSchema { | |
| labels: Vec<String>, | |
| } | |
| struct SourceRow { | |
| row_index: usize, | |
| raw_line: String, | |
| filename: Option<String>, | |
| tokens: Vec<String>, | |
| labels: Vec<String>, | |
| tokenizer_variant: Option<String>, | |
| } | |
| struct Vocab { | |
| ids: HashMap<String, u16>, | |
| pad_id: u16, | |
| unk_id: u16, | |
| cls_id: u16, | |
| sep_id: u16, | |
| } | |
| struct EncodeContext { | |
| vocab: Vocab, | |
| label_ids: HashMap<String, i16>, | |
| max_length: usize, | |
| } | |
| struct ShardManifest { | |
| rows: usize, | |
| input_ids: String, | |
| attention_mask: String, | |
| labels: String, | |
| } | |
| struct SplitSummary { | |
| split: String, | |
| rows: usize, | |
| shards: usize, | |
| directory: String, | |
| } | |
| fn main() -> Result<()> { | |
| let args = Args::parse(); | |
| if args.max_length < 4 { | |
| bail!("--max-length must be at least 4"); | |
| } | |
| if args.shard_size == 0 { | |
| bail!("--shard-size must be positive"); | |
| } | |
| if !(0.0..1.0).contains(&args.train_split) { | |
| bail!("--train-split must be > 0 and < 1"); | |
| } | |
| if args.threads > 0 { | |
| rayon::ThreadPoolBuilder::new() | |
| .num_threads(args.threads) | |
| .build_global() | |
| .context("failed to configure rayon thread pool")?; | |
| } | |
| let started = Instant::now(); | |
| let vocab = load_vocab(&args.vocab_file)?; | |
| let label_ids = load_label_ids(&args.label_schema_file)?; | |
| let mut rows = load_rows(&args.input, args.limit_rows)?; | |
| if rows.len() < 2 { | |
| bail!("need at least two rows to build train/eval cache"); | |
| } | |
| if !args.no_shuffle { | |
| let mut rng = StdRng::seed_from_u64(args.seed); | |
| rows.shuffle(&mut rng); | |
| } | |
| let split_idx = ((rows.len() as f64) * args.train_split) as usize; | |
| let split_idx = split_idx.max(1).min(rows.len() - 1); | |
| let (train_rows, eval_rows) = rows.split_at(split_idx); | |
| fs::create_dir_all(&args.output_dir).with_context(|| { | |
| format!( | |
| "failed to create output directory {}", | |
| args.output_dir.display() | |
| ) | |
| })?; | |
| let context = EncodeContext { | |
| vocab, | |
| label_ids, | |
| max_length: args.max_length, | |
| }; | |
| let train_summary = write_split( | |
| "train", | |
| train_rows, | |
| &args.output_dir, | |
| &context, | |
| args.shard_size, | |
| )?; | |
| let eval_summary = write_split( | |
| "eval", | |
| eval_rows, | |
| &args.output_dir, | |
| &context, | |
| args.shard_size, | |
| )?; | |
| write_eval_records(eval_rows, &args.output_dir.join("eval_records.jsonl"))?; | |
| let manifest = json!({ | |
| "format": "anifilebert.encoded_dataset_cache.v1", | |
| "input": args.input, | |
| "vocab_file": args.vocab_file, | |
| "label_schema_file": args.label_schema_file, | |
| "output_dir": args.output_dir, | |
| "max_length": args.max_length, | |
| "shard_size": args.shard_size, | |
| "limit_rows": args.limit_rows, | |
| "source_rows": train_rows.len() + eval_rows.len(), | |
| "train_split": args.train_split, | |
| "seed": args.seed, | |
| "shuffle": !args.no_shuffle, | |
| "train": train_summary, | |
| "eval": eval_summary, | |
| "eval_records": "eval_records.jsonl", | |
| "elapsed_seconds": started.elapsed().as_secs_f64(), | |
| }); | |
| let manifest_path = args.output_dir.join("manifest.json"); | |
| fs::write(&manifest_path, serde_json::to_string_pretty(&manifest)?) | |
| .with_context(|| format!("failed to write {}", manifest_path.display()))?; | |
| println!("{}", serde_json::to_string_pretty(&manifest)?); | |
| Ok(()) | |
| } | |
| fn load_vocab(path: &Path) -> Result<Vocab> { | |
| let text = fs::read_to_string(path) | |
| .with_context(|| format!("failed to read vocab {}", path.display()))?; | |
| let raw: HashMap<String, u64> = | |
| serde_json::from_str(&text).with_context(|| format!("invalid vocab {}", path.display()))?; | |
| let mut ids = HashMap::with_capacity(raw.len()); | |
| for (token, id) in raw { | |
| if id > u16::MAX as u64 { | |
| bail!("vocab id for token '{token}' exceeds u16: {id}"); | |
| } | |
| ids.insert(token, id as u16); | |
| } | |
| let special = |token: &str| -> Result<u16> { | |
| ids.get(token) | |
| .copied() | |
| .with_context(|| format!("vocab is missing special token {token}")) | |
| }; | |
| Ok(Vocab { | |
| pad_id: special("[PAD]")?, | |
| unk_id: special("[UNK]")?, | |
| cls_id: special("[CLS]")?, | |
| sep_id: special("[SEP]")?, | |
| ids, | |
| }) | |
| } | |
| fn load_label_ids(path: &Path) -> Result<HashMap<String, i16>> { | |
| let labels = match fs::read_to_string(path) { | |
| Ok(text) => { | |
| serde_json::from_str::<LabelSchema>(&text) | |
| .with_context(|| format!("invalid label schema {}", path.display()))? | |
| .labels | |
| } | |
| Err(_) => FALLBACK_LABELS | |
| .iter() | |
| .map(|label| (*label).to_string()) | |
| .collect(), | |
| }; | |
| if labels.is_empty() { | |
| bail!("label schema has no labels"); | |
| } | |
| Ok(labels | |
| .into_iter() | |
| .enumerate() | |
| .map(|(idx, label)| (label, idx as i16)) | |
| .collect()) | |
| } | |
| fn load_rows(path: &Path, limit_rows: usize) -> Result<Vec<SourceRow>> { | |
| let file = File::open(path).with_context(|| format!("failed to open {}", path.display()))?; | |
| let reader = BufReader::new(file); | |
| let mut rows = Vec::new(); | |
| for (idx, line) in reader.lines().enumerate() { | |
| if limit_rows > 0 && rows.len() >= limit_rows { | |
| break; | |
| } | |
| let raw_line = line.with_context(|| format!("failed reading line {}", idx + 1))?; | |
| if raw_line.trim().is_empty() { | |
| continue; | |
| } | |
| let value: Value = serde_json::from_str(&raw_line) | |
| .with_context(|| format!("failed to parse JSONL line {}", idx + 1))?; | |
| let tokens = string_array_field(&value, "tokens", idx + 1)?; | |
| let labels = string_array_field(&value, "labels", idx + 1)?; | |
| if tokens.len() != labels.len() { | |
| bail!( | |
| "line {} has mismatched token/label lengths: {} vs {}", | |
| idx + 1, | |
| tokens.len(), | |
| labels.len() | |
| ); | |
| } | |
| rows.push(SourceRow { | |
| row_index: idx, | |
| raw_line, | |
| filename: value | |
| .get("filename") | |
| .and_then(Value::as_str) | |
| .map(ToOwned::to_owned), | |
| tokens, | |
| labels, | |
| tokenizer_variant: value | |
| .get("tokenizer_variant") | |
| .and_then(Value::as_str) | |
| .map(ToOwned::to_owned), | |
| }); | |
| } | |
| Ok(rows) | |
| } | |
| fn string_array_field(value: &Value, field: &str, line_no: usize) -> Result<Vec<String>> { | |
| let array = value | |
| .get(field) | |
| .and_then(Value::as_array) | |
| .with_context(|| format!("line {line_no} missing array field '{field}'"))?; | |
| array | |
| .iter() | |
| .map(|item| match item { | |
| Value::String(text) => Ok(text.clone()), | |
| other => Ok(match other { | |
| Value::Null => String::new(), | |
| _ => other.to_string(), | |
| }), | |
| }) | |
| .collect() | |
| } | |
| fn write_split( | |
| split: &str, | |
| rows: &[SourceRow], | |
| output_dir: &Path, | |
| context: &EncodeContext, | |
| shard_size: usize, | |
| ) -> Result<SplitSummary> { | |
| let split_dir = output_dir.join(split); | |
| fs::create_dir_all(&split_dir) | |
| .with_context(|| format!("failed to create {}", split_dir.display()))?; | |
| let chunks = rows | |
| .chunks(shard_size) | |
| .enumerate() | |
| .collect::<Vec<(usize, &[SourceRow])>>(); | |
| let shards = chunks | |
| .par_iter() | |
| .map(|(shard_idx, chunk)| write_shard(split, *shard_idx, chunk, &split_dir, context)) | |
| .collect::<Result<Vec<_>>>()?; | |
| let manifest = json!({ | |
| "format": "anifilebert.virtual_dataset.shards.v1", | |
| "generated_by": "tools/encoded_dataset_cache", | |
| "split": split, | |
| "max_length": context.max_length, | |
| "total_rows": rows.len(), | |
| "shards": shards, | |
| }); | |
| let manifest_path = split_dir.join("manifest.json"); | |
| fs::write(&manifest_path, serde_json::to_string_pretty(&manifest)?) | |
| .with_context(|| format!("failed to write {}", manifest_path.display()))?; | |
| Ok(SplitSummary { | |
| split: split.to_string(), | |
| rows: rows.len(), | |
| shards: chunks.len(), | |
| directory: split.to_string(), | |
| }) | |
| } | |
| fn write_shard( | |
| split: &str, | |
| shard_idx: usize, | |
| rows: &[SourceRow], | |
| split_dir: &Path, | |
| context: &EncodeContext, | |
| ) -> Result<ShardManifest> { | |
| let capacity = rows.len().saturating_mul(context.max_length); | |
| let mut input_ids = Vec::with_capacity(capacity); | |
| let mut attention_mask = Vec::with_capacity(capacity); | |
| let mut labels = Vec::with_capacity(capacity); | |
| for row in rows { | |
| let encoded = encode_row(row, context) | |
| .with_context(|| format!("failed to encode source line {}", row.row_index + 1))?; | |
| input_ids.extend_from_slice(&encoded.0); | |
| attention_mask.extend_from_slice(&encoded.1); | |
| labels.extend_from_slice(&encoded.2); | |
| } | |
| let base = format!("part-{split}-s{shard_idx:06}"); | |
| let input_name = format!("{base}.input_ids.npy"); | |
| let mask_name = format!("{base}.attention_mask.npy"); | |
| let label_name = format!("{base}.labels.npy"); | |
| write_npy_u16( | |
| &split_dir.join(&input_name), | |
| &input_ids, | |
| rows.len(), | |
| context.max_length, | |
| )?; | |
| write_npy_u8( | |
| &split_dir.join(&mask_name), | |
| &attention_mask, | |
| rows.len(), | |
| context.max_length, | |
| )?; | |
| write_npy_i16( | |
| &split_dir.join(&label_name), | |
| &labels, | |
| rows.len(), | |
| context.max_length, | |
| )?; | |
| Ok(ShardManifest { | |
| rows: rows.len(), | |
| input_ids: input_name, | |
| attention_mask: mask_name, | |
| labels: label_name, | |
| }) | |
| } | |
| fn encode_row(row: &SourceRow, context: &EncodeContext) -> Result<(Vec<u16>, Vec<u8>, Vec<i16>)> { | |
| let (tokens, labels) = labels_for_char_tokenizer(row); | |
| let mut input_ids = vec![context.vocab.pad_id; context.max_length]; | |
| let mut attention_mask = vec![0u8; context.max_length]; | |
| let mut label_ids = vec![-100i16; context.max_length]; | |
| input_ids[0] = context.vocab.cls_id; | |
| attention_mask[0] = 1; | |
| let available = context.max_length.saturating_sub(2); | |
| let token_count = tokens.len().min(labels.len()).min(available); | |
| for idx in 0..token_count { | |
| input_ids[idx + 1] = token_id(&context.vocab, &tokens[idx]); | |
| attention_mask[idx + 1] = 1; | |
| let label = canonical_bio_label(&labels[idx]); | |
| label_ids[idx + 1] = context | |
| .label_ids | |
| .get(&label) | |
| .copied() | |
| .with_context(|| format!("unknown label '{label}'"))?; | |
| } | |
| let sep_pos = token_count + 1; | |
| input_ids[sep_pos] = context.vocab.sep_id; | |
| attention_mask[sep_pos] = 1; | |
| Ok((input_ids, attention_mask, label_ids)) | |
| } | |
| fn labels_for_char_tokenizer(row: &SourceRow) -> (Vec<String>, Vec<String>) { | |
| let mut source_labels = row.labels.clone(); | |
| if let Some(filename) = row.filename.as_deref() { | |
| repair_known_label_issues(filename, &row.tokens, &mut source_labels); | |
| if row.tokenizer_variant.as_deref() == Some("char") { | |
| let filename_chars = chars_as_strings(filename); | |
| if row.tokens == filename_chars { | |
| return (row.tokens.clone(), source_labels); | |
| } | |
| } | |
| if let Some(projected) = project_labels_from_filename(filename, &row.tokens, &source_labels) | |
| { | |
| let (tokens, labels) = projected; | |
| return (tokens, labels); | |
| } | |
| } | |
| align_tokens_to_chars(&row.tokens, &source_labels) | |
| } | |
| fn project_labels_from_filename( | |
| filename: &str, | |
| source_tokens: &[String], | |
| source_labels: &[String], | |
| ) -> Option<(Vec<String>, Vec<String>)> { | |
| let offsets = token_offsets_in_text(filename, source_tokens)?; | |
| if offsets.len() != source_labels.len() { | |
| return None; | |
| } | |
| let char_len = filename.chars().count(); | |
| let mut char_entities: Vec<Option<String>> = vec![None; char_len]; | |
| for ((token, label), (mut start, mut end)) in source_tokens | |
| .iter() | |
| .zip(source_labels.iter()) | |
| .zip(offsets.into_iter()) | |
| { | |
| let Some(entity) = bio_entity(label) else { | |
| continue; | |
| }; | |
| if is_wrapped_token(token) && end > start + 1 { | |
| start += 1; | |
| end -= 1; | |
| } | |
| for pos in start..end.min(char_entities.len()) { | |
| char_entities[pos] = Some(entity.clone()); | |
| } | |
| } | |
| let tokens = chars_as_strings(filename); | |
| let mut labels = Vec::with_capacity(tokens.len()); | |
| let mut active_entity: Option<String> = None; | |
| for entity in char_entities { | |
| match entity { | |
| Some(entity) => { | |
| let prefix = if active_entity.as_deref() == Some(entity.as_str()) { | |
| "I" | |
| } else { | |
| "B" | |
| }; | |
| labels.push(format!("{prefix}-{entity}")); | |
| active_entity = Some(entity); | |
| } | |
| None => { | |
| labels.push("O".to_string()); | |
| active_entity = None; | |
| } | |
| } | |
| } | |
| Some((tokens, labels)) | |
| } | |
| fn token_offsets_in_text(text: &str, tokens: &[String]) -> Option<Vec<(usize, usize)>> { | |
| let mut offsets = Vec::with_capacity(tokens.len()); | |
| let mut cursor = 0usize; | |
| for token in tokens { | |
| if token.is_empty() { | |
| let char_cursor = char_index_at_byte(text, cursor); | |
| offsets.push((char_cursor, char_cursor)); | |
| continue; | |
| } | |
| let relative = text.get(cursor..)?.find(token)?; | |
| let start_byte = cursor + relative; | |
| let end_byte = start_byte + token.len(); | |
| offsets.push(( | |
| char_index_at_byte(text, start_byte), | |
| char_index_at_byte(text, end_byte), | |
| )); | |
| cursor = end_byte; | |
| } | |
| Some(offsets) | |
| } | |
| fn align_tokens_to_chars(tokens: &[String], labels: &[String]) -> (Vec<String>, Vec<String>) { | |
| let mut char_tokens = Vec::new(); | |
| let mut char_labels = Vec::new(); | |
| for (token, label) in tokens.iter().zip(labels.iter()) { | |
| let chars = chars_as_strings(token); | |
| if chars.is_empty() { | |
| continue; | |
| } | |
| let label = label.as_str(); | |
| if label.starts_with("B-") { | |
| let entity = label | |
| .split_once('-') | |
| .map(|(_, entity)| entity) | |
| .unwrap_or(""); | |
| char_labels.push(label.to_string()); | |
| char_labels.extend((1..chars.len()).map(|_| format!("I-{entity}"))); | |
| } else if label.starts_with("I-") { | |
| char_labels.extend((0..chars.len()).map(|_| label.to_string())); | |
| } else { | |
| char_labels.extend((0..chars.len()).map(|_| label.to_string())); | |
| } | |
| char_tokens.extend(chars); | |
| } | |
| (char_tokens, char_labels) | |
| } | |
| fn repair_structural_meta_labels( | |
| text: &str, | |
| _tokens: &[String], | |
| labels: &mut [String], | |
| offsets: &[(usize, usize)], | |
| ) { | |
| let episode_end = first_episode_span_end(labels, offsets, text); | |
| for (inner_start, inner_end) in bracket_inner_spans(text) { | |
| let bracket_start = inner_start.saturating_sub(1); | |
| if bracket_start < episode_end { | |
| continue; | |
| } | |
| let inner = chars_range_to_string(text, inner_start, inner_end); | |
| let (trim_start, trim_end) = trimmed_bounds(&inner); | |
| if trim_start >= trim_end { | |
| continue; | |
| } | |
| let clean = chars_slice_to_string(&inner, trim_start, trim_end); | |
| if special_tag_re().is_match(&clean) || special_code_re().is_match(&clean) { | |
| let indices = token_indices_for_span(offsets, inner_start, inner_end); | |
| label_span_if_safe(labels, &indices, "SPECIAL"); | |
| continue; | |
| } | |
| if source_tag_re().is_match(&clean) { | |
| let indices = token_indices_for_span(offsets, inner_start, inner_end); | |
| label_span_if_safe(labels, &indices, "SOURCE"); | |
| continue; | |
| } | |
| for mat in resolution_re() | |
| .find_iter(&clean) | |
| .filter_map(|item| item.ok()) | |
| { | |
| let start = inner_start + char_index_at_byte(&clean, mat.start()); | |
| let end = inner_start + char_index_at_byte(&clean, mat.end()); | |
| let indices = token_indices_for_span(offsets, start, end); | |
| label_span_if_safe(labels, &indices, "RESOLUTION"); | |
| } | |
| for mat in source_re().find_iter(&clean) { | |
| if !has_ascii_token_boundaries(&clean, mat.start(), mat.end()) { | |
| continue; | |
| } | |
| let start = inner_start + char_index_at_byte(&clean, mat.start()); | |
| let end = inner_start + char_index_at_byte(&clean, mat.end()); | |
| let indices = token_indices_for_span(offsets, start, end); | |
| label_span_if_safe(labels, &indices, "SOURCE"); | |
| } | |
| } | |
| for mat in resolution_re().find_iter(text).filter_map(|item| item.ok()) { | |
| let start = char_index_at_byte(text, mat.start()); | |
| if start < episode_end { | |
| continue; | |
| } | |
| let end = char_index_at_byte(text, mat.end()); | |
| let indices = token_indices_for_span(offsets, start, end); | |
| label_span_if_safe(labels, &indices, "RESOLUTION"); | |
| } | |
| for mat in source_re().find_iter(text) { | |
| if !has_ascii_token_boundaries(text, mat.start(), mat.end()) { | |
| continue; | |
| } | |
| let start = char_index_at_byte(text, mat.start()); | |
| if start < episode_end { | |
| continue; | |
| } | |
| let end = char_index_at_byte(text, mat.end()); | |
| let indices = token_indices_for_span(offsets, start, end); | |
| label_span_if_safe(labels, &indices, "SOURCE"); | |
| } | |
| } | |
| fn repair_known_label_issues(text: &str, tokens: &[String], labels: &mut [String]) { | |
| if tokens.len() != labels.len() { | |
| return; | |
| } | |
| let Some(offsets) = token_offsets_in_text(text, tokens) else { | |
| return; | |
| }; | |
| let quick_text = text.to_lowercase(); | |
| let has_sequel_marker_hint = [ | |
| " II", " III", " IV", " V", " VI", " VII", " VIII", " IX", "Ⅱ", "Ⅲ", "Ⅳ", "Ⅴ", "Ⅵ", "Ⅶ", | |
| "Ⅷ", "Ⅸ", "之章", "之期", "之季", "之部", "ノ章", "ノ期", "の章", "の期", "貳", "贰", "弐", | |
| "弍", "參", "叁", "参", "肆", "陸", "陆", "Ni ", " ni ", " no Sara", "Gakki", | |
| ] | |
| .iter() | |
| .any(|needle| text.contains(needle) || quick_text.contains(&needle.to_lowercase())); | |
| if has_sequel_marker_hint { | |
| for (start, end) in find_sequel_season_markers(text) { | |
| if labels_have_season_before(labels, &offsets, start) { | |
| continue; | |
| } | |
| let indices = token_indices_for_span(&offsets, start, end); | |
| if indices.is_empty() { | |
| continue; | |
| } | |
| if indices.iter().any(|idx| { | |
| matches!( | |
| label_entity(&labels[*idx]), | |
| Some( | |
| "GROUP" | |
| | "EPISODE" | |
| | "RESOLUTION" | |
| | "SOURCE" | |
| | "SPECIAL" | |
| | "TAG" | |
| | "PATH_SEASON" | |
| ) | |
| ) | |
| }) { | |
| continue; | |
| } | |
| if !indices.iter().any(|idx| is_title_like_label(&labels[*idx])) { | |
| continue; | |
| } | |
| label_span_indices(labels, &indices, "SEASON"); | |
| mark_adjacent_title_separators_o(tokens, labels, &indices); | |
| } | |
| } | |
| repair_structural_meta_labels(text, tokens, labels, &offsets); | |
| } | |
| fn find_sequel_season_markers(text: &str) -> Vec<(usize, usize)> { | |
| let mut repairs = Vec::new(); | |
| for mat in reading_marker_re() | |
| .find_iter(text) | |
| .filter_map(|item| item.ok()) | |
| { | |
| let marker = mat.as_str(); | |
| if season_marker_number(marker).is_none() || !has_episode_context(text, mat.end()) { | |
| continue; | |
| } | |
| repairs.push(( | |
| char_index_at_byte(text, mat.start()), | |
| char_index_at_byte(text, mat.end()), | |
| )); | |
| } | |
| for mat in roman_marker_re() | |
| .find_iter(text) | |
| .filter_map(|item| item.ok()) | |
| { | |
| let marker = mat.as_str(); | |
| if season_marker_number(marker).is_none() || !has_episode_context(text, mat.end()) { | |
| continue; | |
| } | |
| repairs.push(( | |
| char_index_at_byte(text, mat.start()), | |
| char_index_at_byte(text, mat.end()), | |
| )); | |
| } | |
| for mat in cjk_marker_re().find_iter(text) { | |
| let marker = mat.as_str(); | |
| if season_marker_number(marker).is_none() || !has_episode_context(text, mat.end()) { | |
| continue; | |
| } | |
| repairs.push(( | |
| char_index_at_byte(text, mat.start()), | |
| char_index_at_byte(text, mat.end()), | |
| )); | |
| } | |
| for (base, value) in standalone_ni_season_bases() { | |
| let mut search_start = 0usize; | |
| while let Some(relative) = text[search_start..].find(base) { | |
| let base_start = search_start + relative; | |
| let base_end = base_start + base.len(); | |
| let Some((ni_start, ni_end)) = standalone_ni_after_base(text, base_end) else { | |
| search_start = base_end; | |
| continue; | |
| }; | |
| if *value == 2 | |
| && has_episode_context(text, ni_end) | |
| && has_ascii_token_boundaries(text, ni_start, ni_end) | |
| { | |
| repairs.push(( | |
| char_index_at_byte(text, ni_start), | |
| char_index_at_byte(text, ni_end), | |
| )); | |
| } | |
| search_start = base_end; | |
| } | |
| } | |
| repairs.sort_by_key(|(start, end)| (*start, *end)); | |
| let mut deduped: Vec<(usize, usize)> = Vec::new(); | |
| for repair in repairs { | |
| if let Some(previous) = deduped.last_mut() { | |
| if repair.0 < previous.1 { | |
| if repair.1.saturating_sub(repair.0) > previous.1.saturating_sub(previous.0) { | |
| *previous = repair; | |
| } | |
| continue; | |
| } | |
| } | |
| deduped.push(repair); | |
| } | |
| deduped | |
| } | |
| fn season_marker_number(text: &str) -> Option<u8> { | |
| let clean = clean_marker_text(text); | |
| if clean.is_empty() { | |
| return None; | |
| } | |
| if let Some(value) = roman_numeral_value(&clean) { | |
| return Some(value); | |
| } | |
| let lowered = clean | |
| .split_whitespace() | |
| .collect::<Vec<_>>() | |
| .join(" ") | |
| .to_lowercase(); | |
| if let Some(value) = reading_marker_value(&lowered) { | |
| return Some(value); | |
| } | |
| if lowered == "ni" { | |
| return Some(2); | |
| } | |
| if clean.starts_with('第') { | |
| if let Some(last) = clean.chars().last() { | |
| if matches!(last, '季' | '期' | '部' | '章') { | |
| let inner = clean | |
| .chars() | |
| .skip(1) | |
| .take(clean.chars().count().saturating_sub(2)) | |
| .collect::<String>(); | |
| return cn_number_to_int(&inner); | |
| } | |
| } | |
| } | |
| let cjk_chars = clean.chars().collect::<Vec<_>>(); | |
| if let Some(first) = cjk_chars.first() { | |
| if let Some(value) = cn_number_to_int(&first.to_string()) { | |
| let rest = cjk_chars.iter().skip(1).collect::<String>(); | |
| if rest.trim().is_empty() || cjk_marker_suffix_remainder_ok(&rest) { | |
| return Some(value); | |
| } | |
| } | |
| } | |
| None | |
| } | |
| fn clean_marker_text(text: &str) -> String { | |
| text.trim() | |
| .trim_matches(|ch| { | |
| matches!( | |
| ch, | |
| '[' | ']' | '(' | ')' | '【' | '】' | '《' | '》' | '(' | ')' | |
| ) | |
| }) | |
| .trim() | |
| .to_string() | |
| } | |
| fn cn_number_to_int(text: &str) -> Option<u8> { | |
| let text = text.trim(); | |
| if text.is_empty() { | |
| return None; | |
| } | |
| if let Ok(value) = text.parse::<u8>() { | |
| return Some(value); | |
| } | |
| if let Some(value) = cn_digit_value(text) { | |
| return Some(value); | |
| } | |
| let chars = text.chars().collect::<Vec<_>>(); | |
| if chars.len() == 2 && chars[0] == '十' { | |
| return Some(10 + cn_digit_value(&chars[1].to_string()).unwrap_or(0)); | |
| } | |
| if chars.len() == 2 && chars[1] == '十' { | |
| return Some(cn_digit_value(&chars[0].to_string()).unwrap_or(0) * 10); | |
| } | |
| if chars.len() == 3 && chars[1] == '十' { | |
| return Some( | |
| cn_digit_value(&chars[0].to_string()).unwrap_or(0) * 10 | |
| + cn_digit_value(&chars[2].to_string()).unwrap_or(0), | |
| ); | |
| } | |
| None | |
| } | |
| fn cn_digit_value(text: &str) -> Option<u8> { | |
| match text { | |
| "一" => Some(1), | |
| "二" | "兩" | "两" | "貳" | "贰" | "弐" | "弍" => Some(2), | |
| "三" | "參" | "叁" | "参" => Some(3), | |
| "四" | "肆" => Some(4), | |
| "五" | "伍" => Some(5), | |
| "六" | "陸" | "陆" => Some(6), | |
| "七" | "柒" => Some(7), | |
| "八" | "捌" => Some(8), | |
| "九" | "玖" => Some(9), | |
| "十" => Some(10), | |
| _ => None, | |
| } | |
| } | |
| fn roman_numeral_value(text: &str) -> Option<u8> { | |
| match text { | |
| "II" | "Ⅱ" => Some(2), | |
| "III" | "Ⅲ" => Some(3), | |
| "IV" | "Ⅳ" => Some(4), | |
| "V" | "Ⅴ" => Some(5), | |
| "VI" | "Ⅵ" => Some(6), | |
| "VII" | "Ⅶ" => Some(7), | |
| "VIII" | "Ⅷ" => Some(8), | |
| "IX" | "Ⅸ" => Some(9), | |
| _ => None, | |
| } | |
| } | |
| fn reading_marker_value(text: &str) -> Option<u8> { | |
| match text { | |
| "ni no sara" | "ni no shou" | "ni no sho" | "ni no syo" | "ni no shō" | "ni gakki" | |
| | "sono ni" => Some(2), | |
| "san no sara" | "san no shou" | "san no sho" | "san no syo" => Some(3), | |
| "yon no sara" | "shi no sara" | "shin no sara" => Some(4), | |
| "go no sara" | "gou no sara" => Some(5), | |
| _ => None, | |
| } | |
| } | |
| fn cjk_marker_suffix_remainder_ok(rest: &str) -> bool { | |
| let compact = rest.split_whitespace().collect::<String>(); | |
| matches!( | |
| compact.as_str(), | |
| "ノ章" | |
| | "ノ期" | |
| | "ノ季" | |
| | "ノ部" | |
| | "の章" | |
| | "の期" | |
| | "の季" | |
| | "の部" | |
| | "之章" | |
| | "之期" | |
| | "之季" | |
| | "之部" | |
| ) | |
| } | |
| fn has_episode_context(text: &str, marker_end_byte: usize) -> bool { | |
| let tail = &text[marker_end_byte..]; | |
| if episode_context_re().is_match(tail) { | |
| return true; | |
| } | |
| let mut tail = tail.trim_start(); | |
| if let Some(ch) = tail.chars().next() { | |
| if matches!(ch, ']' | ')' | '】' | '》') { | |
| tail = &tail[ch.len_utf8()..]; | |
| tail = tail.trim_start(); | |
| } | |
| } | |
| if let Some(mat) = special_context_prefix_re().find(tail) { | |
| tail = &tail[mat.end()..]; | |
| } | |
| episode_context_re().is_match(tail) | |
| } | |
| fn first_episode_regex_end(text: &str) -> Option<usize> { | |
| episode_span_re() | |
| .find_iter(text) | |
| .filter_map(|item| item.ok()) | |
| .map(|mat| char_index_at_byte(text, mat.end())) | |
| .next() | |
| } | |
| fn labels_have_season_before( | |
| labels: &[String], | |
| offsets: &[(usize, usize)], | |
| marker_start: usize, | |
| ) -> bool { | |
| labels | |
| .iter() | |
| .zip(offsets.iter()) | |
| .any(|(label, (_start, end))| is_season_like_label(label) && *end <= marker_start) | |
| } | |
| fn token_indices_for_span(offsets: &[(usize, usize)], start: usize, end: usize) -> Vec<usize> { | |
| offsets | |
| .iter() | |
| .enumerate() | |
| .filter_map(|(idx, (token_start, token_end))| { | |
| if *token_start < end && *token_end > start { | |
| Some(idx) | |
| } else { | |
| None | |
| } | |
| }) | |
| .collect() | |
| } | |
| fn label_span(labels: &mut [String], start: usize, end: usize, entity: &str) { | |
| let previous_same = start > 0 && label_entity(&labels[start - 1]) == Some(entity); | |
| let mut first = !previous_same; | |
| for label in labels.iter_mut().take(end).skip(start) { | |
| *label = if first { | |
| format!("B-{entity}") | |
| } else { | |
| format!("I-{entity}") | |
| }; | |
| first = false; | |
| } | |
| } | |
| fn label_span_indices(labels: &mut [String], indices: &[usize], entity: &str) { | |
| if indices.is_empty() { | |
| return; | |
| } | |
| let previous_same = indices[0] > 0 && label_entity(&labels[indices[0] - 1]) == Some(entity); | |
| let mut first = !previous_same; | |
| for idx in indices { | |
| labels[*idx] = if first { | |
| format!("B-{entity}") | |
| } else { | |
| format!("I-{entity}") | |
| }; | |
| first = false; | |
| } | |
| } | |
| fn mark_adjacent_title_separators_o( | |
| tokens: &[String], | |
| labels: &mut [String], | |
| marker_indices: &[usize], | |
| ) { | |
| if marker_indices.is_empty() { | |
| return; | |
| } | |
| let mut idx = marker_indices[0]; | |
| while idx > 0 { | |
| let prev = idx - 1; | |
| if !tokens[prev].trim().is_empty() || !is_title_like_label(&labels[prev]) { | |
| break; | |
| } | |
| labels[prev] = "O".to_string(); | |
| idx = prev; | |
| } | |
| let mut idx = marker_indices[marker_indices.len() - 1] + 1; | |
| while idx < tokens.len() | |
| && tokens[idx].chars().all(|ch| SEPARATOR_CHARS.contains(&ch)) | |
| && is_title_like_label(&labels[idx]) | |
| { | |
| labels[idx] = "O".to_string(); | |
| idx += 1; | |
| } | |
| } | |
| fn standalone_ni_season_bases() -> &'static [(&'static str, u8)] { | |
| &[("Kakuriyo no Yadomeshi", 2)] | |
| } | |
| fn standalone_ni_after_base(text: &str, base_end: usize) -> Option<(usize, usize)> { | |
| let mut cursor = base_end; | |
| while let Some(ch) = text[cursor..].chars().next() { | |
| if !ch.is_whitespace() { | |
| break; | |
| } | |
| cursor += ch.len_utf8(); | |
| } | |
| let ni_end = cursor.checked_add(2)?; | |
| if text.get(cursor..ni_end)? == "Ni" { | |
| Some((cursor, ni_end)) | |
| } else { | |
| None | |
| } | |
| } | |
| fn is_title_like_label(label: &str) -> bool { | |
| matches!( | |
| label_entity(label), | |
| Some( | |
| "TITLE" | |
| | "TITLE_CHS" | |
| | "TITLE_CHT" | |
| | "TITLE_JPN" | |
| | "TITLE_LATIN" | |
| | "TITLE_MIXED" | |
| | "PATH_TITLE_CHS" | |
| | "PATH_TITLE_CHT" | |
| | "PATH_TITLE_JPN" | |
| | "PATH_TITLE_LATIN" | |
| | "PATH_TITLE_MIXED" | |
| ) | |
| ) | |
| } | |
| fn is_season_like_label(label: &str) -> bool { | |
| matches!(label_entity(label), Some("SEASON" | "PATH_SEASON")) | |
| } | |
| fn first_episode_span_end(labels: &[String], offsets: &[(usize, usize)], text: &str) -> usize { | |
| let ends = labels | |
| .iter() | |
| .zip(offsets.iter()) | |
| .filter_map(|(label, (_start, end))| { | |
| if label_entity(label) == Some("EPISODE") { | |
| Some(*end) | |
| } else { | |
| None | |
| } | |
| }) | |
| .collect::<Vec<_>>(); | |
| if let Some(end) = ends.into_iter().min() { | |
| return end; | |
| } | |
| first_episode_regex_end(text).unwrap_or(0) | |
| } | |
| fn bracket_inner_spans(text: &str) -> Vec<(usize, usize)> { | |
| let chars = text.chars().collect::<Vec<_>>(); | |
| let mut spans = Vec::new(); | |
| let mut idx = 0usize; | |
| while idx < chars.len() { | |
| let close = match chars[idx] { | |
| '[' => ']', | |
| '(' => ')', | |
| '【' => '】', | |
| '《' => '》', | |
| _ => { | |
| idx += 1; | |
| continue; | |
| } | |
| }; | |
| if let Some(relative_end) = chars[idx + 1..].iter().position(|ch| *ch == close) { | |
| let end = idx + 1 + relative_end; | |
| spans.push((idx + 1, end)); | |
| idx = end + 1; | |
| } else { | |
| idx += 1; | |
| } | |
| } | |
| spans | |
| } | |
| fn trimmed_bounds(text: &str) -> (usize, usize) { | |
| let chars = text.chars().collect::<Vec<_>>(); | |
| let mut start = 0usize; | |
| let mut end = chars.len(); | |
| while start < end && chars[start].is_whitespace() { | |
| start += 1; | |
| } | |
| while end > start && chars[end - 1].is_whitespace() { | |
| end -= 1; | |
| } | |
| (start, end) | |
| } | |
| fn chars_range_to_string(text: &str, start: usize, end: usize) -> String { | |
| text.chars() | |
| .skip(start) | |
| .take(end.saturating_sub(start)) | |
| .collect() | |
| } | |
| fn chars_slice_to_string(text: &str, start: usize, end: usize) -> String { | |
| text.chars() | |
| .skip(start) | |
| .take(end.saturating_sub(start)) | |
| .collect() | |
| } | |
| fn label_span_if_safe(labels: &mut [String], indices: &[usize], entity: &str) { | |
| if indices.is_empty() { | |
| return; | |
| } | |
| if indices.iter().any(|idx| { | |
| matches!( | |
| label_entity(&labels[*idx]), | |
| Some("GROUP" | "EPISODE" | "SEASON" | "PATH_SEASON") | |
| ) | |
| }) { | |
| return; | |
| } | |
| label_span_indices(labels, indices, entity); | |
| } | |
| fn has_ascii_token_boundaries(text: &str, start: usize, end: usize) -> bool { | |
| let previous_ok = text[..start] | |
| .chars() | |
| .next_back() | |
| .map(|ch| !ch.is_ascii_alphanumeric()) | |
| .unwrap_or(true); | |
| let next_ok = text[end..] | |
| .chars() | |
| .next() | |
| .map(|ch| !ch.is_ascii_alphanumeric()) | |
| .unwrap_or(true); | |
| previous_ok && next_ok | |
| } | |
| fn label_entity(label: &str) -> Option<&str> { | |
| let (prefix, entity) = label.split_once('-')?; | |
| if prefix == "B" || prefix == "I" { | |
| Some(entity) | |
| } else { | |
| None | |
| } | |
| } | |
| fn resolution_re() -> &'static FancyRegex { | |
| RESOLUTION_RE.get_or_init(|| { | |
| FancyRegex::new( | |
| r"(?i)(?<![A-Za-z0-9])(?:\d{3,4}[pP]|\d[Kk]|\d{3,4}[xX×]\d{3,4})(?![A-Za-z0-9])", | |
| ) | |
| .unwrap() | |
| }) | |
| } | |
| fn source_re() -> &'static Regex { | |
| SOURCE_RE.get_or_init(|| Regex::new(&format!(r"(?i)(?:{SOURCE_TOKEN_PATTERN})")).unwrap()) | |
| } | |
| fn source_tag_re() -> &'static Regex { | |
| SOURCE_TAG_RE.get_or_init(|| { | |
| Regex::new(&format!( | |
| r"(?i)^(?:{SOURCE_TOKEN_PATTERN})(?:\s*(?:[&+/,_-]|,\s*)\s*(?:{SOURCE_TOKEN_PATTERN}))*$" | |
| )) | |
| .unwrap() | |
| }) | |
| } | |
| fn special_tag_re() -> &'static Regex { | |
| SPECIAL_TAG_RE.get_or_init(|| { | |
| Regex::new(r"(?i)^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[::].+") | |
| .unwrap() | |
| }) | |
| } | |
| fn special_code_re() -> &'static Regex { | |
| SPECIAL_CODE_RE.get_or_init(|| { | |
| Regex::new(r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM)\d*$|^IV\d+$|^(?:OVA|OAD|SP)\d*$").unwrap() | |
| }) | |
| } | |
| fn episode_context_re() -> &'static Regex { | |
| EPISODE_CONTEXT_RE.get_or_init(|| { | |
| Regex::new( | |
| r"(?i)^\s*(?:[-_]\s*(?:\d{1,4}|NCOP|NCED|OP|ED|OVA|OAD|SP|END)\b|#\s*\d{1,4}|[\[\(【《]\s*(?:EP?|#)?\d{1,4})", | |
| ) | |
| .unwrap() | |
| }) | |
| } | |
| fn episode_span_re() -> &'static FancyRegex { | |
| EPISODE_SPAN_RE.get_or_init(|| { | |
| FancyRegex::new( | |
| r"(?i)(?:[Ss]\d{1,2}[Ee]\d{1,4}(?:v\d+)?|(?:^|[\s._])[-_]\s*\d{1,4}(?:v\d+)?(?=$|[\s._\-\]\)】》\[])|[\[\(【《](?:EP?|#)?\d{1,4}(?:v\d+)?[\]\)】》]|(?:^|[\s._\-\[\(【《#])(?:EP?|第|#)\d{1,4}(?:v\d+)?(?:[话話集])?(?=$|[\s._\-\]\)】》]))", | |
| ) | |
| .unwrap() | |
| }) | |
| } | |
| fn reading_marker_re() -> &'static FancyRegex { | |
| READING_MARKER_RE.get_or_init(|| { | |
| FancyRegex::new( | |
| r"(?i)(?<![A-Za-z0-9])(?P<marker>Ni\s+no\s+(?:Sara|Shou|Sho|Syo|Shō)|San\s+no\s+(?:Sara|Shou|Sho|Syo)|(?:Yon|Shi|Shin)\s+no\s+Sara|(?:Go|Gou)\s+no\s+Sara|Ni\s+Gakki|Sono\s+Ni)(?![A-Za-z0-9])", | |
| ) | |
| .unwrap() | |
| }) | |
| } | |
| fn roman_marker_re() -> &'static FancyRegex { | |
| ROMAN_MARKER_RE.get_or_init(|| { | |
| FancyRegex::new( | |
| r"(?<![A-Za-z0-9])(?P<marker>II|III|IV|V|VI|VII|VIII|IX|[ⅡⅢⅣⅤⅥⅦⅧⅨ])(?![A-Za-z0-9])", | |
| ) | |
| .unwrap() | |
| }) | |
| } | |
| fn cjk_marker_re() -> &'static Regex { | |
| CJK_MARKER_RE.get_or_init(|| { | |
| Regex::new( | |
| r"(?:[一二三四五六七八九十兩两貳贰弐弍參叁参肆伍陸陆柒捌玖](?:\s*(?:ノ|の|之)\s*(?:章|期|季|部))?|第[一二三四五六七八九十兩两貳贰弐弍參叁参肆伍陸陆柒捌玖\d]+[季期部章])", | |
| ) | |
| .unwrap() | |
| }) | |
| } | |
| fn special_context_prefix_re() -> &'static Regex { | |
| SPECIAL_CONTEXT_PREFIX_RE.get_or_init(|| { | |
| Regex::new( | |
| r"(?i)^(?:[\[\(【《]\s*(?:menu|menus|bdmenu|ncop|nced|op|ed|ova|oad|sp)\s*[\]\)】》]\s*){0,2}", | |
| ) | |
| .unwrap() | |
| }) | |
| } | |
| fn chars_as_strings(text: &str) -> Vec<String> { | |
| text.chars().map(|ch| ch.to_string()).collect() | |
| } | |
| fn char_index_at_byte(text: &str, byte_index: usize) -> usize { | |
| text[..byte_index].chars().count() | |
| } | |
| fn bio_entity(label: &str) -> Option<String> { | |
| let (prefix, entity) = label.split_once('-')?; | |
| if prefix == "B" || prefix == "I" { | |
| Some(entity.to_string()) | |
| } else { | |
| None | |
| } | |
| } | |
| fn is_wrapped_token(token: &str) -> bool { | |
| let mut chars = token.chars(); | |
| let Some(first) = chars.next() else { | |
| return false; | |
| }; | |
| let Some(last) = token.chars().last() else { | |
| return false; | |
| }; | |
| matches!(first, '[' | '【' | '(' | '《') && matches!(last, ']' | '】' | ')' | '》') | |
| } | |
| fn canonical_bio_label(label: &str) -> String { | |
| let Some((prefix, entity)) = label.split_once('-') else { | |
| return if label == "O" { | |
| "O".to_string() | |
| } else { | |
| label.to_string() | |
| }; | |
| }; | |
| if prefix != "B" && prefix != "I" { | |
| return label.to_string(); | |
| } | |
| let canonical_entity = match entity { | |
| "TITLE" => "TITLE_MIXED", | |
| "PATH_TITLE" => "PATH_TITLE_MIXED", | |
| other => other, | |
| }; | |
| format!("{prefix}-{canonical_entity}") | |
| } | |
| fn token_id(vocab: &Vocab, token: &str) -> u16 { | |
| *vocab.ids.get(token).unwrap_or(&vocab.unk_id) | |
| } | |
| fn write_eval_records(rows: &[SourceRow], path: &Path) -> Result<()> { | |
| let mut writer = BufWriter::new( | |
| File::create(path).with_context(|| format!("failed to create {}", path.display()))?, | |
| ); | |
| for row in rows { | |
| writer.write_all(row.raw_line.as_bytes())?; | |
| writer.write_all(b"\n")?; | |
| } | |
| Ok(()) | |
| } | |
| fn write_npy_u16(path: &Path, data: &[u16], rows: usize, cols: usize) -> Result<()> { | |
| let mut writer = BufWriter::new( | |
| File::create(path).with_context(|| format!("failed to create {}", path.display()))?, | |
| ); | |
| write_npy_header(&mut writer, "<u2", rows, cols)?; | |
| for value in data { | |
| writer.write_all(&value.to_le_bytes())?; | |
| } | |
| Ok(()) | |
| } | |
| fn write_npy_u8(path: &Path, data: &[u8], rows: usize, cols: usize) -> Result<()> { | |
| let mut writer = BufWriter::new( | |
| File::create(path).with_context(|| format!("failed to create {}", path.display()))?, | |
| ); | |
| write_npy_header(&mut writer, "|u1", rows, cols)?; | |
| writer.write_all(data)?; | |
| Ok(()) | |
| } | |
| fn write_npy_i16(path: &Path, data: &[i16], rows: usize, cols: usize) -> Result<()> { | |
| let mut writer = BufWriter::new( | |
| File::create(path).with_context(|| format!("failed to create {}", path.display()))?, | |
| ); | |
| write_npy_header(&mut writer, "<i2", rows, cols)?; | |
| for value in data { | |
| writer.write_all(&value.to_le_bytes())?; | |
| } | |
| Ok(()) | |
| } | |
| fn write_npy_header<W: Write>(writer: &mut W, descr: &str, rows: usize, cols: usize) -> Result<()> { | |
| let mut header = format!( | |
| "{{'descr': '{}', 'fortran_order': False, 'shape': ({}, {}), }}", | |
| descr, rows, cols | |
| ) | |
| .into_bytes(); | |
| let preamble_len = 10usize; | |
| let pad_len = (16 - ((preamble_len + header.len() + 1) % 16)) % 16; | |
| header.extend(std::iter::repeat(b' ').take(pad_len)); | |
| header.push(b'\n'); | |
| if header.len() > u16::MAX as usize { | |
| bail!("npy header too large"); | |
| } | |
| writer.write_all(b"\x93NUMPY")?; | |
| writer.write_all(&[1, 0])?; | |
| writer.write_all(&(header.len() as u16).to_le_bytes())?; | |
| writer.write_all(&header)?; | |
| Ok(()) | |
| } | |
| mod tests { | |
| use super::*; | |
| fn char_row( | |
| text: &str, | |
| title_spans: &[(usize, usize)], | |
| episode_spans: &[(usize, usize)], | |
| ) -> SourceRow { | |
| let tokens = chars_as_strings(text); | |
| let mut labels = vec!["O".to_string(); tokens.len()]; | |
| for (start, end) in title_spans { | |
| label_span(&mut labels, *start, *end, "TITLE_LATIN"); | |
| } | |
| for (start, end) in episode_spans { | |
| label_span(&mut labels, *start, *end, "EPISODE"); | |
| } | |
| SourceRow { | |
| row_index: 0, | |
| raw_line: String::new(), | |
| filename: Some(text.to_string()), | |
| tokens, | |
| labels, | |
| tokenizer_variant: Some("char".to_string()), | |
| } | |
| } | |
| fn repairs_cjk_sequel_marker_in_char_fast_path() { | |
| let text = "妖怪旅館營業中 貳 - 11"; | |
| let title_end = char_index_at_byte(text, text.find(" - ").unwrap()); | |
| let episode_start = char_index_at_byte(text, text.find("11").unwrap()); | |
| let row = char_row( | |
| text, | |
| &[(0, title_end)], | |
| &[(episode_start, episode_start + 2)], | |
| ); | |
| let (_tokens, labels) = labels_for_char_tokenizer(&row); | |
| let marker = char_index_at_byte(text, text.find('貳').unwrap()); | |
| let before_marker = marker - 1; | |
| assert_eq!(labels[before_marker], "O"); | |
| assert_eq!(labels[marker], "B-SEASON"); | |
| assert_eq!(labels[episode_start], "B-EPISODE"); | |
| } | |
| fn repairs_reading_sequel_marker() { | |
| let text = "Shokugeki no Souma Ni no Sara - 13"; | |
| let title_end = text.find(" - ").unwrap(); | |
| let episode_start = text.find("13").unwrap(); | |
| let row = char_row( | |
| text, | |
| &[(0, title_end)], | |
| &[(episode_start, episode_start + 2)], | |
| ); | |
| let (_tokens, labels) = labels_for_char_tokenizer(&row); | |
| let marker_start = text.find("Ni").unwrap(); | |
| let marker_end = text.find(" - ").unwrap(); | |
| assert_eq!(labels[marker_start - 1], "O"); | |
| assert_eq!(labels[marker_start], "B-SEASON"); | |
| assert!(labels[marker_start + 1..marker_end] | |
| .iter() | |
| .all(|label| label == "I-SEASON")); | |
| } | |
| fn keeps_numeric_title_suffix_out_of_sequel_repair() { | |
| let text = "Kamisama Hajimemashita 2 - 01"; | |
| let title_end = text.find(" - ").unwrap(); | |
| let episode_start = text.find("01").unwrap(); | |
| let row = char_row( | |
| text, | |
| &[(0, title_end)], | |
| &[(episode_start, episode_start + 2)], | |
| ); | |
| let (_tokens, labels) = labels_for_char_tokenizer(&row); | |
| let suffix = text.find('2').unwrap(); | |
| assert_eq!(labels[suffix], "I-TITLE_LATIN"); | |
| assert!(!labels | |
| .iter() | |
| .any(|label| label_entity(label) == Some("SEASON"))); | |
| } | |
| fn skips_alias_marker_when_season_already_exists() { | |
| let text = "樱桃小丸子第二期(Chibi Maruko-chan II)[1439]"; | |
| let tokens = chars_as_strings(text); | |
| let mut labels = vec!["O".to_string(); tokens.len()]; | |
| let title_end = char_index_at_byte(text, text.find("第二期").unwrap()); | |
| label_span(&mut labels, 0, title_end, "TITLE_CHS"); | |
| let season_start = title_end; | |
| let season_end = season_start + "第二期".chars().count(); | |
| label_span(&mut labels, season_start, season_end, "SEASON"); | |
| let alias_start = char_index_at_byte(text, text.find("Chibi").unwrap()); | |
| let alias_end = char_index_at_byte(text, text.find(")").unwrap()); | |
| label_span(&mut labels, alias_start, alias_end, "TITLE_LATIN"); | |
| let episode_start = char_index_at_byte(text, text.find("1439").unwrap()); | |
| label_span(&mut labels, episode_start, episode_start + 4, "EPISODE"); | |
| let row = SourceRow { | |
| row_index: 0, | |
| raw_line: String::new(), | |
| filename: Some(text.to_string()), | |
| tokens, | |
| labels, | |
| tokenizer_variant: Some("char".to_string()), | |
| }; | |
| let (_tokens, labels) = labels_for_char_tokenizer(&row); | |
| let roman = char_index_at_byte(text, text.find("II").unwrap()); | |
| assert_eq!(labels[roman], "I-TITLE_LATIN"); | |
| } | |
| } | |