| | |
| |
|
| | use crate::{Error, Result}; |
| | use lazy_static::lazy_static; |
| | use regex::Regex; |
| | use std::collections::HashMap; |
| |
|
| | #[derive(Debug, Clone, Copy, PartialEq, Eq)] |
| | pub enum Language { |
| | Chinese, |
| | English, |
| | Mixed, |
| | } |
| |
|
| | #[derive(Debug)] |
| | pub struct TextNormalizer { |
| | punct_map: HashMap<char, char>, |
| | number_words: HashMap<u64, &'static str>, |
| | } |
| |
|
| | lazy_static! { |
| | static ref NUMBER_REGEX: Regex = Regex::new(r"\d+").unwrap(); |
| | static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s+").unwrap(); |
| | } |
| |
|
| | impl TextNormalizer { |
| | pub fn new() -> Self { |
| | let mut punct_map = HashMap::new(); |
| | punct_map.insert('\u{FF0C}', ','); |
| | punct_map.insert('\u{3002}', '.'); |
| | punct_map.insert('\u{FF01}', '!'); |
| | punct_map.insert('\u{FF1F}', '?'); |
| | punct_map.insert('\u{FF1B}', ';'); |
| | punct_map.insert('\u{FF1A}', ':'); |
| | punct_map.insert('\u{201C}', '\u{0022}'); |
| | punct_map.insert('\u{201D}', '\u{0022}'); |
| | punct_map.insert('\u{2018}', '\''); |
| | punct_map.insert('\u{2019}', '\''); |
| |
|
| | let mut number_words = HashMap::new(); |
| | number_words.insert(0, "zero"); |
| | number_words.insert(1, "one"); |
| | number_words.insert(2, "two"); |
| | number_words.insert(3, "three"); |
| | number_words.insert(4, "four"); |
| | number_words.insert(5, "five"); |
| | number_words.insert(6, "six"); |
| | number_words.insert(7, "seven"); |
| | number_words.insert(8, "eight"); |
| | number_words.insert(9, "nine"); |
| | number_words.insert(10, "ten"); |
| | number_words.insert(20, "twenty"); |
| | number_words.insert(30, "thirty"); |
| |
|
| | Self { punct_map, number_words } |
| | } |
| |
|
| | pub fn normalize(&self, text: &str) -> Result<String> { |
| | let mut result = self.normalize_punctuation(text); |
| | result = self.normalize_whitespace(&result); |
| | Ok(result) |
| | } |
| |
|
| | pub fn normalize_punctuation(&self, text: &str) -> String { |
| | text.chars() |
| | .map(|c| *self.punct_map.get(&c).unwrap_or(&c)) |
| | .collect() |
| | } |
| |
|
| | pub fn normalize_whitespace(&self, text: &str) -> String { |
| | WHITESPACE_REGEX.replace_all(text, " ").trim().to_string() |
| | } |
| |
|
| | pub fn split_sentences(&self, text: &str) -> Vec<String> { |
| | let mut sentences = Vec::new(); |
| | let mut current = String::new(); |
| |
|
| | for ch in text.chars() { |
| | current.push(ch); |
| | if ch == '.' || ch == '!' || ch == '?' { |
| | let trimmed = current.trim().to_string(); |
| | if !trimmed.is_empty() { |
| | sentences.push(trimmed); |
| | } |
| | current.clear(); |
| | } |
| | } |
| |
|
| | let trimmed = current.trim().to_string(); |
| | if !trimmed.is_empty() { |
| | sentences.push(trimmed); |
| | } |
| |
|
| | sentences |
| | } |
| | } |
| |
|
| | impl Default for TextNormalizer { |
| | fn default() -> Self { |
| | Self::new() |
| | } |
| | } |
| |
|
| | #[cfg(test)] |
| | mod tests { |
| | use super::*; |
| |
|
| | #[test] |
| | fn test_normalizer() { |
| | let n = TextNormalizer::new(); |
| | let r = n.normalize_whitespace(" a b "); |
| | assert_eq!(r.len(), 3); |
| | } |
| | } |
| |
|