File size: 4,653 Bytes
72c0672 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 | use crate::tokenizer::pattern::Pattern;
use crate::tokenizer::Decoder;
use crate::tokenizer::{NormalizedString, Normalizer, Result};
use crate::utils::SysRegex;
use serde::{Deserialize, Serialize};
/// Represents the different patterns that `Replace` can use
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)]
pub enum ReplacePattern {
String(String),
Regex(String),
}
impl From<String> for ReplacePattern {
fn from(v: String) -> Self {
Self::String(v)
}
}
impl From<&str> for ReplacePattern {
fn from(v: &str) -> Self {
Self::String(v.to_owned())
}
}
/// We use this custom deserializer to provide the value for `regex` for `Replace`
#[doc(hidden)]
#[derive(Deserialize)]
#[serde(tag = "type")]
struct ReplaceDeserializer {
pattern: ReplacePattern,
content: String,
}
impl std::convert::TryFrom<ReplaceDeserializer> for Replace {
type Error = Box<dyn std::error::Error + Send + Sync>;
fn try_from(v: ReplaceDeserializer) -> Result<Self> {
Self::new(v.pattern, v.content)
}
}
/// This normalizer will take a `pattern` (for now only a String)
/// and replace every occurrence with `content`.
#[derive(Debug, Serialize, Deserialize)]
#[serde(tag = "type", try_from = "ReplaceDeserializer")]
pub struct Replace {
pattern: ReplacePattern,
content: String,
#[serde(skip)]
regex: SysRegex,
}
impl Clone for Replace {
fn clone(&self) -> Self {
Self::new(self.pattern.clone(), &self.content).unwrap()
}
}
impl PartialEq for Replace {
fn eq(&self, other: &Self) -> bool {
self.pattern == other.pattern && self.content == other.content
}
}
impl Replace {
pub fn new<I: Into<ReplacePattern>, C: Into<String>>(pattern: I, content: C) -> Result<Self> {
let pattern: ReplacePattern = pattern.into();
let regex = match &pattern {
ReplacePattern::String(s) => SysRegex::new(®ex::escape(s))?,
ReplacePattern::Regex(r) => SysRegex::new(r)?,
};
Ok(Self {
pattern,
content: content.into(),
regex,
})
}
}
impl Normalizer for Replace {
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
normalized.replace(&self.regex, &self.content)
}
}
impl Decoder for Replace {
fn decode_chain(&self, tokens: Vec<String>) -> Result<Vec<String>> {
tokens
.into_iter()
.map(|token| -> Result<String> {
let mut new_token = "".to_string();
for ((start, stop), is_match) in (&self.regex).find_matches(&token)? {
if is_match {
new_token.push_str(&self.content);
} else {
new_token.push_str(&token[start..stop]);
}
}
Ok(new_token)
})
.collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_replace() {
let original = "This is a ''test''";
let normalized = "This is a \"test\"";
let mut n = NormalizedString::from(original);
Replace::new("''", "\"").unwrap().normalize(&mut n).unwrap();
assert_eq!(&n.get(), &normalized);
}
#[test]
fn test_replace_regex() {
let original = "This is a test";
let normalized = "This is a test";
let mut n = NormalizedString::from(original);
Replace::new(ReplacePattern::Regex(r"\s+".into()), ' ')
.unwrap()
.normalize(&mut n)
.unwrap();
assert_eq!(&n.get(), &normalized);
}
#[test]
fn serialization() {
let replace = Replace::new("Hello", "Hey").unwrap();
let replace_s = r#"{"type":"Replace","pattern":{"String":"Hello"},"content":"Hey"}"#;
assert_eq!(serde_json::to_string(&replace).unwrap(), replace_s);
assert_eq!(serde_json::from_str::<Replace>(replace_s).unwrap(), replace);
let replace = Replace::new(ReplacePattern::Regex(r"\s+".into()), ' ').unwrap();
let replace_s = r#"{"type":"Replace","pattern":{"Regex":"\\s+"},"content":" "}"#;
assert_eq!(serde_json::to_string(&replace).unwrap(), replace_s);
assert_eq!(serde_json::from_str::<Replace>(replace_s).unwrap(), replace);
}
#[test]
fn test_replace_decode() {
let original = vec!["hello".to_string(), "_hello".to_string()];
let replace = Replace::new("_", " ").unwrap();
assert_eq!(
replace.decode_chain(original).unwrap(),
vec!["hello", " hello"]
);
}
}
|