Miaode commited on Jul 26, 2025

Commit

2917cfc

verified ·

1 Parent(s): 4742103

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

DataFlow/dataflow/operators/process/AgenticRAG/__pycache__/ContentChooser.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/process/AgenticRAG/__pycache__/__init__.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/__init__.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/ccnet_deduplicator.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/hash_deduplicator.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/minhash_deduplicator.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/ngramhash_deduplicator.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/sem_deduplicator.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/simhash_deduplicator.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/debertav3_filter.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/fineweb_edu_filter.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/heuristics.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/lexical_diversity_filter.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/perplexity_filter.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/perspective_filter.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerFormatterFilter.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerGroundTruthFilter.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerJudger_MathVerify.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerNgramFilter.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerPipelineRoot.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerTokenLengthFilter.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/process/Reasoning/__pycache__/QuestionFilter.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/process/Reasoning/__pycache__/__init__.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/process/__pycache__/__init__.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/refine/GeneralText/__init__.py +0 -0
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/__init__.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/html_entity_refiner.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/html_url_remover_refiner.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/lowercase_refiner.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/ner_refiner.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/pii_anonymize_refiner.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/ref_removal_refiner.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_contractions_refiner.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_emoji_refiner.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_extra_spaces_refiner.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_image_ref_refiner.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_number_refiner.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_punctuation_refiner.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_repetitions_punctuation_refiner.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_stopwords_refiner.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/spelling_correction_refiner.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/stemming_lemmatization_refiner.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/text_normalization_refiner.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/refine/GeneralText/html_entity_refiner.py +70 -0
DataFlow/dataflow/operators/refine/GeneralText/html_url_remover_refiner.py +46 -0
DataFlow/dataflow/operators/refine/GeneralText/lowercase_refiner.py +36 -0
DataFlow/dataflow/operators/refine/GeneralText/ner_refiner.py +79 -0
DataFlow/dataflow/operators/refine/GeneralText/pii_anonymize_refiner.py +62 -0
DataFlow/dataflow/operators/refine/GeneralText/ref_removal_refiner.py +62 -0
DataFlow/dataflow/operators/refine/GeneralText/remove_contractions_refiner.py +36 -0

DataFlow/dataflow/operators/process/AgenticRAG/__pycache__/ContentChooser.cpython-310.pyc ADDED Viewed

Binary file (7.67 kB). View file

DataFlow/dataflow/operators/process/AgenticRAG/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (251 Bytes). View file

DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (205 Bytes). View file

DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/ccnet_deduplicator.cpython-310.pyc ADDED Viewed

Binary file (3.46 kB). View file

DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/hash_deduplicator.cpython-310.pyc ADDED Viewed

Binary file (2.75 kB). View file

DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/minhash_deduplicator.cpython-310.pyc ADDED Viewed

Binary file (3.43 kB). View file

DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/ngramhash_deduplicator.cpython-310.pyc ADDED Viewed

Binary file (3.45 kB). View file

DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/sem_deduplicator.cpython-310.pyc ADDED Viewed

Binary file (4.78 kB). View file

DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/simhash_deduplicator.cpython-310.pyc ADDED Viewed

Binary file (2.92 kB). View file

DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/debertav3_filter.cpython-310.pyc ADDED Viewed

Binary file (1.92 kB). View file

DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/fineweb_edu_filter.cpython-310.pyc ADDED Viewed

Binary file (2.11 kB). View file

DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/heuristics.cpython-310.pyc ADDED Viewed

Binary file (36.2 kB). View file

DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/lexical_diversity_filter.cpython-310.pyc ADDED Viewed

Binary file (2.66 kB). View file

DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/perplexity_filter.cpython-310.pyc ADDED Viewed

Binary file (1.84 kB). View file

DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/perspective_filter.cpython-310.pyc ADDED Viewed

Binary file (1.91 kB). View file

DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerFormatterFilter.cpython-310.pyc ADDED Viewed

Binary file (3.36 kB). View file

DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerGroundTruthFilter.cpython-310.pyc ADDED Viewed

Binary file (3.29 kB). View file

DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerJudger_MathVerify.cpython-310.pyc ADDED Viewed

Binary file (3.75 kB). View file

DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerNgramFilter.cpython-310.pyc ADDED Viewed

Binary file (4.19 kB). View file

DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerPipelineRoot.cpython-310.pyc ADDED Viewed

Binary file (3.45 kB). View file

DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerTokenLengthFilter.cpython-310.pyc ADDED Viewed

Binary file (3.71 kB). View file

DataFlow/dataflow/operators/process/Reasoning/__pycache__/QuestionFilter.cpython-310.pyc ADDED Viewed

Binary file (3.79 kB). View file

DataFlow/dataflow/operators/process/Reasoning/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (545 Bytes). View file

DataFlow/dataflow/operators/process/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (1.02 kB). View file

DataFlow/dataflow/operators/refine/GeneralText/__init__.py ADDED Viewed

File without changes

DataFlow/dataflow/operators/refine/GeneralText/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (1.57 kB). View file

DataFlow/dataflow/operators/refine/GeneralText/__pycache__/html_entity_refiner.cpython-310.pyc ADDED Viewed

Binary file (2.3 kB). View file

DataFlow/dataflow/operators/refine/GeneralText/__pycache__/html_url_remover_refiner.cpython-310.pyc ADDED Viewed

Binary file (2.08 kB). View file

DataFlow/dataflow/operators/refine/GeneralText/__pycache__/lowercase_refiner.cpython-310.pyc ADDED Viewed

Binary file (1.7 kB). View file

DataFlow/dataflow/operators/refine/GeneralText/__pycache__/ner_refiner.cpython-310.pyc ADDED Viewed

Binary file (2.42 kB). View file

DataFlow/dataflow/operators/refine/GeneralText/__pycache__/pii_anonymize_refiner.cpython-310.pyc ADDED Viewed

Binary file (2.84 kB). View file

DataFlow/dataflow/operators/refine/GeneralText/__pycache__/ref_removal_refiner.cpython-310.pyc ADDED Viewed

Binary file (2.24 kB). View file

DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_contractions_refiner.cpython-310.pyc ADDED Viewed

Binary file (1.75 kB). View file

DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_emoji_refiner.cpython-310.pyc ADDED Viewed

Binary file (2.1 kB). View file

DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_extra_spaces_refiner.cpython-310.pyc ADDED Viewed

Binary file (2 kB). View file

DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_image_ref_refiner.cpython-310.pyc ADDED Viewed

Binary file (2.41 kB). View file

DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_number_refiner.cpython-310.pyc ADDED Viewed

Binary file (1.88 kB). View file

DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_punctuation_refiner.cpython-310.pyc ADDED Viewed

Binary file (1.82 kB). View file

DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_repetitions_punctuation_refiner.cpython-310.pyc ADDED Viewed

Binary file (1.88 kB). View file

DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_stopwords_refiner.cpython-310.pyc ADDED Viewed

Binary file (2.39 kB). View file

DataFlow/dataflow/operators/refine/GeneralText/__pycache__/spelling_correction_refiner.cpython-310.pyc ADDED Viewed

Binary file (3.37 kB). View file

DataFlow/dataflow/operators/refine/GeneralText/__pycache__/stemming_lemmatization_refiner.cpython-310.pyc ADDED Viewed

Binary file (2.45 kB). View file

DataFlow/dataflow/operators/refine/GeneralText/__pycache__/text_normalization_refiner.cpython-310.pyc ADDED Viewed

Binary file (2.22 kB). View file

DataFlow/dataflow/operators/refine/GeneralText/html_entity_refiner.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import re
+from tqdm import tqdm
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+@OPERATOR_REGISTRY.register()
+class HtmlEntityRefiner(OperatorABC):
+    def __init__(self, html_entities: list = [
+            "nbsp", "lt", "gt", "amp", "quot", "apos", "hellip", "ndash", "mdash",
+            "lsquo", "rsquo", "ldquo", "rdquo"
+        ]):
+        self.logger = get_logger()
+        self.logger.info(f"Initializing {self.__class__.__name__} ...")
+        # 从参数中获取自定义 HTML 实体列表，如果未提供则使用默认列表
+        self.html_entities = html_entities
+        # 构建正则表达式模式，匹配所有定义的 HTML 实体
+        # 包括以下几种形式：
+        # 1. &实体名;
+        # 2. ＆实体名; （全角 &）
+        # 3. &实体名； （中文分号）
+        # 4. ＆实体名； （全角 & + 中文分号）
+        entity_patterns = []
+        for entity in self.html_entities:
+            # &实体名;
+            entity_patterns.append(fr'&{entity};')
+            # ＆实体名; （全角 &）
+            entity_patterns.append(fr'＆{entity};')
+            # &实体名； （中文分号）
+            entity_patterns.append(fr'&{entity}；')
+            # ＆实体名； （全角 & + 中文分号）
+            entity_patterns.append(fr'＆{entity}；')
+        # 编译正则表达式
+        self.html_entity_regex = re.compile('|'.join(entity_patterns))
+    @staticmethod
+    def get_desc(lang):
+        return "去除文本中的HTML实体" if lang == "zh" else "Remove HTML entities from the text."
+    def run(self, storage: DataFlowStorage, input_key: str):
+        self.input_key = input_key
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key}...")
+        dataframe = storage.read("dataframe")
+        numbers = 0
+        refined_data = []
+        for item in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            modified = False
+            original_text = item
+            refined_text = original_text
+            # 使用正则表达式替换所有匹配的HTML实体为空字符串
+            refined_text = self.html_entity_regex.sub('', refined_text)
+            # 检查文本是否被修改
+            if original_text != refined_text:
+                item = refined_text
+                modified = True
+                self.logger.debug(f"Modified text for key '{self.input_key}': Original: {original_text[:30]}... -> Refined: {refined_text[:30]}...")
+            refined_data.append(item)
+            if modified:
+                numbers += 1
+                self.logger.debug(f"Item modified, total modified so far: {numbers}")
+        self.logger.info(f"Refining Complete. Total modified items: {numbers}")
+        dataframe[self.input_key] = refined_data
+        output_file = storage.write(dataframe)
+        return [self.input_key]

DataFlow/dataflow/operators/refine/GeneralText/html_url_remover_refiner.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import re
+from tqdm import tqdm
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+@OPERATOR_REGISTRY.register()
+class HtmlUrlRemoverRefiner(OperatorABC):
+    def __init__(self):
+        self.logger = get_logger()
+        self.logger.info(f"Initializing {self.__class__.__name__} ...")
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        return "去除文本中的URL和HTML标签" if lang == "zh" else "Remove URLs and HTML tags from the text."
+    def run(self, storage: DataFlowStorage, input_key: str):
+        self.input_key = input_key
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key}...")
+        dataframe = storage.read("dataframe")
+        numbers = 0
+        refined_data = []
+        for item in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            modified = False
+            original_text = item
+            refined_text = original_text
+            # Remove URLs
+            refined_text = re.sub(r'https?:\/\/\S+[\r\n]*', '', refined_text, flags=re.MULTILINE)
+            # Remove HTML tags
+            refined_text = re.sub(r'<.*?>', '', refined_text)
+            if original_text != refined_text:
+                item = refined_text
+                modified = True
+                self.logger.debug(f"Modified text for key '{self.input_key}': Original: {original_text[:30]}... -> Refined: {refined_text[:30]}...")
+            refined_data.append(item)
+            if modified:
+                numbers += 1
+                self.logger.debug(f"Item modified, total modified so far: {numbers}")
+        self.logger.info(f"Refining Complete. Total modified items: {numbers}")
+        dataframe[self.input_key] = refined_data
+        output_file = storage.write(dataframe)
+        return [self.input_key]

DataFlow/dataflow/operators/refine/GeneralText/lowercase_refiner.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import re
+from tqdm import tqdm
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+@OPERATOR_REGISTRY.register()
+class LowercaseRefiner(OperatorABC):
+    def __init__(self):
+        self.logger = get_logger()
+        self.logger.info(f"Initializing {self.__class__.__name__} ...")
+    def run(self, storage: DataFlowStorage, input_key: str):
+        self.input_key = input_key
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key}...")
+        dataframe = storage.read("dataframe")
+        numbers = 0
+        refined_data = []
+        for item in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            modified = False
+            original_text = item
+            lower_text = original_text.lower()
+            if original_text != lower_text:
+                item = lower_text
+                modified = True
+            self.logger.debug(f"Modified text for key '{self.input_key}': Original: {original_text[:30]}... -> Refined: {lower_text[:30]}...")
+            refined_data.append(item)
+            if modified:
+                numbers += 1
+                self.logger.debug(f"Item modified, total modified so far: {numbers}")
+        self.logger.info(f"Refining Complete. Total modified items: {numbers}")
+        dataframe[self.input_key] = refined_data
+        output_file = storage.write(dataframe)
+        return [self.input_key]

DataFlow/dataflow/operators/refine/GeneralText/ner_refiner.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import spacy
+from tqdm import tqdm
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+ENTITY_LABELS = {
+    "PERSON": "[PERSON]",
+    "ORG": "[ORG]",
+    "GPE": "[GPE]",
+    "LOC": "[LOC]",
+    "PRODUCT": "[PRODUCT]",
+    "EVENT": "[EVENT]",
+    "DATE": "[DATE]",
+    "TIME": "[TIME]",
+    "MONEY": "[MONEY]",
+    "PERCENT": "[PERCENT]",
+    "QUANTITY": "[QUANTITY]",
+    "ORDINAL": "[ORDINAL]",
+    "CARDINAL": "[CARDINAL]",
+    "NORP": "[NORP]",
+    "FAC": "[FAC]",
+    "LAW": "[LAW]",
+    "LANGUAGE": "[LANGUAGE]",
+    "WORK_OF_ART": "[WORK_OF_ART]",
+    "LAW": "[LAW]",
+    "ORDINAL": "[ORDINAL]",
+    "CARDINAL": "[CARDINAL]",
+    "PERCENT": "[PERCENT]",
+    "QUANTITY": "[QUANTITY]",
+    "DATE": "[DATE]",
+    "TIME": "[TIME]",
+    "URL": "[URL]",
+    "EMAIL": "[EMAIL]",
+    "MONEY": "[MONEY]",
+    "FAC": "[FAC]",
+    "PRODUCT": "[PRODUCT]",
+    "EVENT": "[EVENT]",
+    "WORK_OF_ART": "[WORK_OF_ART]",
+    "LANGUAGE": "[LANGUAGE]",
+    "NORP": "[NORP]"
+}
+@OPERATOR_REGISTRY.register()
+class NERRefiner(OperatorABC):
+    def __init__(self):
+        self.logger = get_logger()
+        self.logger.info(f"Initializing {self.__class__.__name__} ...")
+        self.nlp = spacy.load("en_core_web_sm")
+    def run(self, storage: DataFlowStorage, input_key: str):
+        self.input_key = input_key
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key}...")
+        dataframe = storage.read("dataframe")
+        numbers = 0
+        refined_data = []
+        for item in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            modified = False
+            original_text = item
+            refined_text = original_text
+            doc = self.nlp(refined_text)
+            for ent in doc.ents:
+                if ent.label_ in ENTITY_LABELS :
+                    refined_text = refined_text.replace(ent.text, f"[{ent.label_}]")
+            if original_text != refined_text:
+                item = refined_text
+                modified = True
+            refined_data.append(item)
+            if modified:
+                numbers += 1
+                self.logger.debug(f"Item modified, total modified so far: {numbers}")
+        self.logger.info(f"Refining Complete. Total modified items: {numbers}")
+        dataframe[self.input_key] = refined_data
+        output_file = storage.write(dataframe)
+        return [self.input_key]

DataFlow/dataflow/operators/refine/GeneralText/pii_anonymize_refiner.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from tqdm import tqdm
+from transformers import AutoModelForTokenClassification, AutoTokenizer
+from presidio_analyzer.nlp_engine import TransformersNlpEngine
+from presidio_analyzer import AnalyzerEngine
+from presidio_anonymizer import AnonymizerEngine
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+@OPERATOR_REGISTRY.register()
+class PIIAnonymizeRefiner(OperatorABC):
+    def __init__(self, lang='en', device='cuda', model_cache_dir='./dataflow_cache', model_name='dslim/bert-base-NER', ):
+        self.logger = get_logger()
+        self.logger.info(f"Initializing {self.__class__.__name__} ...")
+        self.lang = lang
+        self.device = device
+        self.model_cache_dir = model_cache_dir
+        self.model_name = model_name
+        model_name = 'dslim/bert-base-NER'
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=self.model_cache_dir)
+        self.model = AutoModelForTokenClassification.from_pretrained(model_name, cache_dir=self.model_cache_dir).to(self.device)
+        model_config = [{
+            "lang_code": self.lang,
+            "model_name": {
+                "spacy": "en_core_web_sm",
+                "transformers": model_name
+            }
+        }]
+        self.nlp_engine = TransformersNlpEngine(models=model_config)
+        self.analyzer = AnalyzerEngine(nlp_engine=self.nlp_engine)
+        self.anonymizer = AnonymizerEngine()
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        return "去除文本中的URL和HTML标签" if lang == "zh" else "Remove URLs and HTML tags from the text."
+    def run(self, storage: DataFlowStorage, input_key: str):
+        self.input_key = input_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key}...")
+        anonymized_count = 0
+        refined_data = []
+        for item in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            modified = False
+            original_text = item
+            results = self.analyzer.analyze(original_text, language=self.lang)
+            anonymized_text = self.anonymizer.anonymize(original_text, results)
+            if original_text != anonymized_text.text:
+                item = anonymized_text.text
+                modified = True
+            self.logger.debug(f"Modified text for key '{self.input_key}': Original: {original_text[:30]}... -> Refined: {anonymized_text.text[:30]}...")
+            refined_data.append(item)
+            if modified:
+                anonymized_count += 1
+                self.logger.debug(f"Item modified, total modified so far: {anonymized_count}")
+        self.logger.info(f"Refining Complete. Total modified items: {anonymized_count}")
+        dataframe[self.input_key] = refined_data
+        output_file = storage.write(dataframe)
+        return [self.input_key]

DataFlow/dataflow/operators/refine/GeneralText/ref_removal_refiner.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import re
+from tqdm import tqdm
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+@OPERATOR_REGISTRY.register()
+class ReferenceRemoverRefiner(OperatorABC):
+    def __init__(self):
+        self.logger = get_logger()
+        self.logger.info(f"Initializing {self.__class__.__name__}...")
+    @staticmethod
+    def get_desc(lang):
+        return "删除文本中未闭合的引用标签和引用链接" if lang == "zh" else "Remove unclosed reference tags and citation links from the text."
+    def run(self, storage: DataFlowStorage, input_key: str):
+        self.input_key = input_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key}...")
+        numbers = 0
+        # 定义要删除的模式 - 更全面的版本
+        # 1. 所有<ref>标签及其内容(包括各种不完整形式)
+        ref_pattern = re.compile(
+            r'<ref\b[^>]*>.*?</ref>|'  # 完整的ref标签
+            r'<ref\b[^>]*>[^<]*$|'     # 不完整的ref标签(没有闭合)
+            r'<ref\b[^>]*>.*?/br'      # ref标签后跟/br(如你示例中的情况)
+        )
+        # 2. 所有{{cite}}模板及其内容(包括各种不完整形式)
+        cite_pattern = re.compile(
+            r'\{\{cite\s+\w+\|[^}]*\}\}|'  # 完整的cite模板
+            r'\{\{cite\s+\w+\|[^}]*$'      # 不完整的cite模板(没有闭合)
+        )
+        refined_data = []
+        for item in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            modified = False
+            original_text = item
+            refined_text = original_text
+            # 删除所有未闭合的ref标签
+            refined_text, ref_count = ref_pattern.subn('', refined_text)
+            # 删除所有不完整的cite模板
+            refined_text, cite_count = cite_pattern.subn('', refined_text)
+            # 检查是否有任何修改
+            if ref_count > 0 or cite_count > 0:
+                modified = True
+                numbers += 1
+                self.logger.debug(f"Item modified, removed {ref_count} ref tags and {cite_count} cite templates")
+            refined_data.append(item)
+            if modified:
+                numbers += 1
+                self.logger.debug(f"Item modified, total modified so far: {numbers}")
+        self.logger.info(f"Refining Complete. Total modified items: {numbers}")
+        dataframe[self.input_key] = refined_data
+        output_file = storage.write(dataframe)
+        return [self.input_key]

DataFlow/dataflow/operators/refine/GeneralText/remove_contractions_refiner.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import contractions
+from tqdm import tqdm
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+@OPERATOR_REGISTRY.register()
+class RemoveContractionsRefiner(OperatorABC):
+    def __init__(self):
+        self.logger = get_logger()
+        self.logger.info(f"Initializing {self.__class__.__name__} ...")
+    def run(self, storage: DataFlowStorage, input_key: str):
+        self.input_key = input_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key}...")
+        numbers = 0
+        refined_data = []
+        for item in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            modified = False
+            original_text = item
+            expanded_text = contractions.fix(original_text)
+            if original_text != expanded_text:
+                item = expanded_text
+                modified = True
+                self.logger.debug(f"Modified text for key '{self.input_key}': Original: {original_text[:30]}... -> Refined: {expanded_text[:30]}...")
+            refined_data.append(item)
+            if modified:
+                numbers += 1
+                self.logger.debug(f"Item modified, total modified so far: {numbers}")
+        self.logger.info(f"Refining Complete. Total modified items: {numbers}")
+        dataframe[self.input_key] = refined_data
+        output_file = storage.write(dataframe)
+        return [self.input_key]