Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- DataFlow/dataflow/operators/process/AgenticRAG/__pycache__/ContentChooser.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/process/AgenticRAG/__pycache__/__init__.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/__init__.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/ccnet_deduplicator.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/hash_deduplicator.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/minhash_deduplicator.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/ngramhash_deduplicator.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/sem_deduplicator.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/simhash_deduplicator.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/debertav3_filter.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/fineweb_edu_filter.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/heuristics.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/lexical_diversity_filter.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/perplexity_filter.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/perspective_filter.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerFormatterFilter.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerGroundTruthFilter.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerJudger_MathVerify.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerNgramFilter.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerPipelineRoot.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerTokenLengthFilter.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/process/Reasoning/__pycache__/QuestionFilter.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/process/Reasoning/__pycache__/__init__.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/process/__pycache__/__init__.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/refine/GeneralText/__init__.py +0 -0
- DataFlow/dataflow/operators/refine/GeneralText/__pycache__/__init__.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/refine/GeneralText/__pycache__/html_entity_refiner.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/refine/GeneralText/__pycache__/html_url_remover_refiner.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/refine/GeneralText/__pycache__/lowercase_refiner.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/refine/GeneralText/__pycache__/ner_refiner.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/refine/GeneralText/__pycache__/pii_anonymize_refiner.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/refine/GeneralText/__pycache__/ref_removal_refiner.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_contractions_refiner.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_emoji_refiner.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_extra_spaces_refiner.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_image_ref_refiner.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_number_refiner.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_punctuation_refiner.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_repetitions_punctuation_refiner.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_stopwords_refiner.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/refine/GeneralText/__pycache__/spelling_correction_refiner.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/refine/GeneralText/__pycache__/stemming_lemmatization_refiner.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/refine/GeneralText/__pycache__/text_normalization_refiner.cpython-310.pyc +0 -0
- DataFlow/dataflow/operators/refine/GeneralText/html_entity_refiner.py +70 -0
- DataFlow/dataflow/operators/refine/GeneralText/html_url_remover_refiner.py +46 -0
- DataFlow/dataflow/operators/refine/GeneralText/lowercase_refiner.py +36 -0
- DataFlow/dataflow/operators/refine/GeneralText/ner_refiner.py +79 -0
- DataFlow/dataflow/operators/refine/GeneralText/pii_anonymize_refiner.py +62 -0
- DataFlow/dataflow/operators/refine/GeneralText/ref_removal_refiner.py +62 -0
- DataFlow/dataflow/operators/refine/GeneralText/remove_contractions_refiner.py +36 -0
DataFlow/dataflow/operators/process/AgenticRAG/__pycache__/ContentChooser.cpython-310.pyc
ADDED
|
Binary file (7.67 kB). View file
|
|
|
DataFlow/dataflow/operators/process/AgenticRAG/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (251 Bytes). View file
|
|
|
DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (205 Bytes). View file
|
|
|
DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/ccnet_deduplicator.cpython-310.pyc
ADDED
|
Binary file (3.46 kB). View file
|
|
|
DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/hash_deduplicator.cpython-310.pyc
ADDED
|
Binary file (2.75 kB). View file
|
|
|
DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/minhash_deduplicator.cpython-310.pyc
ADDED
|
Binary file (3.43 kB). View file
|
|
|
DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/ngramhash_deduplicator.cpython-310.pyc
ADDED
|
Binary file (3.45 kB). View file
|
|
|
DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/sem_deduplicator.cpython-310.pyc
ADDED
|
Binary file (4.78 kB). View file
|
|
|
DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/simhash_deduplicator.cpython-310.pyc
ADDED
|
Binary file (2.92 kB). View file
|
|
|
DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/debertav3_filter.cpython-310.pyc
ADDED
|
Binary file (1.92 kB). View file
|
|
|
DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/fineweb_edu_filter.cpython-310.pyc
ADDED
|
Binary file (2.11 kB). View file
|
|
|
DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/heuristics.cpython-310.pyc
ADDED
|
Binary file (36.2 kB). View file
|
|
|
DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/lexical_diversity_filter.cpython-310.pyc
ADDED
|
Binary file (2.66 kB). View file
|
|
|
DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/perplexity_filter.cpython-310.pyc
ADDED
|
Binary file (1.84 kB). View file
|
|
|
DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/perspective_filter.cpython-310.pyc
ADDED
|
Binary file (1.91 kB). View file
|
|
|
DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerFormatterFilter.cpython-310.pyc
ADDED
|
Binary file (3.36 kB). View file
|
|
|
DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerGroundTruthFilter.cpython-310.pyc
ADDED
|
Binary file (3.29 kB). View file
|
|
|
DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerJudger_MathVerify.cpython-310.pyc
ADDED
|
Binary file (3.75 kB). View file
|
|
|
DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerNgramFilter.cpython-310.pyc
ADDED
|
Binary file (4.19 kB). View file
|
|
|
DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerPipelineRoot.cpython-310.pyc
ADDED
|
Binary file (3.45 kB). View file
|
|
|
DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerTokenLengthFilter.cpython-310.pyc
ADDED
|
Binary file (3.71 kB). View file
|
|
|
DataFlow/dataflow/operators/process/Reasoning/__pycache__/QuestionFilter.cpython-310.pyc
ADDED
|
Binary file (3.79 kB). View file
|
|
|
DataFlow/dataflow/operators/process/Reasoning/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (545 Bytes). View file
|
|
|
DataFlow/dataflow/operators/process/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (1.02 kB). View file
|
|
|
DataFlow/dataflow/operators/refine/GeneralText/__init__.py
ADDED
|
File without changes
|
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (1.57 kB). View file
|
|
|
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/html_entity_refiner.cpython-310.pyc
ADDED
|
Binary file (2.3 kB). View file
|
|
|
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/html_url_remover_refiner.cpython-310.pyc
ADDED
|
Binary file (2.08 kB). View file
|
|
|
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/lowercase_refiner.cpython-310.pyc
ADDED
|
Binary file (1.7 kB). View file
|
|
|
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/ner_refiner.cpython-310.pyc
ADDED
|
Binary file (2.42 kB). View file
|
|
|
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/pii_anonymize_refiner.cpython-310.pyc
ADDED
|
Binary file (2.84 kB). View file
|
|
|
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/ref_removal_refiner.cpython-310.pyc
ADDED
|
Binary file (2.24 kB). View file
|
|
|
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_contractions_refiner.cpython-310.pyc
ADDED
|
Binary file (1.75 kB). View file
|
|
|
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_emoji_refiner.cpython-310.pyc
ADDED
|
Binary file (2.1 kB). View file
|
|
|
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_extra_spaces_refiner.cpython-310.pyc
ADDED
|
Binary file (2 kB). View file
|
|
|
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_image_ref_refiner.cpython-310.pyc
ADDED
|
Binary file (2.41 kB). View file
|
|
|
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_number_refiner.cpython-310.pyc
ADDED
|
Binary file (1.88 kB). View file
|
|
|
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_punctuation_refiner.cpython-310.pyc
ADDED
|
Binary file (1.82 kB). View file
|
|
|
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_repetitions_punctuation_refiner.cpython-310.pyc
ADDED
|
Binary file (1.88 kB). View file
|
|
|
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_stopwords_refiner.cpython-310.pyc
ADDED
|
Binary file (2.39 kB). View file
|
|
|
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/spelling_correction_refiner.cpython-310.pyc
ADDED
|
Binary file (3.37 kB). View file
|
|
|
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/stemming_lemmatization_refiner.cpython-310.pyc
ADDED
|
Binary file (2.45 kB). View file
|
|
|
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/text_normalization_refiner.cpython-310.pyc
ADDED
|
Binary file (2.22 kB). View file
|
|
|
DataFlow/dataflow/operators/refine/GeneralText/html_entity_refiner.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from tqdm import tqdm
|
| 3 |
+
from dataflow import get_logger
|
| 4 |
+
from dataflow.core import OperatorABC
|
| 5 |
+
from dataflow.utils.storage import DataFlowStorage
|
| 6 |
+
from dataflow.utils.registry import OPERATOR_REGISTRY
|
| 7 |
+
|
| 8 |
+
@OPERATOR_REGISTRY.register()
|
| 9 |
+
class HtmlEntityRefiner(OperatorABC):
|
| 10 |
+
def __init__(self, html_entities: list = [
|
| 11 |
+
"nbsp", "lt", "gt", "amp", "quot", "apos", "hellip", "ndash", "mdash",
|
| 12 |
+
"lsquo", "rsquo", "ldquo", "rdquo"
|
| 13 |
+
]):
|
| 14 |
+
self.logger = get_logger()
|
| 15 |
+
self.logger.info(f"Initializing {self.__class__.__name__} ...")
|
| 16 |
+
# 从参数中获取自定义 HTML 实体列表,如果未提供则使用默认列表
|
| 17 |
+
self.html_entities = html_entities
|
| 18 |
+
|
| 19 |
+
# 构建正则表达式模式,匹配所有定义的 HTML 实体
|
| 20 |
+
# 包括以下几种形式:
|
| 21 |
+
# 1. &实体名;
|
| 22 |
+
# 2. &实体名; (全角 &)
|
| 23 |
+
# 3. &实体名; (中文分号)
|
| 24 |
+
# 4. &实体名; (全角 & + 中文分号)
|
| 25 |
+
entity_patterns = []
|
| 26 |
+
for entity in self.html_entities:
|
| 27 |
+
# &实体名;
|
| 28 |
+
entity_patterns.append(fr'&{entity};')
|
| 29 |
+
# &实体名; (全角 &)
|
| 30 |
+
entity_patterns.append(fr'&{entity};')
|
| 31 |
+
# &实体名; (中文分号)
|
| 32 |
+
entity_patterns.append(fr'&{entity};')
|
| 33 |
+
# &实体名; (全角 & + 中文分号)
|
| 34 |
+
entity_patterns.append(fr'&{entity};')
|
| 35 |
+
|
| 36 |
+
# 编译正则表达式
|
| 37 |
+
self.html_entity_regex = re.compile('|'.join(entity_patterns))
|
| 38 |
+
|
| 39 |
+
@staticmethod
|
| 40 |
+
def get_desc(lang):
|
| 41 |
+
return "去除文本中的HTML实体" if lang == "zh" else "Remove HTML entities from the text."
|
| 42 |
+
|
| 43 |
+
def run(self, storage: DataFlowStorage, input_key: str):
|
| 44 |
+
self.input_key = input_key
|
| 45 |
+
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key}...")
|
| 46 |
+
dataframe = storage.read("dataframe")
|
| 47 |
+
numbers = 0
|
| 48 |
+
refined_data = []
|
| 49 |
+
for item in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
|
| 50 |
+
modified = False
|
| 51 |
+
original_text = item
|
| 52 |
+
refined_text = original_text
|
| 53 |
+
|
| 54 |
+
# 使用正则表达式替换所有匹配的HTML实体为空字符串
|
| 55 |
+
refined_text = self.html_entity_regex.sub('', refined_text)
|
| 56 |
+
|
| 57 |
+
# 检查文本是否被修改
|
| 58 |
+
if original_text != refined_text:
|
| 59 |
+
item = refined_text
|
| 60 |
+
modified = True
|
| 61 |
+
self.logger.debug(f"Modified text for key '{self.input_key}': Original: {original_text[:30]}... -> Refined: {refined_text[:30]}...")
|
| 62 |
+
|
| 63 |
+
refined_data.append(item)
|
| 64 |
+
if modified:
|
| 65 |
+
numbers += 1
|
| 66 |
+
self.logger.debug(f"Item modified, total modified so far: {numbers}")
|
| 67 |
+
self.logger.info(f"Refining Complete. Total modified items: {numbers}")
|
| 68 |
+
dataframe[self.input_key] = refined_data
|
| 69 |
+
output_file = storage.write(dataframe)
|
| 70 |
+
return [self.input_key]
|
DataFlow/dataflow/operators/refine/GeneralText/html_url_remover_refiner.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from tqdm import tqdm
|
| 3 |
+
from dataflow import get_logger
|
| 4 |
+
from dataflow.core import OperatorABC
|
| 5 |
+
from dataflow.utils.storage import DataFlowStorage
|
| 6 |
+
from dataflow.utils.registry import OPERATOR_REGISTRY
|
| 7 |
+
|
| 8 |
+
@OPERATOR_REGISTRY.register()
|
| 9 |
+
class HtmlUrlRemoverRefiner(OperatorABC):
|
| 10 |
+
def __init__(self):
|
| 11 |
+
self.logger = get_logger()
|
| 12 |
+
self.logger.info(f"Initializing {self.__class__.__name__} ...")
|
| 13 |
+
|
| 14 |
+
@staticmethod
|
| 15 |
+
def get_desc(lang: str = "zh"):
|
| 16 |
+
return "去除文本中的URL和HTML标签" if lang == "zh" else "Remove URLs and HTML tags from the text."
|
| 17 |
+
|
| 18 |
+
def run(self, storage: DataFlowStorage, input_key: str):
|
| 19 |
+
self.input_key = input_key
|
| 20 |
+
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key}...")
|
| 21 |
+
dataframe = storage.read("dataframe")
|
| 22 |
+
numbers = 0
|
| 23 |
+
refined_data = []
|
| 24 |
+
for item in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
|
| 25 |
+
modified = False
|
| 26 |
+
original_text = item
|
| 27 |
+
refined_text = original_text
|
| 28 |
+
|
| 29 |
+
# Remove URLs
|
| 30 |
+
refined_text = re.sub(r'https?:\/\/\S+[\r\n]*', '', refined_text, flags=re.MULTILINE)
|
| 31 |
+
# Remove HTML tags
|
| 32 |
+
refined_text = re.sub(r'<.*?>', '', refined_text)
|
| 33 |
+
|
| 34 |
+
if original_text != refined_text:
|
| 35 |
+
item = refined_text
|
| 36 |
+
modified = True
|
| 37 |
+
self.logger.debug(f"Modified text for key '{self.input_key}': Original: {original_text[:30]}... -> Refined: {refined_text[:30]}...")
|
| 38 |
+
|
| 39 |
+
refined_data.append(item)
|
| 40 |
+
if modified:
|
| 41 |
+
numbers += 1
|
| 42 |
+
self.logger.debug(f"Item modified, total modified so far: {numbers}")
|
| 43 |
+
self.logger.info(f"Refining Complete. Total modified items: {numbers}")
|
| 44 |
+
dataframe[self.input_key] = refined_data
|
| 45 |
+
output_file = storage.write(dataframe)
|
| 46 |
+
return [self.input_key]
|
DataFlow/dataflow/operators/refine/GeneralText/lowercase_refiner.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from tqdm import tqdm
|
| 3 |
+
from dataflow import get_logger
|
| 4 |
+
from dataflow.core import OperatorABC
|
| 5 |
+
from dataflow.utils.storage import DataFlowStorage
|
| 6 |
+
from dataflow.utils.registry import OPERATOR_REGISTRY
|
| 7 |
+
|
| 8 |
+
@OPERATOR_REGISTRY.register()
|
| 9 |
+
class LowercaseRefiner(OperatorABC):
|
| 10 |
+
def __init__(self):
|
| 11 |
+
self.logger = get_logger()
|
| 12 |
+
self.logger.info(f"Initializing {self.__class__.__name__} ...")
|
| 13 |
+
|
| 14 |
+
def run(self, storage: DataFlowStorage, input_key: str):
|
| 15 |
+
self.input_key = input_key
|
| 16 |
+
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key}...")
|
| 17 |
+
dataframe = storage.read("dataframe")
|
| 18 |
+
numbers = 0
|
| 19 |
+
refined_data = []
|
| 20 |
+
for item in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
|
| 21 |
+
modified = False
|
| 22 |
+
original_text = item
|
| 23 |
+
lower_text = original_text.lower()
|
| 24 |
+
if original_text != lower_text:
|
| 25 |
+
item = lower_text
|
| 26 |
+
modified = True
|
| 27 |
+
self.logger.debug(f"Modified text for key '{self.input_key}': Original: {original_text[:30]}... -> Refined: {lower_text[:30]}...")
|
| 28 |
+
|
| 29 |
+
refined_data.append(item)
|
| 30 |
+
if modified:
|
| 31 |
+
numbers += 1
|
| 32 |
+
self.logger.debug(f"Item modified, total modified so far: {numbers}")
|
| 33 |
+
self.logger.info(f"Refining Complete. Total modified items: {numbers}")
|
| 34 |
+
dataframe[self.input_key] = refined_data
|
| 35 |
+
output_file = storage.write(dataframe)
|
| 36 |
+
return [self.input_key]
|
DataFlow/dataflow/operators/refine/GeneralText/ner_refiner.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import spacy
|
| 2 |
+
from tqdm import tqdm
|
| 3 |
+
from dataflow import get_logger
|
| 4 |
+
from dataflow.core import OperatorABC
|
| 5 |
+
from dataflow.utils.storage import DataFlowStorage
|
| 6 |
+
from dataflow.utils.registry import OPERATOR_REGISTRY
|
| 7 |
+
|
| 8 |
+
ENTITY_LABELS = {
|
| 9 |
+
"PERSON": "[PERSON]",
|
| 10 |
+
"ORG": "[ORG]",
|
| 11 |
+
"GPE": "[GPE]",
|
| 12 |
+
"LOC": "[LOC]",
|
| 13 |
+
"PRODUCT": "[PRODUCT]",
|
| 14 |
+
"EVENT": "[EVENT]",
|
| 15 |
+
"DATE": "[DATE]",
|
| 16 |
+
"TIME": "[TIME]",
|
| 17 |
+
"MONEY": "[MONEY]",
|
| 18 |
+
"PERCENT": "[PERCENT]",
|
| 19 |
+
"QUANTITY": "[QUANTITY]",
|
| 20 |
+
"ORDINAL": "[ORDINAL]",
|
| 21 |
+
"CARDINAL": "[CARDINAL]",
|
| 22 |
+
"NORP": "[NORP]",
|
| 23 |
+
"FAC": "[FAC]",
|
| 24 |
+
"LAW": "[LAW]",
|
| 25 |
+
"LANGUAGE": "[LANGUAGE]",
|
| 26 |
+
"WORK_OF_ART": "[WORK_OF_ART]",
|
| 27 |
+
"LAW": "[LAW]",
|
| 28 |
+
"ORDINAL": "[ORDINAL]",
|
| 29 |
+
"CARDINAL": "[CARDINAL]",
|
| 30 |
+
"PERCENT": "[PERCENT]",
|
| 31 |
+
"QUANTITY": "[QUANTITY]",
|
| 32 |
+
"DATE": "[DATE]",
|
| 33 |
+
"TIME": "[TIME]",
|
| 34 |
+
"URL": "[URL]",
|
| 35 |
+
"EMAIL": "[EMAIL]",
|
| 36 |
+
"MONEY": "[MONEY]",
|
| 37 |
+
"FAC": "[FAC]",
|
| 38 |
+
"PRODUCT": "[PRODUCT]",
|
| 39 |
+
"EVENT": "[EVENT]",
|
| 40 |
+
"WORK_OF_ART": "[WORK_OF_ART]",
|
| 41 |
+
"LANGUAGE": "[LANGUAGE]",
|
| 42 |
+
"NORP": "[NORP]"
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
@OPERATOR_REGISTRY.register()
|
| 46 |
+
class NERRefiner(OperatorABC):
|
| 47 |
+
def __init__(self):
|
| 48 |
+
self.logger = get_logger()
|
| 49 |
+
self.logger.info(f"Initializing {self.__class__.__name__} ...")
|
| 50 |
+
self.nlp = spacy.load("en_core_web_sm")
|
| 51 |
+
|
| 52 |
+
def run(self, storage: DataFlowStorage, input_key: str):
|
| 53 |
+
self.input_key = input_key
|
| 54 |
+
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key}...")
|
| 55 |
+
dataframe = storage.read("dataframe")
|
| 56 |
+
numbers = 0
|
| 57 |
+
refined_data = []
|
| 58 |
+
for item in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
|
| 59 |
+
modified = False
|
| 60 |
+
original_text = item
|
| 61 |
+
refined_text = original_text
|
| 62 |
+
|
| 63 |
+
doc = self.nlp(refined_text)
|
| 64 |
+
for ent in doc.ents:
|
| 65 |
+
if ent.label_ in ENTITY_LABELS :
|
| 66 |
+
refined_text = refined_text.replace(ent.text, f"[{ent.label_}]")
|
| 67 |
+
|
| 68 |
+
if original_text != refined_text:
|
| 69 |
+
item = refined_text
|
| 70 |
+
modified = True
|
| 71 |
+
|
| 72 |
+
refined_data.append(item)
|
| 73 |
+
if modified:
|
| 74 |
+
numbers += 1
|
| 75 |
+
self.logger.debug(f"Item modified, total modified so far: {numbers}")
|
| 76 |
+
self.logger.info(f"Refining Complete. Total modified items: {numbers}")
|
| 77 |
+
dataframe[self.input_key] = refined_data
|
| 78 |
+
output_file = storage.write(dataframe)
|
| 79 |
+
return [self.input_key]
|
DataFlow/dataflow/operators/refine/GeneralText/pii_anonymize_refiner.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from tqdm import tqdm
|
| 2 |
+
from transformers import AutoModelForTokenClassification, AutoTokenizer
|
| 3 |
+
from presidio_analyzer.nlp_engine import TransformersNlpEngine
|
| 4 |
+
from presidio_analyzer import AnalyzerEngine
|
| 5 |
+
from presidio_anonymizer import AnonymizerEngine
|
| 6 |
+
from dataflow import get_logger
|
| 7 |
+
from dataflow.core import OperatorABC
|
| 8 |
+
from dataflow.utils.storage import DataFlowStorage
|
| 9 |
+
from dataflow.utils.registry import OPERATOR_REGISTRY
|
| 10 |
+
|
| 11 |
+
@OPERATOR_REGISTRY.register()
|
| 12 |
+
class PIIAnonymizeRefiner(OperatorABC):
|
| 13 |
+
def __init__(self, lang='en', device='cuda', model_cache_dir='./dataflow_cache', model_name='dslim/bert-base-NER', ):
|
| 14 |
+
self.logger = get_logger()
|
| 15 |
+
self.logger.info(f"Initializing {self.__class__.__name__} ...")
|
| 16 |
+
self.lang = lang
|
| 17 |
+
self.device = device
|
| 18 |
+
self.model_cache_dir = model_cache_dir
|
| 19 |
+
self.model_name = model_name
|
| 20 |
+
model_name = 'dslim/bert-base-NER'
|
| 21 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=self.model_cache_dir)
|
| 22 |
+
self.model = AutoModelForTokenClassification.from_pretrained(model_name, cache_dir=self.model_cache_dir).to(self.device)
|
| 23 |
+
model_config = [{
|
| 24 |
+
"lang_code": self.lang,
|
| 25 |
+
"model_name": {
|
| 26 |
+
"spacy": "en_core_web_sm",
|
| 27 |
+
"transformers": model_name
|
| 28 |
+
}
|
| 29 |
+
}]
|
| 30 |
+
|
| 31 |
+
self.nlp_engine = TransformersNlpEngine(models=model_config)
|
| 32 |
+
self.analyzer = AnalyzerEngine(nlp_engine=self.nlp_engine)
|
| 33 |
+
self.anonymizer = AnonymizerEngine()
|
| 34 |
+
|
| 35 |
+
@staticmethod
|
| 36 |
+
def get_desc(lang: str = "zh"):
|
| 37 |
+
return "去除文本中的URL和HTML标签" if lang == "zh" else "Remove URLs and HTML tags from the text."
|
| 38 |
+
|
| 39 |
+
def run(self, storage: DataFlowStorage, input_key: str):
|
| 40 |
+
self.input_key = input_key
|
| 41 |
+
dataframe = storage.read("dataframe")
|
| 42 |
+
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key}...")
|
| 43 |
+
anonymized_count = 0
|
| 44 |
+
refined_data = []
|
| 45 |
+
for item in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
|
| 46 |
+
modified = False
|
| 47 |
+
original_text = item
|
| 48 |
+
results = self.analyzer.analyze(original_text, language=self.lang)
|
| 49 |
+
anonymized_text = self.anonymizer.anonymize(original_text, results)
|
| 50 |
+
if original_text != anonymized_text.text:
|
| 51 |
+
item = anonymized_text.text
|
| 52 |
+
modified = True
|
| 53 |
+
self.logger.debug(f"Modified text for key '{self.input_key}': Original: {original_text[:30]}... -> Refined: {anonymized_text.text[:30]}...")
|
| 54 |
+
|
| 55 |
+
refined_data.append(item)
|
| 56 |
+
if modified:
|
| 57 |
+
anonymized_count += 1
|
| 58 |
+
self.logger.debug(f"Item modified, total modified so far: {anonymized_count}")
|
| 59 |
+
self.logger.info(f"Refining Complete. Total modified items: {anonymized_count}")
|
| 60 |
+
dataframe[self.input_key] = refined_data
|
| 61 |
+
output_file = storage.write(dataframe)
|
| 62 |
+
return [self.input_key]
|
DataFlow/dataflow/operators/refine/GeneralText/ref_removal_refiner.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from tqdm import tqdm
|
| 3 |
+
from dataflow import get_logger
|
| 4 |
+
from dataflow.core import OperatorABC
|
| 5 |
+
from dataflow.utils.storage import DataFlowStorage
|
| 6 |
+
from dataflow.utils.registry import OPERATOR_REGISTRY
|
| 7 |
+
|
| 8 |
+
@OPERATOR_REGISTRY.register()
|
| 9 |
+
class ReferenceRemoverRefiner(OperatorABC):
|
| 10 |
+
def __init__(self):
|
| 11 |
+
self.logger = get_logger()
|
| 12 |
+
self.logger.info(f"Initializing {self.__class__.__name__}...")
|
| 13 |
+
|
| 14 |
+
@staticmethod
|
| 15 |
+
def get_desc(lang):
|
| 16 |
+
return "删除文本中未闭合的引用标签和引用链接" if lang == "zh" else "Remove unclosed reference tags and citation links from the text."
|
| 17 |
+
|
| 18 |
+
def run(self, storage: DataFlowStorage, input_key: str):
|
| 19 |
+
self.input_key = input_key
|
| 20 |
+
dataframe = storage.read("dataframe")
|
| 21 |
+
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key}...")
|
| 22 |
+
numbers = 0
|
| 23 |
+
# 定义要删除的模式 - 更全面的版本
|
| 24 |
+
# 1. 所有<ref>标签及其内容(包括各种不完整形式)
|
| 25 |
+
ref_pattern = re.compile(
|
| 26 |
+
r'<ref\b[^>]*>.*?</ref>|' # 完整的ref标签
|
| 27 |
+
r'<ref\b[^>]*>[^<]*$|' # 不完整的ref标签(没有闭合)
|
| 28 |
+
r'<ref\b[^>]*>.*?/br' # ref标签后跟/br(如你示例中的情况)
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
# 2. 所有{{cite}}模板及其内容(包括各种不完整形式)
|
| 32 |
+
cite_pattern = re.compile(
|
| 33 |
+
r'\{\{cite\s+\w+\|[^}]*\}\}|' # 完整的cite模板
|
| 34 |
+
r'\{\{cite\s+\w+\|[^}]*$' # 不完整的cite模板(没有闭合)
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
refined_data = []
|
| 38 |
+
for item in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
|
| 39 |
+
modified = False
|
| 40 |
+
original_text = item
|
| 41 |
+
refined_text = original_text
|
| 42 |
+
|
| 43 |
+
# 删除所有未闭合的ref标签
|
| 44 |
+
refined_text, ref_count = ref_pattern.subn('', refined_text)
|
| 45 |
+
|
| 46 |
+
# 删除所有不完整的cite模板
|
| 47 |
+
refined_text, cite_count = cite_pattern.subn('', refined_text)
|
| 48 |
+
|
| 49 |
+
# 检查是否有任何修改
|
| 50 |
+
if ref_count > 0 or cite_count > 0:
|
| 51 |
+
modified = True
|
| 52 |
+
numbers += 1
|
| 53 |
+
self.logger.debug(f"Item modified, removed {ref_count} ref tags and {cite_count} cite templates")
|
| 54 |
+
|
| 55 |
+
refined_data.append(item)
|
| 56 |
+
if modified:
|
| 57 |
+
numbers += 1
|
| 58 |
+
self.logger.debug(f"Item modified, total modified so far: {numbers}")
|
| 59 |
+
self.logger.info(f"Refining Complete. Total modified items: {numbers}")
|
| 60 |
+
dataframe[self.input_key] = refined_data
|
| 61 |
+
output_file = storage.write(dataframe)
|
| 62 |
+
return [self.input_key]
|
DataFlow/dataflow/operators/refine/GeneralText/remove_contractions_refiner.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import contractions
|
| 2 |
+
from tqdm import tqdm
|
| 3 |
+
from dataflow import get_logger
|
| 4 |
+
from dataflow.core import OperatorABC
|
| 5 |
+
from dataflow.utils.storage import DataFlowStorage
|
| 6 |
+
from dataflow.utils.registry import OPERATOR_REGISTRY
|
| 7 |
+
|
| 8 |
+
@OPERATOR_REGISTRY.register()
|
| 9 |
+
class RemoveContractionsRefiner(OperatorABC):
|
| 10 |
+
def __init__(self):
|
| 11 |
+
self.logger = get_logger()
|
| 12 |
+
self.logger.info(f"Initializing {self.__class__.__name__} ...")
|
| 13 |
+
|
| 14 |
+
def run(self, storage: DataFlowStorage, input_key: str):
|
| 15 |
+
self.input_key = input_key
|
| 16 |
+
dataframe = storage.read("dataframe")
|
| 17 |
+
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key}...")
|
| 18 |
+
numbers = 0
|
| 19 |
+
refined_data = []
|
| 20 |
+
for item in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
|
| 21 |
+
modified = False
|
| 22 |
+
original_text = item
|
| 23 |
+
expanded_text = contractions.fix(original_text)
|
| 24 |
+
if original_text != expanded_text:
|
| 25 |
+
item = expanded_text
|
| 26 |
+
modified = True
|
| 27 |
+
self.logger.debug(f"Modified text for key '{self.input_key}': Original: {original_text[:30]}... -> Refined: {expanded_text[:30]}...")
|
| 28 |
+
|
| 29 |
+
refined_data.append(item)
|
| 30 |
+
if modified:
|
| 31 |
+
numbers += 1
|
| 32 |
+
self.logger.debug(f"Item modified, total modified so far: {numbers}")
|
| 33 |
+
self.logger.info(f"Refining Complete. Total modified items: {numbers}")
|
| 34 |
+
dataframe[self.input_key] = refined_data
|
| 35 |
+
output_file = storage.write(dataframe)
|
| 36 |
+
return [self.input_key]
|