Miaode commited on
Commit
2917cfc
·
verified ·
1 Parent(s): 4742103

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. DataFlow/dataflow/operators/process/AgenticRAG/__pycache__/ContentChooser.cpython-310.pyc +0 -0
  2. DataFlow/dataflow/operators/process/AgenticRAG/__pycache__/__init__.cpython-310.pyc +0 -0
  3. DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/__init__.cpython-310.pyc +0 -0
  4. DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/ccnet_deduplicator.cpython-310.pyc +0 -0
  5. DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/hash_deduplicator.cpython-310.pyc +0 -0
  6. DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/minhash_deduplicator.cpython-310.pyc +0 -0
  7. DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/ngramhash_deduplicator.cpython-310.pyc +0 -0
  8. DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/sem_deduplicator.cpython-310.pyc +0 -0
  9. DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/simhash_deduplicator.cpython-310.pyc +0 -0
  10. DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/debertav3_filter.cpython-310.pyc +0 -0
  11. DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/fineweb_edu_filter.cpython-310.pyc +0 -0
  12. DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/heuristics.cpython-310.pyc +0 -0
  13. DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/lexical_diversity_filter.cpython-310.pyc +0 -0
  14. DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/perplexity_filter.cpython-310.pyc +0 -0
  15. DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/perspective_filter.cpython-310.pyc +0 -0
  16. DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerFormatterFilter.cpython-310.pyc +0 -0
  17. DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerGroundTruthFilter.cpython-310.pyc +0 -0
  18. DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerJudger_MathVerify.cpython-310.pyc +0 -0
  19. DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerNgramFilter.cpython-310.pyc +0 -0
  20. DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerPipelineRoot.cpython-310.pyc +0 -0
  21. DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerTokenLengthFilter.cpython-310.pyc +0 -0
  22. DataFlow/dataflow/operators/process/Reasoning/__pycache__/QuestionFilter.cpython-310.pyc +0 -0
  23. DataFlow/dataflow/operators/process/Reasoning/__pycache__/__init__.cpython-310.pyc +0 -0
  24. DataFlow/dataflow/operators/process/__pycache__/__init__.cpython-310.pyc +0 -0
  25. DataFlow/dataflow/operators/refine/GeneralText/__init__.py +0 -0
  26. DataFlow/dataflow/operators/refine/GeneralText/__pycache__/__init__.cpython-310.pyc +0 -0
  27. DataFlow/dataflow/operators/refine/GeneralText/__pycache__/html_entity_refiner.cpython-310.pyc +0 -0
  28. DataFlow/dataflow/operators/refine/GeneralText/__pycache__/html_url_remover_refiner.cpython-310.pyc +0 -0
  29. DataFlow/dataflow/operators/refine/GeneralText/__pycache__/lowercase_refiner.cpython-310.pyc +0 -0
  30. DataFlow/dataflow/operators/refine/GeneralText/__pycache__/ner_refiner.cpython-310.pyc +0 -0
  31. DataFlow/dataflow/operators/refine/GeneralText/__pycache__/pii_anonymize_refiner.cpython-310.pyc +0 -0
  32. DataFlow/dataflow/operators/refine/GeneralText/__pycache__/ref_removal_refiner.cpython-310.pyc +0 -0
  33. DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_contractions_refiner.cpython-310.pyc +0 -0
  34. DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_emoji_refiner.cpython-310.pyc +0 -0
  35. DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_extra_spaces_refiner.cpython-310.pyc +0 -0
  36. DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_image_ref_refiner.cpython-310.pyc +0 -0
  37. DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_number_refiner.cpython-310.pyc +0 -0
  38. DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_punctuation_refiner.cpython-310.pyc +0 -0
  39. DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_repetitions_punctuation_refiner.cpython-310.pyc +0 -0
  40. DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_stopwords_refiner.cpython-310.pyc +0 -0
  41. DataFlow/dataflow/operators/refine/GeneralText/__pycache__/spelling_correction_refiner.cpython-310.pyc +0 -0
  42. DataFlow/dataflow/operators/refine/GeneralText/__pycache__/stemming_lemmatization_refiner.cpython-310.pyc +0 -0
  43. DataFlow/dataflow/operators/refine/GeneralText/__pycache__/text_normalization_refiner.cpython-310.pyc +0 -0
  44. DataFlow/dataflow/operators/refine/GeneralText/html_entity_refiner.py +70 -0
  45. DataFlow/dataflow/operators/refine/GeneralText/html_url_remover_refiner.py +46 -0
  46. DataFlow/dataflow/operators/refine/GeneralText/lowercase_refiner.py +36 -0
  47. DataFlow/dataflow/operators/refine/GeneralText/ner_refiner.py +79 -0
  48. DataFlow/dataflow/operators/refine/GeneralText/pii_anonymize_refiner.py +62 -0
  49. DataFlow/dataflow/operators/refine/GeneralText/ref_removal_refiner.py +62 -0
  50. DataFlow/dataflow/operators/refine/GeneralText/remove_contractions_refiner.py +36 -0
DataFlow/dataflow/operators/process/AgenticRAG/__pycache__/ContentChooser.cpython-310.pyc ADDED
Binary file (7.67 kB). View file
 
DataFlow/dataflow/operators/process/AgenticRAG/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (251 Bytes). View file
 
DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (205 Bytes). View file
 
DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/ccnet_deduplicator.cpython-310.pyc ADDED
Binary file (3.46 kB). View file
 
DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/hash_deduplicator.cpython-310.pyc ADDED
Binary file (2.75 kB). View file
 
DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/minhash_deduplicator.cpython-310.pyc ADDED
Binary file (3.43 kB). View file
 
DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/ngramhash_deduplicator.cpython-310.pyc ADDED
Binary file (3.45 kB). View file
 
DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/sem_deduplicator.cpython-310.pyc ADDED
Binary file (4.78 kB). View file
 
DataFlow/dataflow/operators/process/GeneralText/deduplicators/__pycache__/simhash_deduplicator.cpython-310.pyc ADDED
Binary file (2.92 kB). View file
 
DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/debertav3_filter.cpython-310.pyc ADDED
Binary file (1.92 kB). View file
 
DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/fineweb_edu_filter.cpython-310.pyc ADDED
Binary file (2.11 kB). View file
 
DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/heuristics.cpython-310.pyc ADDED
Binary file (36.2 kB). View file
 
DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/lexical_diversity_filter.cpython-310.pyc ADDED
Binary file (2.66 kB). View file
 
DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/perplexity_filter.cpython-310.pyc ADDED
Binary file (1.84 kB). View file
 
DataFlow/dataflow/operators/process/GeneralText/filters/__pycache__/perspective_filter.cpython-310.pyc ADDED
Binary file (1.91 kB). View file
 
DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerFormatterFilter.cpython-310.pyc ADDED
Binary file (3.36 kB). View file
 
DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerGroundTruthFilter.cpython-310.pyc ADDED
Binary file (3.29 kB). View file
 
DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerJudger_MathVerify.cpython-310.pyc ADDED
Binary file (3.75 kB). View file
 
DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerNgramFilter.cpython-310.pyc ADDED
Binary file (4.19 kB). View file
 
DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerPipelineRoot.cpython-310.pyc ADDED
Binary file (3.45 kB). View file
 
DataFlow/dataflow/operators/process/Reasoning/__pycache__/AnswerTokenLengthFilter.cpython-310.pyc ADDED
Binary file (3.71 kB). View file
 
DataFlow/dataflow/operators/process/Reasoning/__pycache__/QuestionFilter.cpython-310.pyc ADDED
Binary file (3.79 kB). View file
 
DataFlow/dataflow/operators/process/Reasoning/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (545 Bytes). View file
 
DataFlow/dataflow/operators/process/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (1.02 kB). View file
 
DataFlow/dataflow/operators/refine/GeneralText/__init__.py ADDED
File without changes
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (1.57 kB). View file
 
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/html_entity_refiner.cpython-310.pyc ADDED
Binary file (2.3 kB). View file
 
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/html_url_remover_refiner.cpython-310.pyc ADDED
Binary file (2.08 kB). View file
 
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/lowercase_refiner.cpython-310.pyc ADDED
Binary file (1.7 kB). View file
 
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/ner_refiner.cpython-310.pyc ADDED
Binary file (2.42 kB). View file
 
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/pii_anonymize_refiner.cpython-310.pyc ADDED
Binary file (2.84 kB). View file
 
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/ref_removal_refiner.cpython-310.pyc ADDED
Binary file (2.24 kB). View file
 
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_contractions_refiner.cpython-310.pyc ADDED
Binary file (1.75 kB). View file
 
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_emoji_refiner.cpython-310.pyc ADDED
Binary file (2.1 kB). View file
 
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_extra_spaces_refiner.cpython-310.pyc ADDED
Binary file (2 kB). View file
 
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_image_ref_refiner.cpython-310.pyc ADDED
Binary file (2.41 kB). View file
 
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_number_refiner.cpython-310.pyc ADDED
Binary file (1.88 kB). View file
 
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_punctuation_refiner.cpython-310.pyc ADDED
Binary file (1.82 kB). View file
 
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_repetitions_punctuation_refiner.cpython-310.pyc ADDED
Binary file (1.88 kB). View file
 
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/remove_stopwords_refiner.cpython-310.pyc ADDED
Binary file (2.39 kB). View file
 
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/spelling_correction_refiner.cpython-310.pyc ADDED
Binary file (3.37 kB). View file
 
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/stemming_lemmatization_refiner.cpython-310.pyc ADDED
Binary file (2.45 kB). View file
 
DataFlow/dataflow/operators/refine/GeneralText/__pycache__/text_normalization_refiner.cpython-310.pyc ADDED
Binary file (2.22 kB). View file
 
DataFlow/dataflow/operators/refine/GeneralText/html_entity_refiner.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from tqdm import tqdm
3
+ from dataflow import get_logger
4
+ from dataflow.core import OperatorABC
5
+ from dataflow.utils.storage import DataFlowStorage
6
+ from dataflow.utils.registry import OPERATOR_REGISTRY
7
+
8
+ @OPERATOR_REGISTRY.register()
9
+ class HtmlEntityRefiner(OperatorABC):
10
+ def __init__(self, html_entities: list = [
11
+ "nbsp", "lt", "gt", "amp", "quot", "apos", "hellip", "ndash", "mdash",
12
+ "lsquo", "rsquo", "ldquo", "rdquo"
13
+ ]):
14
+ self.logger = get_logger()
15
+ self.logger.info(f"Initializing {self.__class__.__name__} ...")
16
+ # 从参数中获取自定义 HTML 实体列表,如果未提供则使用默认列表
17
+ self.html_entities = html_entities
18
+
19
+ # 构建正则表达式模式,匹配所有定义的 HTML 实体
20
+ # 包括以下几种形式:
21
+ # 1. &实体名;
22
+ # 2. &实体名; (全角 &)
23
+ # 3. &实体名; (中文分号)
24
+ # 4. &实体名; (全角 & + 中文分号)
25
+ entity_patterns = []
26
+ for entity in self.html_entities:
27
+ # &实体名;
28
+ entity_patterns.append(fr'&{entity};')
29
+ # &实体名; (全角 &)
30
+ entity_patterns.append(fr'&{entity};')
31
+ # &实体名; (中文分号)
32
+ entity_patterns.append(fr'&{entity};')
33
+ # &实体名; (全角 & + 中文分号)
34
+ entity_patterns.append(fr'&{entity};')
35
+
36
+ # 编译正则表达式
37
+ self.html_entity_regex = re.compile('|'.join(entity_patterns))
38
+
39
+ @staticmethod
40
+ def get_desc(lang):
41
+ return "去除文本中的HTML实体" if lang == "zh" else "Remove HTML entities from the text."
42
+
43
+ def run(self, storage: DataFlowStorage, input_key: str):
44
+ self.input_key = input_key
45
+ self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key}...")
46
+ dataframe = storage.read("dataframe")
47
+ numbers = 0
48
+ refined_data = []
49
+ for item in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
50
+ modified = False
51
+ original_text = item
52
+ refined_text = original_text
53
+
54
+ # 使用正则表达式替换所有匹配的HTML实体为空字符串
55
+ refined_text = self.html_entity_regex.sub('', refined_text)
56
+
57
+ # 检查文本是否被修改
58
+ if original_text != refined_text:
59
+ item = refined_text
60
+ modified = True
61
+ self.logger.debug(f"Modified text for key '{self.input_key}': Original: {original_text[:30]}... -> Refined: {refined_text[:30]}...")
62
+
63
+ refined_data.append(item)
64
+ if modified:
65
+ numbers += 1
66
+ self.logger.debug(f"Item modified, total modified so far: {numbers}")
67
+ self.logger.info(f"Refining Complete. Total modified items: {numbers}")
68
+ dataframe[self.input_key] = refined_data
69
+ output_file = storage.write(dataframe)
70
+ return [self.input_key]
DataFlow/dataflow/operators/refine/GeneralText/html_url_remover_refiner.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from tqdm import tqdm
3
+ from dataflow import get_logger
4
+ from dataflow.core import OperatorABC
5
+ from dataflow.utils.storage import DataFlowStorage
6
+ from dataflow.utils.registry import OPERATOR_REGISTRY
7
+
8
+ @OPERATOR_REGISTRY.register()
9
+ class HtmlUrlRemoverRefiner(OperatorABC):
10
+ def __init__(self):
11
+ self.logger = get_logger()
12
+ self.logger.info(f"Initializing {self.__class__.__name__} ...")
13
+
14
+ @staticmethod
15
+ def get_desc(lang: str = "zh"):
16
+ return "去除文本中的URL和HTML标签" if lang == "zh" else "Remove URLs and HTML tags from the text."
17
+
18
+ def run(self, storage: DataFlowStorage, input_key: str):
19
+ self.input_key = input_key
20
+ self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key}...")
21
+ dataframe = storage.read("dataframe")
22
+ numbers = 0
23
+ refined_data = []
24
+ for item in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
25
+ modified = False
26
+ original_text = item
27
+ refined_text = original_text
28
+
29
+ # Remove URLs
30
+ refined_text = re.sub(r'https?:\/\/\S+[\r\n]*', '', refined_text, flags=re.MULTILINE)
31
+ # Remove HTML tags
32
+ refined_text = re.sub(r'<.*?>', '', refined_text)
33
+
34
+ if original_text != refined_text:
35
+ item = refined_text
36
+ modified = True
37
+ self.logger.debug(f"Modified text for key '{self.input_key}': Original: {original_text[:30]}... -> Refined: {refined_text[:30]}...")
38
+
39
+ refined_data.append(item)
40
+ if modified:
41
+ numbers += 1
42
+ self.logger.debug(f"Item modified, total modified so far: {numbers}")
43
+ self.logger.info(f"Refining Complete. Total modified items: {numbers}")
44
+ dataframe[self.input_key] = refined_data
45
+ output_file = storage.write(dataframe)
46
+ return [self.input_key]
DataFlow/dataflow/operators/refine/GeneralText/lowercase_refiner.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from tqdm import tqdm
3
+ from dataflow import get_logger
4
+ from dataflow.core import OperatorABC
5
+ from dataflow.utils.storage import DataFlowStorage
6
+ from dataflow.utils.registry import OPERATOR_REGISTRY
7
+
8
+ @OPERATOR_REGISTRY.register()
9
+ class LowercaseRefiner(OperatorABC):
10
+ def __init__(self):
11
+ self.logger = get_logger()
12
+ self.logger.info(f"Initializing {self.__class__.__name__} ...")
13
+
14
+ def run(self, storage: DataFlowStorage, input_key: str):
15
+ self.input_key = input_key
16
+ self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key}...")
17
+ dataframe = storage.read("dataframe")
18
+ numbers = 0
19
+ refined_data = []
20
+ for item in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
21
+ modified = False
22
+ original_text = item
23
+ lower_text = original_text.lower()
24
+ if original_text != lower_text:
25
+ item = lower_text
26
+ modified = True
27
+ self.logger.debug(f"Modified text for key '{self.input_key}': Original: {original_text[:30]}... -> Refined: {lower_text[:30]}...")
28
+
29
+ refined_data.append(item)
30
+ if modified:
31
+ numbers += 1
32
+ self.logger.debug(f"Item modified, total modified so far: {numbers}")
33
+ self.logger.info(f"Refining Complete. Total modified items: {numbers}")
34
+ dataframe[self.input_key] = refined_data
35
+ output_file = storage.write(dataframe)
36
+ return [self.input_key]
DataFlow/dataflow/operators/refine/GeneralText/ner_refiner.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ from tqdm import tqdm
3
+ from dataflow import get_logger
4
+ from dataflow.core import OperatorABC
5
+ from dataflow.utils.storage import DataFlowStorage
6
+ from dataflow.utils.registry import OPERATOR_REGISTRY
7
+
8
+ ENTITY_LABELS = {
9
+ "PERSON": "[PERSON]",
10
+ "ORG": "[ORG]",
11
+ "GPE": "[GPE]",
12
+ "LOC": "[LOC]",
13
+ "PRODUCT": "[PRODUCT]",
14
+ "EVENT": "[EVENT]",
15
+ "DATE": "[DATE]",
16
+ "TIME": "[TIME]",
17
+ "MONEY": "[MONEY]",
18
+ "PERCENT": "[PERCENT]",
19
+ "QUANTITY": "[QUANTITY]",
20
+ "ORDINAL": "[ORDINAL]",
21
+ "CARDINAL": "[CARDINAL]",
22
+ "NORP": "[NORP]",
23
+ "FAC": "[FAC]",
24
+ "LAW": "[LAW]",
25
+ "LANGUAGE": "[LANGUAGE]",
26
+ "WORK_OF_ART": "[WORK_OF_ART]",
27
+ "LAW": "[LAW]",
28
+ "ORDINAL": "[ORDINAL]",
29
+ "CARDINAL": "[CARDINAL]",
30
+ "PERCENT": "[PERCENT]",
31
+ "QUANTITY": "[QUANTITY]",
32
+ "DATE": "[DATE]",
33
+ "TIME": "[TIME]",
34
+ "URL": "[URL]",
35
+ "EMAIL": "[EMAIL]",
36
+ "MONEY": "[MONEY]",
37
+ "FAC": "[FAC]",
38
+ "PRODUCT": "[PRODUCT]",
39
+ "EVENT": "[EVENT]",
40
+ "WORK_OF_ART": "[WORK_OF_ART]",
41
+ "LANGUAGE": "[LANGUAGE]",
42
+ "NORP": "[NORP]"
43
+ }
44
+
45
+ @OPERATOR_REGISTRY.register()
46
+ class NERRefiner(OperatorABC):
47
+ def __init__(self):
48
+ self.logger = get_logger()
49
+ self.logger.info(f"Initializing {self.__class__.__name__} ...")
50
+ self.nlp = spacy.load("en_core_web_sm")
51
+
52
+ def run(self, storage: DataFlowStorage, input_key: str):
53
+ self.input_key = input_key
54
+ self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key}...")
55
+ dataframe = storage.read("dataframe")
56
+ numbers = 0
57
+ refined_data = []
58
+ for item in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
59
+ modified = False
60
+ original_text = item
61
+ refined_text = original_text
62
+
63
+ doc = self.nlp(refined_text)
64
+ for ent in doc.ents:
65
+ if ent.label_ in ENTITY_LABELS :
66
+ refined_text = refined_text.replace(ent.text, f"[{ent.label_}]")
67
+
68
+ if original_text != refined_text:
69
+ item = refined_text
70
+ modified = True
71
+
72
+ refined_data.append(item)
73
+ if modified:
74
+ numbers += 1
75
+ self.logger.debug(f"Item modified, total modified so far: {numbers}")
76
+ self.logger.info(f"Refining Complete. Total modified items: {numbers}")
77
+ dataframe[self.input_key] = refined_data
78
+ output_file = storage.write(dataframe)
79
+ return [self.input_key]
DataFlow/dataflow/operators/refine/GeneralText/pii_anonymize_refiner.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tqdm import tqdm
2
+ from transformers import AutoModelForTokenClassification, AutoTokenizer
3
+ from presidio_analyzer.nlp_engine import TransformersNlpEngine
4
+ from presidio_analyzer import AnalyzerEngine
5
+ from presidio_anonymizer import AnonymizerEngine
6
+ from dataflow import get_logger
7
+ from dataflow.core import OperatorABC
8
+ from dataflow.utils.storage import DataFlowStorage
9
+ from dataflow.utils.registry import OPERATOR_REGISTRY
10
+
11
+ @OPERATOR_REGISTRY.register()
12
+ class PIIAnonymizeRefiner(OperatorABC):
13
+ def __init__(self, lang='en', device='cuda', model_cache_dir='./dataflow_cache', model_name='dslim/bert-base-NER', ):
14
+ self.logger = get_logger()
15
+ self.logger.info(f"Initializing {self.__class__.__name__} ...")
16
+ self.lang = lang
17
+ self.device = device
18
+ self.model_cache_dir = model_cache_dir
19
+ self.model_name = model_name
20
+ model_name = 'dslim/bert-base-NER'
21
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=self.model_cache_dir)
22
+ self.model = AutoModelForTokenClassification.from_pretrained(model_name, cache_dir=self.model_cache_dir).to(self.device)
23
+ model_config = [{
24
+ "lang_code": self.lang,
25
+ "model_name": {
26
+ "spacy": "en_core_web_sm",
27
+ "transformers": model_name
28
+ }
29
+ }]
30
+
31
+ self.nlp_engine = TransformersNlpEngine(models=model_config)
32
+ self.analyzer = AnalyzerEngine(nlp_engine=self.nlp_engine)
33
+ self.anonymizer = AnonymizerEngine()
34
+
35
+ @staticmethod
36
+ def get_desc(lang: str = "zh"):
37
+ return "去除文本中的URL和HTML标签" if lang == "zh" else "Remove URLs and HTML tags from the text."
38
+
39
+ def run(self, storage: DataFlowStorage, input_key: str):
40
+ self.input_key = input_key
41
+ dataframe = storage.read("dataframe")
42
+ self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key}...")
43
+ anonymized_count = 0
44
+ refined_data = []
45
+ for item in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
46
+ modified = False
47
+ original_text = item
48
+ results = self.analyzer.analyze(original_text, language=self.lang)
49
+ anonymized_text = self.anonymizer.anonymize(original_text, results)
50
+ if original_text != anonymized_text.text:
51
+ item = anonymized_text.text
52
+ modified = True
53
+ self.logger.debug(f"Modified text for key '{self.input_key}': Original: {original_text[:30]}... -> Refined: {anonymized_text.text[:30]}...")
54
+
55
+ refined_data.append(item)
56
+ if modified:
57
+ anonymized_count += 1
58
+ self.logger.debug(f"Item modified, total modified so far: {anonymized_count}")
59
+ self.logger.info(f"Refining Complete. Total modified items: {anonymized_count}")
60
+ dataframe[self.input_key] = refined_data
61
+ output_file = storage.write(dataframe)
62
+ return [self.input_key]
DataFlow/dataflow/operators/refine/GeneralText/ref_removal_refiner.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from tqdm import tqdm
3
+ from dataflow import get_logger
4
+ from dataflow.core import OperatorABC
5
+ from dataflow.utils.storage import DataFlowStorage
6
+ from dataflow.utils.registry import OPERATOR_REGISTRY
7
+
8
+ @OPERATOR_REGISTRY.register()
9
+ class ReferenceRemoverRefiner(OperatorABC):
10
+ def __init__(self):
11
+ self.logger = get_logger()
12
+ self.logger.info(f"Initializing {self.__class__.__name__}...")
13
+
14
+ @staticmethod
15
+ def get_desc(lang):
16
+ return "删除文本中未闭合的引用标签和引用链接" if lang == "zh" else "Remove unclosed reference tags and citation links from the text."
17
+
18
+ def run(self, storage: DataFlowStorage, input_key: str):
19
+ self.input_key = input_key
20
+ dataframe = storage.read("dataframe")
21
+ self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key}...")
22
+ numbers = 0
23
+ # 定义要删除的模式 - 更全面的版本
24
+ # 1. 所有<ref>标签及其内容(包括各种不完整形式)
25
+ ref_pattern = re.compile(
26
+ r'<ref\b[^>]*>.*?</ref>|' # 完整的ref标签
27
+ r'<ref\b[^>]*>[^<]*$|' # 不完整的ref标签(没有闭合)
28
+ r'<ref\b[^>]*>.*?/br' # ref标签后跟/br(如你示例中的情况)
29
+ )
30
+
31
+ # 2. 所有{{cite}}模板及其内容(包括各种不完整形式)
32
+ cite_pattern = re.compile(
33
+ r'\{\{cite\s+\w+\|[^}]*\}\}|' # 完整的cite模板
34
+ r'\{\{cite\s+\w+\|[^}]*$' # 不完整的cite模板(没有闭合)
35
+ )
36
+
37
+ refined_data = []
38
+ for item in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
39
+ modified = False
40
+ original_text = item
41
+ refined_text = original_text
42
+
43
+ # 删除所有未闭合的ref标签
44
+ refined_text, ref_count = ref_pattern.subn('', refined_text)
45
+
46
+ # 删除所有不完整的cite模板
47
+ refined_text, cite_count = cite_pattern.subn('', refined_text)
48
+
49
+ # 检查是否有任何修改
50
+ if ref_count > 0 or cite_count > 0:
51
+ modified = True
52
+ numbers += 1
53
+ self.logger.debug(f"Item modified, removed {ref_count} ref tags and {cite_count} cite templates")
54
+
55
+ refined_data.append(item)
56
+ if modified:
57
+ numbers += 1
58
+ self.logger.debug(f"Item modified, total modified so far: {numbers}")
59
+ self.logger.info(f"Refining Complete. Total modified items: {numbers}")
60
+ dataframe[self.input_key] = refined_data
61
+ output_file = storage.write(dataframe)
62
+ return [self.input_key]
DataFlow/dataflow/operators/refine/GeneralText/remove_contractions_refiner.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import contractions
2
+ from tqdm import tqdm
3
+ from dataflow import get_logger
4
+ from dataflow.core import OperatorABC
5
+ from dataflow.utils.storage import DataFlowStorage
6
+ from dataflow.utils.registry import OPERATOR_REGISTRY
7
+
8
+ @OPERATOR_REGISTRY.register()
9
+ class RemoveContractionsRefiner(OperatorABC):
10
+ def __init__(self):
11
+ self.logger = get_logger()
12
+ self.logger.info(f"Initializing {self.__class__.__name__} ...")
13
+
14
+ def run(self, storage: DataFlowStorage, input_key: str):
15
+ self.input_key = input_key
16
+ dataframe = storage.read("dataframe")
17
+ self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key}...")
18
+ numbers = 0
19
+ refined_data = []
20
+ for item in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
21
+ modified = False
22
+ original_text = item
23
+ expanded_text = contractions.fix(original_text)
24
+ if original_text != expanded_text:
25
+ item = expanded_text
26
+ modified = True
27
+ self.logger.debug(f"Modified text for key '{self.input_key}': Original: {original_text[:30]}... -> Refined: {expanded_text[:30]}...")
28
+
29
+ refined_data.append(item)
30
+ if modified:
31
+ numbers += 1
32
+ self.logger.debug(f"Item modified, total modified so far: {numbers}")
33
+ self.logger.info(f"Refining Complete. Total modified items: {numbers}")
34
+ dataframe[self.input_key] = refined_data
35
+ output_file = storage.write(dataframe)
36
+ return [self.input_key]