Spaces:
Running
Running
Upload 11 files
Browse files- detree/utils/adverserial/alter_number.py +21 -0
- detree/utils/adverserial/alternative_spelling.py +38 -0
- detree/utils/adverserial/article_deletion.py +24 -0
- detree/utils/adverserial/extend.py +256 -0
- detree/utils/adverserial/homoglyph.py +44 -0
- detree/utils/adverserial/insert_paragraphs.py +29 -0
- detree/utils/adverserial/misspelling.py +89 -0
- detree/utils/adverserial/polish.py +239 -0
- detree/utils/adverserial/upper_lower.py +22 -0
- detree/utils/adverserial/whitespace.py +16 -0
- detree/utils/adverserial/zero_width_space.py +24 -0
detree/utils/adverserial/alter_number.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import random
|
| 3 |
+
|
| 4 |
+
number_pattern = re.compile(r'\d+')
|
| 5 |
+
|
| 6 |
+
def replace_number(match):
|
| 7 |
+
original = match.group()
|
| 8 |
+
while True:
|
| 9 |
+
new_number = ''.join(random.choices('0123456789', k=len(original)))
|
| 10 |
+
if new_number != original:
|
| 11 |
+
return new_number
|
| 12 |
+
|
| 13 |
+
def AlterNumbersAttack(text):
|
| 14 |
+
return number_pattern.sub(replace_number, text)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
if __name__ == "__main__":
|
| 18 |
+
sample_text = "Before the Marian reforms, the Roman military forces were exclusively made up of citizens people of property (3500 sesterces, say about 1750 loaves of bread) and capable of supplying their own reserves with food. In 1558, when Pope Clement IV was forced to abandon his government in favor of a more centralized army alliance, they became a major part of the new federation and by 1563 they "
|
| 19 |
+
replaced_text = AlterNumbersAttack(sample_text)
|
| 20 |
+
print("Original text:", sample_text)
|
| 21 |
+
print("Text after replacement:", replaced_text)
|
detree/utils/adverserial/alternative_spelling.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import hashlib
|
| 3 |
+
import json
|
| 4 |
+
import math
|
| 5 |
+
import os
|
| 6 |
+
import random
|
| 7 |
+
import regex as re
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
_HERE = os.path.dirname(__file__)
|
| 11 |
+
_RESOURCE_PATH = os.path.join(_HERE, "resources", "american_spellings.json")
|
| 12 |
+
with open(_RESOURCE_PATH, encoding="utf-8") as fp:
|
| 13 |
+
us_to_gb_spelling = json.load(fp)
|
| 14 |
+
|
| 15 |
+
def AlternativeSpellingAttack(text):
|
| 16 |
+
matches = list(re.finditer(r"\L<words>", text, words=us_to_gb_spelling.keys()))
|
| 17 |
+
delta = 0
|
| 18 |
+
matches_to_swap = sorted(matches, key=lambda m: m.start())
|
| 19 |
+
for m in matches_to_swap:
|
| 20 |
+
# Get the left and right index of the swap
|
| 21 |
+
left = m.start() + delta
|
| 22 |
+
right = m.end() + delta
|
| 23 |
+
|
| 24 |
+
# Get the match and insert it into the text
|
| 25 |
+
gb_spelling = us_to_gb_spelling[m.group()]
|
| 26 |
+
text = text[:left] + gb_spelling + text[right:]
|
| 27 |
+
|
| 28 |
+
# Edit delta to account for indexing changes
|
| 29 |
+
delta += len(gb_spelling) - len(m.group())
|
| 30 |
+
|
| 31 |
+
return text
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
if __name__ == "__main__":
|
| 35 |
+
sample_text = "Before the Marian reforms, the Roman military forces were exclusively made up of citizens people of property (3500 sesterces, say about 1750 loaves of bread) and capable of supplying their own reserves with food. In 1558, when Pope Clement IV was forced to abandon his government in favor of a more centralized army alliance, they became a major part of the new federation and by 1563 they "
|
| 36 |
+
replaced_text = AlternativeSpellingAttack(sample_text)
|
| 37 |
+
print("Original text:", sample_text)
|
| 38 |
+
print("Text after replacement:", replaced_text)
|
detree/utils/adverserial/article_deletion.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import random
|
| 3 |
+
import math
|
| 4 |
+
articles = ["a ", "an ", "the "]
|
| 5 |
+
article_pattern = r'\b(?:' + '|'.join(map(re.escape, articles)) + r')\b'
|
| 6 |
+
|
| 7 |
+
def ArticleDeletionAttack(text,N=0.5):
|
| 8 |
+
|
| 9 |
+
all_indices = [(m.start(), m.group()) for m in re.finditer(article_pattern, text)]
|
| 10 |
+
indices_to_delete = random.sample(all_indices, math.ceil(len(all_indices) * N))
|
| 11 |
+
indices_to_delete = sorted(indices_to_delete)
|
| 12 |
+
result = []
|
| 13 |
+
last_end = 0
|
| 14 |
+
for index, article in indices_to_delete:
|
| 15 |
+
result.append(text[last_end:index])
|
| 16 |
+
last_end = index + len(article)
|
| 17 |
+
result.append(text[last_end:])
|
| 18 |
+
return ''.join(result)
|
| 19 |
+
|
| 20 |
+
if __name__ == "__main__":
|
| 21 |
+
sample_text = "Before Marian reforms, Roman military forces were exclusively made up of citizens people of property (3500 sesterces, say about 1750 loaves of bread) and capable of supplying their own reserves with food. In 1558, when Pope Clement IV was forced to abandon his government in favor of a more centralized army alliance, they became a major part of the new federation and by 1563 they "
|
| 22 |
+
replaced_text = ArticleDeletionAttack(sample_text)
|
| 23 |
+
print("Original text:", sample_text)
|
| 24 |
+
print("Text after replacement:", replaced_text)
|
detree/utils/adverserial/extend.py
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import hashlib
|
| 3 |
+
import math
|
| 4 |
+
import os
|
| 5 |
+
from vllm import LLM, SamplingParams
|
| 6 |
+
import json
|
| 7 |
+
from transformers import AutoTokenizer
|
| 8 |
+
import random
|
| 9 |
+
model_alias_mapping = {
|
| 10 |
+
'chatgpt': 'chatgpt',
|
| 11 |
+
'ChatGPT': 'chatgpt',
|
| 12 |
+
'chatGPT': 'chatgpt',
|
| 13 |
+
'gpt-3.5-trubo': 'gpt-3.5-trubo',
|
| 14 |
+
'GPT4': 'gpt4',
|
| 15 |
+
'gpt4': 'gpt4',
|
| 16 |
+
'text-davinci-002': 'text-davinci-002',
|
| 17 |
+
'text-davinci-003': 'text-davinci-003',
|
| 18 |
+
'davinci': 'text-davinci',
|
| 19 |
+
'gpt1': 'gpt1',
|
| 20 |
+
'gpt2_pytorch': 'gpt2-pytorch',
|
| 21 |
+
'gpt2_large': 'gpt2-large',
|
| 22 |
+
'gpt2_small': 'gpt2-small',
|
| 23 |
+
'gpt2_medium': 'gpt2-medium',
|
| 24 |
+
'gpt2-xl': 'gpt2-xl',
|
| 25 |
+
'GPT2-XL': 'gpt2-xl',
|
| 26 |
+
'gpt2_xl': 'gpt2-xl',
|
| 27 |
+
'gpt2': 'gpt2-xl',
|
| 28 |
+
'gpt3': 'gpt3',
|
| 29 |
+
'GROVER_base': 'grover_base',
|
| 30 |
+
'grover_base': 'grover_base',
|
| 31 |
+
'grover_large': 'grover_large',
|
| 32 |
+
'grover_mega': 'grover_mega',
|
| 33 |
+
'llama2-fine-tuned': 'llama2',
|
| 34 |
+
'opt_125m': 'opt_125m',
|
| 35 |
+
'opt_1.3b': 'opt_1.3b',
|
| 36 |
+
'opt_2.7b': 'opt_2.7b',
|
| 37 |
+
'opt_6.7b': 'opt_6.7b',
|
| 38 |
+
'opt_13b': 'opt_13b',
|
| 39 |
+
'opt_30b': 'opt_30b',
|
| 40 |
+
'opt_350m': 'opt_350m',
|
| 41 |
+
'opt_iml_max_1.3b': 'opt_iml_max_1.3b',
|
| 42 |
+
'opt_iml_30b': 'opt_iml_30b',
|
| 43 |
+
'flan_t5_small': 'flan_t5_small',
|
| 44 |
+
'flan_t5_base': 'flan_t5_base',
|
| 45 |
+
'flan_t5_large': 'flan_t5_large',
|
| 46 |
+
'flan_t5_xl': 'flan_t5_xl',
|
| 47 |
+
'flan_t5_xxl': 'flan_t5_xxl',
|
| 48 |
+
'flan_t5': 'flan_t5_xxl',
|
| 49 |
+
'dolly': 'dolly',
|
| 50 |
+
'GLM130B': 'GLM130B',
|
| 51 |
+
'bloom_7b': 'bloom_7b',
|
| 52 |
+
'bloomz': 'bloomz',
|
| 53 |
+
't0_3b': 't0_3b',
|
| 54 |
+
't0_11b': 't0_11b',
|
| 55 |
+
'gpt_neox': 'gpt_neox',
|
| 56 |
+
'xlm': 'xlm',
|
| 57 |
+
'xlnet_large': 'xlnet_large',
|
| 58 |
+
'xlnet_base': 'xlnet_base',
|
| 59 |
+
'cohere': 'cohere',
|
| 60 |
+
'ctrl': 'ctrl',
|
| 61 |
+
'pplm_gpt2': 'pplm_gpt2',
|
| 62 |
+
'pplm_distil': 'pplm_distil',
|
| 63 |
+
'fair_wmt19': 'fair_wmt19',
|
| 64 |
+
'fair_wmt20': 'fair_wmt20',
|
| 65 |
+
'glm130b': 'GLM130B',
|
| 66 |
+
'jais-30b': 'jais',
|
| 67 |
+
'transfo_xl': 'transfo_xl',
|
| 68 |
+
'7B': '7B',
|
| 69 |
+
'13B': '13B',
|
| 70 |
+
'65B': '65B',
|
| 71 |
+
'30B': '30B',
|
| 72 |
+
'gpt_j': 'gpt_j',
|
| 73 |
+
'mpt': 'mpt',
|
| 74 |
+
'mpt-chat': 'mpt-chat',
|
| 75 |
+
'llama-chat': 'llama-chat',
|
| 76 |
+
'mistral': 'mistral',
|
| 77 |
+
'mistral-chat': 'mistral-chat',
|
| 78 |
+
'cohere-chat': 'cohere-chat',
|
| 79 |
+
'human': 'human',
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def load_jsonl(file_path):
|
| 84 |
+
out = []
|
| 85 |
+
with open(file_path, mode='r', encoding='utf-8') as jsonl_file:
|
| 86 |
+
for line in jsonl_file:
|
| 87 |
+
now = json.loads(line)
|
| 88 |
+
now['src'] = model_alias_mapping[now['src']]
|
| 89 |
+
out.append(now)
|
| 90 |
+
random.seed(1)
|
| 91 |
+
random.shuffle(out)
|
| 92 |
+
return out
|
| 93 |
+
|
| 94 |
+
def stable_long_hash(input_string):
|
| 95 |
+
hash_object = hashlib.sha256(input_string.encode())
|
| 96 |
+
hex_digest = hash_object.hexdigest()
|
| 97 |
+
int_hash = int(hex_digest, 16)
|
| 98 |
+
long_long_hash = (int_hash & ((1 << 63) - 1))
|
| 99 |
+
return long_long_hash
|
| 100 |
+
|
| 101 |
+
#train data gen templates
|
| 102 |
+
# templates = ['Here is a piece of text. Please continue writing from where it ends, maintaining the same tone, style, and context while making the continuation coherent and engaging.Input Text:\n{}',
|
| 103 |
+
# 'Please expand on the following text, continuing its ideas and maintaining a consistent tone and style. Ensure the expansion is coherent, logical, and enhances the original content.Input Text:\n{}',
|
| 104 |
+
# 'I have an incomplete text that I need to complete. Please expand it into a complete text that includes the original text I provided. The original text must keep its formatting (such as capitalization and punctuation) intact.Input Text:\n{}']
|
| 105 |
+
|
| 106 |
+
#test data gen templates
|
| 107 |
+
templates = ["Please continue the following text, expanding on its ideas in a way that maintains a consistent tone and style. The expansion should be coherent, logically structured, and serve to enrich the original content. Avoid using transitional phrases such as 'firstly,' 'secondly,' or 'then.' Instead, opt for smoother transitions that flow naturally from one thought to the next. Use punctuation carefully, particularly minimizing the overuse of commas. Input Text:\n{}",
|
| 108 |
+
"Please continue the following text, ensuring the expansion flows naturally and coherently. Build upon the original ideas, introducing new insights that are logically derived from the premises already established. Use smooth transitions between thoughts, avoiding rigid or formulaic structures. The writing should maintain a refined balance of clarity and elegance, with careful attention to punctuation—favoring periods and semicolons over excessive commas. Make sure the expansion complements and enhances the original tone, with a focus on preserving its spirit while adding depth.Input Text:\n{}",]
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def truncate_text(text,tokenizer_ ,max_length=128):
|
| 112 |
+
|
| 113 |
+
tokens = tokenizer_.encode(text)
|
| 114 |
+
if len(tokens)//2 > max_length:
|
| 115 |
+
tokens = tokens[:max_length]
|
| 116 |
+
else:
|
| 117 |
+
tokens = tokens[:len(tokens)//2]
|
| 118 |
+
truncated_text = tokenizer_.decode(tokens, skip_special_tokens=True)
|
| 119 |
+
return truncated_text
|
| 120 |
+
|
| 121 |
+
def gen_extend(data):
|
| 122 |
+
tokenizer_ = AutoTokenizer.from_pretrained('meta-llama/Llama-3.1-8B-Instruct', trust_remote_code=True, max_length=1024, truncation=True)
|
| 123 |
+
prompts = []
|
| 124 |
+
for item in data:
|
| 125 |
+
now_prompt = random.choice(templates)
|
| 126 |
+
text = truncate_text(now_prompt.format(item['text']),tokenizer_)+'\nPlease include the content of the original text and continue to output it together directly.And do not have additional explanatory words, either at the beginning or at the end.\nOutput Text:\n'
|
| 127 |
+
prompts.append(text)
|
| 128 |
+
output_text = []
|
| 129 |
+
outputs = llm.generate(prompts, sampling_params)
|
| 130 |
+
for i,output in enumerate(outputs):
|
| 131 |
+
now_item = data[i]
|
| 132 |
+
generated_text = output.outputs[0].text
|
| 133 |
+
id = stable_long_hash(generated_text)
|
| 134 |
+
source_id = now_item['id'] if now_item.get('adv_source_id','')=='' else now_item['adv_source_id']
|
| 135 |
+
output_text.append({'text':generated_text,'label':now_item['label'],'src':now_item['src']+'_extend_'+call_name,'id':id,'adv_source_id':source_id})
|
| 136 |
+
|
| 137 |
+
return output_text
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
if __name__ == '__main__':
|
| 141 |
+
|
| 142 |
+
parser = argparse.ArgumentParser()
|
| 143 |
+
parser.add_argument("--device_num", type=int, default=4)
|
| 144 |
+
parser.add_argument("-d", "--device", type=int, default=0)
|
| 145 |
+
parser.add_argument("--mode", type=str, default='test')
|
| 146 |
+
parser.add_argument("--dataset", type=str, default='Deepfake')
|
| 147 |
+
parser.add_argument(
|
| 148 |
+
"--data-root",
|
| 149 |
+
type=str,
|
| 150 |
+
default="/path/to/RealBench",
|
| 151 |
+
help="Root directory containing the RealBench-style dataset splits.",
|
| 152 |
+
)
|
| 153 |
+
parser.add_argument("--model_name", type=str, default='databricks/dolly-v2-12b')
|
| 154 |
+
#internlm/internlm2_5-7b-chat
|
| 155 |
+
#THUDM/glm-4-9b-chat
|
| 156 |
+
#mistralai/Ministral-8B-Instruct-2410
|
| 157 |
+
#meta-llama/Llama-3.1-8B-Instruct
|
| 158 |
+
#allenai/OLMo-2-1124-7B-Instruct
|
| 159 |
+
#google/gemma-2-9b
|
| 160 |
+
parser.add_argument("--call_name", type=str, default='dollyv2')
|
| 161 |
+
#internlm2_5_7b
|
| 162 |
+
#glm4_9b
|
| 163 |
+
#Ministral_8b
|
| 164 |
+
#Llama_3.1_8B
|
| 165 |
+
#olmo2_7b
|
| 166 |
+
#gemma2_9b
|
| 167 |
+
args = parser.parse_args()
|
| 168 |
+
mode=args.mode
|
| 169 |
+
dataset=args.dataset
|
| 170 |
+
call_name = args.call_name
|
| 171 |
+
device_num = args.device_num
|
| 172 |
+
data_root = os.path.abspath(args.data_root)
|
| 173 |
+
source_path = os.path.join(data_root, dataset, "no_attack", f"{mode}.jsonl")
|
| 174 |
+
data = load_jsonl(source_path)
|
| 175 |
+
print('loading ', source_path)
|
| 176 |
+
|
| 177 |
+
each_len = math.ceil(len(data)/device_num)
|
| 178 |
+
st = each_len * args.device
|
| 179 |
+
ed = min(st + each_len, len(data))
|
| 180 |
+
data = data[st:ed]
|
| 181 |
+
# data=data[:5]
|
| 182 |
+
print(f"device {args.device} start from {st} to {ed}",len(data))
|
| 183 |
+
|
| 184 |
+
model_name = args.model_name
|
| 185 |
+
# model_name = "mistralai/Ministral-8B-Instruct-2410"
|
| 186 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, max_length=1024, truncation=True)
|
| 187 |
+
if 'glm' in model_name:
|
| 188 |
+
llm = LLM(
|
| 189 |
+
model=model_name,
|
| 190 |
+
max_model_len=2048,
|
| 191 |
+
trust_remote_code=True,
|
| 192 |
+
# enforce_eager=True,
|
| 193 |
+
# Enable the following options if GLM-4-9B-Chat-1M runs out of memory
|
| 194 |
+
# enable_chunked_prefill=True,
|
| 195 |
+
# max_num_batched_tokens=8192
|
| 196 |
+
)
|
| 197 |
+
stop_token_ids = [151329, 151336, 151338]
|
| 198 |
+
sampling_params = SamplingParams(temperature=0.95, max_tokens=1024, stop_token_ids=stop_token_ids)
|
| 199 |
+
elif 'Ministral' in model_name:
|
| 200 |
+
llm = LLM(
|
| 201 |
+
model=model_name,
|
| 202 |
+
tokenizer_mode="mistral",
|
| 203 |
+
config_format="mistral",
|
| 204 |
+
load_format="mistral",
|
| 205 |
+
max_model_len=2048,
|
| 206 |
+
)
|
| 207 |
+
sampling_params = SamplingParams(max_tokens=1024)
|
| 208 |
+
elif 'Llama' in model_name:
|
| 209 |
+
llm = LLM(
|
| 210 |
+
model=model_name,
|
| 211 |
+
max_model_len=2048,
|
| 212 |
+
trust_remote_code=True,
|
| 213 |
+
)
|
| 214 |
+
sampling_params = SamplingParams(max_tokens=1024,temperature=0.6,top_p=0.9)
|
| 215 |
+
elif 'OLMo' in model_name or 'internlm' in model_name:
|
| 216 |
+
llm = LLM(
|
| 217 |
+
model=model_name,
|
| 218 |
+
max_model_len=2048,
|
| 219 |
+
trust_remote_code=True,
|
| 220 |
+
)
|
| 221 |
+
sampling_params = SamplingParams(max_tokens=1024,temperature=0.9,top_k=50)
|
| 222 |
+
elif 'Qwen' in model_name:
|
| 223 |
+
llm = LLM(
|
| 224 |
+
model=model_name,
|
| 225 |
+
max_model_len=2048,
|
| 226 |
+
trust_remote_code=True,
|
| 227 |
+
)
|
| 228 |
+
sampling_params = SamplingParams(max_tokens=1024,temperature=0.7,top_k=20,top_p=0.8,repetition_penalty=1.05)
|
| 229 |
+
elif 'gemma' in model_name:
|
| 230 |
+
llm = LLM(
|
| 231 |
+
model=model_name,
|
| 232 |
+
max_model_len=2048,
|
| 233 |
+
trust_remote_code=True,
|
| 234 |
+
)
|
| 235 |
+
sampling_params = SamplingParams(max_tokens=1024,temperature=1.0)
|
| 236 |
+
elif 'dolly' in model_name:
|
| 237 |
+
llm = LLM(
|
| 238 |
+
model=model_name,
|
| 239 |
+
max_model_len=2048,
|
| 240 |
+
trust_remote_code=True,
|
| 241 |
+
tensor_parallel_size=2
|
| 242 |
+
)
|
| 243 |
+
sampling_params = SamplingParams(max_tokens=1024,temperature=0.7)
|
| 244 |
+
|
| 245 |
+
output_text = gen_extend(data)
|
| 246 |
+
extend_dir = os.path.join(data_root, dataset, "extend")
|
| 247 |
+
if not os.path.exists(extend_dir):
|
| 248 |
+
os.makedirs(extend_dir)
|
| 249 |
+
target_path = os.path.join(extend_dir, f"{mode}.jsonl")
|
| 250 |
+
with open(target_path, mode='a+', encoding='utf-8') as jsonl_file:
|
| 251 |
+
for item in output_text:
|
| 252 |
+
jsonl_file.write(json.dumps(item,ensure_ascii=False)+'\n')
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
|
detree/utils/adverserial/homoglyph.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import hashlib
|
| 3 |
+
import json
|
| 4 |
+
import math
|
| 5 |
+
import os
|
| 6 |
+
import random
|
| 7 |
+
import regex as re
|
| 8 |
+
|
| 9 |
+
mapping = {
|
| 10 |
+
"a": ["а"],
|
| 11 |
+
"A": ["А", "Α"],
|
| 12 |
+
"B": ["В", "Β"],
|
| 13 |
+
"e": ["е"],
|
| 14 |
+
"E": ["Е", "Ε"],
|
| 15 |
+
"c": ["с"],
|
| 16 |
+
"p": ["р"],
|
| 17 |
+
"K": ["К", "Κ"],
|
| 18 |
+
"O": ["О", "Ο"],
|
| 19 |
+
"P": ["Р", "Ρ"],
|
| 20 |
+
"M": ["М", "Μ"],
|
| 21 |
+
"H": ["Н", "Η"],
|
| 22 |
+
"T": ["Т", "Τ"],
|
| 23 |
+
"X": ["Х", "Χ"],
|
| 24 |
+
"C": ["С"],
|
| 25 |
+
"y": ["у"],
|
| 26 |
+
"o": ["о"],
|
| 27 |
+
"x": ["х"],
|
| 28 |
+
"I": ["І", "Ι"],
|
| 29 |
+
"i": ["і"],
|
| 30 |
+
"N": ["Ν"],
|
| 31 |
+
"Z": ["Ζ"],
|
| 32 |
+
}
|
| 33 |
+
pattern = re.compile('|'.join(re.escape(char) for char in mapping.keys()))
|
| 34 |
+
|
| 35 |
+
def HomoglyphAttack(text):
|
| 36 |
+
return pattern.sub(lambda match: random.choice(mapping[match.group()]), text)
|
| 37 |
+
|
| 38 |
+
def print_unicode(text):
|
| 39 |
+
return " ".join(str(ord(char)) for char in text)
|
| 40 |
+
if __name__ == "__main__":
|
| 41 |
+
sample_text = "Before Marian reforms, Roman military forces were exclusively made up of citizens people of property (3500 sesterces, say about 1750 loaves of bread) and capable of supplying their own reserves with food. In 1558, when Pope Clement IV was forced to abandon his government in favor of a more centralized army alliance, they became a major part of the new federation and by 1563 they "
|
| 42 |
+
replaced_text = HomoglyphAttack(sample_text)
|
| 43 |
+
print("Original text:", print_unicode(sample_text))
|
| 44 |
+
print("Text after replacement:", print_unicode(replaced_text))
|
detree/utils/adverserial/insert_paragraphs.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import nltk
|
| 3 |
+
import argparse
|
| 4 |
+
import hashlib
|
| 5 |
+
import json
|
| 6 |
+
import math
|
| 7 |
+
import os
|
| 8 |
+
import random
|
| 9 |
+
|
| 10 |
+
sentence_end_regex = r'([.!?。?!])(?=\s|$)'
|
| 11 |
+
|
| 12 |
+
def InsertParagraphsAttack(text,N=0.5):
|
| 13 |
+
sentences = re.split(sentence_end_regex, text)
|
| 14 |
+
sentences_to_alter = math.ceil((len(sentences) - 1) * N)
|
| 15 |
+
indices_to_alter = random.sample(range(1, len(sentences)), sentences_to_alter)
|
| 16 |
+
sorted_indices_to_alter = sorted(indices_to_alter)
|
| 17 |
+
for i in sorted_indices_to_alter:
|
| 18 |
+
if random.random() < 0.5:
|
| 19 |
+
sentences[i] = sentences[i]+"\n\n "
|
| 20 |
+
else:
|
| 21 |
+
sentences[i] = sentences[i]+"\n\n"
|
| 22 |
+
return "".join(sentences)
|
| 23 |
+
|
| 24 |
+
if __name__ == "__main__":
|
| 25 |
+
sample_text = "Before Marian reforms, Roman military forces were exclusively made up of citizens people of property (3500 sesterces, say about 1750 loaves of bread) and capable of supplying their own reserves with food. In 1558, when Pope Clement IV was forced to abandon his government in favor of a more centralized army alliance, they became a major part of the new federation and by 1563 they "
|
| 26 |
+
replaced_text = InsertParagraphsAttack(sample_text)
|
| 27 |
+
print("Original text:", sample_text)
|
| 28 |
+
print("Text after replacement:", replaced_text)
|
| 29 |
+
|
detree/utils/adverserial/misspelling.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import hashlib
|
| 2 |
+
import itertools
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import random
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import re
|
| 8 |
+
import nltk
|
| 9 |
+
from tqdm import tqdm
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def _ensure_nltk_resource(resource_name: str, resource_paths: list[str]) -> None:
|
| 13 |
+
"""Download an NLTK resource only if none of the candidate paths exist."""
|
| 14 |
+
|
| 15 |
+
for resource_path in resource_paths:
|
| 16 |
+
try:
|
| 17 |
+
nltk.data.find(resource_path)
|
| 18 |
+
return
|
| 19 |
+
except LookupError:
|
| 20 |
+
continue
|
| 21 |
+
|
| 22 |
+
nltk.download(resource_name)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
_ensure_nltk_resource(
|
| 26 |
+
"punkt_tab",
|
| 27 |
+
[
|
| 28 |
+
"tokenizers/punkt_tab",
|
| 29 |
+
"tokenizers/punkt_tab/english.pickle",
|
| 30 |
+
"tokenizers/punkt",
|
| 31 |
+
"tokenizers/punkt/english.pickle",
|
| 32 |
+
],
|
| 33 |
+
)
|
| 34 |
+
_ensure_nltk_resource(
|
| 35 |
+
"averaged_perceptron_tagger_eng",
|
| 36 |
+
[
|
| 37 |
+
"taggers/averaged_perceptron_tagger_eng",
|
| 38 |
+
"taggers/averaged_perceptron_tagger_eng/averaged_perceptron_tagger_eng.pickle",
|
| 39 |
+
],
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
def merge_dict(dict1, dict2):
|
| 43 |
+
for key in dict1.keys():
|
| 44 |
+
if key in dict2:
|
| 45 |
+
dict1[key] += dict2[key]
|
| 46 |
+
return dict1
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
sent_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
|
| 50 |
+
sspan = lambda x: sent_tokenizer.span_tokenize(x)
|
| 51 |
+
word_tokenizer = nltk.tokenize.NLTKWordTokenizer()
|
| 52 |
+
wspan = lambda x: word_tokenizer.span_tokenize(x)
|
| 53 |
+
current_directory = Path(os.path.dirname(os.path.abspath(__file__)))
|
| 54 |
+
json_path = current_directory / "resources" / "misspelling3.json"
|
| 55 |
+
with open(json_path, "r") as f:
|
| 56 |
+
corrections = json.load(f)
|
| 57 |
+
json_path = current_directory / "resources" / "misspellings.json"
|
| 58 |
+
with open(json_path, "r") as f:
|
| 59 |
+
corrections = merge_dict(corrections, json.load(f))
|
| 60 |
+
json_path = current_directory / "resources" / "misspellings2.json"
|
| 61 |
+
with open(json_path, "r") as f:
|
| 62 |
+
corrections = merge_dict(corrections, json.load(f))
|
| 63 |
+
|
| 64 |
+
def get_nltk_span_tokens(text):
|
| 65 |
+
return [(ws + s, we + s) for s, e in sspan(text) for ws, we in wspan(text[s:e])]
|
| 66 |
+
|
| 67 |
+
def misspell(w):
|
| 68 |
+
misspellings = corrections.get(w.lower())
|
| 69 |
+
choice = random.choice(misspellings)
|
| 70 |
+
return choice.capitalize() if w[0].isupper() else choice
|
| 71 |
+
|
| 72 |
+
def can_misspell(word):
|
| 73 |
+
return word.lower() in corrections
|
| 74 |
+
|
| 75 |
+
def MisspellingAttack(text):
|
| 76 |
+
word_spans = get_nltk_span_tokens(text)
|
| 77 |
+
all_spans = list(itertools.pairwise(itertools.chain.from_iterable(word_spans)))
|
| 78 |
+
toks = [text[s:e] for s, e in all_spans]
|
| 79 |
+
|
| 80 |
+
candidate_spans = [(i, s, e) for i, (s, e) in enumerate(all_spans) if can_misspell(text[s:e])]
|
| 81 |
+
for i, s, e in candidate_spans:
|
| 82 |
+
toks[i] = misspell(text[s:e])
|
| 83 |
+
return "".join(toks)
|
| 84 |
+
|
| 85 |
+
if __name__ == "__main__":
|
| 86 |
+
sample_text = "Tap tap tap on the door. Bitsy gathered her bravery, approached the closet's door as she knew she must, face her fear as she did each night. Mother, with her hand still raised and her back turned to me, was lifting a dress out of its hanger and turning it over in her hands. \"What are you doing?\" I asked quietly, my voice trembling just like her own, afraid that if we spoke too loudly at this moment their voices would carry through the wall into mine. \"Mother,\" I whispered again. She stopped what she was doing and faced me across the room. Her eyes were puffy from lack of sleep and her hair had been pulled up off her neck in an untidy bun. The lines around her mouth - the ones so carefully curled by her hairdresser earlier today - were hardening now, becoming more pronounced than they'd ever really looked before. If Lily wasn't there for us anymore, why should any mother be? Why even bother trying when all our children left home one after another without warning or explanation? Hadn't anyone taught them anything about life or love? Did they think only happy things happened inside houses? I stood irresolute, staring blankly ahead while Mother slowly lowered herself onto the floor, then began gently rubbing the side of her cheek against the bottom of the dress being examined. When she felt something wet she looked down at her hands and saw bits of dried blood smeared along one knuckle. It took everything within me not to cry out No! But I didn't want to see her fall apart right here, right now, because surely if she gave way once, soon enough the whole house could go crashing down around us. Instead of saying the word aloud, however, which might force matters further forward, I said nothing but simply went and sat beside her, brushing the strands of her hair away from where they clung to her forehead. As I rubbed her head I tried to imagine how the two of us used to sit together under the same roof, how gentle and loving it had always been between us, and how much happier it made me feel knowing she cared deeply for me, no matter what else came along later. So many thoughts raced through my mind and heart: that the entire household may have fallen apart; that perhaps we never would be able to make things better, or maybe never get past this terrible midlife crisis together. All of these anxious questions passed through my brain, but none rose above the others until I realized that I hadn't moved an inch since sitting down next to her. Now I found myself looking directly into her tear-streaked face, searching earnestly for some clue as to why she'd done such a thing. Perhaps I'd convinced myself that she needed help somehow. In fact, it seemed so obvious now that I'm sure it had occurred to me before, probably during my long walk alone toward my old apartment building. Still, I couldn't say exactly why I wanted to know. What would I do if it became necessary? Would I try to drag her outside, lift her bodily from the front hall rug into the living room, and hold her close until someone arrived who could give her aid? Or would I tell her that she should call 911, let them come and take care of her properly? It doesn't matter now, I thought sadly, feeling strangely detached from myself even though I was intimately involved. Whatever happens will happen. We'll find a way. Maybe tonight, maybe tomorrow, or three months from now, the older girls will return. They're both in college now. One is riding high on a full football scholarship. They won't need money, or begrudge us having any ourselves, anyway. There could be absolutely no harm in giving them your new address, anonymously of course, explaining that you've decided to spend a few quiet years here among family friends, henceforth avoiding contact with either of them. And yet, despite the tremendous relief I supposed that knowledge brought me, I also missed those daughters terribly, desperately wished I'd chosen to intervene sooner rather than letting the situation deteriorate as far as it already had. The Girl Who Ran Away (Chapter 3) 6293 Dear Diary, So far I haven't talked about one very important person in all this business: Father. He hasn't run away, nor has he threatened it, but whenever Mom goes near him he gets nervous, his body tensing up until you can almost hear the springs squeaking beneath him. This morning Edie woke me quite early, telling me breathless that Dad sounded sick and crying, and I rushed downstairs ready to rush out after him. Then I remembered that he had gotten hurt last week, got carted off to the hospital in Torrance for stitches, and stayed overnight. By Sunday afternoon Edie told me he'd come home fine, although limping like hell. For weeks afterwards everyone acted blithely unconcerned with whatever ailment he suffered, without-scandentirely irrelevant. Not that it bothered taking current symptomatically serious symptoms setterfrua recent token looks behind the other members' mutations."
|
| 87 |
+
replaced_text = MisspellingAttack(sample_text)
|
| 88 |
+
print("Original text:", sample_text)
|
| 89 |
+
print("Text after replacement:", replaced_text)
|
detree/utils/adverserial/polish.py
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import hashlib
|
| 3 |
+
import math
|
| 4 |
+
import os
|
| 5 |
+
from vllm import LLM, SamplingParams
|
| 6 |
+
import json
|
| 7 |
+
from transformers import AutoTokenizer
|
| 8 |
+
import random
|
| 9 |
+
model_alias_mapping = {
|
| 10 |
+
'chatgpt': 'chatgpt',
|
| 11 |
+
'ChatGPT': 'chatgpt',
|
| 12 |
+
'chatGPT': 'chatgpt',
|
| 13 |
+
'gpt-3.5-trubo': 'gpt-3.5-trubo',
|
| 14 |
+
'GPT4': 'gpt4',
|
| 15 |
+
'gpt4': 'gpt4',
|
| 16 |
+
'text-davinci-002': 'text-davinci-002',
|
| 17 |
+
'text-davinci-003': 'text-davinci-003',
|
| 18 |
+
'davinci': 'text-davinci',
|
| 19 |
+
'gpt1': 'gpt1',
|
| 20 |
+
'gpt2_pytorch': 'gpt2-pytorch',
|
| 21 |
+
'gpt2_large': 'gpt2-large',
|
| 22 |
+
'gpt2_small': 'gpt2-small',
|
| 23 |
+
'gpt2_medium': 'gpt2-medium',
|
| 24 |
+
'gpt2-xl': 'gpt2-xl',
|
| 25 |
+
'GPT2-XL': 'gpt2-xl',
|
| 26 |
+
'gpt2_xl': 'gpt2-xl',
|
| 27 |
+
'gpt2': 'gpt2-xl',
|
| 28 |
+
'gpt3': 'gpt3',
|
| 29 |
+
'GROVER_base': 'grover_base',
|
| 30 |
+
'grover_base': 'grover_base',
|
| 31 |
+
'grover_large': 'grover_large',
|
| 32 |
+
'grover_mega': 'grover_mega',
|
| 33 |
+
'llama2-fine-tuned': 'llama2',
|
| 34 |
+
'opt_125m': 'opt_125m',
|
| 35 |
+
'opt_1.3b': 'opt_1.3b',
|
| 36 |
+
'opt_2.7b': 'opt_2.7b',
|
| 37 |
+
'opt_6.7b': 'opt_6.7b',
|
| 38 |
+
'opt_13b': 'opt_13b',
|
| 39 |
+
'opt_30b': 'opt_30b',
|
| 40 |
+
'opt_350m': 'opt_350m',
|
| 41 |
+
'opt_iml_max_1.3b': 'opt_iml_max_1.3b',
|
| 42 |
+
'opt_iml_30b': 'opt_iml_30b',
|
| 43 |
+
'flan_t5_small': 'flan_t5_small',
|
| 44 |
+
'flan_t5_base': 'flan_t5_base',
|
| 45 |
+
'flan_t5_large': 'flan_t5_large',
|
| 46 |
+
'flan_t5_xl': 'flan_t5_xl',
|
| 47 |
+
'flan_t5_xxl': 'flan_t5_xxl',
|
| 48 |
+
'flan_t5': 'flan_t5_xxl',
|
| 49 |
+
'dolly': 'dolly',
|
| 50 |
+
'GLM130B': 'GLM130B',
|
| 51 |
+
'bloom_7b': 'bloom_7b',
|
| 52 |
+
'bloomz': 'bloomz',
|
| 53 |
+
't0_3b': 't0_3b',
|
| 54 |
+
't0_11b': 't0_11b',
|
| 55 |
+
'gpt_neox': 'gpt_neox',
|
| 56 |
+
'xlm': 'xlm',
|
| 57 |
+
'xlnet_large': 'xlnet_large',
|
| 58 |
+
'xlnet_base': 'xlnet_base',
|
| 59 |
+
'cohere': 'cohere',
|
| 60 |
+
'ctrl': 'ctrl',
|
| 61 |
+
'pplm_gpt2': 'pplm_gpt2',
|
| 62 |
+
'pplm_distil': 'pplm_distil',
|
| 63 |
+
'fair_wmt19': 'fair_wmt19',
|
| 64 |
+
'fair_wmt20': 'fair_wmt20',
|
| 65 |
+
'glm130b': 'GLM130B',
|
| 66 |
+
'jais-30b': 'jais',
|
| 67 |
+
'transfo_xl': 'transfo_xl',
|
| 68 |
+
'7B': '7B',
|
| 69 |
+
'13B': '13B',
|
| 70 |
+
'65B': '65B',
|
| 71 |
+
'30B': '30B',
|
| 72 |
+
'gpt_j': 'gpt_j',
|
| 73 |
+
'mpt': 'mpt',
|
| 74 |
+
'mpt-chat': 'mpt-chat',
|
| 75 |
+
'llama-chat': 'llama-chat',
|
| 76 |
+
'mistral': 'mistral',
|
| 77 |
+
'mistral-chat': 'mistral-chat',
|
| 78 |
+
'cohere-chat': 'cohere-chat',
|
| 79 |
+
'outfox': 'outfox',
|
| 80 |
+
'human': 'human',
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
def load_jsonl(file_path):
|
| 84 |
+
out = []
|
| 85 |
+
with open(file_path, mode='r', encoding='utf-8') as jsonl_file:
|
| 86 |
+
for line in jsonl_file:
|
| 87 |
+
now = json.loads(line)
|
| 88 |
+
|
| 89 |
+
now['src'] = model_alias_mapping[now['src']]
|
| 90 |
+
out.append(now)
|
| 91 |
+
random.seed(1)
|
| 92 |
+
random.shuffle(out)
|
| 93 |
+
return out
|
| 94 |
+
|
| 95 |
+
def stable_long_hash(input_string):
|
| 96 |
+
hash_object = hashlib.sha256(input_string.encode())
|
| 97 |
+
hex_digest = hash_object.hexdigest()
|
| 98 |
+
int_hash = int(hex_digest, 16)
|
| 99 |
+
long_long_hash = (int_hash & ((1 << 63) - 1))
|
| 100 |
+
return long_long_hash
|
| 101 |
+
|
| 102 |
+
# templates = ['Please refine the following paragraph to improve its flow and clarity. Ensure that the original meaning and structure are preserved, while enhancing sentence construction and expression for better readability:\n{}',
|
| 103 |
+
# 'Please refine the sentences in the following paragraph to improve their fluency and clarity. Ensure that the overall content and structure remain unchanged. The focus should be on enhancing sentence construction and expression, ensuring the text flows smoothly and conveys information clearly and accurately:Input Text:\n{}',
|
| 104 |
+
# 'Kindly optimize the sentences in the following paragraph to improve readability and coherence. Do not make any changes to the main content or structure of the text. Concentrate on refining the sentence construction and expression to ensure the ideas are presented clearly and logically:Input Text:\n{}',
|
| 105 |
+
# 'Please enhance the fluency and clarity of the sentences in the paragraph below. Keep the overall content and structure intact, but focus on optimizing the construction and expression of the sentences to ensure that the text reads smoothly and conveys the intended information accurately:Input Text:\n{}',
|
| 106 |
+
# 'Examine the following text and make adjustments to improve the fluency and clarity of the sentences. Do not alter the structure or content of the paragraph. The goal is to improve the expression and flow of the sentences, ensuring the ideas are conveyed clearly and effectively:Input Text:\n{}',
|
| 107 |
+
# 'Please analyze the following text for spelling and grammatical inaccuracies, ensuring that any repetitive or improperly chosen words are replaced. Do not make any changes to the sentence order or structure. The goal is to enhance the precision and clarity of the language, maintaining the original sentence framework:Input Text:\n{}',
|
| 108 |
+
# 'Review the paragraph below and identify any spelling and grammatical errors. Replace any words that seem redundant, unclear, or incorrectly chosen. The sentence structure must remain intact, with changes being limited only to word choices to improve readability and appropriateness:Input Text:\n{}',
|
| 109 |
+
# 'Please optimize the sentences in the following paragraph to enhance fluency and clarity. Do not alter the overall content or structure of the paragraph. Focus on the construction and expression of the sentences, ensuring that the text is coherent and the information is accurate:Input Text:\n{}',
|
| 110 |
+
# 'Please polish the following text to make the language more fluent and cohesive, ensuring grammatical accuracy and enhancing the elegance and professionalism of expression:Input Text:\n{}',
|
| 111 |
+
# 'Optimize the following text to make sentence structures more varied, enrich vocabulary, and improve readability and appeal:Input Text:\n{}',
|
| 112 |
+
# 'Enhance the following text to elevate its expressive quality, add a literary touch, and retain its original meaning:Input Text:\n{}',
|
| 113 |
+
# 'Polish the following text to add emotional depth and vivid imagery, with the hallmark of creative writing:Input Text:\n{}',
|
| 114 |
+
# 'Reorganize the structure of the following text to make its logic clearer and its flow more coherent:Input Text:\n{}',]
|
| 115 |
+
|
| 116 |
+
templates = ['Please revise the following paragraph to enhance its fluency and coherence. Focus on improving the transitions between sentences, reinforcing the core argument, and eliminating any redundant or unnecessary content. The goal is to refine the expression and sentence structure, ensuring clarity and precision while maintaining the original meaning and overall structure. The revision should make the text more concise, logical, and engaging.Input Text:\n{}',
|
| 117 |
+
'Please adjust the language style of the following paragraph to make it more informal. Maintain the core meaning and structure while ensuring that the tone aligns with a more casual audience.Input Text:\n{}']
|
| 118 |
+
|
| 119 |
+
def truncate_text(text,tokenizer_ ,max_length=1024):
|
| 120 |
+
|
| 121 |
+
tokens = tokenizer_.encode(text, truncation=True, max_length=max_length)
|
| 122 |
+
truncated_text = tokenizer_.decode(tokens, skip_special_tokens=True)
|
| 123 |
+
return truncated_text
|
| 124 |
+
|
| 125 |
+
def gen_polish(data):
|
| 126 |
+
tokenizer_ = AutoTokenizer.from_pretrained('meta-llama/Llama-3.1-8B-Instruct', trust_remote_code=True, max_length=1024, truncation=True)
|
| 127 |
+
prompts = []
|
| 128 |
+
for item in data:
|
| 129 |
+
now_prompt = random.choice(templates)
|
| 130 |
+
text = truncate_text(now_prompt.format(item['text']),tokenizer_)+'\nPlease output the polished text directly.\nOutput Text:\n'
|
| 131 |
+
prompts.append(text)
|
| 132 |
+
output_text = []
|
| 133 |
+
outputs = llm.generate(prompts, sampling_params)
|
| 134 |
+
for i,output in enumerate(outputs):
|
| 135 |
+
now_item = data[i]
|
| 136 |
+
generated_text = output.outputs[0].text
|
| 137 |
+
id = stable_long_hash(generated_text)
|
| 138 |
+
source_id = now_item['id'] if now_item.get('adv_source_id','')=='' else now_item['adv_source_id']
|
| 139 |
+
output_text.append({'text':generated_text,'label':now_item['label'],'src':now_item['src']+'_polish_'+call_name,'id':id,'adv_source_id':source_id})
|
| 140 |
+
|
| 141 |
+
return output_text
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
if __name__ == '__main__':
|
| 145 |
+
|
| 146 |
+
parser = argparse.ArgumentParser()
|
| 147 |
+
parser.add_argument("--device_num", type=int, default=1)
|
| 148 |
+
parser.add_argument("-d", "--device", type=int, default=0)
|
| 149 |
+
parser.add_argument("--mode", type=str, default='test')
|
| 150 |
+
parser.add_argument("--dataset", type=str, default='OUTFOX')
|
| 151 |
+
parser.add_argument("--model_name", type=str, default='deepseek-ai/DeepSeek-V2-Lite')
|
| 152 |
+
parser.add_argument("--call_name", type=str, default='deepseekv2')
|
| 153 |
+
args = parser.parse_args()
|
| 154 |
+
mode=args.mode
|
| 155 |
+
dataset=args.dataset
|
| 156 |
+
call_name = args.call_name
|
| 157 |
+
device_num = args.device_num
|
| 158 |
+
data=load_jsonl(f'/path/to/RealBench/{dataset}/no_attack/{mode}.jsonl')
|
| 159 |
+
print('loading ',f'/path/to/RealBench/{dataset}/no_attack/{mode}.jsonl')
|
| 160 |
+
|
| 161 |
+
each_len = math.ceil(len(data)/device_num)
|
| 162 |
+
st = each_len * args.device
|
| 163 |
+
ed = min(st + each_len, len(data))
|
| 164 |
+
data = data[st:ed]
|
| 165 |
+
# data=data[:5]
|
| 166 |
+
print(f"device {args.device} start from {st} to {ed}",len(data))
|
| 167 |
+
|
| 168 |
+
model_name = args.model_name
|
| 169 |
+
# model_name = "mistralai/Ministral-8B-Instruct-2410"
|
| 170 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, max_length=1024, truncation=True)
|
| 171 |
+
if 'glm' in model_name:
|
| 172 |
+
llm = LLM(
|
| 173 |
+
model=model_name,
|
| 174 |
+
max_model_len=2048,
|
| 175 |
+
trust_remote_code=True,
|
| 176 |
+
# enforce_eager=True,
|
| 177 |
+
# Enable the following options if GLM-4-9B-Chat-1M runs out of memory
|
| 178 |
+
# enable_chunked_prefill=True,
|
| 179 |
+
# max_num_batched_tokens=8192
|
| 180 |
+
)
|
| 181 |
+
stop_token_ids = [151329, 151336, 151338]
|
| 182 |
+
sampling_params = SamplingParams(temperature=0.95, max_tokens=1024, stop_token_ids=stop_token_ids)
|
| 183 |
+
elif 'Ministral' in model_name:
|
| 184 |
+
llm = LLM(
|
| 185 |
+
model=model_name,
|
| 186 |
+
tokenizer_mode="mistral",
|
| 187 |
+
config_format="mistral",
|
| 188 |
+
load_format="mistral",
|
| 189 |
+
max_model_len=2048,
|
| 190 |
+
)
|
| 191 |
+
sampling_params = SamplingParams(max_tokens=1024)
|
| 192 |
+
elif 'Llama' in model_name:
|
| 193 |
+
llm = LLM(
|
| 194 |
+
model=model_name,
|
| 195 |
+
max_model_len=2048,
|
| 196 |
+
trust_remote_code=True,
|
| 197 |
+
)
|
| 198 |
+
sampling_params = SamplingParams(max_tokens=1024,temperature=0.6,top_p=0.9)
|
| 199 |
+
elif 'OLMo' in model_name or 'internlm' in model_name:
|
| 200 |
+
llm = LLM(
|
| 201 |
+
model=model_name,
|
| 202 |
+
max_model_len=2048,
|
| 203 |
+
trust_remote_code=True,
|
| 204 |
+
)
|
| 205 |
+
sampling_params = SamplingParams(max_tokens=1024,temperature=0.9,top_k=50)
|
| 206 |
+
elif 'Qwen' in model_name:
|
| 207 |
+
llm = LLM(
|
| 208 |
+
model=model_name,
|
| 209 |
+
max_model_len=2048,
|
| 210 |
+
trust_remote_code=True,
|
| 211 |
+
)
|
| 212 |
+
sampling_params = SamplingParams(max_tokens=1024,temperature=0.7,top_k=20,top_p=0.8,repetition_penalty=1.05)
|
| 213 |
+
elif 'DeepSeek' in model_name:
|
| 214 |
+
llm = LLM(
|
| 215 |
+
model=model_name,
|
| 216 |
+
max_model_len=2048,
|
| 217 |
+
trust_remote_code=True,
|
| 218 |
+
tensor_parallel_size=2
|
| 219 |
+
)
|
| 220 |
+
sampling_params = SamplingParams(max_tokens=1024,temperature=0.3,top_p=0.95)
|
| 221 |
+
elif 'gemma' in model_name:
|
| 222 |
+
llm = LLM(
|
| 223 |
+
model=model_name,
|
| 224 |
+
max_model_len=2048,
|
| 225 |
+
trust_remote_code=True,
|
| 226 |
+
)
|
| 227 |
+
sampling_params = SamplingParams(max_tokens=1024,temperature=1.0)
|
| 228 |
+
|
| 229 |
+
output_text = gen_polish(data)
|
| 230 |
+
if os.path.exists(f"/path/to/RealBench/{dataset}/polish")==False:
|
| 231 |
+
os.makedirs(f"/path/to/RealBench/{dataset}/polish")
|
| 232 |
+
target_path = f"/path/to/RealBench/{dataset}/polish/{mode}.jsonl"
|
| 233 |
+
with open(target_path, mode='a+', encoding='utf-8') as jsonl_file:
|
| 234 |
+
for item in output_text:
|
| 235 |
+
jsonl_file.write(json.dumps(item,ensure_ascii=False)+'\n')
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
|
detree/utils/adverserial/upper_lower.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import nltk
|
| 2 |
+
import math
|
| 3 |
+
import os
|
| 4 |
+
import random
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
tokenizer = nltk.tokenize.NLTKWordTokenizer()
|
| 8 |
+
|
| 9 |
+
def UpperLowerFlipAttack(text, N=0.5):
|
| 10 |
+
indices = [s for s, e in tokenizer.span_tokenize(text) if text[s].isalpha()]
|
| 11 |
+
num_to_flip = math.ceil(len(indices) * N)
|
| 12 |
+
flip_indices = random.sample(indices, num_to_flip)
|
| 13 |
+
text = list(text)
|
| 14 |
+
for i in flip_indices:
|
| 15 |
+
text[i] = text[i].lower() if text[i].isupper() else text[i].upper()
|
| 16 |
+
return "".join(text)
|
| 17 |
+
|
| 18 |
+
if __name__ == "__main__":
|
| 19 |
+
sample_text = "Before Marian reforms, Roman military forces were exclusively made up of citizens people of property (3500 sesterces, say about 1750 loaves of bread) and capable of supplying their own reserves with food. In 1558, when Pope Clement IV was forced to abandon his government in favor of a more centralized army alliance, they became a major part of the new federation and by 1563 they "
|
| 20 |
+
replaced_text = UpperLowerFlipAttack(sample_text)
|
| 21 |
+
print("Original text:", sample_text)
|
| 22 |
+
print("Text after replacement:", replaced_text)
|
detree/utils/adverserial/whitespace.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
|
| 3 |
+
def WhiteSpaceAttack(text,N=0.5):
|
| 4 |
+
texts = text.split(" ")
|
| 5 |
+
spaces_to_alter = int(len(texts) * N)
|
| 6 |
+
indices_to_alter = random.choices(range(len(texts)), k=spaces_to_alter)
|
| 7 |
+
indices_to_alter = sorted(indices_to_alter)
|
| 8 |
+
for i in indices_to_alter:
|
| 9 |
+
texts[i] += " "
|
| 10 |
+
return " ".join(texts)
|
| 11 |
+
|
| 12 |
+
if __name__ == "__main__":
|
| 13 |
+
sample_text = "Before Marian reforms, Roman military forces were exclusively made up of citizens people of property (3500 sesterces, say about 1750 loaves of bread) and capable of supplying their own reserves with food. In 1558, when Pope Clement IV was forced to abandon his government in favor of a more centralized army alliance, they became a major part of the new federation and by 1563 they "
|
| 14 |
+
replaced_text = WhiteSpaceAttack(sample_text)
|
| 15 |
+
print("Original text:", sample_text)
|
| 16 |
+
print("Text after replacement:", replaced_text)
|
detree/utils/adverserial/zero_width_space.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import hashlib
|
| 3 |
+
import json
|
| 4 |
+
import math
|
| 5 |
+
import os
|
| 6 |
+
import random
|
| 7 |
+
from torch.utils.data import Dataset,DataLoader
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
zero_width_space = "\u200b"
|
| 10 |
+
|
| 11 |
+
def ZeroWidthSpaceAttack(text,N=0.3):
|
| 12 |
+
texts = text.split(" ")
|
| 13 |
+
num_spaces = int(N * len(texts))
|
| 14 |
+
indices_to_alter = random.sample(range(len(texts)), num_spaces)
|
| 15 |
+
indices_to_alter = sorted(indices_to_alter)
|
| 16 |
+
for i in indices_to_alter:
|
| 17 |
+
texts[i] += zero_width_space
|
| 18 |
+
return " ".join(texts)
|
| 19 |
+
|
| 20 |
+
if __name__ == "__main__":
|
| 21 |
+
sample_text = "Before Marian reforms, Roman military forces were exclusively made up of citizens people of property (3500 sesterces, say about 1750 loaves of bread) and capable of supplying their own reserves with food. In 1558, when Pope Clement IV was forced to abandon his government in favor of a more centralized army alliance, they became a major part of the new federation and by 1563 they "
|
| 22 |
+
replaced_text = ZeroWidthSpaceAttack(sample_text)
|
| 23 |
+
print("Original text:", sample_text)
|
| 24 |
+
print("Text after replacement:", replaced_text)
|