sudoaza
commited on
Commit
·
3fd70ad
1
Parent(s):
95da9d8
adding final dictionary
Browse files- .gitattributes +1 -0
- README.md +14 -1
- dedupe_passwords.py +38 -0
- rockdich.txt.gz +3 -0
- rya.tgz +2 -2
- translate_final.py +3 -3
.gitattributes
CHANGED
|
@@ -1,2 +1,3 @@
|
|
| 1 |
rockdich_model.tgz filter=lfs diff=lfs merge=lfs -text
|
| 2 |
rya.tgz filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 1 |
rockdich_model.tgz filter=lfs diff=lfs merge=lfs -text
|
| 2 |
rya.tgz filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
rockdich.txt.gz filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -2,4 +2,17 @@
|
|
| 2 |
|
| 3 |
Toolkit to translate password dictionaries into German, but possibly to other languages. Includes tools to generate the training dataset by using local ollama or OpenAI API. Fine tune a llama3 model into the translation task using Unsloth. A script to translate a password dictionary using the finetuned model. And finally a translation into German of rockyou.txt as rockdich.txt. Each script has a description comment in the same file.
|
| 4 |
|
| 5 |
-
See huggingface repo for model. https://huggingface.co/sudoaza/rockdich
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
Toolkit to translate password dictionaries into German, but possibly to other languages. Includes tools to generate the training dataset by using local ollama or OpenAI API. Fine tune a llama3 model into the translation task using Unsloth. A script to translate a password dictionary using the finetuned model. And finally a translation into German of rockyou.txt as rockdich.txt. Each script has a description comment in the same file.
|
| 4 |
|
| 5 |
+
See huggingface repo for model. https://huggingface.co/sudoaza/rockdich
|
| 6 |
+
|
| 7 |
+
## Motivation
|
| 8 |
+
|
| 9 |
+
On the original rockyou.txt dictonary you can find some German passwords but they are a small minority.
|
| 10 |
+
We find 50 examples with variations of "passwort" while there are 4685 ocurrances of the English alternative.
|
| 11 |
+
'i.?love.?you' matches 6472 passwords while '(ich)?.?liebe.?dich' matches 117.
|
| 12 |
+
While there are 53k ocurrances of passwords including "girl" variations of "mädchen" are only 30 (usually with "a", sometimes "ae" and rarely the original "ä").
|
| 13 |
+
|
| 14 |
+
## Results
|
| 15 |
+
|
| 16 |
+
After translatng the dictionary we get 4593 passwords matching "passwort", 50287 matching 'm.?.?dchen' and 32729 matching '(ich)?.?liebe.?dich'. Notably the model translated the passwords with the correct spelling, so removing umlauts or replacing them with their ae/oe/ue alternatives may be good dictionary mutation strategies.
|
| 17 |
+
|
| 18 |
+
Access the dictionary here https://huggingface.co/sudoaza/rockdich/blob/main/rockdich.txt
|
dedupe_passwords.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Remove duplicate passwords that either exist in the original password list, were not translated or were translated to the same password twice."""
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
from tqdm import tqdm
|
| 5 |
+
|
| 6 |
+
def main():
|
| 7 |
+
parser = argparse.ArgumentParser(description="Remove passwords that exist in the original list and remove deduplicates from the output one.")
|
| 8 |
+
parser.add_argument("-i", "--input_file", required=True, help="Path to the original password list.")
|
| 9 |
+
parser.add_argument("-t", "--translated_file", required=True, help="Path to the translated password list.")
|
| 10 |
+
parser.add_argument("-o", "--output_file", required=True, help="Path to the output deduplicated password list.")
|
| 11 |
+
|
| 12 |
+
args = parser.parse_args()
|
| 13 |
+
|
| 14 |
+
try:
|
| 15 |
+
with open(args.input_file, 'r', encoding='latin1') as file:
|
| 16 |
+
original = set(file.readlines())
|
| 17 |
+
with open(args.translated_file, 'r', encoding='latin1') as file:
|
| 18 |
+
translated = file.readlines()
|
| 19 |
+
|
| 20 |
+
final_list = []
|
| 21 |
+
final_set = set()
|
| 22 |
+
|
| 23 |
+
for newpa in tqdm(translated):
|
| 24 |
+
if newpa in original:
|
| 25 |
+
continue
|
| 26 |
+
if newpa in final_set:
|
| 27 |
+
continue
|
| 28 |
+
final_list.append(newpa)
|
| 29 |
+
final_set.add(newpa)
|
| 30 |
+
|
| 31 |
+
with open(args.output_file, 'w', encoding='latin1') as file:
|
| 32 |
+
file.writelines(final_list)
|
| 33 |
+
|
| 34 |
+
except FileNotFoundError:
|
| 35 |
+
print("Some file was not found.")
|
| 36 |
+
|
| 37 |
+
if __name__ == "__main__":
|
| 38 |
+
main()
|
rockdich.txt.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ca94b8f7016c42db17f8f33270ccf997a64ee797176f20ece9fd9ae2783d9bb2
|
| 3 |
+
size 15005831
|
rya.tgz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6bd9c5dc0c2b6bfe90ae01b14dcc20511fdea73ab188df2b24340b658b73cab1
|
| 3 |
+
size 46012407
|
translate_final.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
| 3 |
from unsloth import FastLanguageModel
|
| 4 |
import torch
|
| 5 |
import argparse
|
| 6 |
-
import tqdm
|
| 7 |
|
| 8 |
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
|
| 9 |
dtype = torch.float16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
|
|
@@ -78,7 +78,7 @@ def process_batch(batch):
|
|
| 78 |
|
| 79 |
input_tokens = tokenizer(inputs, return_tensors = "pt", padding=True).to("cuda")
|
| 80 |
outputs = model.generate(**input_tokens, max_new_tokens = 64, use_cache = True)
|
| 81 |
-
return tokenizer.batch_decode(outputs)
|
| 82 |
|
| 83 |
BATCH_SIZE = 1000
|
| 84 |
|
|
@@ -95,7 +95,7 @@ def process_file(infile, outfile):
|
|
| 95 |
translated_lines.extend(translated_batch)
|
| 96 |
|
| 97 |
# Write the translated text to another file
|
| 98 |
-
with open(outfile, 'w', encoding='
|
| 99 |
file.writelines(translated_lines)
|
| 100 |
|
| 101 |
except FileNotFoundError:
|
|
|
|
| 3 |
from unsloth import FastLanguageModel
|
| 4 |
import torch
|
| 5 |
import argparse
|
| 6 |
+
from tqdm import tqdm
|
| 7 |
|
| 8 |
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
|
| 9 |
dtype = torch.float16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
|
|
|
|
| 78 |
|
| 79 |
input_tokens = tokenizer(inputs, return_tensors = "pt", padding=True).to("cuda")
|
| 80 |
outputs = model.generate(**input_tokens, max_new_tokens = 64, use_cache = True)
|
| 81 |
+
return [extract_response(response) for response in tokenizer.batch_decode(outputs)]
|
| 82 |
|
| 83 |
BATCH_SIZE = 1000
|
| 84 |
|
|
|
|
| 95 |
translated_lines.extend(translated_batch)
|
| 96 |
|
| 97 |
# Write the translated text to another file
|
| 98 |
+
with open(outfile, 'w', encoding='latin1') as file:
|
| 99 |
file.writelines(translated_lines)
|
| 100 |
|
| 101 |
except FileNotFoundError:
|