adding final dictionary

Files changed (6) hide show

.gitattributes CHANGED Viewed

@@ -1,2 +1,3 @@
 rockdich_model.tgz filter=lfs diff=lfs merge=lfs -text
 rya.tgz filter=lfs diff=lfs merge=lfs -text

 rockdich_model.tgz filter=lfs diff=lfs merge=lfs -text
 rya.tgz filter=lfs diff=lfs merge=lfs -text
+rockdich.txt.gz filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -2,4 +2,17 @@
 Toolkit to translate password dictionaries into German, but possibly to other languages. Includes tools to generate the training dataset by using local ollama or OpenAI API. Fine tune a llama3 model into the translation task using Unsloth. A script to translate a password dictionary using the finetuned model. And finally a translation into German of rockyou.txt as rockdich.txt. Each script has a description comment in the same file.
-See huggingface repo for model. https://huggingface.co/sudoaza/rockdich

 Toolkit to translate password dictionaries into German, but possibly to other languages. Includes tools to generate the training dataset by using local ollama or OpenAI API. Fine tune a llama3 model into the translation task using Unsloth. A script to translate a password dictionary using the finetuned model. And finally a translation into German of rockyou.txt as rockdich.txt. Each script has a description comment in the same file.
+See huggingface repo for model. https://huggingface.co/sudoaza/rockdich
+## Motivation
+On the original rockyou.txt dictonary you can find some German passwords but they are a small minority.
+We find 50 examples with variations of "passwort" while there are 4685 ocurrances of the English alternative.
+'i.?love.?you' matches 6472 passwords while '(ich)?.?liebe.?dich' matches 117.
+While there are 53k ocurrances of passwords including "girl" variations of "mädchen" are only 30 (usually with "a", sometimes "ae" and rarely the original "ä").
+## Results
+After translatng the dictionary we get 4593 passwords matching "passwort", 50287 matching 'm.?.?dchen' and 32729 matching '(ich)?.?liebe.?dich'. Notably the model translated the passwords with the correct spelling, so removing umlauts or replacing them with their ae/oe/ue alternatives may be good dictionary mutation strategies.
+Access the dictionary here https://huggingface.co/sudoaza/rockdich/blob/main/rockdich.txt

dedupe_passwords.py ADDED Viewed

+"""Remove duplicate passwords that either exist in the original password list, were not translated or were translated to the same password twice."""
+import argparse
+from tqdm import tqdm
+def main():
+    parser = argparse.ArgumentParser(description="Remove passwords that exist in the original list and remove deduplicates from the output one.")
+    parser.add_argument("-i", "--input_file", required=True, help="Path to the original password list.")
+    parser.add_argument("-t", "--translated_file", required=True, help="Path to the translated password list.")
+    parser.add_argument("-o", "--output_file", required=True, help="Path to the output deduplicated password list.")
+    args = parser.parse_args()
+    try:
+        with open(args.input_file, 'r', encoding='latin1') as file:
+            original = set(file.readlines())
+        with open(args.translated_file, 'r', encoding='latin1') as file:
+            translated = file.readlines()
+        final_list = []
+        final_set = set()
+        for newpa in tqdm(translated):
+            if newpa in original:
+                continue
+            if newpa in final_set:
+                continue
+            final_list.append(newpa)
+            final_set.add(newpa)
+        with open(args.output_file, 'w', encoding='latin1') as file:
+            file.writelines(final_list)
+    except FileNotFoundError:
+        print("Some file was not found.")
+if __name__ == "__main__":
+    main()

rockdich.txt.gz ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca94b8f7016c42db17f8f33270ccf997a64ee797176f20ece9fd9ae2783d9bb2
+size 15005831

rya.tgz CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2fe05852ec61a7968384a56d10797c084d0da05edd0c611e2ec5a865e9d7be8f
-size 46039552

 version https://git-lfs.github.com/spec/v1
+oid sha256:6bd9c5dc0c2b6bfe90ae01b14dcc20511fdea73ab188df2b24340b658b73cab1
+size 46012407

translate_final.py CHANGED Viewed

@@ -3,7 +3,7 @@
 from unsloth import FastLanguageModel
 import torch
 import argparse
-import tqdm
 max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
 dtype = torch.float16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
@@ -78,7 +78,7 @@ def process_batch(batch):
     input_tokens = tokenizer(inputs, return_tensors = "pt", padding=True).to("cuda")
     outputs = model.generate(**input_tokens, max_new_tokens = 64, use_cache = True)
-    return tokenizer.batch_decode(outputs).map(extract_response)
 BATCH_SIZE = 1000
@@ -95,7 +95,7 @@ def process_file(infile, outfile):
             translated_lines.extend(translated_batch)
         # Write the translated text to another file
-        with open(outfile, 'w', encoding='utf-8') as file:
             file.writelines(translated_lines)
     except FileNotFoundError:

 from unsloth import FastLanguageModel
 import torch
 import argparse
+from tqdm import tqdm
 max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
 dtype = torch.float16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
     input_tokens = tokenizer(inputs, return_tensors = "pt", padding=True).to("cuda")
     outputs = model.generate(**input_tokens, max_new_tokens = 64, use_cache = True)
+    return [extract_response(response) for response in tokenizer.batch_decode(outputs)]
 BATCH_SIZE = 1000
             translated_lines.extend(translated_batch)
         # Write the translated text to another file
+        with open(outfile, 'w', encoding='latin1') as file:
             file.writelines(translated_lines)
     except FileNotFoundError: