| | """Translate a password dictionary using a finetuned model.""" |
| |
|
| | from unsloth import FastLanguageModel |
| | import torch |
| | import argparse |
| | from tqdm import tqdm |
| |
|
| | max_seq_length = 2048 |
| | dtype = torch.float16 |
| | load_in_4bit = True |
| |
|
| | model, tokenizer = FastLanguageModel.from_pretrained( |
| | model_name = "unsloth/llama-3-8b-bnb-4bit", |
| | max_seq_length = max_seq_length, |
| | dtype = dtype, |
| | load_in_4bit = load_in_4bit, |
| | ) |
| |
|
| | model = FastLanguageModel.get_peft_model( |
| | model, |
| | r = 16, |
| | target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", |
| | "gate_proj", "up_proj", "down_proj",], |
| | lora_alpha = 16, |
| | lora_dropout = 0, |
| | bias = "none", |
| | |
| | use_gradient_checkpointing = "unsloth", |
| | random_state = 3407, |
| | use_rslora = False, |
| | loftq_config = None, |
| | ) |
| |
|
| | import re |
| | def extract_response(text): |
| | |
| | |
| | match = re.search(r"### Response:\n(.*?)$", text, re.DOTALL) |
| | if match: |
| | response = match.group(1) |
| | response = response.replace("<|end_of_text|>", "") |
| | if response[-1] != "\n": |
| | response = response + "\n" |
| | return response |
| | else: |
| | raise "No response found in the text." |
| |
|
| | from unsloth import FastLanguageModel |
| | model, tokenizer = FastLanguageModel.from_pretrained( |
| | model_name = "lora_model", |
| | max_seq_length = max_seq_length, |
| | dtype = dtype, |
| | load_in_4bit = load_in_4bit, |
| | ) |
| | FastLanguageModel.for_inference(model) |
| | tokenizer.padding_side = "left" |
| | alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. |
| | |
| | ### Instruction: |
| | {} |
| | |
| | ### Input: |
| | {} |
| | |
| | ### Response: |
| | {}""" |
| |
|
| | def process_batch(batch): |
| | inputs = [] |
| | chunk_size = 10 |
| | for i in range(0, len(batch), chunk_size): |
| | chunk = ''.join(batch[i:i+chunk_size]) |
| | inputs.append(alpaca_prompt.format( |
| | "Translate this passwords while keeping the original format.", |
| | chunk, |
| | "", |
| | )) |
| | |
| | input_tokens = tokenizer(inputs, return_tensors = "pt", padding=True).to("cuda") |
| | outputs = model.generate(**input_tokens, max_new_tokens = 64, use_cache = True) |
| | return [extract_response(response) for response in tokenizer.batch_decode(outputs)] |
| |
|
| | BATCH_SIZE = 1000 |
| |
|
| | def process_file(infile, outfile): |
| | try: |
| | with open(infile, 'r', encoding='latin1') as file: |
| | lines = file.readlines() |
| |
|
| | translated_lines = [] |
| | |
| | |
| | for i in tqdm(range(0, len(lines), BATCH_SIZE)): |
| | translated_batch = process_batch(lines[i:i+BATCH_SIZE]) |
| | translated_lines.extend(translated_batch) |
| |
|
| | |
| | with open(outfile, 'w', encoding='latin1') as file: |
| | file.writelines(translated_lines) |
| |
|
| | except FileNotFoundError: |
| | print("The input file was not found.") |
| |
|
| | def main(): |
| | parser = argparse.ArgumentParser(description="Translate text file content to German.") |
| | parser.add_argument("-i", "--input_file", required=True, help="Path to the input text file") |
| | parser.add_argument("-o", "--output_file", required=True, help="Path to the output text file where translated text will be saved") |
| |
|
| | args = parser.parse_args() |
| |
|
| | process_file(args.input_file, args.output_file) |
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|