Spaces:
Configuration error
Configuration error
| """ | |
| batch_selfinstruct_generate.py | |
| run: | |
| python -m generate_instruction generate_instruction_following_data \ | |
| --output_dir ./ \ | |
| --num_instructions_to_generate 10 \ | |
| --model_name="text-davinci-003" \ | |
| """ | |
| import time | |
| import json | |
| import os | |
| import random | |
| import re | |
| import string | |
| from functools import partial | |
| from multiprocessing import Pool | |
| import numpy as np | |
| import tqdm | |
| from rouge_score import rouge_scorer | |
| import utils | |
| import fire | |
| def encode_prompt(prompt_instructions): | |
| """Encode multiple prompt instructions into a single string.""" | |
| prompt = open("./prompt.txt").read() + "\n" | |
| for idx, task_dict in enumerate(prompt_instructions): | |
| (instruction, input, output) = task_dict["instruction"], task_dict["input"], task_dict["output"] | |
| instruction = re.sub(r"\s+", " ", instruction).strip().rstrip(":") | |
| input = "<noinput>" if input.lower() == "" else input | |
| prompt += f"###\n" | |
| prompt += f"{idx + 1}. Instruction: {instruction}\n" | |
| prompt += f"{idx + 1}. Input:\n{input}\n" | |
| prompt += f"{idx + 1}. Output:\n{output}\n" | |
| prompt += f"###\n" | |
| prompt += f"{idx + 2}. Instruction:" | |
| return prompt | |
| def post_process_gpt3_response(num_prompt_instructions, response): | |
| if response is None: | |
| return [] | |
| raw_instructions = f"{num_prompt_instructions+1}. Instruction:" + response["text"] | |
| raw_instructions = re.split("###", raw_instructions) | |
| instructions = [] | |
| for idx, inst in enumerate(raw_instructions): | |
| # if the decoding stops due to length, the last example is likely truncated so we discard it | |
| if idx == len(raw_instructions) - 1 and response["finish_reason"] == "length": | |
| continue | |
| idx += num_prompt_instructions + 1 | |
| splitted_data = re.split(f"{idx}\.\s+(Instruction|Input|Output):", inst) | |
| if len(splitted_data) != 7: | |
| continue | |
| else: | |
| inst = splitted_data[2].strip() | |
| input = splitted_data[4].strip() | |
| input = "" if input.lower() == "<noinput>" else input | |
| output = splitted_data[6].strip() | |
| # filter out too short or too long instructions | |
| if len(inst.split()) <= 3 or len(inst.split()) > 150: | |
| continue | |
| # filter based on keywords that are not suitable for language models. | |
| blacklist = [ | |
| "image", | |
| "images", | |
| "graph", | |
| "graphs", | |
| "picture", | |
| "pictures", | |
| "file", | |
| "files", | |
| "map", | |
| "maps", | |
| "draw", | |
| "plot", | |
| "go to", | |
| "video", | |
| "audio", | |
| "music", | |
| "flowchart", | |
| "diagram", | |
| ] | |
| blacklist += [] | |
| if any(find_word_in_string(word, inst) for word in blacklist): | |
| continue | |
| # We found that the model tends to add "write a program" to some existing instructions, which lead to a lot of such instructions. | |
| # And it's a bit comfusing whether the model need to write a program or directly output the result. | |
| # Here we filter them out. | |
| # Note this is not a comprehensive filtering for all programming instructions. | |
| if inst.startswith("Write a program"): | |
| continue | |
| # filter those starting with punctuation | |
| if inst[0] in string.punctuation: | |
| continue | |
| # filter those starting with non-english character | |
| if not inst[0].isascii(): | |
| continue | |
| instructions.append({"instruction": inst, "input": input, "output": output}) | |
| return instructions | |
| def find_word_in_string(w, s): | |
| return re.compile(r"\b({0})\b".format(w), flags=re.IGNORECASE).search(s) | |
| def generate_instruction_following_data( | |
| output_dir="./", | |
| seed_tasks_path="./seed_tasks.jsonl", | |
| num_instructions_to_generate=100, | |
| model_name="text-davinci-003", | |
| num_prompt_instructions=3, | |
| request_batch_size=5, | |
| temperature=1.0, | |
| top_p=1.0, | |
| num_cpus=16, | |
| ): | |
| seed_tasks = [json.loads(l) for l in open(seed_tasks_path, "r")] | |
| seed_instruction_data = [ | |
| {"instruction": t["instruction"], "input": t["instances"][0]["input"], "output": t["instances"][0]["output"]} | |
| for t in seed_tasks | |
| ] | |
| print(f"Loaded {len(seed_instruction_data)} human-written seed instructions") | |
| os.makedirs(output_dir, exist_ok=True) | |
| request_idx = 0 | |
| # load the LM-generated instructions | |
| machine_instruction_data = [] | |
| if os.path.exists(os.path.join(output_dir, "regen.json")): | |
| machine_instruction_data = utils.jload(os.path.join(output_dir, "regen.json")) | |
| print(f"Loaded {len(machine_instruction_data)} machine-generated instructions") | |
| # similarities = {} | |
| scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False) | |
| # now let's generate new instructions! | |
| progress_bar = tqdm.tqdm(total=num_instructions_to_generate) | |
| if machine_instruction_data: | |
| progress_bar.update(len(machine_instruction_data)) | |
| # first we tokenize all the seed instructions and generated machine instructions | |
| all_instructions = [d["instruction"] for d in seed_instruction_data] + [ | |
| d["instruction"] for d in machine_instruction_data | |
| ] | |
| all_instruction_tokens = [scorer._tokenizer.tokenize(inst) for inst in all_instructions] | |
| while len(machine_instruction_data) < num_instructions_to_generate: | |
| request_idx += 1 | |
| batch_inputs = [] | |
| for _ in range(request_batch_size): | |
| # only sampling from the seed tasks | |
| prompt_instructions = random.sample(seed_instruction_data, num_prompt_instructions) | |
| prompt = encode_prompt(prompt_instructions) | |
| batch_inputs.append(prompt) | |
| decoding_args = utils.OpenAIDecodingArguments( | |
| temperature=temperature, | |
| n=1, | |
| max_tokens=3072, # hard-code to maximize the length. the requests will be automatically adjusted | |
| top_p=top_p, | |
| stop=["\n20", "20.", "20."], | |
| ) | |
| request_start = time.time() | |
| results = utils.openai_completion( | |
| prompts=batch_inputs, | |
| model_name=model_name, | |
| batch_size=request_batch_size, | |
| decoding_args=decoding_args, | |
| logit_bias={"50256": -100}, # prevent the <|endoftext|> token from being generated | |
| ) | |
| request_duration = time.time() - request_start | |
| process_start = time.time() | |
| instruction_data = [] | |
| for result in results: | |
| new_instructions = post_process_gpt3_response(num_prompt_instructions, result) | |
| instruction_data += new_instructions | |
| total = len(instruction_data) | |
| keep = 0 | |
| for instruction_data_entry in instruction_data: | |
| # computing similarity with the pre-tokenzied instructions | |
| new_instruction_tokens = scorer._tokenizer.tokenize(instruction_data_entry["instruction"]) | |
| with Pool(num_cpus) as p: | |
| rouge_scores = p.map( | |
| partial(rouge_scorer._score_lcs, new_instruction_tokens), | |
| all_instruction_tokens, | |
| ) | |
| rouge_scores = [score.fmeasure for score in rouge_scores] | |
| most_similar_instructions = { | |
| all_instructions[i]: rouge_scores[i] for i in np.argsort(rouge_scores)[-10:][::-1] | |
| } | |
| if max(rouge_scores) > 0.7: | |
| continue | |
| else: | |
| keep += 1 | |
| instruction_data_entry["most_similar_instructions"] = most_similar_instructions | |
| instruction_data_entry["avg_similarity_score"] = float(np.mean(rouge_scores)) | |
| machine_instruction_data.append(instruction_data_entry) | |
| all_instructions.append(instruction_data_entry["instruction"]) | |
| all_instruction_tokens.append(new_instruction_tokens) | |
| progress_bar.update(1) | |
| process_duration = time.time() - process_start | |
| print(f"Request {request_idx} took {request_duration:.2f}s, processing took {process_duration:.2f}s") | |
| print(f"Generated {total} instructions, kept {keep} instructions") | |
| utils.jdump(machine_instruction_data, os.path.join(output_dir, "regen.json")) | |
| def main(task, **kwargs): | |
| globals()[task](**kwargs) | |
| if __name__ == "__main__": | |
| fire.Fire(main) | |