Miranda2023 commited on
Commit
82bc1c9
·
1 Parent(s): a1fcc16

app create

Browse files
app.py CHANGED
@@ -1,13 +1,56 @@
1
  import gradio as gr
 
 
2
  from langchain.llms.openai import OpenAI
3
-
4
  import os
5
- os.environ["OPENAI_API_KEY"] = "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"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- llm = OpenAI(temperature=0.9)
 
8
 
9
- def openai(input):
10
- return llm(input)
11
 
12
- iface = gr.Interface(fn=openai, inputs="textbox", outputs="textbox")
13
- iface.launch()
 
1
  import gradio as gr
2
+ from selfinstruct.instance import instance_main
3
+ import openai
4
  from langchain.llms.openai import OpenAI
 
5
  import os
6
+ import json
7
+ # os.environ["OPENAI_API_KEY"] = "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"
8
+
9
+ def OpenaiChat(key, input, temperature):
10
+ print(input)
11
+ openai.api_key = "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" if not key else key
12
+ # llm = OpenAI(temperature=0.9)
13
+ # return llm(input)
14
+ MODEL = "gpt-3.5-turbo"
15
+ response = openai.ChatCompletion.create(
16
+ model=MODEL,
17
+ messages=[
18
+ {"role": "user", "content": input}
19
+ ],
20
+ temperature=temperature
21
+ )
22
+ return json.dumps(response["choices"], ensure_ascii=False)
23
+
24
+
25
+ with gr.Blocks() as demo:
26
+ gr.Markdown("# Self-instruct Prompt")
27
+ with gr.Row():
28
+ key = gr.Textbox(label="Input OpenAI key", type="password")
29
+ with gr.Tab("Generate instance"):
30
+ with gr.Row():
31
+ with gr.Column():
32
+ gr.Dropdown(["davinci"], value=["davinci"], multiselect=False, label="Model")
33
+ text_input = gr.TextArea(lines=3, label="Input instruction:", info="格式:一行一个instruction")
34
+ with gr.Column():
35
+ text_output = gr.JSON(lines=10, label="Output instance:")
36
+ text_button = gr.Button("Generate")
37
+ gr.Markdown("## 3 Steps to generate")
38
+ with gr.Row():
39
+ text_step1 = gr.TextArea(lines=3, label="Step1 identify if classifacrion task:", )
40
+ text_step2 = gr.TextArea(lines=3, label="Step2 generate instance:", )
41
+ text_step3 = gr.TextArea(lines=3, label="Step3 Prepare for finetune:", )
42
+
43
+ with gr.Tab("Simple ChatGPT"):
44
+ gr.Dropdown(["GPT3.5"], value=["GPT3.5"], multiselect=False, label="Model")
45
+ temperature = gr.Slider(0, 1.0, value=0.5, label="Temperature", info="Choose betwen 0 and 1")
46
+ ChatGPT_input = gr.TextArea(lines=3, label="Input:")
47
+ ChatGPT_output = gr.TextArea(lines=3, label="Output:")
48
+ ChatGPT_button = gr.Button("Run")
49
 
50
+ # with gr.Accordion("Open for More!"):
51
+ # gr.Markdown("Look at me...")
52
 
53
+ text_button.click(instance_main, inputs=[text_input, key], outputs=[text_output, text_step1, text_step2, text_step3])
54
+ ChatGPT_button.click(OpenaiChat, inputs=[key, ChatGPT_input, temperature], outputs=ChatGPT_output)
55
 
56
+ demo.launch()
 
selfinstruct/gpt3_api.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import tqdm
3
+ import os
4
+ import random
5
+ import openai
6
+ from datetime import datetime
7
+ import argparse
8
+ import time
9
+
10
+
11
+ def make_requests(
12
+ engine, prompts, max_tokens, temperature, top_p,
13
+ frequency_penalty, presence_penalty, stop_sequences, logprobs, n, best_of, retries=3, api_key=None, organization=None
14
+ ):
15
+ response = None
16
+ target_length = max_tokens
17
+ if api_key is not None:
18
+ openai.api_key = api_key
19
+ if organization is not None:
20
+ openai.organization = organization
21
+ retry_cnt = 0
22
+ backoff_time = 30
23
+ while retry_cnt <= retries:
24
+ try:
25
+ response = openai.Completion.create(
26
+ engine=engine,
27
+ prompt=prompts,
28
+ max_tokens=target_length,
29
+ temperature=temperature,
30
+ top_p=top_p,
31
+ frequency_penalty=frequency_penalty,
32
+ presence_penalty=presence_penalty,
33
+ stop=stop_sequences,
34
+ logprobs=logprobs,
35
+ n=n,
36
+ best_of=best_of,
37
+ )
38
+ break
39
+ except openai.error.OpenAIError as e:
40
+ print(f"OpenAIError: {e}.")
41
+ if "Please reduce your prompt" in str(e):
42
+ target_length = int(target_length * 0.8)
43
+ print(f"Reducing target length to {target_length}, retrying...")
44
+ else:
45
+ print(f"Retrying in {backoff_time} seconds...")
46
+ time.sleep(backoff_time)
47
+ backoff_time *= 1.5
48
+ retry_cnt += 1
49
+
50
+ if isinstance(prompts, list):
51
+ results = []
52
+ for j, prompt in enumerate(prompts):
53
+ data = {
54
+ "prompt": prompt,
55
+ "response": {"choices": response["choices"][j * n: (j + 1) * n]} if response else None,
56
+ "created_at": str(datetime.now()),
57
+ }
58
+ results.append(data)
59
+ return results
60
+ else:
61
+ data = {
62
+ "prompt": prompts,
63
+ "response": response,
64
+ "created_at": str(datetime.now()),
65
+ }
66
+ return [data]
67
+
68
+
69
+ def parse_args():
70
+ parser = argparse.ArgumentParser()
71
+ parser.add_argument(
72
+ "--input_file",
73
+ type=str,
74
+ help="The input file that contains the prompts to GPT3.",
75
+ )
76
+ parser.add_argument(
77
+ "--output_file",
78
+ type=str,
79
+ help="The output file to save the responses from GPT3.",
80
+ )
81
+ parser.add_argument(
82
+ "--engine",
83
+ type=str,
84
+ help="The openai GPT3 engine to use.",
85
+ )
86
+ parser.add_argument(
87
+ "--max_tokens",
88
+ default=500,
89
+ type=int,
90
+ help="The max_tokens parameter of GPT3.",
91
+ )
92
+ parser.add_argument(
93
+ "--temperature",
94
+ default=0.7,
95
+ type=float,
96
+ help="The temprature of GPT3.",
97
+ )
98
+ parser.add_argument(
99
+ "--top_p",
100
+ default=0.5,
101
+ type=float,
102
+ help="The `top_p` parameter of GPT3.",
103
+ )
104
+ parser.add_argument(
105
+ "--frequency_penalty",
106
+ default=0,
107
+ type=float,
108
+ help="The `frequency_penalty` parameter of GPT3.",
109
+ )
110
+ parser.add_argument(
111
+ "--presence_penalty",
112
+ default=0,
113
+ type=float,
114
+ help="The `presence_penalty` parameter of GPT3.",
115
+ )
116
+ parser.add_argument(
117
+ "--stop_sequences",
118
+ default=["\n\n"],
119
+ nargs="+",
120
+ help="The `stop_sequences` parameter of GPT3.",
121
+ )
122
+ parser.add_argument(
123
+ "--logprobs",
124
+ default=5,
125
+ type=int,
126
+ help="The `logprobs` parameter of GPT3"
127
+ )
128
+ parser.add_argument(
129
+ "--n",
130
+ type=int,
131
+ help="The `n` parameter of GPT3. The number of responses to generate."
132
+ )
133
+ parser.add_argument(
134
+ "--best_of",
135
+ type=int,
136
+ help="The `best_of` parameter of GPT3. The beam size on the GPT3 server."
137
+ )
138
+ parser.add_argument(
139
+ "--use_existing_responses",
140
+ action="store_true",
141
+ help="Whether to use existing responses from the output file if it exists."
142
+ )
143
+ parser.add_argument(
144
+ "--request_batch_size",
145
+ default=20,
146
+ type=int,
147
+ help="The number of requests to send to GPT3 at a time."
148
+ )
149
+ return parser.parse_args()
150
+
151
+
152
+ if __name__ == "__main__":
153
+ random.seed(123)
154
+ args = parse_args()
155
+ os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
156
+
157
+ # read existing file if it exists
158
+ existing_responses = {}
159
+ if os.path.exists(args.output_file) and args.use_existing_responses:
160
+ with open(args.output_file, "r") as fin:
161
+ for line in fin:
162
+ data = json.loads(line)
163
+ existing_responses[data["prompt"]] = data
164
+
165
+ # do new prompts
166
+ with open(args.input_file, "r") as fin:
167
+ if args.input_file.endswith(".jsonl"):
168
+ all_prompts = [json.loads(line)["prompt"] for line in fin]
169
+ else:
170
+ all_prompt = [line.strip().replace("\\n", "\n") for line in fin]
171
+
172
+ with open(args.output_file, "w") as fout:
173
+ for i in tqdm.tqdm(range(0, len(all_prompts), args.request_batch_size)):
174
+ batch_prompts = all_prompts[i: i + args.request_batch_size]
175
+ if all(p in existing_responses for p in batch_prompts):
176
+ for p in batch_prompts:
177
+ fout.write(json.dumps(existing_responses[p]) + "\n")
178
+ else:
179
+ results = make_requests(
180
+ engine=args.engine,
181
+ prompts=batch_prompts,
182
+ max_tokens=args.max_tokens,
183
+ temperature=args.temperature,
184
+ top_p=args.top_p,
185
+ frequency_penalty=args.frequency_penalty,
186
+ presence_penalty=args.presence_penalty,
187
+ stop_sequences=args.stop_sequences,
188
+ logprobs=args.logprobs,
189
+ n=args.n,
190
+ best_of=args.best_of,
191
+ )
192
+ for data in results:
193
+ fout.write(json.dumps(data) + "\n")
selfinstruct/identify_clf_or_not.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import random
4
+ import tqdm
5
+ import re
6
+ import argparse
7
+ import pandas as pd
8
+ from collections import OrderedDict
9
+ from gpt3_api import make_requests as make_gpt3_requests
10
+ from templates.clf_task_template import template_1
11
+
12
+
13
+ random.seed(42)
14
+
15
+
16
+ templates = {
17
+ "template_1": template_1
18
+ }
19
+
20
+ def parse_args():
21
+ parser = argparse.ArgumentParser()
22
+ # parser.add_argument("--template", type=str, default="template_1", help="Which template to use.")
23
+ parser.add_argument(
24
+ "--batch_dir",
25
+ type=str,
26
+ required=True,
27
+ help="The directory where the batch is stored.",
28
+ )
29
+ parser.add_argument(
30
+ "--num_instructions",
31
+ type=int,
32
+ help="if specified, only generate instance input for this many instructions",
33
+ )
34
+ parser.add_argument(
35
+ "--template",
36
+ type=str,
37
+ default="template_1",
38
+ help="Which template to use. Currently only `template_1` is supported.",
39
+ )
40
+ parser.add_argument(
41
+ "--engine",
42
+ type=str,
43
+ default="davinci",
44
+ help="The engine to use."
45
+ )
46
+ parser.add_argument(
47
+ "--request_batch_size",
48
+ type=int,
49
+ default=5,
50
+ help="The number of requests to send in a batch."
51
+ )
52
+ parser.add_argument(
53
+ "--api_key",
54
+ type=str,
55
+ help="The API key to use. If not specified, the key will be read from the environment variable `OPENAI_API_KEY`."
56
+ )
57
+ parser.add_argument(
58
+ "--organization",
59
+ type=str,
60
+ help="The organization to use. If not specified, the default organization id will be used."
61
+ )
62
+ return parser.parse_args()
63
+
64
+
65
+ if __name__ == '__main__':
66
+ args = parse_args()
67
+
68
+ with open(os.path.join(args.batch_dir, "machine_generated_instructions.jsonl")) as fin:
69
+ lines = fin.readlines()
70
+ if args.num_instructions is not None:
71
+ lines = lines[:args.num_instructions]
72
+
73
+ output_path = os.path.join(args.batch_dir, f"is_clf_or_not_{args.engine}_{args.template}.jsonl")
74
+ existing_requests = {}
75
+ if os.path.exists(output_path):
76
+ with open(output_path) as fin:
77
+ for line in tqdm.tqdm(fin):
78
+ try:
79
+ data = json.loads(line)
80
+ existing_requests[data["instruction"]] = data
81
+ except:
82
+ pass
83
+ print(f"Loaded {len(existing_requests)} existing requests")
84
+
85
+ progress_bar = tqdm.tqdm(total=len(lines))
86
+ with open(output_path, "w") as fout:
87
+ for batch_idx in range(0, len(lines), args.request_batch_size):
88
+ batch = [json.loads(line) for line in lines[batch_idx: batch_idx + args.request_batch_size]]
89
+ if all(d["instruction"] in existing_requests for d in batch):
90
+ for d in batch:
91
+ data = existing_requests[d["instruction"]]
92
+ data = OrderedDict(
93
+ (k, data[k]) for k in \
94
+ ["instruction", "is_classification"]
95
+ )
96
+ fout.write(json.dumps(data, ensure_ascii=False) + "\n")
97
+ else:
98
+ # prefix = compose_prompt_prefix(human_written_tasks, batch[0]["instruction"], 8, 2)
99
+ prefix = templates[args.template]
100
+ print("***batch", batch)
101
+ prompts = [prefix + " " + d["instruction"].strip() + "\n" + "Is it classification?" for d in batch]
102
+ results = make_gpt3_requests(
103
+ engine=args.engine,
104
+ prompts=prompts,
105
+ max_tokens=3,
106
+ temperature=0,
107
+ top_p=0,
108
+ frequency_penalty=0,
109
+ presence_penalty=0,
110
+ stop_sequences=["\n", "Task"],
111
+ logprobs=1,
112
+ n=1,
113
+ best_of=1,
114
+ api_key=args.api_key,
115
+ organization=args.organization)
116
+ for i in range(len(batch)):
117
+ data = batch[i]
118
+ if results[i]["response"] is not None:
119
+ data["is_classification"] = results[i]["response"]["choices"][0]["text"]
120
+ else:
121
+ data["is_classification"] = ""
122
+ data = {
123
+ "instruction": data["instruction"],
124
+ "is_classification": data["is_classification"]
125
+ }
126
+ data = OrderedDict(
127
+ (k, data[k]) for k in \
128
+ ["instruction", "is_classification"]
129
+ )
130
+ fout.write(json.dumps(data, ensure_ascii=False) + "\n")
131
+ progress_bar.update(len(batch))
selfinstruct/instance.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import random
4
+ import tqdm
5
+ import re
6
+ import argparse
7
+ import pandas as pd
8
+ from collections import OrderedDict
9
+ from .gpt3_api import make_requests as make_gpt3_requests
10
+ from .templates.instance_gen_template import output_first_template_for_clf, input_first_template_for_gen
11
+ from .templates.clf_task_template import template_1
12
+
13
+ random.seed(42)
14
+
15
+ engine = "davinci"
16
+ global api_key
17
+ api_key = "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"
18
+
19
+ # def parse_args():
20
+ # parser = argparse.ArgumentParser()
21
+ # parser.add_argument(
22
+ # "--batch_dir",
23
+ # type=str,
24
+ # required=True,
25
+ # help="The directory where the batch is stored.",
26
+ # )
27
+ # parser.add_argument(
28
+ # "--input_file",
29
+ # type=str,
30
+ # default="machine_generated_instructions.jsonl"
31
+ # )
32
+ # parser.add_argument(
33
+ # "--output_file",
34
+ # type=str,
35
+ # default="machine_generated_instances.jsonl",
36
+ # )
37
+ # parser.add_argument(
38
+ # "--num_instructions",
39
+ # type=int,
40
+ # help="if specified, only generate instance input for this many instructions",
41
+ # )
42
+ # parser.add_argument(
43
+ # "--max_instances_to_generate",
44
+ # type=int,
45
+ # default=5,
46
+ # help="The max number of instances to generate for each instruction.",
47
+ # )
48
+ # parser.add_argument(
49
+ # "--generation_tasks_only",
50
+ # action="store_true",
51
+ # help="If specified, only do for generation tasks.",
52
+ # )
53
+ # parser.add_argument(
54
+ # "--classification_tasks_only",
55
+ # action="store_true",
56
+ # help="If specified, only do for classification tasks.",
57
+ # )
58
+ # parser.add_argument(
59
+ # "--engine",
60
+ # type=str,
61
+ # default="davinci",
62
+ # help="The engine to use."
63
+ # )
64
+ # parser.add_argument(
65
+ # "--request_batch_size",
66
+ # type=int,
67
+ # default=5,
68
+ # help="The number of requests to send in a batch."
69
+ # )
70
+ # parser.add_argument(
71
+ # "--api_key",
72
+ # type=str,
73
+ # help="The API key to use. If not specified, the key will be read from the environment variable OPENAI_API_KEY."
74
+ # )
75
+ # parser.add_argument(
76
+ # "--organization",
77
+ # type=str,
78
+ # help="The organization to use. If not specified, the default organization id will be used."
79
+ # )
80
+ # return parser.parse_args()
81
+
82
+ def if_classify(instructions):
83
+ prefix = template_1
84
+ prompts = [prefix + " " + instruct.strip() + "\n" + "Is it classification?" for instruct in instructions]
85
+ results = make_gpt3_requests(
86
+ engine=engine,
87
+ prompts=prompts,
88
+ max_tokens=3,
89
+ temperature=0,
90
+ top_p=0,
91
+ frequency_penalty=0,
92
+ presence_penalty=0,
93
+ stop_sequences=["\n", "Task"],
94
+ logprobs=1,
95
+ n=1,
96
+ best_of=1,
97
+ api_key=api_key)
98
+ classify_res = []
99
+ for i in range(len(prompts)):
100
+ if results[i]["response"] is not None:
101
+ if results[i]["response"]["choices"][0]["text"] in ["Yes", "yes", "YES"]:
102
+ classify_res.append(True)
103
+ else:
104
+ classify_res.append(False)
105
+ else:
106
+ print("**分类出错,", results[i])
107
+ classify_res.append("Unknown")
108
+ return classify_res
109
+
110
+ def filter_duplicate_instances(instances):
111
+ # if the instances have same non-empty input, but different output, we will not use such instances
112
+ same_input_diff_output = False
113
+ for i in range(1, len(instances)):
114
+ for j in range(0, i):
115
+ if instances[i][1] == "":
116
+ continue
117
+ if instances[i][1] == instances[j][1] and instances[i][2] != instances[j][2]:
118
+ same_input_diff_output = True
119
+ break
120
+ if same_input_diff_output:
121
+ return []
122
+
123
+ # remove duplicate instances
124
+ instances = list(set(instances))
125
+ return instances
126
+
127
+ def filter_invalid_instances(instances):
128
+ filtered_instances = []
129
+ for instance in instances:
130
+ # if input and output are the same, we will not use such instances
131
+ if instance[1] == instance[2]:
132
+ continue
133
+ # if output is empty, we will not use such instances
134
+ if instance[2] == "":
135
+ continue
136
+ # if input or output ends with a colon, these are usually imcomplete generation. We will not use such instances
137
+ if instance[1].strip().endswith(":") or instance[2].strip().endswith(":"):
138
+ continue
139
+ filtered_instances.append(instance)
140
+ return filtered_instances
141
+
142
+ def encode_instance(instruction, input, output, random_template=True):
143
+ encoding_templates_w_input = [
144
+ ("{instruction}\nInput: {input}\nOutput:", " {output}<|endoftext|>"),
145
+ ("{instruction}\n\nInput: {input}\n\nOutput:", " {output}<|endoftext|>"),
146
+ ("Task: {instruction}\nInput: {input}\nOutput:", " {output}<|endoftext|>"),
147
+ ("{instruction}\n\n{input}\n\nOutput:", " {output}<|endoftext|>"),
148
+ ("{instruction}\n\n{input}\n\n", "{output}<|endoftext|>"),
149
+ ("{instruction}\n{input}\n\n", "{output}<|endoftext|>"),
150
+ ("Task: {instruction}\n\n{input}\n\n", "{output}<|endoftext|>"),
151
+ ]
152
+ encoding_templates_wo_input = [
153
+ ("{instruction} Output:", " {output}<|endoftext|>"),
154
+ ("{instruction}\nOutput:", " {output}<|endoftext|>"),
155
+ ("{instruction}\n\nOutput:", " {output}<|endoftext|>"),
156
+ ("{instruction}\n", "{output}<|endoftext|>"),
157
+ ("{instruction}\n\n", "{output}<|endoftext|>"),
158
+ ("Task: {instruction}\n\n", "{output}<|endoftext|>"),
159
+ ]
160
+ if random_template:
161
+ if input.strip() != "":
162
+ prompt_template, completion_template = random.choice(encoding_templates_w_input)
163
+ prompt = prompt_template.format(instruction=instruction.strip(), input=input.strip())
164
+ completion = completion_template.format(output=output.strip())
165
+ else:
166
+ prompt_template, completion_template = random.choice(encoding_templates_wo_input)
167
+ prompt = prompt_template.format(instruction=instruction.strip())
168
+ completion = completion_template.format(output=output.strip())
169
+ else:
170
+ prompt = instruction.strip() + "\n\n" + input.strip() + "\n\n"
171
+ completion = output.strip() + "<|endoftext|>"
172
+
173
+ data = {
174
+ "prompt": prompt,
175
+ "completion": completion,
176
+ "instruction": instruction.strip(),
177
+ "input": input.strip(),
178
+ "output": output.strip(),
179
+ }
180
+ return data
181
+
182
+ def parse_input_output(response_text):
183
+ if re.findall(r"Output\s*\d*\s*:", response_text):
184
+ inst_input = re.split(r"Output\s*\d*\s*:", response_text)[0].strip()
185
+ inst_output = re.split(r"Output\s*\d*\s*:", response_text)[1].strip()
186
+ else:
187
+ inst_input = ""
188
+ inst_output = response_text.strip()
189
+ # to avoid the case multiple input/output pairs are generated
190
+ if re.findall(r"Input\s*\d*\s*:", inst_output):
191
+ inst_output = re.split(r"Input\s*\d*\s*:", inst_output)[0].strip()
192
+ # remove the prefix "Input:" from the string
193
+ inst_input = re.sub(r"^Input\s*\d*\s*:", "", inst_input).strip()
194
+ return inst_input, inst_output
195
+
196
+ def parse_instances_for_generation_task(raw_text, instruction, response_metadata):
197
+ instances = []
198
+ raw_text = raw_text.strip()
199
+ if re.findall("Example\s?\d*\.?", raw_text):
200
+ instance_texts = re.split(r"Example\s?\d*\.?", raw_text)
201
+ instance_texts = [it.strip() for it in instance_texts if it.strip() != ""]
202
+ for instance_text in instance_texts:
203
+ inst_input, inst_output = parse_input_output(instance_text)
204
+ instances.append((instruction.strip(), inst_input.strip(), inst_output.strip()))
205
+ elif re.findall(r"Output\s*\d*\s*:", raw_text):
206
+ # we assume only one input/output pair in this case
207
+ inst_input, inst_output = parse_input_output(raw_text)
208
+ instances.append((instruction.strip(), inst_input.strip(), inst_output.strip()))
209
+ else:
210
+ return []
211
+ # if the generation stops because of length, we remove the last instance
212
+ if response_metadata["response"]["choices"][0]["finish_reason"] == "length":
213
+ instances = instances[:-1]
214
+
215
+ instances = filter_invalid_instances(instances)
216
+ instances = filter_duplicate_instances(instances)
217
+ return instances
218
+
219
+ def parse_instances_for_classification_task(raw_text, instruction, response_metadata):
220
+ instances = []
221
+ if not "Class label:" in raw_text:
222
+ return []
223
+ instance_texts = raw_text.split("Class label:")[1:]
224
+ for instance_text in instance_texts:
225
+ instance_text = instance_text.strip()
226
+ fields = instance_text.split("\n", 1)
227
+ if len(fields) == 2:
228
+ # the first field split by \n is the class label
229
+ class_label = fields[0].strip()
230
+ # the rest is the input
231
+ input_text = fields[1].strip()
232
+ elif len(fields) == 1:
233
+ # the first field split by \n is the input
234
+ class_label = fields[0].strip()
235
+ input_text = ""
236
+ else:
237
+ raise ValueError("Invalid instance text: {}".format(instance_text))
238
+ instances.append((instruction.strip(), input_text.strip(), class_label.strip()))
239
+
240
+ # if the generation stops because of length, we remove the last instance
241
+ if response_metadata["response"]["choices"][0]["finish_reason"] == "length":
242
+ instances = instances[:-1]
243
+ instances = filter_invalid_instances(instances)
244
+ instances = filter_duplicate_instances(instances)
245
+ return instances
246
+
247
+ def generate_instance(inputs):
248
+ classify_res = if_classify(inputs)
249
+ prompts = []
250
+ for i in range(len(inputs)):
251
+ if classify_res[i] in ["Yes", "yes", "YES"]:
252
+ prompt = output_first_template_for_clf + " " + inputs[i].strip() + "\n"
253
+ prompts.append(prompt)
254
+ else:
255
+ prompt = input_first_template_for_gen + " " + inputs[i].strip() + "\n"
256
+ prompts.append(prompt)
257
+ # print("prompts", prompts)
258
+ results = make_gpt3_requests(
259
+ engine=engine,
260
+ prompts=prompts,
261
+ # because the clf template is longer, we need to decrease the max_tokens
262
+ max_tokens=350,
263
+ temperature=0,
264
+ top_p=0,
265
+ frequency_penalty=0,
266
+ presence_penalty=1.5,
267
+ stop_sequences=["Task:"],
268
+ logprobs=1,
269
+ n=1,
270
+ best_of=1,
271
+ api_key=api_key)
272
+ return results, classify_res
273
+
274
+ def prepare_finetune(inputs):
275
+ instance_outputs, classify_res = generate_instance(inputs)
276
+ training_instances = []
277
+ results1, results2 = [], []
278
+ for i in range(len(inputs)):
279
+ if classify_res[i]:
280
+ task_instances = parse_instances_for_classification_task(instance_outputs[i]["response"]["choices"][0]["text"],
281
+ inputs[i].strip(), instance_outputs[i])
282
+ else:
283
+ task_instances = parse_instances_for_generation_task(instance_outputs[i]["response"]["choices"][0]["text"],
284
+ inputs[i].strip(), instance_outputs[i])
285
+ # we only allow max 5 instances per task
286
+ task_instances = random.sample(task_instances, min(len(task_instances), 5))
287
+
288
+ if not task_instances:
289
+ continue
290
+
291
+ training_instances += task_instances
292
+
293
+ for instance in training_instances:
294
+ results1.append({
295
+ "instruction": instance[0],
296
+ "input": instance[1],
297
+ "output": instance[2],
298
+ })
299
+ results2.append(json.dumps({
300
+ "instruction": instance[0],
301
+ "input": instance[1],
302
+ "output": instance[2],
303
+ }, ensure_ascii=False))
304
+ return results1, classify_res, instance_outputs, results2
305
+
306
+ def instance_main(inputs, key):
307
+ if key:
308
+ api_key = key
309
+ inputs = inputs.split('\n')
310
+ print("***", inputs)
311
+ return prepare_finetune(inputs)
312
+
313
+ # instance_main()
selfinstruct/prepare_for_finetuning.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import argparse
4
+ import glob
5
+ import re
6
+ import random
7
+ import tqdm
8
+ import pandas as pd
9
+
10
+
11
+ random.seed(123)
12
+
13
+
14
+ def parse_args():
15
+ parser = argparse.ArgumentParser()
16
+ parser.add_argument(
17
+ "--instance_files",
18
+ nargs="+",
19
+ default=["data/batch_221203/machine_generated_instances.jsonl"],
20
+ type=str,
21
+ help="The input files that contain the machine generated instances."
22
+ )
23
+ parser.add_argument(
24
+ "--classification_type_files",
25
+ nargs="+",
26
+ default=["data/batch_221203/is_clf_or_not_davinci_template_1.jsonl"],
27
+ )
28
+ parser.add_argument(
29
+ "--output_dir",
30
+ default="data/gpt3_generations/batch_221203/finetuning/",
31
+ type=str,
32
+ help="The output dir to save the cleaned version of the generated instances, so that it can be used for GPT3 finetuning."
33
+ )
34
+ parser.add_argument(
35
+ "--num_instructions",
36
+ type=int,
37
+ help="The number of instructions to load."
38
+ )
39
+ parser.add_argument(
40
+ "--include_seed_tasks",
41
+ action="store_true",
42
+ help="Whether to include the seed human-written instances in the finetuning data."
43
+ )
44
+ parser.add_argument(
45
+ "--seed_tasks_path",
46
+ type=str,
47
+ required=True,
48
+ default="data/seed_tasks.jsonl",
49
+ help="The path to the seed data.",
50
+ )
51
+ return parser.parse_args()
52
+
53
+
54
+ def encode_instance(instruction, input, output, random_template=True):
55
+ encoding_templates_w_input = [
56
+ ("{instruction}\nInput: {input}\nOutput:", " {output}<|endoftext|>"),
57
+ ("{instruction}\n\nInput: {input}\n\nOutput:", " {output}<|endoftext|>"),
58
+ ("Task: {instruction}\nInput: {input}\nOutput:", " {output}<|endoftext|>"),
59
+ ("{instruction}\n\n{input}\n\nOutput:", " {output}<|endoftext|>"),
60
+ ("{instruction}\n\n{input}\n\n", "{output}<|endoftext|>"),
61
+ ("{instruction}\n{input}\n\n", "{output}<|endoftext|>"),
62
+ ("Task: {instruction}\n\n{input}\n\n", "{output}<|endoftext|>"),
63
+ ]
64
+ encoding_templates_wo_input = [
65
+ ("{instruction} Output:", " {output}<|endoftext|>"),
66
+ ("{instruction}\nOutput:", " {output}<|endoftext|>"),
67
+ ("{instruction}\n\nOutput:", " {output}<|endoftext|>"),
68
+ ("{instruction}\n", "{output}<|endoftext|>"),
69
+ ("{instruction}\n\n", "{output}<|endoftext|>"),
70
+ ("Task: {instruction}\n\n", "{output}<|endoftext|>"),
71
+ ]
72
+ if random_template:
73
+ if input.strip() != "":
74
+ prompt_template, completion_template = random.choice(encoding_templates_w_input)
75
+ prompt = prompt_template.format(instruction=instruction.strip(), input=input.strip())
76
+ completion = completion_template.format(output=output.strip())
77
+ else:
78
+ prompt_template, completion_template = random.choice(encoding_templates_wo_input)
79
+ prompt = prompt_template.format(instruction=instruction.strip())
80
+ completion = completion_template.format(output=output.strip())
81
+ else:
82
+ prompt = instruction.strip() + "\n\n" + input.strip() + "\n\n"
83
+ completion = output.strip() + "<|endoftext|>"
84
+
85
+ data = {
86
+ "prompt": prompt,
87
+ "completion": completion,
88
+ "instruction": instruction.strip(),
89
+ "input": input.strip(),
90
+ "output": output.strip(),
91
+ }
92
+ return data
93
+
94
+
95
+ def parse_input_output(response_text):
96
+ if re.findall(r"Output\s*\d*\s*:", response_text):
97
+ inst_input = re.split(r"Output\s*\d*\s*:", response_text)[0].strip()
98
+ inst_output = re.split(r"Output\s*\d*\s*:", response_text)[1].strip()
99
+ else:
100
+ inst_input = ""
101
+ inst_output = response_text.strip()
102
+ # to avoid the case multiple input/output pairs are generated
103
+ if re.findall(r"Input\s*\d*\s*:", inst_output):
104
+ inst_output = re.split(r"Input\s*\d*\s*:", inst_output)[0].strip()
105
+ # remove the prefix "Input:" from the string
106
+ inst_input = re.sub(r"^Input\s*\d*\s*:", "", inst_input).strip()
107
+ return inst_input, inst_output
108
+
109
+
110
+ def filter_duplicate_instances(instances):
111
+ # if the instances have same non-empty input, but different output, we will not use such instances
112
+ same_input_diff_output = False
113
+ for i in range(1, len(instances)):
114
+ for j in range(0, i):
115
+ if instances[i][1] == "":
116
+ continue
117
+ if instances[i][1] == instances[j][1] and instances[i][2] != instances[j][2]:
118
+ same_input_diff_output = True
119
+ break
120
+ if same_input_diff_output:
121
+ return []
122
+
123
+ # remove duplicate instances
124
+ instances = list(set(instances))
125
+ return instances
126
+
127
+ def filter_invalid_instances(instances):
128
+ filtered_instances = []
129
+ for instance in instances:
130
+ # if input and output are the same, we will not use such instances
131
+ if instance[1] == instance[2]:
132
+ continue
133
+ # if output is empty, we will not use such instances
134
+ if instance[2] == "":
135
+ continue
136
+ # if input or output ends with a colon, these are usually imcomplete generation. We will not use such instances
137
+ if instance[1].strip().endswith(":") or instance[2].strip().endswith(":"):
138
+ continue
139
+ filtered_instances.append(instance)
140
+ return filtered_instances
141
+
142
+ def parse_instances_for_generation_task(raw_text, instruction, response_metadata):
143
+ instances = []
144
+ raw_text = raw_text.strip()
145
+ if re.findall("Example\s?\d*\.?", raw_text):
146
+ instance_texts = re.split(r"Example\s?\d*\.?", raw_text)
147
+ instance_texts = [it.strip() for it in instance_texts if it.strip() != ""]
148
+ for instance_text in instance_texts:
149
+ inst_input, inst_output = parse_input_output(instance_text)
150
+ instances.append((instruction.strip(), inst_input.strip(), inst_output.strip()))
151
+ elif re.findall(r"Output\s*\d*\s*:", raw_text):
152
+ # we assume only one input/output pair in this case
153
+ inst_input, inst_output = parse_input_output(raw_text)
154
+ instances.append((instruction.strip(), inst_input.strip(), inst_output.strip()))
155
+ else:
156
+ return []
157
+ # if the generation stops because of length, we remove the last instance
158
+ if response_metadata["response"]["choices"][0]["finish_reason"] == "length":
159
+ instances = instances[:-1]
160
+
161
+ instances = filter_invalid_instances(instances)
162
+ instances = filter_duplicate_instances(instances)
163
+ return instances
164
+
165
+ def parse_instances_for_classification_task(raw_text, instruction, response_metadata):
166
+ instances = []
167
+ if not "Class label:" in raw_text:
168
+ return []
169
+ instance_texts = raw_text.split("Class label:")[1:]
170
+ for instance_text in instance_texts:
171
+ instance_text = instance_text.strip()
172
+ fields = instance_text.split("\n", 1)
173
+ if len(fields) == 2:
174
+ # the first field split by \n is the class label
175
+ class_label = fields[0].strip()
176
+ # the rest is the input
177
+ input_text = fields[1].strip()
178
+ elif len(fields) == 1:
179
+ # the first field split by \n is the input
180
+ class_label = fields[0].strip()
181
+ input_text = ""
182
+ else:
183
+ raise ValueError("Invalid instance text: {}".format(instance_text))
184
+ instances.append((instruction.strip(), input_text.strip(), class_label.strip()))
185
+
186
+ # if the generation stops because of length, we remove the last instance
187
+ if response_metadata["response"]["choices"][0]["finish_reason"] == "length":
188
+ instances = instances[:-1]
189
+ instances = filter_invalid_instances(instances)
190
+ instances = filter_duplicate_instances(instances)
191
+ return instances
192
+
193
+
194
+ if __name__ == "__main__":
195
+ args = parse_args()
196
+
197
+ training_instances = []
198
+
199
+ generated_tasks = []
200
+ for instance_file in args.instance_files:
201
+ with open(instance_file) as fin:
202
+ for line in fin:
203
+ generated_tasks.append(json.loads(line))
204
+ print(f"Loaded {len(generated_tasks)} raw generated tasks")
205
+
206
+ task_clf_types = {}
207
+ for file in args.classification_type_files:
208
+ with open(file) as fin:
209
+ for line in fin:
210
+ data = json.loads(line)
211
+ task_clf_types[data["instruction"]] = data["is_classification"].strip() in ["Yes", "yes", "YES"]
212
+
213
+ for task in tqdm.tqdm(generated_tasks):
214
+ # get instruction
215
+ instruction = task["instruction"]
216
+ task["is_classification"] = task_clf_types[instruction]
217
+
218
+ # get the instances
219
+ if task["is_classification"]:
220
+ task_instances = parse_instances_for_classification_task(task["raw_instances"], instruction, task["instance_metadata"])
221
+ else:
222
+ task_instances = parse_instances_for_generation_task(task["raw_instances"], instruction, task["instance_metadata"])
223
+
224
+ # we only allow max 5 instances per task
225
+ task_instances = random.sample(task_instances, min(len(task_instances), 5))
226
+
227
+ if not task_instances:
228
+ continue
229
+
230
+ training_instances += task_instances
231
+
232
+
233
+ os.makedirs(args.output_dir, exist_ok=True)
234
+ with open(os.path.join(args.output_dir, "all_generated_instances.jsonl"), "w") as fout:
235
+ for instance in training_instances:
236
+ fout.write(json.dumps({
237
+ "instruction": instance[0],
238
+ "input": instance[1],
239
+ "output": instance[2],
240
+ }) + "\n")
241
+ print(f"Saved {len(training_instances)} instances")
242
+ unique_instructions = set([it[0] for it in training_instances])
243
+ print(f"Unique instructions: {len(unique_instructions)}")
244
+ clf_instructions = [instruction for instruction in unique_instructions if task_clf_types[instruction]]
245
+ print(f"Classification instructions: {len(clf_instructions)}")
246
+ non_clf_instructions = [instruction for instruction in unique_instructions if not task_clf_types[instruction]]
247
+ print(f"Non-classification instructions: {len(non_clf_instructions)}")
248
+
249
+ if args.num_instructions is not None:
250
+ print(f"Sampling {args.num_instructions} instructions")
251
+ sampled_instructions = random.sample(unique_instructions, args.num_instructions)
252
+ training_instances = [it for it in training_instances if it[0] in sampled_instructions]
253
+ print(f"Only using {len(training_instances)} instances for these sampled instructions.")
254
+ with open(os.path.join(args.output_dir, f"sampled_generated_instances_{args.num_instructions}.jsonl"), "w") as fout:
255
+ for instance in training_instances:
256
+ fout.write(json.dumps({
257
+ "instruction": instance[0],
258
+ "input": instance[1],
259
+ "output": instance[2],
260
+ }) + "\n")
261
+
262
+ if args.include_seed_tasks:
263
+ seed_tasks = [json.loads(l) for l in open(args.seed_tasks_path, "r")]
264
+ for task in seed_tasks:
265
+ for instance in task["instances"]:
266
+ training_instances.append((task["instruction"], instance["input"], instance["output"]))
267
+ print(f"Included {len(seed_tasks)} seed tasks")
268
+
269
+ # get the prompt and completion for training gpt3
270
+ gpt3_instances = []
271
+ for instance in training_instances:
272
+ # get input and do preprocessing
273
+ inst_input = instance[1]
274
+ # for some tasks, we check whether the input contains colon, and if so, we remove the part before the colon
275
+ if random.random() < 0.5:
276
+ colon_words = re.findall(r"(\w+):", inst_input)
277
+ # if only one colon is found, we assume the instance only have one input and we remove the field name before the colon
278
+ if len(set(colon_words)) == 1:
279
+ inst_input = inst_input.split(":", 1)[1].strip()
280
+ else:
281
+ inst_input = inst_input.strip()
282
+ # we also replace two consecutive new lines with one new line half of the time
283
+ inst_input = inst_input.replace("\n\n", "\n")
284
+
285
+ gpt3_instances.append(encode_instance(instance[0], inst_input, instance[2]))
286
+
287
+ # remove duplicates
288
+ filtered_instances = []
289
+ prompt_completion_set = set()
290
+ for instance in gpt3_instances:
291
+ instance_pair = (instance["prompt"], instance["completion"])
292
+ if instance_pair not in prompt_completion_set:
293
+ prompt_completion_set.add((instance["prompt"], instance["completion"]))
294
+ filtered_instances.append(instance)
295
+ gpt3_instances = filtered_instances
296
+
297
+ # shuffle
298
+ random.shuffle(gpt3_instances)
299
+ with open(os.path.join(args.output_dir, f"gpt3_finetuning_data_{len(gpt3_instances)}.jsonl"), "w") as fout:
300
+ for instance in gpt3_instances:
301
+ fout.write(json.dumps({
302
+ "prompt": instance["prompt"],
303
+ "completion": instance["completion"],
304
+ }) + "\n")
selfinstruct/simple_instance.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import random
4
+ import tqdm
5
+ import re
6
+ import argparse
7
+ import pandas as pd
8
+ from collections import OrderedDict
9
+ from gpt3_api import make_requests as make_gpt3_requests
10
+ from templates.instance_gen_template import output_first_template_for_clf, input_first_template_for_gen
11
+
12
+ random.seed(42)
13
+
14
+ def parse_args():
15
+ parser = argparse.ArgumentParser()
16
+ parser.add_argument(
17
+ "--batch_dir",
18
+ type=str,
19
+ required=True,
20
+ help="The directory where the batch is stored.",
21
+ )
22
+ parser.add_argument(
23
+ "--input_file",
24
+ type=str,
25
+ default="machine_generated_instructions.jsonl"
26
+ )
27
+ parser.add_argument(
28
+ "--output_file",
29
+ type=str,
30
+ default="machine_generated_instances.jsonl",
31
+ )
32
+ parser.add_argument(
33
+ "--num_instructions",
34
+ type=int,
35
+ help="if specified, only generate instance input for this many instructions",
36
+ )
37
+ parser.add_argument(
38
+ "--max_instances_to_generate",
39
+ type=int,
40
+ default=5,
41
+ help="The max number of instances to generate for each instruction.",
42
+ )
43
+ parser.add_argument(
44
+ "--generation_tasks_only",
45
+ action="store_true",
46
+ help="If specified, only do for generation tasks.",
47
+ )
48
+ parser.add_argument(
49
+ "--classification_tasks_only",
50
+ action="store_true",
51
+ help="If specified, only do for classification tasks.",
52
+ )
53
+ parser.add_argument(
54
+ "--engine",
55
+ type=str,
56
+ default="davinci",
57
+ help="The engine to use."
58
+ )
59
+ parser.add_argument(
60
+ "--request_batch_size",
61
+ type=int,
62
+ default=5,
63
+ help="The number of requests to send in a batch."
64
+ )
65
+ parser.add_argument(
66
+ "--api_key",
67
+ type=str,
68
+ help="The API key to use. If not specified, the key will be read from the environment variable OPENAI_API_KEY."
69
+ )
70
+ parser.add_argument(
71
+ "--organization",
72
+ type=str,
73
+ help="The organization to use. If not specified, the default organization id will be used."
74
+ )
75
+ return parser.parse_args()
76
+
77
+
78
+ if __name__ == '__main__':
79
+ args = parse_args()
80
+
81
+ with open(os.path.join(args.batch_dir, args.input_file)) as fin:
82
+ lines = fin.readlines()
83
+ if args.num_instructions is not None:
84
+ lines = lines[:args.num_instructions]
85
+ tasks = []
86
+ for line in lines:
87
+ data = json.loads(line)
88
+ if "metadata" in data:
89
+ data["instruction_metadata"] = data["metadata"]
90
+ del data["metadata"]
91
+ tasks.append(data)
92
+
93
+ task_clf_types = {}
94
+ with open(os.path.join(args.batch_dir, "is_clf_or_not_davinci_template_1.jsonl")) as fin:
95
+ for line in fin:
96
+ data = json.loads(line)
97
+ task_clf_types[data["instruction"]] = data["is_classification"].strip() in ["Yes", "yes", "YES"]
98
+
99
+ if args.classification_tasks_only:
100
+ tasks = [task for task in tasks if task_clf_types[task["instruction"]]]
101
+
102
+ if args.generation_tasks_only:
103
+ tasks = [task for task in tasks if not task_clf_types[task["instruction"]]]
104
+
105
+ output_path = os.path.join(args.batch_dir, args.output_file)
106
+ existing_requests = {}
107
+ if os.path.exists(output_path):
108
+ with open(output_path) as fin:
109
+ for line in tqdm.tqdm(fin):
110
+ try:
111
+ data = json.loads(line)
112
+ existing_requests[data["instruction"]] = data
113
+ except:
114
+ pass
115
+ print(f"Loaded {len(existing_requests)} existing requests")
116
+
117
+ progress_bar = tqdm.tqdm(total=len(tasks))
118
+ with open(output_path, "w") as fout:
119
+ for batch_idx in range(0, len(tasks), args.request_batch_size):
120
+ batch = tasks[batch_idx: batch_idx + args.request_batch_size]
121
+ if all(d["instruction"] in existing_requests for d in batch):
122
+ for d in batch:
123
+ data = existing_requests[d["instruction"]]
124
+ data = OrderedDict(
125
+ (k, data[k]) for k in \
126
+ ["instruction", "raw_instances", "instance_metadata"]
127
+ # ["instruction", "raw_instances", "instance_metadata", "instruction_metadata",
128
+ # "most_similar", "avg_similarity_score"]
129
+ )
130
+ fout.write(json.dumps(data, ensure_ascii=False) + "\n")
131
+ else:
132
+ prompts = []
133
+ for task in batch:
134
+ if task_clf_types[task["instruction"]]:
135
+ prompt = output_first_template_for_clf + " " + task["instruction"].strip() + "\n"
136
+ prompts.append(prompt)
137
+ else:
138
+ prompt = input_first_template_for_gen + " " + task["instruction"].strip() + "\n"
139
+ prompts.append(prompt)
140
+ print("prompts", prompts)
141
+ results = make_gpt3_requests(
142
+ engine=args.engine,
143
+ prompts=prompts,
144
+ # because the clf template is longer, we need to decrease the max_tokens
145
+ max_tokens=300 if any(task_clf_types[task["instruction"]] for task in batch) else 350,
146
+ temperature=0,
147
+ top_p=0,
148
+ frequency_penalty=0,
149
+ presence_penalty=1.5,
150
+ stop_sequences=[f"Example {args.max_instances_to_generate + 1}", "Task:"],
151
+ logprobs=1,
152
+ n=1,
153
+ best_of=1,
154
+ api_key=args.api_key,
155
+ organization=args.organization)
156
+ for i in range(len(batch)):
157
+ data = batch[i]
158
+ data["instance_metadata"] = results[i]
159
+ if results[i]["response"] is not None:
160
+ data["raw_instances"] = results[i]["response"]["choices"][0]["text"]
161
+ else:
162
+ data["raw_instances"] = ""
163
+ data = OrderedDict(
164
+ (k, data[k]) for k in \
165
+ ["instruction", "raw_instances", "instance_metadata"]
166
+ # ["instruction", "raw_instances", "instance_metadata", "instruction_metadata",
167
+ # "most_similar", "avg_similarity_score"]
168
+ )
169
+ fout.write(json.dumps(data, ensure_ascii=False) + "\n")
170
+ progress_bar.update(len(batch))
selfinstruct/templates/clf_task_template.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ template_1 = '''Can the following task be regarded as a classification task with finite output labels?
2
+
3
+ Task: Given my personality and the job, tell me if I would be suitable.
4
+ Is it classification? Yes
5
+
6
+ Task: Give me an example of a time when you had to use your sense of humor.
7
+ Is it classification? No
8
+
9
+ Task: Replace the placeholders in the given text with appropriate named entities.
10
+ Is it classification? No
11
+
12
+ Task: Fact checking - tell me if the statement is true, false, or unknown, based on your knowledge and common sense.
13
+ Is it classification? Yes
14
+
15
+ Task: Return the SSN number for the person.
16
+ Is it classification? No
17
+
18
+ Task: Detect if the Reddit thread contains hate speech.
19
+ Is it classification? Yes
20
+
21
+ Task: Analyze the sentences below to identify biases.
22
+ Is it classification? No
23
+
24
+ Task: Select the longest sentence in terms of the number of words in the paragraph, output the sentence index.
25
+ Is it classification? Yes
26
+
27
+ Task: Find out the toxic word or phrase in the sentence.
28
+ Is it classification? No
29
+
30
+ Task: Rank these countries by their population.
31
+ Is it classification? No
32
+
33
+ Task: You are provided with a news article, and you need to identify all the categories that this article belongs to. Possible categories include: Music, Sports, Politics, Tech, Finance, Basketball, Soccer, Tennis, Entertainment, Digital Game, World News. Output its categories one by one, seperated by comma.
34
+ Is it classification? Yes
35
+
36
+ Task: Given the name of an exercise, explain how to do it.
37
+ Is it classification? No
38
+
39
+ Task: Select the oldest person from the list.
40
+ Is it classification? Yes
41
+
42
+ Task: Find the four smallest perfect numbers.
43
+ Is it classification? No
44
+
45
+ Task: Does the information in the document supports the claim? You can answer "Support" or "Unsupport".
46
+ Is it classification? Yes
47
+
48
+ Task: Create a detailed budget for the given hypothetical trip.
49
+ Is it classification? No
50
+
51
+ Task: Given a sentence, detect if there is any potential stereotype in it. If so, you should explain the stereotype. Else, output no.
52
+ Is it classification? No
53
+
54
+ Task: Explain the following idiom to me, and try to give me some examples.
55
+ Is it classification? No
56
+
57
+ Task: Is there anything I can eat for a breakfast that doesn't include eggs, yet includes protein, and has roughly 700-1000 calories?
58
+ Is it classification? No
59
+
60
+ Task: Answer the following multiple choice question. Select A, B, C, or D for the final answer.
61
+ Is it classification? Yes
62
+
63
+ Task: Decide whether the syllogism is logically sound.
64
+ Is it classification? Yes
65
+
66
+ Task: How can individuals and organizations reduce unconscious bias?
67
+ Is it classification? No
68
+
69
+ Task: What are some things you can do to de-stress?
70
+ Is it classification? No
71
+
72
+ Task: Find out the largest one from a set of numbers. Output the number directly.
73
+ Is it classification? Yes
74
+
75
+ Task: Replace the <mask> token in the text with proper words that are consistent with the context. You can use multiple words for each <mask> token.
76
+ Is it classification? No
77
+
78
+ Task: Write a cover letter based on the given facts.
79
+ Is it classification? No
80
+
81
+ Task: Identify the pos tag of the word in the given sentence.
82
+ Is it classification? Yes
83
+
84
+ Task: Write a program to compute the sum of integers from k to n.
85
+ Is it classification? No
86
+
87
+ Task: In this task, you need to compare the meaning of the two sentences and tell if they are the same. Output yes or no.
88
+ Is it classification? Yes
89
+
90
+ Task: To make the pairs have the same analogy, write the fourth word.
91
+ Is it classification? No
92
+
93
+ Task: Given a set of numbers, find all possible subsets that sum to a given number.
94
+ Is it classification? No
95
+
96
+ Task:'''
selfinstruct/templates/instance_gen_template.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ output_first_template_for_clf = '''Given the classification task definition and the class labels, generate an input that corresponds to each of the class labels. If the task doesn't require input, just generate possible class labels.
2
+
3
+ Task: Classify the sentiment of the sentence into positive, negative, or mixed.
4
+ Class label: mixed
5
+ Sentence: I enjoy the flavor of the restaurant but their service is too slow.
6
+ Class label: Positive
7
+ Sentence: I had a great day today. The weather was beautiful and I spent time with friends and family.
8
+ Class label: Negative
9
+ Sentence: I was really disappointed by the latest superhero movie. I would not recommend it to anyone.
10
+
11
+ Task: Given a dialogue, classify whether the user is satisfied with the service. You should respond with "Satisfied" or "Unsatisfied".
12
+ Class label: Satisfied
13
+ Dialogue:
14
+ - Agent: Thank you for your feedback. We will work to improve our service in the future.
15
+ - Customer: I am happy with the service you provided. Thank you for your help.
16
+ Class label: Unsatisfied
17
+ Dialogue:
18
+ - Agent: I am sorry we will cancel that order for you, and you will get a refund within 7 business days.
19
+ - Customer: oh that takes too long. I want you to take quicker action on this.
20
+
21
+ Task: Given some political opinions, classify whether the person belongs to Democrats or Republicans.
22
+ Class label: Democrats
23
+ Opinion: I believe that everyone should have access to quality healthcare regardless of their income level.
24
+ Class label: Republicans
25
+ Opinion: I believe that people should be able to keep more of their hard-earned money and should not be taxed at high rates.
26
+
27
+ Task: Tell me if the following email is a promotion email or not.
28
+ Class label: Promotion
29
+ Email: Check out our amazing new sale! We've got discounts on all of your favorite products.
30
+ Class label: Not Promotion
31
+ Email: We hope you are doing well. Let us know if you need any help.
32
+
33
+ Task: Detect if the Reddit thread contains hate speech.
34
+ Class label: Hate Speech
35
+ Thread: All people of color are stupid and should not be allowed to vote.
36
+ Class label: Not Hate Speech
37
+ Thread: The best way to cook a steak on the grill.
38
+
39
+ Task: Does the information in the document supports the claim? You can answer "Support" or "Unsupport".
40
+ Class label: Unsupport
41
+ Document: After a record-breaking run that saw mortgage rates plunge to all-time lows and home prices soar to new highs, the U.S. housing market finally is slowing. While demand and price gains are cooling, any correction is likely to be a modest one, housing economists and analysts say. No one expects price drops on the scale of the declines experienced during the Great Recession.
42
+ Claim: The US housing market is going to crash soon.
43
+ Class label: Support
44
+ Document: The U.S. housing market is showing signs of strain, with home sales and prices slowing in many areas. Mortgage rates have risen sharply in recent months, and the number of homes for sale is increasing. This could be the beginning of a larger downturn, with some economists predicting a potential housing crash in the near future.
45
+ Claim: The US housing market is going to crash soon.
46
+
47
+ Task: Answer the following multiple-choice question. Select A, B, C, or D for the final answer.
48
+ Class label: C
49
+ Question: What is the capital of Germany?
50
+ A. London
51
+ B. Paris
52
+ C. Berlin
53
+ D. Rome
54
+ Class label: D
55
+ Question: What is the largest planet in our solar system?
56
+ A) Earth
57
+ B) Saturn
58
+ C) Mars
59
+ D) Jupiter
60
+ Class label: A
61
+ Question: What is the process by which plants make their own food through photosynthesis?
62
+ A) Respiration
63
+ B) Fermentation
64
+ C) Digestion
65
+ D) Metabolism
66
+ Class label: B
67
+ Question: Who wrote the novel "The Great Gatsby"?
68
+ A) Ernest Hemingway
69
+ B) F. Scott Fitzgerald
70
+ C) J.D. Salinger
71
+ D) Mark Twain
72
+
73
+ Task: You need to read a code and detect if there is a syntax error or not. Output true if there is an error, output false if there is not.
74
+ Class label: true
75
+ Code:
76
+ def quick_sort(arr):
77
+ if len(arr) < 2
78
+ return arr
79
+ Class label: False
80
+ Code:
81
+ def calculate_average(numbers):
82
+ total = 0
83
+ for number in numbers:
84
+ total += number
85
+ return total / len(numbers)
86
+
87
+ Task: You are provided with a news article, and you need to identify all the categories that this article belongs to. Possible categories include Sports and Politics. Output its categories one by one, separated by a comma.
88
+ Class label: Sports
89
+ Article: The Golden State Warriors have won the NBA championship for the second year in a row.
90
+ Class label: Politics
91
+ Article: The United States has withdrawn from the Paris Climate Agreement.
92
+ Class label: Politics, Sports
93
+ Article: The government has proposed cutting funding for youth sports programs.
94
+
95
+ Task: Given a credit card statement, the cardholder's spending habits, and the account balance, classify whether the cardholder is at risk of defaulting on their payments or not.
96
+ Class label: At risk
97
+ Credit card statement: Purchases at high-end clothing stores and luxury hotels.
98
+ Cardholder's spending habits: Frequent purchases at luxury brands and high-end establishments.
99
+ Account balance: Over the credit limit and multiple missed payments.
100
+ Class label: Not at risk
101
+ Credit card statement: Purchases at grocery stores and gas stations.
102
+ Cardholder's spending habits: Regular purchases for necessary expenses and occasional dining out.
103
+ Account balance: Slightly below the credit limit and no missed payments.
104
+
105
+ Task: Given a social media post, the hashtags used, and a topic. classify whether the post is relevant to the topic or not.
106
+ Class label: Relevant
107
+ Post: I can't believe the government is still not taking action on climate change. It's time for us to take matters into our own hands.
108
+ Hashtags: #climatechange #actnow
109
+ Topic: Climate change
110
+ Class label: Not relevant
111
+ Post: I just bought the new iPhone and it is amazing!
112
+ Hashtags: #apple #technology
113
+ Topic: Travel
114
+
115
+ Task: The answer will be 'yes' if the provided sentence contains an explicit mention that answers the given question. Otherwise, answer 'no'.
116
+ Class label: Yes
117
+ Sentence: Jack played basketball for an hour after school.
118
+ Question: How long did Jack play basketball?
119
+ Class label: No
120
+ Sentence: The leaders of the Department of Homeland Security now appear before 88 committees and subcommittees of Congress.
121
+ Question: How often are they required to appear?
122
+
123
+ Task: Tell me what's the second largest city by population in Canada.
124
+ Class label: Montreal
125
+
126
+ Task: Classifying different types of mathematical equations, such as linear, and quadratic equations, based on the coefficients and terms in the equation.
127
+ Class label: Linear equation
128
+ Equation: y = 2x + 5
129
+ Class label: Quadratic equation
130
+ Equation: y = x^2 - 4x + 3
131
+
132
+ Task: Tell me the first number of the given list.
133
+ Class label: 1
134
+ List: 1, 2, 3
135
+ Class label: 2
136
+ List: 2, 9, 10
137
+
138
+ Task: Which of the following is not an input type? (a) number (b) date (c) phone number (d) email address (e) all of these are valid inputs.
139
+ Class label: (e)
140
+
141
+ Task:'''
142
+
143
+ input_first_template_for_gen = '''Come up with examples for the following tasks. Try to generate multiple examples when possible. If the task doesn't require additional input, you can generate the output directly.
144
+
145
+ Task: Which exercises are best for reducing belly fat at home?
146
+ Output:
147
+ - Lying Leg Raises
148
+ - Leg In And Out
149
+ - Plank
150
+ - Side Plank
151
+ - Sit-ups
152
+
153
+ Task: Extract all the country names in the paragraph, list them separated by commas.
154
+ Example 1
155
+ Paragraph: Dr. No is the sixth novel by the English author Ian Fleming to feature his British Secret Service agent James Bond. Written at Fleming's Goldeneye estate in Jamaica, it was first published in the United Kingdom by Jonathan Cape in 1958. In the novel Bond looks into the disappearance in Jamaica of two fellow MI6 operatives who had been investigating Doctor No. Bond travels to No's Caribbean island and meets Honeychile Rider, who is there to collect shells. They are captured and taken to a luxurious facility carved into a mountain. The character of Doctor No, the son of a German missionary and a Chinese woman, was influenced by Sax Rohmer's Fu Manchu stories. Dr. No was the first of Fleming's novels to face widespread negative reviews in Britain, but it was received more favourably in the United States.
156
+ Output: English, British, Jamaica, the United Kingdom, German, Chinese, Britain, the United States.
157
+
158
+ Task: Converting 85 F to Celsius.
159
+ Output: 85°F = 29.44°C
160
+
161
+ Task: Sort the given list ascendingly.
162
+ Example 1
163
+ List: [10, 92, 2, 5, -4, 92, 5, 101]
164
+ Output: [-4, 2, 5, 5, 10, 92, 92, 101]
165
+ Example 2
166
+ Input 2 - List: [9.99, 10, -5, -1000, 5e6, 999]
167
+ Output: [-1000, -5, 9.99, 10, 999, 5e6]
168
+
169
+ Task: Suggest a better and more professional rephrasing of the following sentence.
170
+ Example 1
171
+ Sentence: This house is surprisingly not constructed very well, and you probably need more money to fix it after you buy it. If you ask me, I would suggest you to consider other candidates.
172
+ Output: This house does not seem to be constructed well, so you may need to spend more money to fix it after you purchase it. I would suggest that you look at other properties.
173
+ Example 2
174
+ Sentence: Just so you know, we did an experiment last week and found really surprising results - language model can improve itself!
175
+ Output: Our experiments last week demonstrated surprising results, proving that the language model can improve itself.
176
+
177
+ Task: Read the following paragraph and answer a math question about the paragraph. You need to write out the calculation for getting the final answer.
178
+ Example 1
179
+ Paragraph: Gun violence in the United States results in tens of thousands of deaths and injuries annually, and was the leading cause of death for children 19 and younger in 2020. In 2018, the most recent year for which data are available as of 2021, the Centers for Disease Control and Prevention's (CDC) National Center for Health Statistics reports 38,390 deaths by firearm, of which 24,432 were by suicide. The rate of firearm deaths per 100,000 people rose from 10.3 per 100,000 in 1999 to 12 per 100,000 in 2017, with 109 people dying per day or about 14,542 homicides in total, being 11.9 per 100,000 in 2018. In 2010, there were 19,392 firearm-related suicides, and 11,078 firearm-related homicides in the U.S. In 2010, 358 murders were reported involving a rifle while 6,009 were reported involving a handgun; another 1,939 were reported with an unspecified type of firearm. In 2011, a total of 478,400 fatal and nonfatal violent crimes were committed with a firearm.
180
+ Question: How many more firearm-related deaths were there in 2018 compared to 2010?
181
+ Output:
182
+ 38390 - (19392 + 11078) = 38390 - 30470 = 7920.
183
+ So, in 2018, there were 7920 more deaths by firearm than in 2010.
184
+
185
+ Task: Write Python code to solve this leetcode problem.
186
+ Example 1
187
+ Problem: You are given two non-empty linked lists representing two non-negative integers. The digits are stored in reverse order, and each of their nodes contains a single digit. Add the two numbers and return the sum as a linked list. You may assume the two numbers do not contain any leading zero, except the number 0 itself.
188
+ Output:
189
+ class Solution(object):
190
+ def addTwoNumbers(self, l1, l2):
191
+ carry = 0
192
+ root = n = ListNode(0)
193
+ while l1 or l2 or carry:
194
+ v1 = v2 = 0
195
+ if l1:
196
+ v1 = l1.val
197
+ l1 = l1.next
198
+ if l2:
199
+ v2 = l2.val
200
+ l2 = l2.next
201
+ carry, val = divmod(v1+v2+carry, 10)
202
+ n.next = ListNode(val)
203
+ n = n.next
204
+ return root.next
205
+
206
+ Task: Solve the equation and find the value of X. Show your steps.
207
+ Example 1
208
+ Equation: 10X + 5 = 10
209
+ Output: 10X = 5, X = 0.5
210
+ Example 2
211
+ Equation: X + Y + 120 = 100
212
+ Output: X + Y = -20, X = -20 - Y
213
+
214
+ Task: Write a program to compute the sum of integers from k to n.
215
+ Output:
216
+ def sum(k, n):
217
+ sum = 0
218
+ for i in range(k, n+1):
219
+ sum += i
220
+ return sum
221
+
222
+ Task: Select the oldest person from the given list.
223
+ Example 1
224
+ List: George Washington, Confucius, Michael Jordan, Michelangelo
225
+ Output: Confucious
226
+ Example 2
227
+ List: Alan Turing, Geoffrey Hinton, Yann LeCun, Yoshua Bengio
228
+ Output: Alan Turing
229
+
230
+ Task: Turn down a job offer by sending an email to a recruiter explaining the reason.
231
+ Output: Hi [Recruiter],
232
+ Thank you so much for the generous offer to join your team. As we discussed, I’ve admired the company for a number of years, and am a proud endorser of its products. However, after further consideration of where I currently am in my career, I’ve decided to accept an offer at another company.
233
+ I would love to stay in touch with you and have already started following you on [Social Media Platform]. Again, thank you so much for your time and consideration.
234
+ Thanks again,
235
+ [Your Name]
236
+
237
+ Task:'''