| | import os |
| | import random |
| | import re |
| | import uuid |
| |
|
| | import pandas as pd |
| | from datasets import load_dataset |
| | from tqdm import tqdm |
| |
|
| |
|
| | def prepare_default_dataset_causal_language_modeling(path): |
| | ds = load_dataset("OpenAssistant/oasst2") |
| | train = ds["train"].to_pandas() |
| | val = ds["validation"].to_pandas() |
| |
|
| | df = pd.concat([train, val], axis=0).reset_index(drop=True) |
| |
|
| | df_assistant = df[(df.role == "assistant")].copy() |
| | df_prompter = df[(df.role == "prompter")].copy() |
| | df_prompter = df_prompter.set_index("message_id") |
| | df_assistant["output"] = df_assistant["text"].values |
| |
|
| | inputs = [] |
| | parent_ids = [] |
| | for _, row in df_assistant.iterrows(): |
| | input = df_prompter.loc[row.parent_id] |
| | inputs.append(input.text) |
| | parent_ids.append(input.parent_id) |
| |
|
| | df_assistant["instruction"] = inputs |
| | df_assistant["parent_id"] = parent_ids |
| |
|
| | df_assistant = df_assistant[ |
| | ["instruction", "output", "message_id", "parent_id", "lang", "rank"] |
| | ].rename(columns={"message_id": "id"}) |
| |
|
| | df_assistant[(df_assistant["rank"] == 0.0) & (df_assistant["lang"] == "en")][ |
| | ["instruction", "output", "id", "parent_id"] |
| | ].to_parquet(os.path.join(path, "train_full.pq"), index=False) |
| |
|
| | df_assistant[df_assistant["lang"] == "en"][ |
| | ["instruction", "output", "id", "parent_id"] |
| | ].to_parquet(os.path.join(path, "train_full_allrank.pq"), index=False) |
| |
|
| | df_assistant[df_assistant["rank"] == 0.0][ |
| | ["instruction", "output", "id", "parent_id"] |
| | ].to_parquet(os.path.join(path, "train_full_multilang.pq"), index=False) |
| |
|
| | df_assistant[["instruction", "output", "id", "parent_id"]].to_parquet( |
| | os.path.join(path, "train_full_multilang_allrank.pq"), index=False |
| | ) |
| |
|
| | return df_assistant[(df_assistant["rank"] == 0.0) & (df_assistant["lang"] == "en")] |
| |
|
| |
|
| | def prepare_default_dataset_dpo_modeling() -> pd.DataFrame: |
| | df = load_dataset("Intel/orca_dpo_pairs")["train"].to_pandas() |
| | return df |
| |
|
| |
|
| | def extract_anthropic_prompt(prompt_and_response): |
| | """Extract the anthropic prompt from a prompt and response pair.""" |
| | search_term = "\n\nAssistant:" |
| | search_term_idx = prompt_and_response.rfind(search_term) |
| | assert ( |
| | search_term_idx != -1 |
| | ), f"Prompt and response does not contain '{search_term}'" |
| | return prompt_and_response[: search_term_idx + len(search_term)] |
| |
|
| |
|
| | def _parse_row(prompt_and_response): |
| | """Extract the anthropic prompt from a prompt and response pair.""" |
| | search_term = "\n\nAssistant:" |
| | search_term_idx = prompt_and_response["chosen"].rfind(search_term) |
| | assert ( |
| | search_term_idx != -1 |
| | ), f"Prompt and response does not contain '{search_term}'" |
| | prompt = prompt_and_response["chosen"][: search_term_idx + len(search_term)] |
| |
|
| | chosen_response = prompt_and_response["chosen"][len(prompt) :] |
| | rejected_response = prompt_and_response["rejected"][len(prompt) :] |
| |
|
| | return prompt, chosen_response, rejected_response |
| |
|
| |
|
| | def _split_up_prompt(prompt): |
| | human_texts = re.findall( |
| | r"\n\nHuman:(.*?)(?=(\n\nAssistant:|$))", prompt, flags=re.DOTALL |
| | ) |
| | assistant_texts = re.findall( |
| | r"\n\nAssistant:(.*?)(?=(\n\nHuman:|$))", prompt, flags=re.DOTALL |
| | ) |
| | human_texts = [text[0].strip() for text in human_texts] |
| | assistant_texts = [text[0].strip() for text in assistant_texts] |
| |
|
| | assert len(human_texts) == len(assistant_texts), prompt |
| | dialogue = list(zip(human_texts, assistant_texts)) |
| | return dialogue |
| |
|
| |
|
| | def prepare_hh_dpo_modeling(split: str) -> pd.DataFrame: |
| | """ |
| | Adapted from |
| | https://github.com/eric-mitchell/direct-preference-optimization/blob/main/preference_datasets.py |
| | """ |
| | dataset = load_dataset("Anthropic/hh-rlhf", split=split) |
| | rnd = random.Random() |
| | rnd.seed(123) |
| | dfs = [] |
| | for row in tqdm(dataset): |
| | prompt, chosen_response, rejected_response = _parse_row(row) |
| | if len(rejected_response) == 0: |
| | |
| | continue |
| |
|
| | parent_uuid = None |
| | parsed_texts = [] |
| | for human_text, assistant_text in _split_up_prompt(prompt): |
| | random_uuid = str(uuid.UUID(int=rnd.getrandbits(128), version=4)) |
| | parsed_texts += [ |
| | [human_text, assistant_text, random_uuid, parent_uuid, None, None] |
| | ] |
| | parent_uuid = random_uuid |
| |
|
| | parsed_texts[-1][-2] = chosen_response |
| | parsed_texts[-1][-1] = rejected_response |
| | df = pd.DataFrame( |
| | parsed_texts, |
| | columns=[ |
| | "instruction", |
| | "output", |
| | "id", |
| | "parent_id", |
| | "chosen_response", |
| | "rejected_response", |
| | ], |
| | ) |
| | dfs.append(df) |
| | df = pd.concat(dfs).reset_index(drop=True) |
| | |
| | df["chosen_response"] = df["chosen_response"].fillna(df["output"]) |
| | df["rejected_response"] = df["rejected_response"].fillna(df["output"]) |
| | del df["output"] |
| | return df |
| |
|