IRIS-FLOWER-CLASSIFICATION-using-machine-learning-models
/
transformers
/examples
/research_projects
/rag
/parse_dpr_relevance_data.py
| """ | |
| This script reads DPR retriever training data and parses each datapoint. We save a line per datapoint. | |
| Each line consists of the query followed by a tab-separated list of Wikipedia page titles constituting | |
| positive contexts for a given query. | |
| """ | |
| import argparse | |
| import json | |
| from tqdm import tqdm | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| # Required parameters | |
| parser.add_argument( | |
| "--src_path", | |
| type=str, | |
| default="biencoder-nq-dev.json", | |
| help="Path to raw DPR training data", | |
| ) | |
| parser.add_argument( | |
| "--evaluation_set", | |
| type=str, | |
| help="where to store parsed evaluation_set file", | |
| ) | |
| parser.add_argument( | |
| "--gold_data_path", | |
| type=str, | |
| help="where to store parsed gold_data_path file", | |
| ) | |
| args = parser.parse_args() | |
| with open(args.src_path, "r") as src_file, open(args.evaluation_set, "w") as eval_file, open( | |
| args.gold_data_path, "w" | |
| ) as gold_file: | |
| dpr_records = json.load(src_file) | |
| for dpr_record in tqdm(dpr_records): | |
| question = dpr_record["question"] | |
| contexts = [context["title"] for context in dpr_record["positive_ctxs"]] | |
| eval_file.write(question + "\n") | |
| gold_file.write("\t".join(contexts) + "\n") | |
| if __name__ == "__main__": | |
| main() | |