import json import pandas as pd def load_jsonl(path: str) -> list[dict]: """Load a JSONL file into a list of dicts.""" with open(path, encoding="utf-8") as f: return [json.loads(line) for line in f if line.strip()] ## LLMSQL 1.0 tables_llmsql_1 = pd.read_pickle("llmsql_1.0/llmsql_tables.pkl") questions_splits_llmsql_1 = pd.read_pickle("llmsql_1.0/llmsql_formatted.pkl") split_info_llmsql_1 = dict() questions_llmsql_1 = dict() for split_name in list(questions_splits_llmsql_1.keys()): sorted_keys = sorted( list(questions_splits_llmsql_1[split_name].keys()), key=lambda x: int(x) ) split_info_llmsql_1[split_name] = { "first": sorted_keys[0], "last": sorted_keys[-1], "count": len(sorted_keys), } for question_id, items in questions_splits_llmsql_1[split_name].items(): questions_llmsql_1[int(question_id)] = items tables_llmsql_1 = pd.read_pickle("llmsql_1.0/llmsql_tables.pkl") questions_splits_llmsql_1 = pd.read_pickle("llmsql_1.0/llmsql_formatted.pkl") split_info = dict() questions_llmsql_1 = dict() for split_name in list(questions_splits_llmsql_1.keys()): sorted_keys = sorted( list(questions_splits_llmsql_1[split_name].keys()), key=lambda x: int(x) ) split_info[split_name] = { "first": sorted_keys[0], "last": sorted_keys[-1], "count": len(sorted_keys), } for question_id, items in questions_splits_llmsql_1[split_name].items(): questions_llmsql_1[int(question_id)] = items ## LLMSQL 2.0 tables_list_llmsql_2 = load_jsonl("llmsql_2.0/tables.jsonl") questions_list_llmsql_2 = load_jsonl("llmsql_2.0/questions.jsonl") questions_llmsql_2 = dict() for question in questions_list_llmsql_2: questions_llmsql_2[question["question_id"]] = question tables_llmsql_2 = dict() for table in tables_list_llmsql_2: tables_llmsql_2[table["table_id"]] = table