llmsql-interactive-q-a / dataset_generator.py
pihull's picture
Add dataset files using LFS
bc3f5f2
import json
import pandas as pd
def load_jsonl(path: str) -> list[dict]:
"""Load a JSONL file into a list of dicts."""
with open(path, encoding="utf-8") as f:
return [json.loads(line) for line in f if line.strip()]
## LLMSQL 1.0
tables_llmsql_1 = pd.read_pickle("llmsql_1.0/llmsql_tables.pkl")
questions_splits_llmsql_1 = pd.read_pickle("llmsql_1.0/llmsql_formatted.pkl")
split_info_llmsql_1 = dict()
questions_llmsql_1 = dict()
for split_name in list(questions_splits_llmsql_1.keys()):
sorted_keys = sorted(
list(questions_splits_llmsql_1[split_name].keys()), key=lambda x: int(x)
)
split_info_llmsql_1[split_name] = {
"first": sorted_keys[0],
"last": sorted_keys[-1],
"count": len(sorted_keys),
}
for question_id, items in questions_splits_llmsql_1[split_name].items():
questions_llmsql_1[int(question_id)] = items
tables_llmsql_1 = pd.read_pickle("llmsql_1.0/llmsql_tables.pkl")
questions_splits_llmsql_1 = pd.read_pickle("llmsql_1.0/llmsql_formatted.pkl")
split_info = dict()
questions_llmsql_1 = dict()
for split_name in list(questions_splits_llmsql_1.keys()):
sorted_keys = sorted(
list(questions_splits_llmsql_1[split_name].keys()), key=lambda x: int(x)
)
split_info[split_name] = {
"first": sorted_keys[0],
"last": sorted_keys[-1],
"count": len(sorted_keys),
}
for question_id, items in questions_splits_llmsql_1[split_name].items():
questions_llmsql_1[int(question_id)] = items
## LLMSQL 2.0
tables_list_llmsql_2 = load_jsonl("llmsql_2.0/tables.jsonl")
questions_list_llmsql_2 = load_jsonl("llmsql_2.0/questions.jsonl")
questions_llmsql_2 = dict()
for question in questions_list_llmsql_2:
questions_llmsql_2[question["question_id"]] = question
tables_llmsql_2 = dict()
for table in tables_list_llmsql_2:
tables_llmsql_2[table["table_id"]] = table