| import os |
| import string |
| import sqlite3 |
| import nltk.corpus |
| import re |
| |
|
|
| STOPWORDS = set(nltk.corpus.stopwords.words('english')) |
| PUNKS = set(a for a in string.punctuation) |
|
|
|
|
| |
| def compute_schema_linking(question, column, table): |
|
|
| def partial_match(x_list, y_list): |
| x_str = " ".join(x_list) |
| y_str = " ".join(y_list) |
| if x_str in STOPWORDS or x_str in PUNKS: |
| return False |
| if re.match(rf"\b{re.escape(x_str)}\b", y_str): |
| assert x_str in y_str |
| return True |
| else: |
| return False |
|
|
| def exact_match(x_list, y_list): |
| x_str = " ".join(x_list) |
| y_str = " ".join(y_list) |
| if x_str == y_str: |
| return True |
| else: |
| return False |
|
|
| q_col_match = dict() |
| q_tab_match = dict() |
|
|
| col_id2list = dict() |
| for col_id, col_item in enumerate(column): |
| if col_id == 0: |
| continue |
| col_id2list[col_id] = col_item |
| |
| tab_id2list = dict() |
| for tab_id, tab_item in enumerate(table): |
| tab_id2list[tab_id] = tab_item |
|
|
| |
| n = 5 |
| while n > 0: |
| for i in range(len(question) - n + 1): |
| n_gram_list = question[i:i + n] |
| n_gram = " ".join(n_gram_list) |
| if len(n_gram.strip()) == 0: |
| continue |
| |
| for col_id in col_id2list: |
| if exact_match(n_gram_list, col_id2list[col_id]): |
| for q_id in range(i, i+n): |
| q_col_match[f"{q_id},{col_id}"] = "CEM" |
| for tab_id in tab_id2list: |
| if exact_match(n_gram_list, tab_id2list[tab_id]): |
| for q_id in range(i, i+n): |
| q_tab_match[f"{q_id},{tab_id}"] = "TEM" |
|
|
| |
| for col_id in col_id2list: |
| if partial_match(n_gram_list, col_id2list[col_id]): |
| for q_id in range(i, i+n): |
| if f"{q_id},{col_id}" not in q_col_match: |
| q_col_match[f"{q_id},{col_id}"] = "CPM" |
| for tab_id in tab_id2list: |
| if partial_match(n_gram_list, tab_id2list[tab_id]): |
| for q_id in range(i, i+n): |
| if f"{q_id},{tab_id}" not in q_tab_match: |
| q_tab_match[f"{q_id},{tab_id}"] = "TPM" |
| n -= 1 |
| return {"q_col_match": q_col_match, "q_tab_match": q_tab_match } |
|
|
|
|
| def compute_cell_value_linking(tokens, schema, db_dir): |
|
|
| def isnumber(word): |
| try: |
| float(word) |
| return True |
| except: |
| return False |
|
|
| def db_word_match(word, column, table, db_path): |
| conn = sqlite3.connect(db_path) |
| cursor = conn.cursor() |
|
|
| p_str = f"select {column} from {table} where {column} like '{word} %' or {column} like '% {word}' or {column} like '% {word} %' or {column} like '{word}'" |
| try: |
| cursor.execute(p_str) |
| p_res = cursor.fetchall() |
| if len(p_res) == 0: |
| return False |
| else: |
| return p_res |
| except: |
| return False |
|
|
| db_name = schema.db_id |
| db_path = os.path.join(db_dir, db_name, db_name + '.sqlite') |
|
|
| num_date_match = {} |
| cell_match = {} |
|
|
| for q_id, word in enumerate(tokens): |
| if len(word.strip()) == 0: |
| continue |
| if word in STOPWORDS or word in PUNKS: |
| continue |
|
|
| num_flag = isnumber(word) |
|
|
| for col_id, column in enumerate(schema.columns): |
| if col_id == 0: |
| assert column.orig_name == "*" |
| continue |
| |
| |
| if num_flag: |
| if column.type in ["number", "time"]: |
| num_date_match[f"{q_id},{col_id}"] = column.type.upper() |
| else: |
| ret = db_word_match(word, column.orig_name, column.table.orig_name, db_path) |
| if ret: |
| |
| cell_match[f"{q_id},{col_id}"] = "CELLMATCH" |
|
|
| cv_link = {"num_date_match": num_date_match, "cell_match" : cell_match, "normalized_token": tokens} |
| return cv_link |