Spaces:
Paused
Paused
Commit ·
305d669
1
Parent(s): 749f953
update
Browse files- app.py +0 -66
- schema_item_filter.py +23 -12
app.py
CHANGED
|
@@ -1,69 +1,3 @@
|
|
| 1 |
-
# import streamlit as st
|
| 2 |
-
# from text2sql import ChatBot
|
| 3 |
-
# from langdetect import detect
|
| 4 |
-
# from utils.translate_utils import translate_zh_to_en
|
| 5 |
-
# from utils.db_utils import add_a_record
|
| 6 |
-
# from langdetect.lang_detect_exception import LangDetectException
|
| 7 |
-
|
| 8 |
-
# # Initialize chatbot and other variables
|
| 9 |
-
# text2sql_bot = ChatBot()
|
| 10 |
-
# baidu_api_token = None
|
| 11 |
-
|
| 12 |
-
# # Define database schemas for demonstration
|
| 13 |
-
# db_schemas = {
|
| 14 |
-
# "singer": """
|
| 15 |
-
# CREATE TABLE "singer" (
|
| 16 |
-
# "Singer_ID" int,
|
| 17 |
-
# "Name" text,
|
| 18 |
-
# "Birth_Year" real,
|
| 19 |
-
# "Net_Worth_Millions" real,
|
| 20 |
-
# "Citizenship" text,
|
| 21 |
-
# PRIMARY KEY ("Singer_ID")
|
| 22 |
-
# );
|
| 23 |
-
|
| 24 |
-
# CREATE TABLE "song" (
|
| 25 |
-
# "Song_ID" int,
|
| 26 |
-
# "Title" text,
|
| 27 |
-
# "Singer_ID" int,
|
| 28 |
-
# "Sales" real,
|
| 29 |
-
# "Highest_Position" real,
|
| 30 |
-
# PRIMARY KEY ("Song_ID"),
|
| 31 |
-
# FOREIGN KEY ("Singer_ID") REFERENCES "singer"("Singer_ID")
|
| 32 |
-
# );
|
| 33 |
-
# """,
|
| 34 |
-
# # Add other schemas as needed
|
| 35 |
-
# }
|
| 36 |
-
|
| 37 |
-
# # Streamlit UI
|
| 38 |
-
# st.title("Text-to-SQL Chatbot")
|
| 39 |
-
# st.sidebar.header("Select a Database")
|
| 40 |
-
|
| 41 |
-
# # Sidebar for selecting a database
|
| 42 |
-
# selected_db = st.sidebar.selectbox("Choose a database:", list(db_schemas.keys()))
|
| 43 |
-
|
| 44 |
-
# # Display the selected schema
|
| 45 |
-
# st.sidebar.text_area("Database Schema", db_schemas[selected_db], height=600)
|
| 46 |
-
|
| 47 |
-
# # User input section
|
| 48 |
-
# question = st.text_input("Enter your question:")
|
| 49 |
-
# db_id = selected_db # Use selected database for DB ID
|
| 50 |
-
|
| 51 |
-
# if question:
|
| 52 |
-
# add_a_record(question, db_id)
|
| 53 |
-
|
| 54 |
-
# try:
|
| 55 |
-
# if baidu_api_token is not None and detect(question) != "en":
|
| 56 |
-
# print("Before translation:", question)
|
| 57 |
-
# question = translate_zh_to_en(question, baidu_api_token)
|
| 58 |
-
# print("After translation:", question)
|
| 59 |
-
# except LangDetectException as e:
|
| 60 |
-
# print("Language detection error:", str(e))
|
| 61 |
-
|
| 62 |
-
# predicted_sql = text2sql_bot.get_response(question, db_id)
|
| 63 |
-
# st.write(f"**Database:** {db_id}")
|
| 64 |
-
# st.write(f"**Predicted SQL query:** {predicted_sql}")
|
| 65 |
-
|
| 66 |
-
|
| 67 |
import streamlit as st
|
| 68 |
from text2sql import ChatBot
|
| 69 |
from transformers import (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
from text2sql import ChatBot
|
| 3 |
from transformers import (
|
schema_item_filter.py
CHANGED
|
@@ -238,15 +238,25 @@ def lista_contains_listb(lista, listb):
|
|
| 238 |
class SchemaItemClassifierInference():
|
| 239 |
def __init__(self, model_save_path):
|
| 240 |
set_seed(42)
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
|
|
|
|
|
|
| 244 |
self.model = SchemaItemClassifier(model_save_path, "test")
|
| 245 |
self.model.eval()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
|
| 247 |
def predict_one(self, sample):
|
| 248 |
-
encoder_input_ids, encoder_input_attention_mask, column_name_token_indices,\
|
| 249 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
|
| 251 |
with torch.no_grad():
|
| 252 |
model_outputs = self.model(
|
|
@@ -258,15 +268,16 @@ class SchemaItemClassifierInference():
|
|
| 258 |
)
|
| 259 |
|
| 260 |
table_logits = model_outputs["batch_table_name_cls_logits"][0]
|
| 261 |
-
table_pred_probs = torch.nn.functional.softmax(table_logits, dim
|
| 262 |
-
|
| 263 |
column_logits = model_outputs["batch_column_info_cls_logits"][0]
|
| 264 |
-
column_pred_probs = torch.nn.functional.softmax(column_logits, dim
|
| 265 |
|
| 266 |
splitted_column_pred_probs = []
|
| 267 |
-
#
|
| 268 |
for table_id, column_num in enumerate(column_num_in_each_table):
|
| 269 |
-
splitted_column_pred_probs.append(column_pred_probs[sum(column_num_in_each_table[:table_id]):
|
|
|
|
| 270 |
column_pred_probs = splitted_column_pred_probs
|
| 271 |
|
| 272 |
result_dict = dict()
|
|
@@ -329,9 +340,9 @@ class SchemaItemClassifierInference():
|
|
| 329 |
print(column_coverage_results)
|
| 330 |
|
| 331 |
if __name__ == "__main__":
|
| 332 |
-
dataset_name = "bird_with_evidence"
|
| 333 |
# dataset_name = "bird"
|
| 334 |
-
|
| 335 |
sic = SchemaItemClassifierInference("sic_ckpts/sic_{}".format(dataset_name))
|
| 336 |
import json
|
| 337 |
dataset = json.load(open("./data/sft_eval_{}_text2sql.json".format(dataset_name)))
|
|
|
|
| 238 |
class SchemaItemClassifierInference():
|
| 239 |
def __init__(self, model_save_path):
|
| 240 |
set_seed(42)
|
| 241 |
+
|
| 242 |
+
# Load tokenizer from Hugging Face
|
| 243 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_save_path, add_prefix_space=True)
|
| 244 |
+
|
| 245 |
+
# Load the model from Hugging Face or local path
|
| 246 |
self.model = SchemaItemClassifier(model_save_path, "test")
|
| 247 |
self.model.eval()
|
| 248 |
+
|
| 249 |
+
# Move model to GPU if available, otherwise stay on CPU
|
| 250 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 251 |
+
self.model = self.model.to(self.device) # Move model to the selected device
|
| 252 |
|
| 253 |
def predict_one(self, sample):
|
| 254 |
+
encoder_input_ids, encoder_input_attention_mask, column_name_token_indices, \
|
| 255 |
+
table_name_token_indices, column_num_in_each_table = prepare_inputs_and_labels(sample, self.tokenizer)
|
| 256 |
+
|
| 257 |
+
# Ensure all tensors are moved to the same device as the model
|
| 258 |
+
encoder_input_ids = encoder_input_ids.to(self.device)
|
| 259 |
+
encoder_input_attention_mask = encoder_input_attention_mask.to(self.device)
|
| 260 |
|
| 261 |
with torch.no_grad():
|
| 262 |
model_outputs = self.model(
|
|
|
|
| 268 |
)
|
| 269 |
|
| 270 |
table_logits = model_outputs["batch_table_name_cls_logits"][0]
|
| 271 |
+
table_pred_probs = torch.nn.functional.softmax(table_logits, dim=1)[:, 1].cpu().tolist()
|
| 272 |
+
|
| 273 |
column_logits = model_outputs["batch_column_info_cls_logits"][0]
|
| 274 |
+
column_pred_probs = torch.nn.functional.softmax(column_logits, dim=1)[:, 1].cpu().tolist()
|
| 275 |
|
| 276 |
splitted_column_pred_probs = []
|
| 277 |
+
# Split predicted column probs into each table
|
| 278 |
for table_id, column_num in enumerate(column_num_in_each_table):
|
| 279 |
+
splitted_column_pred_probs.append(column_pred_probs[sum(column_num_in_each_table[:table_id]):
|
| 280 |
+
sum(column_num_in_each_table[:table_id]) + column_num])
|
| 281 |
column_pred_probs = splitted_column_pred_probs
|
| 282 |
|
| 283 |
result_dict = dict()
|
|
|
|
| 340 |
print(column_coverage_results)
|
| 341 |
|
| 342 |
if __name__ == "__main__":
|
| 343 |
+
# dataset_name = "bird_with_evidence"
|
| 344 |
# dataset_name = "bird"
|
| 345 |
+
dataset_name = "spider"
|
| 346 |
sic = SchemaItemClassifierInference("sic_ckpts/sic_{}".format(dataset_name))
|
| 347 |
import json
|
| 348 |
dataset = json.load(open("./data/sft_eval_{}_text2sql.json".format(dataset_name)))
|