Spaces:

Roxanne-WANG
/

LangSQL

Paused

App Files Files Community

Roxanne-WANG commited on Apr 20, 2025

Commit

305d669

1 Parent(s): 749f953

update

Browse files

Files changed (2) hide show

app.py +0 -66
schema_item_filter.py +23 -12

app.py CHANGED Viewed

@@ -1,69 +1,3 @@
-# import streamlit as st
-# from text2sql import ChatBot
-# from langdetect import detect
-# from utils.translate_utils import translate_zh_to_en
-# from utils.db_utils import add_a_record
-# from langdetect.lang_detect_exception import LangDetectException
-# # Initialize chatbot and other variables
-# text2sql_bot = ChatBot()
-# baidu_api_token = None
-# # Define database schemas for demonstration
-# db_schemas = {
-#     "singer": """
-#     CREATE TABLE "singer" (
-#         "Singer_ID" int,
-#         "Name" text,
-#         "Birth_Year" real,
-#         "Net_Worth_Millions" real,
-#         "Citizenship" text,
-#         PRIMARY KEY ("Singer_ID")
-#     );
-#     CREATE TABLE "song" (
-#         "Song_ID" int,
-#         "Title" text,
-#         "Singer_ID" int,
-#         "Sales" real,
-#         "Highest_Position" real,
-#         PRIMARY KEY ("Song_ID"),
-#         FOREIGN KEY ("Singer_ID") REFERENCES "singer"("Singer_ID")
-#     );
-#     """,
-#     # Add other schemas as needed
-# }
-# # Streamlit UI
-# st.title("Text-to-SQL Chatbot")
-# st.sidebar.header("Select a Database")
-# # Sidebar for selecting a database
-# selected_db = st.sidebar.selectbox("Choose a database:", list(db_schemas.keys()))
-# # Display the selected schema
-# st.sidebar.text_area("Database Schema", db_schemas[selected_db], height=600)
-# # User input section
-# question = st.text_input("Enter your question:")
-# db_id = selected_db  # Use selected database for DB ID
-# if question:
-#     add_a_record(question, db_id)
-#     try:
-#         if baidu_api_token is not None and detect(question) != "en":
-#             print("Before translation:", question)
-#             question = translate_zh_to_en(question, baidu_api_token)
-#             print("After translation:", question)
-#     except LangDetectException as e:
-#         print("Language detection error:", str(e))
-#     predicted_sql = text2sql_bot.get_response(question, db_id)
-#     st.write(f"**Database:** {db_id}")
-#     st.write(f"**Predicted SQL query:** {predicted_sql}")
 import streamlit as st
 from text2sql import ChatBot
 from transformers import (

 import streamlit as st
 from text2sql import ChatBot
 from transformers import (

schema_item_filter.py CHANGED Viewed

@@ -238,15 +238,25 @@ def lista_contains_listb(lista, listb):
 class SchemaItemClassifierInference():
     def __init__(self, model_save_path):
         set_seed(42)
-        # load tokenizer from Hugging Face
-        self.tokenizer = AutoTokenizer.from_pretrained(model_save_path, add_prefix_space = True)
-        # load model directly from Hugging Face
         self.model = SchemaItemClassifier(model_save_path, "test")
         self.model.eval()
     def predict_one(self, sample):
-        encoder_input_ids, encoder_input_attention_mask, column_name_token_indices,\
-            table_name_token_indices, column_num_in_each_table = prepare_inputs_and_labels(sample, self.tokenizer)
         with torch.no_grad():
             model_outputs = self.model(
@@ -258,15 +268,16 @@ class SchemaItemClassifierInference():
             )
         table_logits = model_outputs["batch_table_name_cls_logits"][0]
-        table_pred_probs = torch.nn.functional.softmax(table_logits, dim = 1)[:, 1].cpu().tolist()
         column_logits = model_outputs["batch_column_info_cls_logits"][0]
-        column_pred_probs = torch.nn.functional.softmax(column_logits, dim = 1)[:, 1].cpu().tolist()
         splitted_column_pred_probs = []
-        # split predicted column probs into each table
         for table_id, column_num in enumerate(column_num_in_each_table):
-            splitted_column_pred_probs.append(column_pred_probs[sum(column_num_in_each_table[:table_id]): sum(column_num_in_each_table[:table_id]) + column_num])
         column_pred_probs = splitted_column_pred_probs
         result_dict = dict()
@@ -329,9 +340,9 @@ class SchemaItemClassifierInference():
         print(column_coverage_results)
 if __name__ == "__main__":
-    dataset_name = "bird_with_evidence"
     # dataset_name = "bird"
-    # dataset_name = "spider"
     sic = SchemaItemClassifierInference("sic_ckpts/sic_{}".format(dataset_name))
     import json
     dataset = json.load(open("./data/sft_eval_{}_text2sql.json".format(dataset_name)))

 class SchemaItemClassifierInference():
     def __init__(self, model_save_path):
         set_seed(42)
+        # Load tokenizer from Hugging Face
+        self.tokenizer = AutoTokenizer.from_pretrained(model_save_path, add_prefix_space=True)
+        # Load the model from Hugging Face or local path
         self.model = SchemaItemClassifier(model_save_path, "test")
         self.model.eval()
+        # Move model to GPU if available, otherwise stay on CPU
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model = self.model.to(self.device)  # Move model to the selected device
     def predict_one(self, sample):
+        encoder_input_ids, encoder_input_attention_mask, column_name_token_indices, \
+        table_name_token_indices, column_num_in_each_table = prepare_inputs_and_labels(sample, self.tokenizer)
+        # Ensure all tensors are moved to the same device as the model
+        encoder_input_ids = encoder_input_ids.to(self.device)
+        encoder_input_attention_mask = encoder_input_attention_mask.to(self.device)
         with torch.no_grad():
             model_outputs = self.model(
             )
         table_logits = model_outputs["batch_table_name_cls_logits"][0]
+        table_pred_probs = torch.nn.functional.softmax(table_logits, dim=1)[:, 1].cpu().tolist()
         column_logits = model_outputs["batch_column_info_cls_logits"][0]
+        column_pred_probs = torch.nn.functional.softmax(column_logits, dim=1)[:, 1].cpu().tolist()
         splitted_column_pred_probs = []
+        # Split predicted column probs into each table
         for table_id, column_num in enumerate(column_num_in_each_table):
+            splitted_column_pred_probs.append(column_pred_probs[sum(column_num_in_each_table[:table_id]):
+                                                                sum(column_num_in_each_table[:table_id]) + column_num])
         column_pred_probs = splitted_column_pred_probs
         result_dict = dict()
         print(column_coverage_results)
 if __name__ == "__main__":
+    # dataset_name = "bird_with_evidence"
     # dataset_name = "bird"
+    dataset_name = "spider"
     sic = SchemaItemClassifierInference("sic_ckpts/sic_{}".format(dataset_name))
     import json
     dataset = json.load(open("./data/sft_eval_{}_text2sql.json".format(dataset_name)))