|
|
--- |
|
|
library_name: transformers |
|
|
tags: |
|
|
- text-to-SQL |
|
|
- SQL |
|
|
- code-generation |
|
|
- NLQ-to-SQL |
|
|
- text2SQL |
|
|
datasets: |
|
|
- salmane11/BanQies |
|
|
language: |
|
|
- en |
|
|
base_model: |
|
|
- bigcode/starcoder2-3b |
|
|
--- |
|
|
|
|
|
# BanQL-3B |
|
|
|
|
|
## Model Description |
|
|
|
|
|
BanQL is a family of Code LLMs dedicated solely for the text-to-SQL task in the Financial domain. |
|
|
|
|
|
The checkpoint included in this repository is based on [bigcode/starcoder2-3b](https://huggingface.co/bigcode/starcoder2-3b) and further finetuned on [BanQies](https://huggingface.co/datasets/salmane11/BanQies), a dataset generated using [SelectCraft](https://github.com/ezzini/SelectCraft) composed of NLQ-SQL pairs from the financial domain. |
|
|
|
|
|
|
|
|
## Finetuning Procedure |
|
|
|
|
|
BanQL was fine-tuned using PEFT (Parameter-Efficient Fine-Tuning) techniques, specifically LoRA (Low-Rank Adaptation) adapters. |
|
|
|
|
|
## Intended Use and Limitations |
|
|
|
|
|
The model was designed as a use case to prove the efficiency of SelectCraft in generating large-scale good quality domain-specific text-to-SQL datasets. The model is mainly finetuned on the database schemas displayed above. The prompt format is defined below. |
|
|
|
|
|
## How to Use |
|
|
|
|
|
Example 1: Loans_DB |
|
|
|
|
|
```python |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
|
|
|
device="cuda" |
|
|
tokenizer = AutoTokenizer.from_pretrained("salmane11/BanQL-3b") |
|
|
model = AutoModelForCausalLM.from_pretrained("salmane11/BanQL-3b").to(device) |
|
|
|
|
|
input_text = """ |
|
|
CREATE TABLE Loans { |
|
|
loan_id number, |
|
|
client_id number, |
|
|
budget real, |
|
|
duration number, |
|
|
interest real, |
|
|
status varchar |
|
|
} |
|
|
|
|
|
CREATE TABLE Clients { |
|
|
client_id number, |
|
|
first_name varchar, |
|
|
last_name varchar, |
|
|
email varchar, |
|
|
city varchar, |
|
|
year_of_birth number |
|
|
} |
|
|
|
|
|
CREATE TABLE Accounts { |
|
|
account_id number, |
|
|
client_id number, |
|
|
balance real, |
|
|
type varchar |
|
|
} |
|
|
|
|
|
CREATE TABLE Deposits{ |
|
|
deposit_id number, |
|
|
account_id number, |
|
|
source varchar, |
|
|
amount real |
|
|
} |
|
|
|
|
|
-- Using valid SQLite, answer the following question for the tables provided above. |
|
|
|
|
|
-- What is the duration and budget of the loan id 16342? |
|
|
|
|
|
SELECT""" |
|
|
|
|
|
encoding = tokenizer.encode_plus(input_text,pad_to_max_length=True, return_tensors="pt").to(device) |
|
|
input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device) |
|
|
|
|
|
|
|
|
outputs = model.generate( |
|
|
input_ids=input_ids, attention_mask=attention_masks, |
|
|
max_length=512, |
|
|
do_sample=True, |
|
|
top_k=120, |
|
|
top_p=0.95, |
|
|
early_stopping=True, |
|
|
) |
|
|
line = tokenizer.decode(outputs[0], skip_special_tokens=True,clean_up_tokenization_spaces=True) |
|
|
query_begining = line.find("SELECT") |
|
|
print(line[query_begining:]) |
|
|
``` |
|
|
|
|
|
Example 2: Transactions_DB |
|
|
|
|
|
```python |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
|
|
|
device="cuda" |
|
|
tokenizer = AutoTokenizer.from_pretrained("salmane11/BanQL-3b") |
|
|
model = AutoModelForCausalLM.from_pretrained("salmane11/BanQL-3b").to(device) |
|
|
|
|
|
input_text = """ |
|
|
CREATE TABLE Transactions { |
|
|
transaction_id number, |
|
|
timestamp_id number, |
|
|
primary_contract_id number, |
|
|
client_id number, |
|
|
beneficiary_id number, |
|
|
transaction_amount real, |
|
|
is_fraudulent boolean, |
|
|
product_family_code varchar, |
|
|
amount_currency varchar |
|
|
} |
|
|
|
|
|
CREATE TABLE Beneficiary { |
|
|
beneficiary_id number, |
|
|
bank_branch_id number, |
|
|
country_name varchar, |
|
|
country_code varchar, |
|
|
} |
|
|
|
|
|
CREATE TABLE Source { |
|
|
primary_contract_id number, |
|
|
client_id number, |
|
|
counterparty_bank_branch_id number, |
|
|
counterparty_donor_id number, |
|
|
} |
|
|
|
|
|
CREATE TABLE Time{ |
|
|
timestamp_id number, |
|
|
week_number number, |
|
|
day_number number, |
|
|
hour_number number, |
|
|
day_name varchar, |
|
|
year number, |
|
|
month_number number |
|
|
} |
|
|
|
|
|
-- Using valid SQLite, answer the following question for the tables provided above. |
|
|
|
|
|
-- How many transactions for the client id 15482? |
|
|
|
|
|
SELECT""" |
|
|
|
|
|
|
|
|
encoding = tokenizer.encode_plus(input_text,pad_to_max_length=True, return_tensors="pt").to(device) |
|
|
input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device) |
|
|
|
|
|
|
|
|
outputs = model.generate( |
|
|
input_ids=input_ids, attention_mask=attention_masks, |
|
|
max_length=512, |
|
|
do_sample=True, |
|
|
top_k=120, |
|
|
top_p=0.95, |
|
|
early_stopping=True, |
|
|
) |
|
|
line = tokenizer.decode(outputs[0], skip_special_tokens=True,clean_up_tokenization_spaces=True) |
|
|
query_begining = line.find("SELECT") |
|
|
print(line[query_begining:]) |
|
|
``` |
|
|
|
|
|
|
|
|
|
|
|
## Cite our work |
|
|
|
|
|
Citation |