{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Finetune DeepSeek Coder 1.3B for NBA + Tennis Kaggle Databases SQLite Generation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import torch\n", "from datasets import Dataset\n", "from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig, EarlyStoppingCallback, PreTrainedTokenizer\n", "from torch.utils.data import DataLoader\n", "import sys\n", "from peft import LoraConfig, get_peft_model, TaskType\n", "from huggingface_hub import snapshot_download\n", "import os\n", "import re\n", "import contextlib #helps make pip silent\n", "import sys\n", "import os\n", "import numpy as np\n", "with contextlib.redirect_stdout(sys.__stdout__), contextlib.redirect_stderr(sys.__stderr__):\n", " %pip install datasets" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Define constants for using google colab or local runs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "is_google_colab = False\n", "use_bnb = True" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Establish read and write paths" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "current_read_path = \"./\"\n", "current_write_path = \"./\"\n", "\n", "def read_path(rel_path):\n", " return os.path.join(current_read_path, rel_path)\n", "\n", "def write_path(rel_path):\n", " return os.path.join(current_write_path, rel_path)\n", "\n", "if is_google_colab:\n", " from google.colab import drive\n", " drive.mount('/content/drive')\n", " current_write_path = \"/content/drive/MyDrive/sql_gen\"\n", "\n", " hugging_face_path = snapshot_download(\n", " repo_id=\"USC-Applied-NLP-Group/SQL-Generation\",\n", " repo_type=\"model\",\n", " allow_patterns=[\"train-data/*\", \"deepseek-coder-1.3b-instruct/*\"], \n", " )\n", " sys.path.append(hugging_face_path)\n", " current_read_path = hugging_face_path" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## First define prompt" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from utils.prompts.nba_prompt import input_text as input_prompt\n", "\n", "print(len(input_prompt))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load data and convert to Dataset object tokenized by the DeepSeek model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Model output directories\n", "MODEL_DIR = write_path(\"finetuned-model-16\")\n", "VAL_OUTPUT = write_path(\"val-16.hf\")\n", "\n", "# Load dataset\n", "df_train = pd.read_csv(read_path(\"training-data/train_set.tsv\"), sep='\\t')\n", "df_test = pd.read_csv(read_path(\"training-data/test_set.tsv\"), sep='\\t')\n", "\n", "# Fix any spacing issues\n", "df_train.applymap(lambda x: re.sub(r'\\s+', ' ', x) if isinstance(x, str) else x)\n", "df_test.applymap(lambda x: re.sub(r'\\s+', ' ', x) if isinstance(x, str) else x)\n", "\n", "# Display dataset info\n", "print(f\"Total train dataset examples: {len(df_train)}\")\n", "print(df_train.head())\n", "print(f\"Total test dataset examples: {len(df_test)}\")\n", "print(df_test.head())\n", "# Load tokenizer\n", "model_name = read_path(\"deepseek-coder-1.3b-instruct\")\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "\n", "# Enable 8-bit quantization for lower memory usage\n", "bnb_config = None\n", "if use_bnb:\n", " bnb_config = BitsAndBytesConfig(\n", " load_in_8bit=True, \n", " bnb_8bit_compute_dtype=torch.float16\n", " )\n", "\n", "# Load model with quantization\n", "device_name = 'cuda' if torch.cuda.is_available() else 'cpu'\n", "device = torch.device(device_name)\n", "model = AutoModelForCausalLM.from_pretrained(\n", " model_name, \n", " quantization_config=bnb_config,\n", " device_map=device\n", ")\n", "\n", "# Add a custom stop token (can be anything that won’t show up in your data)\n", "special_token = \"<|endofsql|>\"\n", "\n", "# Only add if it doesn’t already exist\n", "print(\"adding special token\")\n", "print(len(tokenizer))\n", "tokenizer.add_special_tokens({\"additional_special_tokens\": [special_token]})\n", "tokenizer.eos_token = special_token\n", "model.resize_token_embeddings(len(tokenizer))\n", "print(len(tokenizer))\n", "\n", "tokenizer.truncation_side = \"left\"\n", "\n", "def format_deepseek_chat(example, tokenizer, special_token=\"<|endofsql|>\"):\n", " # Manually build the prompt as one flat string\n", " prompt = f\"{input_prompt}{example['natural_query']}\\n\"\n", " completion = f\"SQLite:\\n{example['sql_query']}{special_token}\"\n", "\n", " full_text = prompt + completion\n", " tokenized = tokenizer(\n", " full_text,\n", " truncation=True,\n", " padding=\"max_length\",\n", " max_length=3156, # or whatever your model can handle\n", " )\n", "\n", " # Mask out prompt tokens in the labels\n", " prompt_len = len(tokenizer(prompt, truncation=True)[\"input_ids\"])\n", " labels = tokenized[\"input_ids\"][:]\n", " labels[:prompt_len] = [-100] * prompt_len\n", " tokenized[\"labels\"] = labels\n", "\n", " return tokenized\n", "\n", "# Build dataset dict\n", "train_dataset_dict = {\n", " \"natural_query\": df_train[\"natural_query\"].tolist(),\n", " \"sql_query\": df_train[\"sql_query\"].tolist(),\n", "}\n", "\n", "\n", "val_dataset_dict = {\n", " \"natural_query\": df_test[\"natural_query\"].tolist(),\n", " \"sql_query\": df_test[\"sql_query\"].tolist(),\n", "}\n", "\n", "# Create HuggingFace Dataset\n", "train_dataset = Dataset.from_dict(train_dataset_dict)\n", "val_dataset = Dataset.from_dict(val_dataset_dict)\n", "\n", "# Apply formatting\n", "train_dataset = train_dataset.map(\n", " lambda x: format_deepseek_chat(x, tokenizer),\n", " remove_columns=[\"natural_query\", \"sql_query\"]\n", ")\n", "\n", "val_dataset = val_dataset.map(\n", " lambda x: format_deepseek_chat(x, tokenizer),\n", " remove_columns=[\"natural_query\", \"sql_query\"]\n", ")\n", "\n", "del df_train, df_test, train_dataset_dict, val_dataset_dict\n", "\n", "\n", "for v in val_dataset:\n", " print(v)\n", " print(len(v['input_ids']))\n", " break\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load model and define training arguments" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Define LoRA configuration\n", "lora_config = LoraConfig(\n", " r=16, # Rank of LoRA matrices (adjust for memory vs. accuracy)\n", " lora_alpha=32, # Scaling factor\n", " lora_dropout=0.0, # Dropout for regularization\n", " bias=\"none\",\n", " task_type=TaskType.CAUSAL_LM,\n", " target_modules=[\n", " \"q_proj\",\n", " \"k_proj\",\n", " \"v_proj\",\n", " \"o_proj\",\n", " \"gate_proj\",\n", " \"up_proj\",\n", " \"down_proj\"\n", " ]\n", ")\n", "\n", "# Wrap model with LoRA adapters\n", "model = get_peft_model(model, lora_config)\n", "model = model.to(device)\n", "model.print_trainable_parameters() # Show trainable parameters count" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup model trainer" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "training_args = TrainingArguments(\n", " output_dir=MODEL_DIR,\n", " evaluation_strategy=\"epoch\", # Evaluate at the end of each epoch\n", " save_strategy=\"epoch\", # Save model every epoch\n", " per_device_train_batch_size=1, # LoRA allows higher batch size\n", " per_device_eval_batch_size=1,\n", " gradient_accumulation_steps=16,\n", " num_train_epochs=10, # Increase if needed\n", " learning_rate=5e-5, # Higher LR since we're only training LoRA layers\n", " weight_decay=0.001,\n", " logging_steps=50, # Print loss every 50 steps\n", " save_total_limit=2, # Keep last 4 checkpoints\n", " bf16=True if torch.cuda.is_available() else False,\n", " push_to_hub=False,\n", " load_best_model_at_end=True,\n", " metric_for_best_model=\"eval_loss\",\n", " greater_is_better=False\n", ")\n", "\n", "# Trainer setup\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=train_dataset,\n", " eval_dataset=val_dataset,\n", " tokenizer=tokenizer,\n", " callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Run fine-tuning and save model weights when complete" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Run training\n", "trainer.train()\n", "\n", "# Merge LoRA adapters with the base model before saving\n", "model = model.merge_and_unload()\n", "model.save_pretrained(MODEL_DIR)\n", "tokenizer.save_pretrained(MODEL_DIR)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Try inference using fine-tuned model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model = AutoModelForCausalLM.from_pretrained(MODEL_DIR, torch_dtype=torch.bfloat16, device_map=device)\n", "tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)\n", "\n", "# Prepare query with the same prompt\n", "input_text = \"How many points do the Los Angeles Lakers average at home?\"\n", "message = [{'role': 'user', 'content': input_prompt + input_text}]\n", "inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors=\"pt\").to(model.device)\n", "\n", "# Generate SQL query\n", "outputs = model.generate(\n", " inputs,\n", " max_new_tokens=256,\n", " eos_token_id=tokenizer.convert_tokens_to_ids(\"<|endofsql|>\")\n", ")\n", "model_output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)\n", "\n", "print(\"Generated SQL:\", model_output)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Save validation set to disk" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "val_dataset.save_to_disk(VAL_OUTPUT)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test logic for obtaining original prompt and SQLite" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import sqlite3 as sql\n", "\n", "prompt_length = len(input_prompt)\n", "\n", "print(prompt_length)\n", "\n", "# Create connection to sqlite3 database\n", "connection = sql.connect(read_path('nba-data/nba.sqlite'))\n", "cursor = connection.cursor()\n", "\n", "for v in val_dataset:\n", " full_example = tokenizer.decode(v[\"input_ids\"], skip_special_tokens=True)\n", " user_prompt = full_example[:prompt_length]\n", " question, sql_query = full_example[prompt_length:].split(\"SQLite:\\n\")\n", " print(question)\n", " print(sql_query)\n", " cursor.execute(sql_query)\n", " rows = cursor.fetchall()\n", " for row in rows:\n", " print(row)\n", " break" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Run evaluation over entire validation set" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import math\n", "import random\n", "\n", "def compare_result(sample_query, model_output):\n", " # Clean model output to only have the query output\n", " if model_output[0:8] == \"SQLite:\\n\":\n", " query = model_output[8:]\n", " elif model_output[0:8] == \"SQLite: \":\n", " query = model_output[8:]\n", " elif model_output[0:7] == \"SQLite:\":\n", " query = model_output[7:]\n", " elif model_output[0:5] == \"SQL:\\n\":\n", " query = model_output[5:]\n", " elif model_output[0:5] == \"SQL: \":\n", " query = model_output[5:]\n", " elif model_output[0:4] == \"SQL:\":\n", " query = model_output[4:]\n", " else:\n", " query = model_output\n", "\n", " # Clean any excess text after the query semicolon\n", " for i in range(len(query)):\n", " if query[i] == \";\":\n", " query = query[:i+1]\n", " break\n", "\n", " # Get sample and model result\n", " cursor.execute(sample_query)\n", " sample_result = [str(item) for tup in cursor.fetchall() for item in tup]\n", "\n", " try:\n", " cursor.execute(query)\n", " except:\n", " return False, False, False\n", " model_result = [str(item) for tup in cursor.fetchall() for item in tup]\n", "\n", " print(sample_result)\n", " print(model_result)\n", "\n", " # Strip all whitespace before comparing queries since there may be differences in spacing, newlines, tabs, etc.\n", " query = query.replace(\" \", \"\").replace(\"\\n\", \"\").replace(\"\\t\", \"\")\n", " sample_query = sample_query.replace(\" \", \"\").replace(\"\\n\", \"\").replace(\"\\t\", \"\")\n", " query_match = (query == sample_query)\n", "\n", " # If the queries match, the results clearly also match\n", " if query_match:\n", " return True, True, True\n", "\n", " # Try to execute query, if it fails, then this is a failure of the model\n", " try:\n", " for r in sample_result:\n", " for res in model_result:\n", " try:\n", " if math.isclose(float(r), float(res), abs_tol=0.5):\n", " return True, False, True\n", " except:\n", " if r in res or res in r:\n", " return True, False, True\n", " return True, False, False\n", " except:\n", " return True, False, False\n", " \n", "num_valid = 0\n", "num_sql_matched = 0\n", "num_result_matched = 0\n", "\n", "for v in val_dataset:\n", " # Obtain sample natural language question and sql_query\n", " #v = val_dataset[random.randint(0, len(val_dataset) - 1)]\n", " full_example = tokenizer.decode(v[\"input_ids\"], skip_special_tokens=True)\n", " user_prompt = full_example[:prompt_length]\n", " question, sql_query = full_example[prompt_length:].split(\"SQLite:\\n\")\n", " #print(question)\n", " #print(sql_query)\n", "\n", " # Obtain model output\n", " input_text = \"How many points to the Los Angeles Lakers average at home?\"\n", " message = [{'role': 'user', 'content': input_prompt + question}]\n", " inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors=\"pt\").to(model.device)\n", "\n", " # Generate SQL query\n", " outputs = model.generate(\n", " inputs,\n", " max_new_tokens=256,\n", " eos_token_id=tokenizer.convert_tokens_to_ids(\"<|endofsql|>\")\n", " )\n", " model_output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)\n", "\n", " print(sql_query)\n", " print(model_output.split(\";\")[0])\n", " #print()\n", " #print(model_output)\n", " result = compare_result(sql_query, model_output)\n", " print(\"Statement valid? \" + str(result[0]))\n", " print(\"SQLite matched? \" + str(result[1]))\n", " print(\"Result matched? \" + str(result[2]))\n", " print()\n", " print()\n", "\n", " if result[0]:\n", " num_valid += 1\n", " if result[1]:\n", " num_sql_matched += 1\n", " if result[2]:\n", " num_result_matched += 1\n", "\n", "print(\"Percent valid: \" + str(num_valid / len(val_dataset)))\n", "print(\"Percent SQLite matched: \" + str(num_sql_matched / len(val_dataset)))\n", "print(\"Percent result matched: \" + str(num_result_matched / len(val_dataset)))\n", "\n", "# break" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test validation set only on short queries" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "num_valid = 0\n", "num_sql_matched = 0\n", "num_result_matched = 0\n", "counter = 0\n", "for v in val_dataset:\n", " # Obtain sample natural language question and sql_query\n", " #v = val_dataset[random.randint(0, len(val_dataset) - 1)]\n", " full_example = tokenizer.decode(v[\"input_ids\"], skip_special_tokens=True)\n", " user_prompt = full_example[:prompt_length]\n", " question, sql_query = full_example[prompt_length:].split(\"SQLite:\\n\")\n", " #print(question)\n", " #print(sql_query)\n", "\n", " if len(sql_query) <= 90:\n", " # Obtain model output\n", " input_text = \"How many points to the Los Angeles Lakers average at home?\"\n", " message = [{'role': 'user', 'content': input_prompt + question}]\n", " inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors=\"pt\").to(model.device)\n", "\n", " # Generate SQL query\n", " outputs = model.generate(\n", " inputs,\n", " max_new_tokens=256,\n", " eos_token_id=tokenizer.convert_tokens_to_ids(\"<|endofsql|>\")\n", " )\n", " model_output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)\n", "\n", " print(sql_query)\n", " print(model_output.split(\";\")[0])\n", " #print()\n", " #print(model_output)\n", " result = compare_result(sql_query, model_output)\n", " print(\"Statement valid? \" + str(result[0]))\n", " print(\"SQLite matched? \" + str(result[1]))\n", " print(\"Result matched? \" + str(result[2]))\n", " print()\n", " print()\n", " counter += 1\n", "\n", " if result[0]:\n", " num_valid += 1\n", " if result[1]:\n", " num_sql_matched += 1\n", " if result[2]:\n", " num_result_matched += 1\n", "\n", "print(\"Percent valid: \" + str(num_valid / counter))\n", "print(\"Percent SQLite matched: \" + str(num_sql_matched / counter))\n", "print(\"Percent result matched: \" + str(num_result_matched / counter))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.6" } }, "nbformat": 4, "nbformat_minor": 2 }