Spaces:

othdu
/

AgriQA-assistant

Runtime error

File size: 27,084 Bytes

571aca4

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "49cab397",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "import torch\n",
    "import logging\n",
    "import yaml\n",
    "from typing import Dict, List, Tuple, Any\n",
    "from datasets import load_dataset, Dataset\n",
    "from transformers import (\n",
    "    AutoModelForCausalLM,\n",
    "    AutoTokenizer,\n",
    "    TrainingArguments,\n",
    "    Trainer,\n",
    "    DataCollatorForLanguageModeling,\n",
    "    EarlyStoppingCallback\n",
    ")\n",
    "from peft import (\n",
    "    LoraConfig,\n",
    "    get_peft_model,\n",
    "    prepare_model_for_kbit_training,\n",
    "    TaskType\n",
    ")\n",
    "import bitsandbytes as bnb\n",
    "from tqdm import tqdm\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "f275305b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Using device: cuda\n",
      "GPU: NVIDIA GeForce RTX 3070 Ti Laptop GPU\n",
      "Memory: 8.6 GB\n"
     ]
    }
   ],
   "source": [
    "# Check GPU availability\n",
    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "print(f\"Using device: {device}\")\n",
    "if torch.cuda.is_available():\n",
    "    print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n",
    "    print(f\"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "c2334fd4",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_name = \"shchoi83/agriQA\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "849f5ace",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_name = \"Qwen/Qwen1.5-1.8B-Chat\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "6fca4e6e",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Repo card metadata block was not found. Setting CardData to empty.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset downloaded successfully!\n",
      "Dataset size: 174930 samples\n",
      "Dataset columns: ['questions', 'answers', 'text']\n"
     ]
    }
   ],
   "source": [
    "dataset = load_dataset(dataset_name)\n",
    "print(f\"Dataset downloaded successfully!\")\n",
    "print(f\"Dataset size: {len(dataset['train'])} samples\")\n",
    "print(f\"Dataset columns: {dataset['train'].column_names}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "1757f71e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample data:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'questions': ['asking about the control measure of flower drop problem in his coconut plant'],\n",
       " 'answers': ['suggested him to apply fertilizer in recommended dose like urea-600gm,ssp-1kg,mop-700gm,borax-25gm twice in a year(march/april and september/october)followed by trench method and also suggested him to spray planofix@1teaspoonful per20-25 liter of water.'],\n",
       " 'text': ['Below are questions and answers about agriculture. Make sure you answer the questions. ###Q:asking about the control measure of flower drop problem in his coconut plant###A:suggested him to apply fertilizer in recommended dose like urea-600gm,ssp-1kg,mop-700gm,borax-25gm twice in a year(march/april and september/october)followed by trench method and also suggested him to spray planofix@1teaspoonful per20-25 liter of water.']}"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(\"Sample data:\")\n",
    "sample = dataset['train'][1:2]\n",
    "sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "b42f819e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'asked about the cultivation practice of bitter gourd'"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "questions = [item['questions'] for item in dataset['train']]\n",
    "answers = [item['answers'] for item in dataset['train']]\n",
    "questions[45]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "648b0b87",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Data statistics:\n",
      "Average question length: 43.6 characters\n",
      "Average answer length: 76.0 characters\n",
      "Min question length: 1 characters\n",
      "Max question length: 10239 characters\n"
     ]
    }
   ],
   "source": [
    "print(f\"\\nData statistics:\")\n",
    "print(f\"Average question length: {sum(len(q) for q in questions) / len(questions):.1f} characters\")\n",
    "print(f\"Average answer length: {sum(len(a) for a in answers) / len(answers):.1f} characters\")\n",
    "print(f\"Min question length: {min(len(q) for q in questions)} characters\")\n",
    "print(f\"Max question length: {max(len(q) for q in questions)} characters\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "6f89f722",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "231b96e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data preprocessing functions\n",
    "def clean_text(text: str) -> str:\n",
    "    if not text or not isinstance(text, str):\n",
    "        return \"\"\n",
    "    \n",
    "    # basic cleaning\n",
    "    text = text.strip()\n",
    "    text = text.replace('\\n', ' ').replace('\\r', ' ')\n",
    "    text = ' '.join(text.split())  # remove extra whitespace\n",
    "    \n",
    "    return text\n",
    "\n",
    "def filter_qa_pairs(question: str, answer: str) -> bool:\n",
    "    # remove pairs with very short or very long responses\n",
    "    if len(answer) < 10 or len(answer) > 2000:\n",
    "        return False\n",
    "    \n",
    "    # remove pairs with short questions\n",
    "    if len(question) < 5:\n",
    "        return False\n",
    "    \n",
    "    # more lenient filtering for agri content\n",
    "    english_chars = sum(1 for c in question + answer if c.isascii())\n",
    "    total_chars = len(question + answer)\n",
    "    \n",
    "    if total_chars > 0 and english_chars / total_chars < 0.7:\n",
    "        return False\n",
    "    \n",
    "    return True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "5ed30e18",
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocess_for_chat(question: str, answer: str, tokenizer) -> str:\n",
    "    #Format question-answer pairs for chat model training\n",
    "    if tokenizer and hasattr(tokenizer, 'apply_chat_template'):\n",
    "        try:\n",
    "            messages = [\n",
    "                {\"role\": \"system\", \"content\": \"You are a helpful agricultural assistant. Provide accurate and practical advice for farming, crop management, livestock care, and agricultural practices.\"},\n",
    "                {\"role\": \"user\", \"content\": question},\n",
    "                {\"role\": \"assistant\", \"content\": answer}\n",
    "            ]\n",
    "            formatted_text = tokenizer.apply_chat_template(\n",
    "                messages,\n",
    "                tokenize=False,\n",
    "                add_generation_prompt=False\n",
    "            )\n",
    "            return formatted_text\n",
    "        except Exception as e:\n",
    "            print(f\"Failed to use chat template: {e}. Using fallback format.\")\n",
    "    \n",
    "    # Fallback format for Qwen1.5-Chat\n",
    "    formatted_text = f\"<|im_start|>system\\nYou are a helpful agricultural assistant. Provide accurate and practical advice for farming, crop management, livestock care, and agricultural practices.<|im_end|>\\n<|im_start|>user\\n{question}<|im_end|>\\n<|im_start|>assistant\\n{answer}<|im_end|>\"\n",
    "    return formatted_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "f11911f8",
   "metadata": {},
   "outputs": [],
   "source": [
    "formatted_data = []\n",
    "raw_data = []\n",
    "filtered_count = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "5e2ba2bd",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Processing samples: 100%|██████████| 174930/174930 [57:41<00:00, 50.54it/s] "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Processing completed!\n",
      "Total samples: 174930\n",
      "Filtered out: 1768 samples\n",
      "Formatted samples: 173162\n",
      "Success rate: 99.0%\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "for item in tqdm(dataset['train'], desc=\"Processing samples\"):\n",
    "    question = clean_text(item['questions'])\n",
    "    answer = clean_text(item['answers'])\n",
    "    \n",
    "    # Filter quality\n",
    "    if not filter_qa_pairs(question, answer):\n",
    "        filtered_count += 1\n",
    "        continue\n",
    "    \n",
    "    # Format for training\n",
    "    formatted_text = preprocess_for_chat(question, answer, tokenizer)\n",
    "    formatted_data.append(formatted_text)\n",
    "    \n",
    "    # Keep raw data for analysis\n",
    "    raw_data.append({\n",
    "        'question': question,\n",
    "        'answer': answer,\n",
    "        'text': item.get('text', '')\n",
    "    })\n",
    "\n",
    "print(f\"\\nProcessing completed!\")\n",
    "print(f\"Total samples: {len(dataset['train'])}\")\n",
    "print(f\"Filtered out: {filtered_count} samples\")\n",
    "print(f\"Formatted samples: {len(formatted_data)}\")\n",
    "print(f\"Success rate: {len(formatted_data)/len(dataset['train'])*100:.1f}%\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "c0e03833",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train/Validation split:\n",
      "Training samples: 155846\n",
      "Validation samples: 17316\n",
      "Total samples: 173162\n"
     ]
    }
   ],
   "source": [
    "val_ratio = 0.1\n",
    "val_size = int(len(formatted_data) * val_ratio)\n",
    "val_data = formatted_data[:val_size]\n",
    "train_data = formatted_data[val_size:]\n",
    "print(f\"Train/Validation split:\")\n",
    "print(f\"Training samples: {len(train_data)}\")\n",
    "print(f\"Validation samples: {len(val_data)}\")\n",
    "print(f\"Total samples: {len(train_data) + len(val_data)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "9bb68cd3",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset = Dataset.from_dict({\"text\": train_data})\n",
    "val_dataset = Dataset.from_dict({\"text\": val_data})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "e5f46e82",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Datasets saved as 'train_data.txt' and 'val_data.txt'.\n"
     ]
    }
   ],
   "source": [
    "# Save as plain text files\n",
    "with open(\"train_data.txt\", \"w\", encoding=\"utf-8\") as f:\n",
    "    for line in train_data:\n",
    "        f.write(line + \"\\n\")\n",
    "\n",
    "with open(\"val_data.txt\", \"w\", encoding=\"utf-8\") as f:\n",
    "    for line in val_data:\n",
    "        f.write(line + \"\\n\")\n",
    "\n",
    "print(\"Datasets saved as 'train_data.txt' and 'val_data.txt'.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eb6dc918",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save datasets to disk\n",
    "train_dataset.save_to_disk(\"train_dataset1\")\n",
    "val_dataset.save_to_disk(\"val_dataset1\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "ccf369fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_dataset(train_file: str, val_file: str) -> tuple:\n",
    "    def load_text_file(file_path: str) -> list:\n",
    "        with open(file_path, 'r', encoding='utf-8') as f:\n",
    "            return [line.strip() for line in f if line.strip()]\n",
    "        \n",
    "    train_texts = load_text_file(train_file)\n",
    "    train_dataset = Dataset.from_dict({\"text\": train_texts})\n",
    "        \n",
    "    val_texts = load_text_file(val_file)\n",
    "    val_dataset = Dataset.from_dict({\"text\": val_texts})\n",
    "    \n",
    "    return train_dataset, val_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "896e25d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_path = r\"N:\\agriQA\\data\\train_data.txt\"\n",
    "val_path = r\"N:\\agriQA\\data\\val_data.txt\"\n",
    "\n",
    "train_dataset, val_dataset = load_dataset(train_path, val_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "0e374cc7",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "loading configuration file config.json from cache at C:\\Users\\ACER\\.cache\\huggingface\\hub\\models--Qwen--Qwen1.5-1.8B-Chat\\snapshots\\e482ee3f73c375a627a16fdf66fd0c8279743ca6\\config.json\n",
      "Model config Qwen2Config {\n",
      "  \"_name_or_path\": \"Qwen/Qwen1.5-1.8B-Chat\",\n",
      "  \"architectures\": [\n",
      "    \"Qwen2ForCausalLM\"\n",
      "  ],\n",
      "  \"attention_dropout\": 0.0,\n",
      "  \"bos_token_id\": 151643,\n",
      "  \"eos_token_id\": 151645,\n",
      "  \"hidden_act\": \"silu\",\n",
      "  \"hidden_size\": 2048,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 5504,\n",
      "  \"max_position_embeddings\": 32768,\n",
      "  \"max_window_layers\": 21,\n",
      "  \"model_type\": \"qwen2\",\n",
      "  \"num_attention_heads\": 16,\n",
      "  \"num_hidden_layers\": 24,\n",
      "  \"num_key_value_heads\": 16,\n",
      "  \"rms_norm_eps\": 1e-06,\n",
      "  \"rope_scaling\": null,\n",
      "  \"rope_theta\": 1000000.0,\n",
      "  \"sliding_window\": 32768,\n",
      "  \"tie_word_embeddings\": false,\n",
      "  \"torch_dtype\": \"float16\",\n",
      "  \"transformers_version\": \"4.49.0\",\n",
      "  \"use_cache\": true,\n",
      "  \"use_sliding_window\": false,\n",
      "  \"vocab_size\": 151936\n",
      "}\n",
      "\n",
      "loading weights file model.safetensors from cache at C:\\Users\\ACER\\.cache\\huggingface\\hub\\models--Qwen--Qwen1.5-1.8B-Chat\\snapshots\\e482ee3f73c375a627a16fdf66fd0c8279743ca6\\model.safetensors\n",
      "Instantiating Qwen2ForCausalLM model under default dtype torch.float16.\n",
      "Generate config GenerationConfig {\n",
      "  \"bos_token_id\": 151643,\n",
      "  \"eos_token_id\": 151645\n",
      "}\n",
      "\n",
      "All model checkpoint weights were used when initializing Qwen2ForCausalLM.\n",
      "\n",
      "All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at Qwen/Qwen1.5-1.8B-Chat.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training.\n",
      "loading configuration file generation_config.json from cache at C:\\Users\\ACER\\.cache\\huggingface\\hub\\models--Qwen--Qwen1.5-1.8B-Chat\\snapshots\\e482ee3f73c375a627a16fdf66fd0c8279743ca6\\generation_config.json\n",
      "Generate config GenerationConfig {\n",
      "  \"bos_token_id\": 151643,\n",
      "  \"do_sample\": true,\n",
      "  \"eos_token_id\": [\n",
      "    151645,\n",
      "    151643\n",
      "  ],\n",
      "  \"pad_token_id\": 151643,\n",
      "  \"repetition_penalty\": 1.1,\n",
      "  \"top_p\": 0.8\n",
      "}\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model loaded successfully!\n",
      "Model device: cuda:0\n"
     ]
    }
   ],
   "source": [
    "use_4bit = False  \n",
    "\n",
    "if use_4bit:\n",
    "    print(\"Loading with 4-bit quantization\")\n",
    "    model = AutoModelForCausalLM.from_pretrained(\n",
    "        model_name,\n",
    "        quantization_config=bnb.BitsAndBytesConfig(\n",
    "            load_in_4bit=True,\n",
    "            bnb_4bit_compute_dtype=torch.float16,\n",
    "            bnb_4bit_quant_type=\"nf4\",\n",
    "            bnb_4bit_use_double_quant=True,\n",
    "        ),\n",
    "        device_map=\"auto\",\n",
    "        trust_remote_code=True\n",
    "    )\n",
    "    \n",
    "    # prepare model for k-bit training\n",
    "    model = prepare_model_for_kbit_training(model)\n",
    "else:\n",
    "    # Load model without device_map to avoid meta tensor issues\n",
    "    model = AutoModelForCausalLM.from_pretrained(\n",
    "        model_name,\n",
    "        torch_dtype=torch.float16,\n",
    "        trust_remote_code=True\n",
    "    )\n",
    "    # Move model to device manually\n",
    "    model = model.to(device)\n",
    "\n",
    "print(\"Model loaded successfully!\")\n",
    "print(f\"Model device: {next(model.parameters()).device}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "423993c7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "trainable params: 14,991,360 || all params: 1,851,820,032 || trainable%: 0.8095\n"
     ]
    }
   ],
   "source": [
    "lora_config = LoraConfig(\n",
    "    r=16,  # LoRA rank\n",
    "    lora_alpha=32,  # alpha parameter\n",
    "    lora_dropout=0.1,  # LoRA dropout\n",
    "    target_modules=[  # Target modules for LoRA\n",
    "        \"q_proj\",\n",
    "        \"k_proj\",\n",
    "        \"v_proj\",\n",
    "        \"o_proj\",\n",
    "        \"gate_proj\",\n",
    "        \"up_proj\",\n",
    "        \"down_proj\"\n",
    "    ],\n",
    "    bias=\"none\",  \n",
    "    task_type=TaskType.CAUSAL_LM  # Task type for PEFT\n",
    ")\n",
    "\n",
    "# apply to the model\n",
    "model = get_peft_model(model, lora_config)\n",
    "\n",
    "model.print_trainable_parameters()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "6abd7234",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "4c09bd63",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def create_tokenize_function(tokenizer):\n",
    "    def tokenize_function(examples):\n",
    "        tokenized = tokenizer(\n",
    "            examples[\"text\"],\n",
    "            truncation=True,\n",
    "            padding=False, \n",
    "            max_length=2048,\n",
    "            return_tensors=None,  # Return lists, not tensors\n",
    "            return_attention_mask= False  # Return attention mask\n",
    "        )\n",
    "        \n",
    "        # Ensure labels are the same as input_ids\n",
    "        tokenized['labels'] = tokenized['input_ids'].copy()\n",
    "        \n",
    "        return tokenized\n",
    "    \n",
    "    return tokenize_function\n",
    "\n",
    "# Create the tokenize function with access to the tokenizer\n",
    "tokenize_function = create_tokenize_function(tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "add99cd6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e1277ea5aa9d42598f9c647f265c62b9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Tokenizing training dataset (num_proc=4):   0%|          | 0/935076 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e0b72f885d2d438d8a85c28f7cb2106b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Tokenizing validation dataset:   0%|          | 0/103896 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Tokenize training dataset\n",
    "train_tokenized = train_dataset.map(\n",
    "    tokenize_function,\n",
    "    batched=True,\n",
    "    num_proc=4,\n",
    "    remove_columns=train_dataset.column_names,\n",
    "    desc=\"Tokenizing training dataset\"\n",
    ")\n",
    "\n",
    "# Tokenize validation dataset\n",
    "val_tokenized = val_dataset.map(\n",
    "    tokenize_function,\n",
    "    batched=True,\n",
    "    num_proc=1,\n",
    "    remove_columns=val_dataset.column_names,\n",
    "    desc=\"Tokenizing validation dataset\"\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "26e33c06",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "PyTorch: setting up devices\n"
     ]
    }
   ],
   "source": [
    "training_args = TrainingArguments(\n",
    "    output_dir=\"./models/agriqa-assistant\",  # Output directory\n",
    "    num_train_epochs=3,  # Number of training epochs\n",
    "    per_device_train_batch_size=4,  # Batch size per device\n",
    "    per_device_eval_batch_size=4,  # Evaluation batch size\n",
    "    gradient_accumulation_steps=8,  # Gradient accumulation steps\n",
    "    learning_rate=2e-4,  # Learning rate\n",
    "    weight_decay=0.01,  # Weight decay\n",
    "    warmup_steps=100,  # Warmup steps\n",
    "    logging_steps=10,  # Logging steps\n",
    "    save_steps=500,  # Save steps\n",
    "    eval_steps=500,  # Evaluation steps\n",
    "    eval_strategy=\"steps\",  # Evaluation strategy\n",
    "    save_strategy=\"steps\",  # Save strategy\n",
    "    save_total_limit=3,  # Save total limit\n",
    "    load_best_model_at_end=True,  # Load best model at end\n",
    "    metric_for_best_model=\"eval_loss\",  # Metric for best model\n",
    "    greater_is_better=False,  # Greater is better\n",
    "    dataloader_pin_memory=False,  # Dataloader pin memory\n",
    "    remove_unused_columns=False,  # Remove unused columns\n",
    "    fp16=True,  # Use fp16\n",
    "    dataloader_num_workers=4,  # Dataloader num workers\n",
    "    report_to=\"none\",  # Report to (set to \"wandb\" if using wandb)\n",
    "    run_name=\"agriqa-assistant-finetune\",  # Run name\n",
    "    log_level=\"info\",  # Log level\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "76a043dc",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using auto half precision backend\n"
     ]
    }
   ],
   "source": [
    "# Data collator\n",
    "data_collator = DataCollatorForLanguageModeling(\n",
    "    tokenizer=tokenizer,\n",
    "    mlm=False,\n",
    "    pad_to_multiple_of=8,  # Pad to multiple of 8 for efficiency\n",
    "    return_tensors=\"pt\",  # Return PyTorch tensors\n",
    ")\n",
    "\n",
    "# Trainer\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    train_dataset=train_tokenized,\n",
    "    eval_dataset=val_tokenized,\n",
    "    data_collator=data_collator,\n",
    "    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8365dcf5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train the model\n",
    "train_result = trainer.train()\n",
    "\n",
    "# Save the final model\n",
    "trainer.save_model()\n",
    " # Save training metrics\n",
    "metrics = train_result.metrics\n",
    "trainer.log_metrics(\"train\", metrics)\n",
    "trainer.save_metrics(\"train\", metrics)\n",
    "trainer.save_state()\n",
    "    \n",
    "print(\"Training completed successfully!\")\n",
    "print(f\"Training metrics: {metrics}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a82805e6",
   "metadata": {},
   "outputs": [],
   "source": [
    "output_dir = \"./models/agriqa-assistant\"\n",
    "\n",
    "# Save tokenizer\n",
    "tokenizer.save_pretrained(output_dir)\n",
    "print(f\"Tokenizer saved to {output_dir}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "67901450",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save model configuration\n",
    "model_config = {\n",
    "    'base_model': model_name,\n",
    "    'lora_config': {\n",
    "        'r': 16,\n",
    "        'lora_alpha': 32,\n",
    "        'lora_dropout': 0.1,\n",
    "        'target_modules': [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
    "        'bias': \"none\",\n",
    "        'task_type': \"CAUSAL_LM\"\n",
    "    },\n",
    "    'generation_config': {\n",
    "        'max_new_tokens': 512,\n",
    "        'do_sample': True,\n",
    "        'temperature': 0.7,\n",
    "        'top_p': 0.9,\n",
    "        'top_k': 50,\n",
    "        'repetition_penalty': 1.1\n",
    "    },\n",
    "    'training_config': {\n",
    "        'num_train_epochs': 3,\n",
    "        'learning_rate': 2e-4,\n",
    "        'batch_size': 4,\n",
    "        'gradient_accumulation_steps': 8\n",
    "    }\n",
    "}\n",
    "\n",
    "config_path = os.path.join(output_dir, 'model_config.json')\n",
    "with open(config_path, 'w') as f:\n",
    "    json.dump(model_config, f, indent=2)\n",
    "\n",
    "print(f\"Model configuration saved to {config_path}\")\n",
    "print(f\"Model saved to {output_dir}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12ad9ed9",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llama",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}