{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "49cab397", "metadata": {}, "outputs": [], "source": [ "import os\n", "import json\n", "import torch\n", "import logging\n", "import yaml\n", "from typing import Dict, List, Tuple, Any\n", "from datasets import load_dataset, Dataset\n", "from transformers import (\n", " AutoModelForCausalLM,\n", " AutoTokenizer,\n", " TrainingArguments,\n", " Trainer,\n", " DataCollatorForLanguageModeling,\n", " EarlyStoppingCallback\n", ")\n", "from peft import (\n", " LoraConfig,\n", " get_peft_model,\n", " prepare_model_for_kbit_training,\n", " TaskType\n", ")\n", "import bitsandbytes as bnb\n", "from tqdm import tqdm\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 5, "id": "f275305b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using device: cuda\n", "GPU: NVIDIA GeForce RTX 3070 Ti Laptop GPU\n", "Memory: 8.6 GB\n" ] } ], "source": [ "# Check GPU availability\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "print(f\"Using device: {device}\")\n", "if torch.cuda.is_available():\n", " print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n", " print(f\"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "c2334fd4", "metadata": {}, "outputs": [], "source": [ "dataset_name = \"shchoi83/agriQA\"" ] }, { "cell_type": "code", "execution_count": 7, "id": "849f5ace", "metadata": {}, "outputs": [], "source": [ "model_name = \"Qwen/Qwen1.5-1.8B-Chat\"" ] }, { "cell_type": "code", "execution_count": 8, "id": "6fca4e6e", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Repo card metadata block was not found. Setting CardData to empty.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Dataset downloaded successfully!\n", "Dataset size: 174930 samples\n", "Dataset columns: ['questions', 'answers', 'text']\n" ] } ], "source": [ "dataset = load_dataset(dataset_name)\n", "print(f\"Dataset downloaded successfully!\")\n", "print(f\"Dataset size: {len(dataset['train'])} samples\")\n", "print(f\"Dataset columns: {dataset['train'].column_names}\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "1757f71e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sample data:\n" ] }, { "data": { "text/plain": [ "{'questions': ['asking about the control measure of flower drop problem in his coconut plant'],\n", " 'answers': ['suggested him to apply fertilizer in recommended dose like urea-600gm,ssp-1kg,mop-700gm,borax-25gm twice in a year(march/april and september/october)followed by trench method and also suggested him to spray planofix@1teaspoonful per20-25 liter of water.'],\n", " 'text': ['Below are questions and answers about agriculture. Make sure you answer the questions. ###Q:asking about the control measure of flower drop problem in his coconut plant###A:suggested him to apply fertilizer in recommended dose like urea-600gm,ssp-1kg,mop-700gm,borax-25gm twice in a year(march/april and september/october)followed by trench method and also suggested him to spray planofix@1teaspoonful per20-25 liter of water.']}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(\"Sample data:\")\n", "sample = dataset['train'][1:2]\n", "sample" ] }, { "cell_type": "code", "execution_count": 10, "id": "b42f819e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'asked about the cultivation practice of bitter gourd'" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "questions = [item['questions'] for item in dataset['train']]\n", "answers = [item['answers'] for item in dataset['train']]\n", "questions[45]" ] }, { "cell_type": "code", "execution_count": 11, "id": "648b0b87", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Data statistics:\n", "Average question length: 43.6 characters\n", "Average answer length: 76.0 characters\n", "Min question length: 1 characters\n", "Max question length: 10239 characters\n" ] } ], "source": [ "print(f\"\\nData statistics:\")\n", "print(f\"Average question length: {sum(len(q) for q in questions) / len(questions):.1f} characters\")\n", "print(f\"Average answer length: {sum(len(a) for a in answers) / len(answers):.1f} characters\")\n", "print(f\"Min question length: {min(len(q) for q in questions)} characters\")\n", "print(f\"Max question length: {max(len(q) for q in questions)} characters\")" ] }, { "cell_type": "code", "execution_count": 12, "id": "6f89f722", "metadata": {}, "outputs": [], "source": [ "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)" ] }, { "cell_type": "code", "execution_count": 13, "id": "231b96e4", "metadata": {}, "outputs": [], "source": [ "# Data preprocessing functions\n", "def clean_text(text: str) -> str:\n", " if not text or not isinstance(text, str):\n", " return \"\"\n", " \n", " # basic cleaning\n", " text = text.strip()\n", " text = text.replace('\\n', ' ').replace('\\r', ' ')\n", " text = ' '.join(text.split()) # remove extra whitespace\n", " \n", " return text\n", "\n", "def filter_qa_pairs(question: str, answer: str) -> bool:\n", " # remove pairs with very short or very long responses\n", " if len(answer) < 10 or len(answer) > 2000:\n", " return False\n", " \n", " # remove pairs with short questions\n", " if len(question) < 5:\n", " return False\n", " \n", " # more lenient filtering for agri content\n", " english_chars = sum(1 for c in question + answer if c.isascii())\n", " total_chars = len(question + answer)\n", " \n", " if total_chars > 0 and english_chars / total_chars < 0.7:\n", " return False\n", " \n", " return True" ] }, { "cell_type": "code", "execution_count": 14, "id": "5ed30e18", "metadata": {}, "outputs": [], "source": [ "def preprocess_for_chat(question: str, answer: str, tokenizer) -> str:\n", " #Format question-answer pairs for chat model training\n", " if tokenizer and hasattr(tokenizer, 'apply_chat_template'):\n", " try:\n", " messages = [\n", " {\"role\": \"system\", \"content\": \"You are a helpful agricultural assistant. Provide accurate and practical advice for farming, crop management, livestock care, and agricultural practices.\"},\n", " {\"role\": \"user\", \"content\": question},\n", " {\"role\": \"assistant\", \"content\": answer}\n", " ]\n", " formatted_text = tokenizer.apply_chat_template(\n", " messages,\n", " tokenize=False,\n", " add_generation_prompt=False\n", " )\n", " return formatted_text\n", " except Exception as e:\n", " print(f\"Failed to use chat template: {e}. Using fallback format.\")\n", " \n", " # Fallback format for Qwen1.5-Chat\n", " formatted_text = f\"<|im_start|>system\\nYou are a helpful agricultural assistant. Provide accurate and practical advice for farming, crop management, livestock care, and agricultural practices.<|im_end|>\\n<|im_start|>user\\n{question}<|im_end|>\\n<|im_start|>assistant\\n{answer}<|im_end|>\"\n", " return formatted_text" ] }, { "cell_type": "code", "execution_count": 15, "id": "f11911f8", "metadata": {}, "outputs": [], "source": [ "formatted_data = []\n", "raw_data = []\n", "filtered_count = 0" ] }, { "cell_type": "code", "execution_count": 16, "id": "5e2ba2bd", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Processing samples: 100%|██████████| 174930/174930 [57:41<00:00, 50.54it/s] " ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Processing completed!\n", "Total samples: 174930\n", "Filtered out: 1768 samples\n", "Formatted samples: 173162\n", "Success rate: 99.0%\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "for item in tqdm(dataset['train'], desc=\"Processing samples\"):\n", " question = clean_text(item['questions'])\n", " answer = clean_text(item['answers'])\n", " \n", " # Filter quality\n", " if not filter_qa_pairs(question, answer):\n", " filtered_count += 1\n", " continue\n", " \n", " # Format for training\n", " formatted_text = preprocess_for_chat(question, answer, tokenizer)\n", " formatted_data.append(formatted_text)\n", " \n", " # Keep raw data for analysis\n", " raw_data.append({\n", " 'question': question,\n", " 'answer': answer,\n", " 'text': item.get('text', '')\n", " })\n", "\n", "print(f\"\\nProcessing completed!\")\n", "print(f\"Total samples: {len(dataset['train'])}\")\n", "print(f\"Filtered out: {filtered_count} samples\")\n", "print(f\"Formatted samples: {len(formatted_data)}\")\n", "print(f\"Success rate: {len(formatted_data)/len(dataset['train'])*100:.1f}%\")" ] }, { "cell_type": "code", "execution_count": 17, "id": "c0e03833", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train/Validation split:\n", "Training samples: 155846\n", "Validation samples: 17316\n", "Total samples: 173162\n" ] } ], "source": [ "val_ratio = 0.1\n", "val_size = int(len(formatted_data) * val_ratio)\n", "val_data = formatted_data[:val_size]\n", "train_data = formatted_data[val_size:]\n", "print(f\"Train/Validation split:\")\n", "print(f\"Training samples: {len(train_data)}\")\n", "print(f\"Validation samples: {len(val_data)}\")\n", "print(f\"Total samples: {len(train_data) + len(val_data)}\")" ] }, { "cell_type": "code", "execution_count": 18, "id": "9bb68cd3", "metadata": {}, "outputs": [], "source": [ "train_dataset = Dataset.from_dict({\"text\": train_data})\n", "val_dataset = Dataset.from_dict({\"text\": val_data})" ] }, { "cell_type": "code", "execution_count": 19, "id": "e5f46e82", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Datasets saved as 'train_data.txt' and 'val_data.txt'.\n" ] } ], "source": [ "# Save as plain text files\n", "with open(\"train_data.txt\", \"w\", encoding=\"utf-8\") as f:\n", " for line in train_data:\n", " f.write(line + \"\\n\")\n", "\n", "with open(\"val_data.txt\", \"w\", encoding=\"utf-8\") as f:\n", " for line in val_data:\n", " f.write(line + \"\\n\")\n", "\n", "print(\"Datasets saved as 'train_data.txt' and 'val_data.txt'.\")" ] }, { "cell_type": "code", "execution_count": null, "id": "eb6dc918", "metadata": {}, "outputs": [], "source": [ "# Save datasets to disk\n", "train_dataset.save_to_disk(\"train_dataset1\")\n", "val_dataset.save_to_disk(\"val_dataset1\")" ] }, { "cell_type": "code", "execution_count": 40, "id": "ccf369fa", "metadata": {}, "outputs": [], "source": [ "def load_dataset(train_file: str, val_file: str) -> tuple:\n", " def load_text_file(file_path: str) -> list:\n", " with open(file_path, 'r', encoding='utf-8') as f:\n", " return [line.strip() for line in f if line.strip()]\n", " \n", " train_texts = load_text_file(train_file)\n", " train_dataset = Dataset.from_dict({\"text\": train_texts})\n", " \n", " val_texts = load_text_file(val_file)\n", " val_dataset = Dataset.from_dict({\"text\": val_texts})\n", " \n", " return train_dataset, val_dataset" ] }, { "cell_type": "code", "execution_count": 41, "id": "896e25d9", "metadata": {}, "outputs": [], "source": [ "train_path = r\"N:\\agriQA\\data\\train_data.txt\"\n", "val_path = r\"N:\\agriQA\\data\\val_data.txt\"\n", "\n", "train_dataset, val_dataset = load_dataset(train_path, val_path)" ] }, { "cell_type": "code", "execution_count": 32, "id": "0e374cc7", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "loading configuration file config.json from cache at C:\\Users\\ACER\\.cache\\huggingface\\hub\\models--Qwen--Qwen1.5-1.8B-Chat\\snapshots\\e482ee3f73c375a627a16fdf66fd0c8279743ca6\\config.json\n", "Model config Qwen2Config {\n", " \"_name_or_path\": \"Qwen/Qwen1.5-1.8B-Chat\",\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 2048,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 5504,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 16,\n", " \"num_hidden_layers\": 24,\n", " \"num_key_value_heads\": 16,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_scaling\": null,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": 32768,\n", " \"tie_word_embeddings\": false,\n", " \"torch_dtype\": \"float16\",\n", " \"transformers_version\": \"4.49.0\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "loading weights file model.safetensors from cache at C:\\Users\\ACER\\.cache\\huggingface\\hub\\models--Qwen--Qwen1.5-1.8B-Chat\\snapshots\\e482ee3f73c375a627a16fdf66fd0c8279743ca6\\model.safetensors\n", "Instantiating Qwen2ForCausalLM model under default dtype torch.float16.\n", "Generate config GenerationConfig {\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645\n", "}\n", "\n", "All model checkpoint weights were used when initializing Qwen2ForCausalLM.\n", "\n", "All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at Qwen/Qwen1.5-1.8B-Chat.\n", "If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training.\n", "loading configuration file generation_config.json from cache at C:\\Users\\ACER\\.cache\\huggingface\\hub\\models--Qwen--Qwen1.5-1.8B-Chat\\snapshots\\e482ee3f73c375a627a16fdf66fd0c8279743ca6\\generation_config.json\n", "Generate config GenerationConfig {\n", " \"bos_token_id\": 151643,\n", " \"do_sample\": true,\n", " \"eos_token_id\": [\n", " 151645,\n", " 151643\n", " ],\n", " \"pad_token_id\": 151643,\n", " \"repetition_penalty\": 1.1,\n", " \"top_p\": 0.8\n", "}\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Model loaded successfully!\n", "Model device: cuda:0\n" ] } ], "source": [ "use_4bit = False \n", "\n", "if use_4bit:\n", " print(\"Loading with 4-bit quantization\")\n", " model = AutoModelForCausalLM.from_pretrained(\n", " model_name,\n", " quantization_config=bnb.BitsAndBytesConfig(\n", " load_in_4bit=True,\n", " bnb_4bit_compute_dtype=torch.float16,\n", " bnb_4bit_quant_type=\"nf4\",\n", " bnb_4bit_use_double_quant=True,\n", " ),\n", " device_map=\"auto\",\n", " trust_remote_code=True\n", " )\n", " \n", " # prepare model for k-bit training\n", " model = prepare_model_for_kbit_training(model)\n", "else:\n", " # Load model without device_map to avoid meta tensor issues\n", " model = AutoModelForCausalLM.from_pretrained(\n", " model_name,\n", " torch_dtype=torch.float16,\n", " trust_remote_code=True\n", " )\n", " # Move model to device manually\n", " model = model.to(device)\n", "\n", "print(\"Model loaded successfully!\")\n", "print(f\"Model device: {next(model.parameters()).device}\")\n" ] }, { "cell_type": "code", "execution_count": 22, "id": "423993c7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "trainable params: 14,991,360 || all params: 1,851,820,032 || trainable%: 0.8095\n" ] } ], "source": [ "lora_config = LoraConfig(\n", " r=16, # LoRA rank\n", " lora_alpha=32, # alpha parameter\n", " lora_dropout=0.1, # LoRA dropout\n", " target_modules=[ # Target modules for LoRA\n", " \"q_proj\",\n", " \"k_proj\",\n", " \"v_proj\",\n", " \"o_proj\",\n", " \"gate_proj\",\n", " \"up_proj\",\n", " \"down_proj\"\n", " ],\n", " bias=\"none\", \n", " task_type=TaskType.CAUSAL_LM # Task type for PEFT\n", ")\n", "\n", "# apply to the model\n", "model = get_peft_model(model, lora_config)\n", "\n", "model.print_trainable_parameters()" ] }, { "cell_type": "code", "execution_count": 32, "id": "6abd7234", "metadata": {}, "outputs": [], "source": [ "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)" ] }, { "cell_type": "code", "execution_count": 48, "id": "4c09bd63", "metadata": {}, "outputs": [], "source": [ "\n", "def create_tokenize_function(tokenizer):\n", " def tokenize_function(examples):\n", " tokenized = tokenizer(\n", " examples[\"text\"],\n", " truncation=True,\n", " padding=False, \n", " max_length=2048,\n", " return_tensors=None, # Return lists, not tensors\n", " return_attention_mask= False # Return attention mask\n", " )\n", " \n", " # Ensure labels are the same as input_ids\n", " tokenized['labels'] = tokenized['input_ids'].copy()\n", " \n", " return tokenized\n", " \n", " return tokenize_function\n", "\n", "# Create the tokenize function with access to the tokenizer\n", "tokenize_function = create_tokenize_function(tokenizer)" ] }, { "cell_type": "code", "execution_count": 49, "id": "add99cd6", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e1277ea5aa9d42598f9c647f265c62b9", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Tokenizing training dataset (num_proc=4): 0%| | 0/935076 [00:00