{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.\n", "Unsloth: Failed to patch Gemma3ForConditionalGeneration.\n", "🦥 Unsloth Zoo will now patch everything to make training faster!\n", "INFO 05-29 15:16:51 [importing.py:53] Triton module has been replaced with a placeholder.\n", "INFO 05-29 15:16:51 [__init__.py:239] Automatically detected platform cuda.\n" ] } ], "source": [ "import os\n", "import json\n", "import csv\n", "import re\n", "import emoji\n", "import html\n", "import os\n", "import random\n", "from unsloth import FastLanguageModel # Tool for loading the model\n", "from datasets import Dataset\n", "from trl import SFTTrainer # For supervised fine-tuning\n", "from transformers import TrainingArguments # For configuring training parameters\n", "from unsloth import is_bfloat16_supported # Check if bfloat16 is supported\n", "import math\n", "import torch\n", "from tqdm import tqdm\n", "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # Evaluation metrics\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Data Preprocessing" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def process_tweets(csv_file, label):\n", " tweets_with_labels = []\n", " # Regular expressions for removing URLs and line breaks\n", " url_pattern = re.compile(r'https?://\\S+|www\\.\\S+')\n", " newline_pattern = re.compile(r'\\n')\n", " \n", " # Check if file exists\n", " if not os.path.exists(csv_file):\n", " print(f\"File not found: {csv_file}\")\n", " return tweets_with_labels # Return empty list or handle error\n", "\n", " try:\n", " with open(csv_file, mode='r', encoding='utf-8') as file:\n", " reader = csv.DictReader(file)\n", " for row in reader:\n", " tweet_text = row.get('Tweet')\n", " if tweet_text:\n", " # Decode HTML entities, e.g., & → &\n", " cleaned_text = html.unescape(tweet_text)\n", "\n", " # Remove mentions (@user)\n", " cleaned_text = re.sub(r'@\\w+', '', cleaned_text)\n", " \n", " # Remove leading hashtags\n", " cleaned_text = re.sub(r'^(#\\w+\\s*)+', '', cleaned_text)\n", "\n", " # Remove URLs and line breaks\n", " cleaned_text = url_pattern.sub(r'', cleaned_text)\n", " cleaned_text = newline_pattern.sub(r' ', cleaned_text) # Replace newlines with space\n", " \n", " # Remove emojis using emoji library\n", " cleaned_text = emoji.replace_emoji(cleaned_text, '')\n", " \n", " # Remove remaining non-ASCII characters\n", " cleaned_text = re.sub(r'[^\\x00-\\x7F]+', '', cleaned_text)\n", "\n", " cleaned_text = cleaned_text.strip() # Remove leading/trailing spaces\n", " \n", " # Add to list if not empty\n", " if cleaned_text:\n", " tweets_with_labels.append({\"text\": cleaned_text, \"label\": label})\n", " except Exception as e:\n", " print(f\"Error processing file {csv_file}: {e}\")\n", " \n", " return tweets_with_labels" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Process all CSV files in a given directory\n", "def process_directory_tweets(directory_path, label):\n", " processed_data = []\n", " print(f\"Processing directory: {directory_path}\")\n", " if not os.path.isdir(directory_path):\n", " print(f\"Directory not found: {directory_path}\")\n", " return processed_data\n", "\n", " try:\n", " csv_files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]\n", " for file_name in csv_files:\n", " file_path = os.path.join(directory_path, file_name)\n", " print(f\"Processing file: {file_path}\")\n", " processed_data_single_file = process_tweets(file_path, label)\n", " processed_data.extend(processed_data_single_file)\n", " except Exception as e:\n", " print(f\"Error processing directory {directory_path}: {e}\")\n", " \n", " return processed_data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing directory: Datasets/Negative CSV\n", "Processing file: Datasets/Negative CSV/hatewomen.csv\n", "Processing file: Datasets/Negative CSV/hatemen.csv\n", "Processing file: Datasets/Negative CSV/menaretrash.csv\n", "Processing file: Datasets/Negative CSV/Misandry.csv\n", "Processing file: Datasets/Negative CSV/womenarebaddrivers.csv\n", "Processing file: Datasets/Negative CSV/womenarestupid.csv\n", "Processing directory: Datasets/Positive CSV\n", "Processing file: Datasets/Positive CSV/equalityofsexes.csv\n", "Processing file: Datasets/Positive CSV/breastfeeding.csv\n", "Processing file: Datasets/Positive CSV/genderpaygap.csv\n", "Processing file: Datasets/Positive CSV/lgbtq.csv\n", "Cleaned data saved to cleaned_trained_data.json\n", "Total processed tweets: 1885\n" ] } ], "source": [ "# Paths to folders containing CSV files\n", "negative_data_directory = 'Datasets/Negative CSV'\n", "positive_data_directory = 'Datasets/Positive CSV'\n", "\n", "all_processed_data = []\n", "\n", "# Process negative samples\n", "all_processed_data.extend(process_directory_tweets(negative_data_directory, 1))\n", "\n", "# Process positive samples\n", "all_processed_data.extend(process_directory_tweets(positive_data_directory, 0))\n", "\n", "# Shuffle all processed data\n", "random.shuffle(all_processed_data)\n", "\n", "# Save cleaned data to JSON file\n", "output_json_file = 'cleaned_trained_data.json'\n", "with open(output_json_file, 'w', encoding='utf-8') as f:\n", " json.dump(all_processed_data, f, ensure_ascii=False, indent=4)\n", "\n", "print(f\"Cleaned data saved to {output_json_file}\")\n", "\n", "# Print total number of processed tweets\n", "print(f\"Total processed tweets: {len(all_processed_data)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2.Train the model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2.1 Evaluate function" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Model evaluation function\n", "def evaluate_model(model, tokenizer, evaluation_data, instruction):\n", " \"\"\"Evaluate model performance using accuracy, precision, recall, and F1 score (binary labels)\"\"\"\n", " model.eval()\n", " true_labels_list = []\n", " pred_labels_list = []\n", " \n", " print(\"\\nEvaluating model...\")\n", " with torch.no_grad():\n", " for item in tqdm(evaluation_data, desc=\"Evaluating\"):\n", " tweet_text = item.get(\"text\")\n", " true_label_numeric = item.get(\"label\")\n", " \n", " if tweet_text is None or true_label_numeric is None:\n", " continue\n", " \n", " prompt = f\"Instruction: {instruction}\\nInput: {tweet_text}\\nOutput:\"\n", " inputs = tokenizer(prompt, return_tensors=\"pt\", truncation=True, max_length=2048)\n", " inputs = {k: v.to(model.device) for k, v in inputs.items()} \n", " \n", " outputs = model.generate(\n", " **inputs,\n", " max_new_tokens=10,\n", " pad_token_id=tokenizer.pad_token_id,\n", " eos_token_id=tokenizer.eos_token_id\n", " )\n", " \n", " response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", " \n", " try:\n", " output_start = response.rindex(\"Output:\")\n", " output_text = response[output_start + 7:].strip()\n", " pred_label = output_text.split()[0].strip()\n", " if pred_label not in ['1', '0']:\n", " pred_label = ''\n", "\n", " except (ValueError, IndexError):\n", " pred_label = \"\"\n", " \n", " true_labels_list.append(str(true_label_numeric))\n", " if pred_label == '1':\n", " pred_label_for_list = '1'\n", " elif pred_label == '0':\n", " pred_label_for_list = '0'\n", " else:\n", " if true_label_numeric == 1:\n", " pred_label_for_list = '0'\n", " else:\n", " pred_label_for_list = '1'\n", "\n", " pred_labels_list.append(pred_label_for_list)\n", " \n", " # Calculate evaluation metrics\n", " if len(true_labels_list) > 0:\n", " true_labels_list = [str(label) for label in true_labels_list]\n", " pred_labels_list = [str(label) for label in pred_labels_list]\n", "\n", " accuracy = accuracy_score(true_labels_list, pred_labels_list)\n", " try:\n", " precision = precision_score(true_labels_list, pred_labels_list, pos_label='1', zero_division=0)\n", " except ValueError:\n", " precision = 0.0\n", " try:\n", " recall = recall_score(true_labels_list, pred_labels_list, pos_label='1', zero_division=0)\n", " except ValueError:\n", " recall = 0.0\n", " try:\n", " f1 = f1_score(true_labels_list, pred_labels_list, pos_label='1', zero_division=0)\n", " except ValueError:\n", " f1 = 0.0\n", "\n", " print(f\"Total samples: {len(true_labels_list)}\")\n", " print(f\"Accuracy: {accuracy:.4f}\")\n", " print(f\"Precision: {precision:.4f}\")\n", " print(f\"Recall: {recall:.4f}\")\n", " print(f\"F1 Score: {f1:.4f}\")\n", " else:\n", " accuracy = 0.0\n", " print(\"\\nNo valid evaluation data found.\")\n", " \n", " return accuracy" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2.2 Prepare Train Data" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "==((====))== Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.51.3. vLLM: 0.8.5.post1.\n", " \\\\ /| NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.65 GB. Platform: Linux.\n", "O^O/ \\_/ \\ Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0\n", "\\ / Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]\n", " \"-____-\" Free license: http://github.com/unslothai/unsloth\n", "Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.\n" ] } ], "source": [ "# Load cleaned JSON data\n", "with open(\"cleaned_trained_data.json\", \"r\", encoding='utf-8') as f:\n", " all_data = json.load(f)\n", "\n", "# Split into training and test set (90% training, 10% test)\n", "train_size = int(len(all_data) * 0.9)\n", "train_data = all_data[:train_size]\n", "test_data = all_data[train_size:]\n", "\n", "# Load pretrained model and tokenizer\n", "max_seq_length = 2048\n", "dtype = None\n", "load_in_4bit = True \n", "\n", "model, tokenizer= FastLanguageModel.from_pretrained(\n", " model_name = \"Qwen2.5-0.5B-Instruct\",\n", " max_seq_length = max_seq_length,\n", " dtype = dtype,\n", " load_in_4bit = load_in_4bit\n", ")\n", "EOS_TOKEN = tokenizer.eos_token\n", "# Instruction: output 1 or 0\n", "instruction = \"Determine whether the following sentence contains gender discrimination. Reply with \\\"1\\\" for yes and \\\"0\\\" for no. Only reply 1 or 0.\"" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Prepare training data\n", "instruction_texts = [] \n", "for item in train_data:\n", " tweet_text = item.get(\"text\")\n", " label_numeric = item.get(\"label\")\n", " \n", " if tweet_text is not None and label_numeric is not None:\n", " formatted_text = f\"Instruction: {instruction}\\nInput: {tweet_text}\\nOutput: {label_numeric}\" + EOS_TOKEN\n", " instruction_texts.append(formatted_text)\n", "\n", "dataset = {\"text\": instruction_texts}\n", "dataset_length = len(instruction_texts)\n", "dataset = Dataset.from_dict(dataset)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2.3 Evaluate the original model" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Evaluating model before fine-tuning...\n", "\n", "Evaluating model...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating: 100%|██████████| 189/189 [01:05<00:00, 2.91it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Total samples: 189\n", "Accuracy: 0.3598\n", "Precision: 0.3820\n", "Recall: 0.3400\n", "F1 Score: 0.3598\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] }, { "data": { "text/plain": [ "0.35978835978835977" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Evaluate model before fine-tuning\n", "print(\"Evaluating model before fine-tuning...\")\n", "evaluate_model(model, tokenizer, test_data, instruction)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2.4 Set training parameters and start training" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Unsloth 2025.3.19 patched 24 layers with 24 QKV layers, 24 O layers and 24 MLP layers.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "997a050dc2cc45e483c9a9e294560b31", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Unsloth: Tokenizing [\"text\"] (num_proc=8): 0%| | 0/1696 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "==((====))== Unsloth - 2x faster free finetuning | Num GPUs used = 1\n", " \\\\ /| Num examples = 1,696 | Num Epochs = 4 | Total steps = 81\n", "O^O/ \\_/ \\ Batch size per device = 32 | Gradient accumulation steps = 2\n", "\\ / Data Parallel GPUs = 1 | Total batch size (32 x 2 x 1) = 64\n", " \"-____-\" Trainable parameters = 8,798,208/5,000,000,000 (0.18% trained)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Unsloth: Will smartly offload gradients to save VRAM!\n" ] }, { "data": { "text/html": [ "\n", "
| Step | \n", "Training Loss | \n", "
|---|---|
| 1 | \n", "4.146500 | \n", "
| 2 | \n", "4.192900 | \n", "
| 3 | \n", "3.768500 | \n", "
| 4 | \n", "3.255000 | \n", "
| 5 | \n", "2.900000 | \n", "
| 6 | \n", "2.549800 | \n", "
| 7 | \n", "2.242000 | \n", "
| 8 | \n", "2.203500 | \n", "
| 9 | \n", "2.146200 | \n", "
| 10 | \n", "2.018900 | \n", "
| 11 | \n", "1.989000 | \n", "
| 12 | \n", "1.857400 | \n", "
| 13 | \n", "2.118600 | \n", "
| 14 | \n", "1.811500 | \n", "
| 15 | \n", "1.952500 | \n", "
| 16 | \n", "1.812100 | \n", "
| 17 | \n", "2.013800 | \n", "
| 18 | \n", "2.018100 | \n", "
| 19 | \n", "1.921000 | \n", "
| 20 | \n", "2.001900 | \n", "
| 21 | \n", "2.025800 | \n", "
| 22 | \n", "2.135500 | \n", "
| 23 | \n", "2.134300 | \n", "
| 24 | \n", "1.941200 | \n", "
| 25 | \n", "1.799700 | \n", "
| 26 | \n", "1.916200 | \n", "
| 27 | \n", "1.850200 | \n", "
| 28 | \n", "1.803900 | \n", "
| 29 | \n", "1.737700 | \n", "
| 30 | \n", "1.795500 | \n", "
| 31 | \n", "1.783700 | \n", "
| 32 | \n", "1.767300 | \n", "
| 33 | \n", "1.645700 | \n", "
| 34 | \n", "1.844800 | \n", "
| 35 | \n", "1.869500 | \n", "
| 36 | \n", "1.774400 | \n", "
| 37 | \n", "1.738200 | \n", "
| 38 | \n", "1.716500 | \n", "
| 39 | \n", "1.789400 | \n", "
| 40 | \n", "1.832900 | \n", "
| 41 | \n", "1.721500 | \n", "
| 42 | \n", "1.842900 | \n", "
| 43 | \n", "1.798900 | \n", "
| 44 | \n", "1.740200 | \n", "
| 45 | \n", "1.504900 | \n", "
| 46 | \n", "1.657100 | \n", "
| 47 | \n", "1.746300 | \n", "
| 48 | \n", "1.871300 | \n", "
| 49 | \n", "1.618300 | \n", "
| 50 | \n", "1.792900 | \n", "
| 51 | \n", "1.688400 | \n", "
| 52 | \n", "1.628700 | \n", "
| 53 | \n", "1.735000 | \n", "
| 54 | \n", "1.637300 | \n", "
| 55 | \n", "1.551300 | \n", "
| 56 | \n", "1.483200 | \n", "
| 57 | \n", "1.799100 | \n", "
| 58 | \n", "1.477300 | \n", "
| 59 | \n", "1.582500 | \n", "
| 60 | \n", "1.582300 | \n", "
| 61 | \n", "1.439400 | \n", "
| 62 | \n", "1.627500 | \n", "
| 63 | \n", "1.364500 | \n", "
| 64 | \n", "1.684800 | \n", "
| 65 | \n", "1.543600 | \n", "
| 66 | \n", "1.651500 | \n", "
| 67 | \n", "1.550500 | \n", "
| 68 | \n", "1.679900 | \n", "
| 69 | \n", "1.590700 | \n", "
| 70 | \n", "1.656000 | \n", "
| 71 | \n", "1.589100 | \n", "
| 72 | \n", "1.766400 | \n", "
| 73 | \n", "1.619300 | \n", "
| 74 | \n", "1.521700 | \n", "
| 75 | \n", "1.523600 | \n", "
| 76 | \n", "1.524700 | \n", "
| 77 | \n", "1.558300 | \n", "
| 78 | \n", "1.625100 | \n", "
| 79 | \n", "1.596000 | \n", "
| 80 | \n", "1.478600 | \n", "
| 81 | \n", "1.586800 | \n", "
"
],
"text/plain": [
"