tdickson17
/

Text_Summarization

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true,
+    "ExecuteTime": {
+     "end_time": "2025-08-10T15:22:21.391963Z",
+     "start_time": "2025-08-10T15:22:21.389220Z"
+    }
+   },
+   "source": [
+    "# import pandas as pd\n",
+    "# import torch\n",
+    "# from transformers import T5Tokenizer\n",
+    "# import pandas as pd\n",
+    "# from torch.utils.data import DataLoader, TensorDataset\n",
+    "# device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "# \n",
+    "# import numpy as np\n",
+    "# from transformers import T5Tokenizer\n"
+   ],
+   "outputs": [],
+   "execution_count": 12
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "",
+   "id": "18d7838a0a2b47f0"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2025-08-10T15:22:21.416790Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "# df = pd.read_parquet(\"press_releases_all_with_CAP_issues.parquet\")",
+   "id": "3318aa3e574f90cf",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": "# df = df[['title', 'text']]",
+   "id": "f3816d3ecce5a8e0",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": "# df = df.head(10000)",
+   "id": "2cc68e87814bc931",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": "# df['title'].fillna('', inplace=True)",
+   "id": "8f3c1efe99f9dcdf",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": "# df['title'] = df['title'].replace('', 'No Title')  ",
+   "id": "3d4322138b08d0f5",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": "# print(df.isna().sum())",
+   "id": "393b3b45b339c991",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": "# df.to_parquet('press_releases_consolidated.parquet', engine='pyarrow')",
+   "id": "4561d51aa9a63bba",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-08-10T15:39:06.429249Z",
+     "start_time": "2025-08-10T15:39:06.123602Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "import pandas as pd\n",
+    "df = pd.read_parquet('press_releases_consolidated.parquet')"
+   ],
+   "id": "3f9ca20cb8190e2a",
+   "outputs": [],
+   "execution_count": 1
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-08-10T15:39:14.393933Z",
+     "start_time": "2025-08-10T15:39:12.502613Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "import torch\n",
+    "from torch.utils.data import Dataset, DataLoader, random_split\n",
+    "import torch\n",
+    "from transformers import T5Tokenizer\n",
+    "\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "\n",
+    "tokenizer = T5Tokenizer.from_pretrained('t5-small')\n",
+    "\n",
+    "# modify accordingly\n",
+    "MAX_TARGET_LENGTH = 128\n",
+    "MAX_INPUT_LENGTH = 512\n",
+    "\n",
+    "class SummarizationDataset(Dataset):\n",
+    "    def __init__(self, dataframe, tokenizer, max_input_length=MAX_INPUT_LENGTH, max_target_length=MAX_TARGET_LENGTH):\n",
+    "        self.data = dataframe\n",
+    "        self.tokenizer = tokenizer\n",
+    "        self.max_input_length = max_input_length\n",
+    "        self.max_target_length = max_target_length\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.data)\n",
+    "    \n",
+    "    def __getitem__(self, idx):\n",
+    "        text = self.data.iloc[idx]['text']\n",
+    "        title = self.data.iloc[idx]['title']\n",
+    "        \n",
+    "    \n",
+    "        # tokenize\n",
+    "        text_to_token = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_input_length, return_tensors='pt')\n",
+    "        title_to_token = self.tokenizer(title, padding='max_length', truncation=True, max_length=self.max_target_length, return_tensors='pt')\n",
+    "        \n",
+    "        \n",
+    "        input_ids = text_to_token['input_ids'].squeeze(0) \n",
+    "        attention_mask = text_to_token['attention_mask'].squeeze(0)  \n",
+    "        labels = title_to_token['input_ids'].squeeze(0)  \n",
+    "        labels[labels == self.tokenizer.pad_token_id] = -100 \n",
+    "        \n",
+    "        return {\n",
+    "            'input_ids': input_ids,\n",
+    "            'attention_mask': attention_mask,\n",
+    "            'labels': labels  \n",
+    "        }\n",
+    "\n",
+    "dataset = SummarizationDataset(df, tokenizer)\n",
+    "\n",
+    "\n",
+    "train_size = int(0.8 * len(dataset))\n",
+    "val_size = len(dataset) - train_size\n",
+    "train_dataset, val_dataset = random_split(dataset, [train_size, val_size])\n",
+    "\n",
+    "train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)\n",
+    "val_dataloader = DataLoader(val_dataset, batch_size=8)\n",
+    "\n"
+   ],
+   "id": "22604924094a8cd3",
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n"
+     ]
+    }
+   ],
+   "execution_count": 3
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-08-10T21:47:41.277658Z",
+     "start_time": "2025-08-10T15:39:15.673627Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "import torch\n",
+    "from transformers import T5ForConditionalGeneration\n",
+    "from torch.optim import Adam\n",
+    "from torch.utils.data import DataLoader\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "import evaluate\n",
+    "\n",
+    "model = T5ForConditionalGeneration.from_pretrained('t5-small')\n",
+    "optimizer = Adam(model.parameters(), lr=5e-5)\n",
+    "\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "model.to(device)\n",
+    "\n",
+    "rouge = evaluate.load(\"rouge\")\n",
+    "\n",
+    "def train():\n",
+    "    model.train()\n",
+    "    total_loss = 0\n",
+    "    for batch in train_dataloader:\n",
+    "        input_ids = batch['input_ids'].to(device)\n",
+    "        attention_mask = batch['attention_mask'].to(device)\n",
+    "        labels = batch['labels'].to(device)\n",
+    "\n",
+    "        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)\n",
+    "        loss = outputs.loss\n",
+    "        total_loss += loss.item()\n",
+    "\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "        optimizer.zero_grad()\n",
+    "\n",
+    "    return total_loss / len(train_dataloader)\n",
+    "\n",
+    "def evaluate():\n",
+    "    model.eval()\n",
+    "    total_loss = 0\n",
+    "    all_preds = []\n",
+    "    all_labels = []\n",
+    "    \n",
+    "    with torch.no_grad():\n",
+    "        for batch in val_dataloader:\n",
+    "            input_ids = batch['input_ids'].to(device)\n",
+    "            attention_mask = batch['attention_mask'].to(device)\n",
+    "            labels = batch['labels'].to(device)\n",
+    "\n",
+    "            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)\n",
+    "            total_loss += outputs.loss.item()\n",
+    "            \n",
+    "            try:\n",
+    "                summary_ids = model.generate(\n",
+    "                    input_ids=input_ids,\n",
+    "                    attention_mask=attention_mask,\n",
+    "                    max_length=MAX_TARGET_LENGTH,\n",
+    "                    num_beams=8,\n",
+    "                    early_stopping=True\n",
+    "                )\n",
+    "                \n",
+    "                summary_ids = summary_ids[0] if len(summary_ids) > 0 else torch.tensor([tokenizer.pad_token_id])\n",
+    "                \n",
+    "                preds = tokenizer.decode(summary_ids.cpu(), skip_special_tokens=True, clean_up_tokenization_spaces=True)\n",
+    "                labels_decoded = tokenizer.decode(\n",
+    "                    labels[0].masked_select(labels[0] != -100).cpu(), \n",
+    "                    skip_special_tokens=True,\n",
+    "                    clean_up_tokenization_spaces=True\n",
+    "                )\n",
+    "                \n",
+    "                all_preds.append(preds if preds else \" \")\n",
+    "                all_labels.append(labels_decoded if labels_decoded else \" \")\n",
+    "                \n",
+    "            except Exception as e:\n",
+    "                print(f\"Error during generation: {e}\")\n",
+    "                all_preds.append(\" \")\n",
+    "                all_labels.append(\" \")\n",
+    "                continue\n",
+    "\n",
+    "    all_preds = [p if p.strip() else \" \" for p in all_preds]\n",
+    "    all_labels = [l if l.strip() else \" \" for l in all_labels]\n",
+    "    \n",
+    "    rouge_result = rouge.compute(predictions=all_preds, references=all_labels)\n",
+    "    \n",
+    "    return total_loss / len(val_dataloader), rouge_result\n",
+    "\n",
+    "\n",
+    "epochs = 15\n",
+    "best_val_loss = float('inf')\n",
+    "\n",
+    "for epoch in range(epochs):\n",
+    "    print(f\"Epoch {epoch + 1}/{epochs}\")\n",
+    "\n",
+    "    train_loss = train()\n",
+    "    print(f\"Training Loss: {train_loss:.4f}\")\n",
+    "\n",
+    "    val_loss, rouge_result = evaluate()\n",
+    "    print(f\"Validation Loss: {val_loss:.4f}\")\n",
+    "    print(f\"ROUGE Scores: {rouge_result}\")\n",
+    "\n",
+    "    if val_loss < best_val_loss:\n",
+    "        best_val_loss = val_loss\n",
+    "        model.save_pretrained(f\"best_model_epoch_{epoch + 1}\")\n",
+    "        tokenizer.save_pretrained(f\"best_model_epoch_{epoch + 1}\")\n"
+   ],
+   "id": "2041549aaa86af9f",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 1/15\n",
+      "Training Loss: 2.3327\n",
+      "Validation Loss: 1.9963\n",
+      "ROUGE Scores: {'rouge1': 0.21808722374319384, 'rouge2': 0.1182736024791169, 'rougeL': 0.19976099496233557, 'rougeLsum': 0.19920689338385827}\n",
+      "Epoch 2/15\n",
+      "Training Loss: 2.1164\n",
+      "Validation Loss: 1.9190\n",
+      "ROUGE Scores: {'rouge1': 0.24314444230564494, 'rouge2': 0.14001878402499457, 'rougeL': 0.2237854024840728, 'rougeLsum': 0.22246462572576908}\n",
+      "Epoch 3/15\n",
+      "Training Loss: 2.0179\n",
+      "Validation Loss: 1.8727\n",
+      "ROUGE Scores: {'rouge1': 0.23564530968156083, 'rouge2': 0.13669895563342216, 'rougeL': 0.21725589526977998, 'rougeLsum': 0.2151015219135301}\n",
+      "Epoch 4/15\n",
+      "Training Loss: 1.9257\n",
+      "Validation Loss: 1.8389\n",
+      "ROUGE Scores: {'rouge1': 0.23937899093803855, 'rouge2': 0.13888041555479988, 'rougeL': 0.21854222551451663, 'rougeLsum': 0.21721511685962552}\n",
+      "Epoch 5/15\n",
+      "Training Loss: 1.8781\n",
+      "Validation Loss: 1.8102\n",
+      "ROUGE Scores: {'rouge1': 0.2412030325505815, 'rouge2': 0.1373245465699872, 'rougeL': 0.22158876960762192, 'rougeLsum': 0.21964406824128718}\n",
+      "Epoch 6/15\n",
+      "Training Loss: 1.8266\n",
+      "Validation Loss: 1.8030\n",
+      "ROUGE Scores: {'rouge1': 0.24693945766624123, 'rouge2': 0.13859814431515555, 'rougeL': 0.22609207133571282, 'rougeLsum': 0.22456133662136685}\n",
+      "Epoch 7/15\n",
+      "Training Loss: 1.7831\n",
+      "Validation Loss: 1.7842\n",
+      "ROUGE Scores: {'rouge1': 0.24995693123364204, 'rouge2': 0.13730760003890233, 'rougeL': 0.22966043449504253, 'rougeLsum': 0.22839320529835103}\n",
+      "Epoch 8/15\n",
+      "Training Loss: 1.7398\n",
+      "Validation Loss: 1.7843\n",
+      "ROUGE Scores: {'rouge1': 0.24797510003323764, 'rouge2': 0.13919083038634567, 'rougeL': 0.22646443435896133, 'rougeLsum': 0.22558282591894607}\n",
+      "Epoch 9/15\n",
+      "Training Loss: 1.7068\n",
+      "Validation Loss: 1.7860\n",
+      "ROUGE Scores: {'rouge1': 0.25390876204792084, 'rouge2': 0.13814393342112263, 'rougeL': 0.231234438215985, 'rougeLsum': 0.2311260176829176}\n",
+      "Epoch 10/15\n",
+      "Training Loss: 1.6779\n",
+      "Validation Loss: 1.7854\n",
+      "ROUGE Scores: {'rouge1': 0.25411363403331366, 'rouge2': 0.14468888317851958, 'rougeL': 0.2354872641812709, 'rougeLsum': 0.23342210178892542}\n",
+      "Epoch 11/15\n",
+      "Training Loss: 1.6413\n",
+      "Validation Loss: 1.7642\n",
+      "ROUGE Scores: {'rouge1': 0.2679774072064855, 'rouge2': 0.14667787569965263, 'rougeL': 0.24705660369839066, 'rougeLsum': 0.2454144686019869}\n",
+      "Epoch 12/15\n",
+      "Training Loss: 1.6075\n",
+      "Validation Loss: 1.7712\n",
+      "ROUGE Scores: {'rouge1': 0.268361111086107, 'rouge2': 0.15128550708369404, 'rougeL': 0.24768429614360232, 'rougeLsum': 0.24575241584538624}\n",
+      "Epoch 13/15\n",
+      "Training Loss: 1.5857\n",
+      "Validation Loss: 1.7618\n",
+      "ROUGE Scores: {'rouge1': 0.28096384664011065, 'rouge2': 0.1595810134136424, 'rougeL': 0.2575870112336856, 'rougeLsum': 0.25663783533294626}\n",
+      "Epoch 14/15\n",
+      "Training Loss: 1.5552\n",
+      "Validation Loss: 1.7620\n",
+      "ROUGE Scores: {'rouge1': 0.2833173462582747, 'rouge2': 0.1648174970170761, 'rougeL': 0.2615026211543109, 'rougeLsum': 0.2600381314435784}\n",
+      "Epoch 15/15\n",
+      "Training Loss: 1.5316\n",
+      "Validation Loss: 1.7716\n",
+      "ROUGE Scores: {'rouge1': 0.2782139285308772, 'rouge2': 0.1606118164438922, 'rougeL': 0.2581515139790868, 'rougeLsum': 0.2571149575053421}\n"
+     ]
+    }
+   ],
+   "execution_count": 4
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": "",
+   "id": "c8d5f56240932910",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": "",
+   "id": "3cecb16d8154a783",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-08-11T23:22:29.491880Z",
+     "start_time": "2025-08-11T23:22:28.364057Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "import torch\n",
+    "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
+    "\n",
+    "model_id = \"tdickson17/Text_Summarization\"\n",
+    "\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "\n",
+    "tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to(device)\n",
+    "\n",
+    "def generate_summary(\n",
+    "    text,\n",
+    "    model=model,\n",
+    "    tokenizer=tok,\n",
+    "    device=device,\n",
+    "    max_new_tokens=128,\n",
+    "    min_new_tokens=20,\n",
+    "    num_beams=4\n",
+    "):\n",
+    "    # T5 often uses a task prefix; keep if your model expects it\n",
+    "    if not text.lower().startswith(\"summarize:\"):\n",
+    "        text = \"summarize: \" + text\n",
+    "\n",
+    "    inputs = tokenizer(text, return_tensors=\"pt\", truncation=True).to(device)\n",
+    "\n",
+    "    with torch.no_grad():\n",
+    "        out_ids = model.generate(\n",
+    "            **inputs,\n",
+    "            max_new_tokens=max_new_tokens,  \n",
+    "            min_new_tokens=min_new_tokens,\n",
+    "            num_beams=num_beams,\n",
+    "            no_repeat_ngram_size=3,\n",
+    "            early_stopping=True\n",
+    "        )\n",
+    "\n",
+    "    return tokenizer.decode(out_ids[0], skip_special_tokens=True)\n",
+    "\n",
+    "input_text = (\n",
+    "    \"At Susquehanna, we approach quantitative finance with a deep commitment to scientific rigor and innovation. Our research leverages vast and diverse datasets, applying cutting-edge machine learning to uncover actionable insights and driving data-informed decisions from predictive modeling to strategic execution. Today, Susquehanna has over 3,000 employees in 17+ global locations. While we have grown in size and expanded our reach, our collaborative culture and love for gaming remains.\"\n",
+    ")\n",
+    "print(\"Summary:\", generate_summary(input_text))\n"
+   ],
+   "id": "add7d5e5d17e708b",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Summary: quantitative finance is driven by scientific rigor and innovation. Susquehanna has over 3,000 employees.\n"
+     ]
+    }
+   ],
+   "execution_count": 15
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "",
+   "id": "976fd3465f63b737"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}