{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "e028a78e", "metadata": { "execution": { "iopub.execute_input": "2024-10-26T12:33:45.203715Z", "iopub.status.busy": "2024-10-26T12:33:45.203302Z", "iopub.status.idle": "2024-10-26T12:34:04.776829Z", "shell.execute_reply": "2024-10-26T12:34:04.775630Z" }, "id": "y73ZVTnK0YCQ", "papermill": { "duration": 19.582191, "end_time": "2024-10-26T12:34:04.779343", "exception": false, "start_time": "2024-10-26T12:33:45.197152", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "!pip install rouge sumy rouge_score datasets -qq" ] }, { "cell_type": "code", "execution_count": 2, "id": "efb01005", "metadata": { "execution": { "iopub.execute_input": "2024-10-26T12:34:04.791156Z", "iopub.status.busy": "2024-10-26T12:34:04.790818Z", "iopub.status.idle": "2024-10-26T12:34:08.386003Z", "shell.execute_reply": "2024-10-26T12:34:08.385203Z" }, "id": "NBHSsTcyyoef", "papermill": { "duration": 3.603978, "end_time": "2024-10-26T12:34:08.388454", "exception": false, "start_time": "2024-10-26T12:34:04.784476", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "import sys\n", "import nltk\n", "from rouge import Rouge\n", "from datasets import load_dataset\n", "from sumy.nlp.stemmers import Stemmer\n", "from sumy.nlp.tokenizers import Tokenizer\n", "from sumy.parsers.plaintext import PlaintextParser\n", "from sumy.summarizers.kl import KLSummarizer\n", "from sumy.summarizers.lsa import LsaSummarizer\n", "from sumy.summarizers.lex_rank import LexRankSummarizer\n", "from sumy.summarizers.sum_basic import SumBasicSummarizer\n", "from sumy.summarizers.text_rank import TextRankSummarizer" ] }, { "cell_type": "code", "execution_count": 3, "id": "b0b7c513", "metadata": { "execution": { "iopub.execute_input": "2024-10-26T12:34:08.399571Z", "iopub.status.busy": "2024-10-26T12:34:08.399079Z", "iopub.status.idle": "2024-10-26T12:34:11.926694Z", "shell.execute_reply": "2024-10-26T12:34:11.925741Z" }, "id": "myJUj_R7-uXL", "papermill": { "duration": 3.535373, "end_time": "2024-10-26T12:34:11.928820", "exception": false, "start_time": "2024-10-26T12:34:08.393447", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "99e5ef8ff5bc4ee8a84127dd82efe116", "version_major": 2, "version_minor": 0 }, "text/plain": [ "README.md: 0%| | 0.00/368 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "924cf9a0d563431d81521baf2078f8d9", "version_major": 2, "version_minor": 0 }, "text/plain": [ "train.csv: 0%| | 0.00/36.5M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ce22f82394154ee383873818bf330908", "version_major": 2, "version_minor": 0 }, "text/plain": [ "test.csv: 0%| | 0.00/17.7M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e890c18d50c44f978c10fcbfaa9d7f11", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating train split: 0%| | 0/2058 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "3f005554d55e47e2ba443dc891d4b0e8", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating test split: 0%| | 0/1015 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "dataset = load_dataset(\"d0r1h/ILC\", split='test')" ] }, { "cell_type": "code", "execution_count": 4, "id": "eedac49d", "metadata": { "execution": { "iopub.execute_input": "2024-10-26T12:34:11.942831Z", "iopub.status.busy": "2024-10-26T12:34:11.942469Z", "iopub.status.idle": "2024-10-26T12:34:11.949195Z", "shell.execute_reply": "2024-10-26T12:34:11.948272Z" }, "id": "RYJyqp-Z-5hV", "outputId": "fd79fe6a-9c5a-4f11-995b-a5a19640d8d3", "papermill": { "duration": 0.016454, "end_time": "2024-10-26T12:34:11.951249", "exception": false, "start_time": "2024-10-26T12:34:11.934795", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['Title', 'Summary', 'Case'],\n", " num_rows: 1015\n", "})" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset" ] }, { "cell_type": "code", "execution_count": 5, "id": "33809db2", "metadata": { "execution": { "iopub.execute_input": "2024-10-26T12:34:11.964246Z", "iopub.status.busy": "2024-10-26T12:34:11.963924Z", "iopub.status.idle": "2024-10-26T12:34:12.048940Z", "shell.execute_reply": "2024-10-26T12:34:12.048147Z" }, "id": "J4rmm7voyoUk", "outputId": "a986658d-d987-4475-dd7e-63c7087d7b2f", "papermill": { "duration": 0.094143, "end_time": "2024-10-26T12:34:12.051285", "exception": false, "start_time": "2024-10-26T12:34:11.957142", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "df = pd.DataFrame(dataset)\n", "\n", "df= df.sample(300)" ] }, { "cell_type": "code", "execution_count": 6, "id": "23baf647", "metadata": { "execution": { "iopub.execute_input": "2024-10-26T12:34:12.065263Z", "iopub.status.busy": "2024-10-26T12:34:12.064923Z", "iopub.status.idle": "2024-10-26T12:34:12.208359Z", "shell.execute_reply": "2024-10-26T12:34:12.207406Z" }, "id": "Kle1TB8pyoDb", "outputId": "3fce24d0-dcfd-4756-d352-48c366ea1155", "papermill": { "duration": 0.152826, "end_time": "2024-10-26T12:34:12.210670", "exception": false, "start_time": "2024-10-26T12:34:12.057844", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to /usr/share/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ] } ], "source": [ "nltk.download(\"punkt\")\n", "rouge = Rouge()" ] }, { "cell_type": "code", "execution_count": 7, "id": "359b61e7", "metadata": { "execution": { "iopub.execute_input": "2024-10-26T12:34:12.224138Z", "iopub.status.busy": "2024-10-26T12:34:12.223352Z", "iopub.status.idle": "2024-10-26T12:34:12.228392Z", "shell.execute_reply": "2024-10-26T12:34:12.227508Z" }, "id": "SRkBvDPnyoBg", "outputId": "43424be1-a0a2-4642-c64d-f214b1dcef3a", "papermill": { "duration": 0.014117, "end_time": "2024-10-26T12:34:12.230610", "exception": false, "start_time": "2024-10-26T12:34:12.216493", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3000\n", "10000\n" ] } ], "source": [ "print(sys.getrecursionlimit())\n", "sys.setrecursionlimit(10000)\n", "print(sys.getrecursionlimit())" ] }, { "cell_type": "code", "execution_count": 8, "id": "d0245dcc", "metadata": { "execution": { "iopub.execute_input": "2024-10-26T12:34:12.243768Z", "iopub.status.busy": "2024-10-26T12:34:12.243205Z", "iopub.status.idle": "2024-10-26T12:34:12.248590Z", "shell.execute_reply": "2024-10-26T12:34:12.247753Z" }, "id": "O5Jk5LR1yn_i", "papermill": { "duration": 0.014078, "end_time": "2024-10-26T12:34:12.250596", "exception": false, "start_time": "2024-10-26T12:34:12.236518", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "def summarize(text, sumarizer, SENTENCES_COUNT):\n", " sentences_ = []\n", " doc = text\n", " doc_ = PlaintextParser(doc, Tokenizer(\"en\")).document\n", " for sentence in sumarizer(doc_, SENTENCES_COUNT):\n", " sentences_.append(str(sentence))\n", "\n", " summm_ = \" \".join(sentences_)\n", " return summm_" ] }, { "cell_type": "code", "execution_count": 9, "id": "4cf755a5", "metadata": { "execution": { "iopub.execute_input": "2024-10-26T12:34:12.263278Z", "iopub.status.busy": "2024-10-26T12:34:12.263001Z", "iopub.status.idle": "2024-10-26T12:51:43.519475Z", "shell.execute_reply": "2024-10-26T12:51:43.517884Z" }, "id": "ZDm1gEE1yn7p", "papermill": { "duration": 1051.267032, "end_time": "2024-10-26T12:51:43.523381", "exception": false, "start_time": "2024-10-26T12:34:12.256349", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "df[\"LexRankSummary\"] = df[\"Case\"].map(\n", " lambda x: summarize(x, LexRankSummarizer(), 3)\n", ")\n", "df[\"KLSummary\"] = df[\"Case\"].map(\n", " lambda x: summarize(x, KLSummarizer(), 3)\n", ")\n", "df[\"TextRankSummary\"] = df[\"Case\"].map(\n", " lambda x: summarize(x, TextRankSummarizer(), 3)\n", ")\n", "df[\"SumBasicSummary\"] = df[\"Case\"].map(\n", " lambda x: summarize(x, SumBasicSummarizer(), 3)\n", ")\n", "df[\"LsaSummary\"] = df[\"Case\"].map(\n", " lambda x: summarize(x, LsaSummarizer(), 3)\n", ")" ] }, { "cell_type": "code", "execution_count": 10, "id": "de7283ad", "metadata": { "execution": { "iopub.execute_input": "2024-10-26T12:51:43.551606Z", "iopub.status.busy": "2024-10-26T12:51:43.551001Z", "iopub.status.idle": "2024-10-26T12:51:43.590519Z", "shell.execute_reply": "2024-10-26T12:51:43.589141Z" }, "id": "T1XLZsDzyn1_", "outputId": "418f6d0a-2ebf-4ecc-9050-929720c12764", "papermill": { "duration": 0.057838, "end_time": "2024-10-26T12:51:43.594515", "exception": false, "start_time": "2024-10-26T12:51:43.536677", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
| \n", " | Summary | \n", "LexRankSummary | \n", "KLSummary | \n", "TextRankSummary | \n", "SumBasicSummary | \n", "LsaSummary | \n", "
|---|---|---|---|---|---|---|
| 294 | \n", "When the wife is being physically as well as m... | \n", "It is to be noted that the appellant married t... | \n", "7. 8. r n s5 7 https: www.mhc.tn.gov.in judis ... | \n", "The case of the prosecution is that the appell... | \n", "The case of the prosecution is that the appell... | \n", "Crl.A.No.1821IN THE HIGH COURT OF JUDICATURE A... | \n", "
| 616 | \n", "Courts are not allowed to issue orders on a sy... | \n", "They were admitted in 1st year Professional MB... | \n", "However Glocal Medical College did not conduct... | \n", "They were admitted in 1st year Professional MB... | \n", "411 of 2017. The review petitioners of 2016. I... | \n", "Nos 3103731038 of 2016 may appear in the exam... | \n", "
| 659 | \n", "The man alleged of causing injury to the infor... | \n", "Learned counsel submitted that the petitioner ... | \n", "Sikandra District ... Petitioner s The State o... | \n", "Having considered the facts and circumstances ... | \n", "Further it was submitted that only Upendra Kum... | \n", "The petitioner apprehends arrest in connection... | \n", "