{ "cells": [ { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "import numpy as np \n", "import nltk\n", "import os\n", "import sklearn\n", "import parquet" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "data1 = pd.read_parquet(\"news-00000-of-00007-0ff1ec222cd690f2.parquet\")\n", "data2 = pd.read_parquet(\"news-00001-of-00007-7c273f5de9017dc5.parquet\")\n", "data3 = pd.read_parquet(\"telegram_blogs-00000-of-00001-80087cf60adbe6d4.parquet\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "# Extract the 'text' column from each DataFrame\n", "texts1 = data1['text']\n", "texts2 = data2['text']\n", "texts3 = data3['text']\n", "\n", "# Concatenate the 'text' columns from all three DataFrames\n", "all_texts = pd.concat([texts1, texts2, texts3], ignore_index=True)\n", "data = pd.DataFrame(all_texts, columns=['text'])\n" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "count 643523.000000\n", "mean 1225.167071\n", "std 2613.174490\n", "min 0.000000\n", "25% 271.000000\n", "50% 689.000000\n", "75% 1362.000000\n", "max 299171.000000\n", "Name: text, dtype: float64\n" ] } ], "source": [ "# Calculate the length of each text entry\n", "text_lengths = data['text'].str.len()\n", "\n", "# Display the distribution of text lengths\n", "length_distribution = text_lengths.describe()\n", "\n", "# Print the distribution\n", "print(length_distribution)\n" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "import random\n", "import string\n", "\n", "def random_letter():\n", " \"\"\"Returns a random alphanumeric character.\"\"\"\n", " return random.choice(string.ascii_letters + string.digits)\n", "\n", "def replace_random_letters(word, pct=0.15):\n", " \"\"\"Replaces random letters in a word with a given probability.\"\"\"\n", " if random.random() < pct:\n", " char_pos = random.choice(range(len(word)))\n", " return word[:char_pos] + random_letter() + word[char_pos + 1:]\n", " else:\n", " return word\n", "\n", "def misspell_text(text, pct=0.15, last_letter_error_pct=0.20):\n", " \"\"\"Generates a misspelled version of the input text.\"\"\"\n", " words = text.split()\n", " misspelled_words = [replace_random_letters(word, pct) for word in words]\n", " \n", " # Apply last letter error with a different probability\n", " for i, word in enumerate(misspelled_words):\n", " if random.random() < last_letter_error_pct:\n", " if len(word) > 1: # Ensure word has more than 1 character\n", " misspelled_words[i] = word[:-1] + random_letter()\n", " \n", " return ' '.join(misspelled_words)\n" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "data.rename(columns={'text':'ground_truth'}, inplace=True)\n" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "ename": "AttributeError", "evalue": "'Index' object has no attribute '_format_flat'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "File \u001b[0;32m~/anaconda3/lib/python3.9/site-packages/IPython/core/formatters.py:343\u001b[0m, in \u001b[0;36mBaseFormatter.__call__\u001b[0;34m(self, obj)\u001b[0m\n\u001b[1;32m 341\u001b[0m method \u001b[38;5;241m=\u001b[39m get_real_method(obj, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprint_method)\n\u001b[1;32m 342\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m method \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 343\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 344\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 345\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", "File \u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/core/frame.py:1053\u001b[0m, in \u001b[0;36m_repr_html_\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1036\u001b[0m \u001b[38;5;129m@property\u001b[39m\n\u001b[1;32m 1037\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mshape\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mtuple\u001b[39m[\u001b[38;5;28mint\u001b[39m, \u001b[38;5;28mint\u001b[39m]:\n\u001b[1;32m 1038\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1039\u001b[0m \u001b[38;5;124;03m Return a tuple representing the dimensionality of the DataFrame.\u001b[39;00m\n\u001b[1;32m 1040\u001b[0m \n\u001b[1;32m 1041\u001b[0m \u001b[38;5;124;03m See Also\u001b[39;00m\n\u001b[1;32m 1042\u001b[0m \u001b[38;5;124;03m --------\u001b[39;00m\n\u001b[1;32m 1043\u001b[0m \u001b[38;5;124;03m ndarray.shape : Tuple of array dimensions.\u001b[39;00m\n\u001b[1;32m 1044\u001b[0m \n\u001b[1;32m 1045\u001b[0m \u001b[38;5;124;03m Examples\u001b[39;00m\n\u001b[1;32m 1046\u001b[0m \u001b[38;5;124;03m --------\u001b[39;00m\n\u001b[1;32m 1047\u001b[0m \u001b[38;5;124;03m >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})\u001b[39;00m\n\u001b[1;32m 1048\u001b[0m \u001b[38;5;124;03m >>> df.shape\u001b[39;00m\n\u001b[1;32m 1049\u001b[0m \u001b[38;5;124;03m (2, 2)\u001b[39;00m\n\u001b[1;32m 1050\u001b[0m \n\u001b[1;32m 1051\u001b[0m \u001b[38;5;124;03m >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],\u001b[39;00m\n\u001b[1;32m 1052\u001b[0m \u001b[38;5;124;03m ... 'col3': [5, 6]})\u001b[39;00m\n\u001b[0;32m-> 1053\u001b[0m \u001b[38;5;124;03m >>> df.shape\u001b[39;00m\n\u001b[1;32m 1054\u001b[0m \u001b[38;5;124;03m (2, 3)\u001b[39;00m\n\u001b[1;32m 1055\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m 1056\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindex), \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns)\n", "File \u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/io/formats/format.py:1102\u001b[0m, in \u001b[0;36mto_html\u001b[0;34m(self, buf, encoding, classes, notebook, border, table_id, render_links)\u001b[0m\n\u001b[1;32m 1079\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mformat_array\u001b[39m(\n\u001b[1;32m 1080\u001b[0m values: ArrayLike,\n\u001b[1;32m 1081\u001b[0m formatter: Callable \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1090\u001b[0m fallback_formatter: Callable \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1091\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlist\u001b[39m[\u001b[38;5;28mstr\u001b[39m]:\n\u001b[1;32m 1092\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1093\u001b[0m \u001b[38;5;124;03m Format an array for printing.\u001b[39;00m\n\u001b[1;32m 1094\u001b[0m \n\u001b[1;32m 1095\u001b[0m \u001b[38;5;124;03m Parameters\u001b[39;00m\n\u001b[1;32m 1096\u001b[0m \u001b[38;5;124;03m ----------\u001b[39;00m\n\u001b[1;32m 1097\u001b[0m \u001b[38;5;124;03m values : np.ndarray or ExtensionArray\u001b[39;00m\n\u001b[1;32m 1098\u001b[0m \u001b[38;5;124;03m formatter\u001b[39;00m\n\u001b[1;32m 1099\u001b[0m \u001b[38;5;124;03m float_format\u001b[39;00m\n\u001b[1;32m 1100\u001b[0m \u001b[38;5;124;03m na_rep\u001b[39;00m\n\u001b[1;32m 1101\u001b[0m \u001b[38;5;124;03m digits\u001b[39;00m\n\u001b[0;32m-> 1102\u001b[0m \u001b[38;5;124;03m space\u001b[39;00m\n\u001b[1;32m 1103\u001b[0m \u001b[38;5;124;03m justify\u001b[39;00m\n\u001b[1;32m 1104\u001b[0m \u001b[38;5;124;03m decimal\u001b[39;00m\n\u001b[1;32m 1105\u001b[0m \u001b[38;5;124;03m leading_space : bool, optional, default True\u001b[39;00m\n\u001b[1;32m 1106\u001b[0m \u001b[38;5;124;03m Whether the array should be formatted with a leading space.\u001b[39;00m\n\u001b[1;32m 1107\u001b[0m \u001b[38;5;124;03m When an array as a column of a Series or DataFrame, we do want\u001b[39;00m\n\u001b[1;32m 1108\u001b[0m \u001b[38;5;124;03m the leading space to pad between columns.\u001b[39;00m\n\u001b[1;32m 1109\u001b[0m \n\u001b[1;32m 1110\u001b[0m \u001b[38;5;124;03m When formatting an Index subclass\u001b[39;00m\n\u001b[1;32m 1111\u001b[0m \u001b[38;5;124;03m (e.g. IntervalIndex._get_values_for_csv), we don't want the\u001b[39;00m\n\u001b[1;32m 1112\u001b[0m \u001b[38;5;124;03m leading space since it should be left-aligned.\u001b[39;00m\n\u001b[1;32m 1113\u001b[0m \u001b[38;5;124;03m fallback_formatter\u001b[39;00m\n\u001b[1;32m 1114\u001b[0m \n\u001b[1;32m 1115\u001b[0m \u001b[38;5;124;03m Returns\u001b[39;00m\n\u001b[1;32m 1116\u001b[0m \u001b[38;5;124;03m -------\u001b[39;00m\n\u001b[1;32m 1117\u001b[0m \u001b[38;5;124;03m List[str]\u001b[39;00m\n\u001b[1;32m 1118\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m 1119\u001b[0m fmt_klass: \u001b[38;5;28mtype\u001b[39m[_GenericArrayFormatter]\n\u001b[1;32m 1120\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mis_np_dtype(values\u001b[38;5;241m.\u001b[39mdtype, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mM\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n", "File \u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/io/formats/html.py:88\u001b[0m, in \u001b[0;36mHTMLFormatter.to_string\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 87\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mto_string\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mstr\u001b[39m:\n\u001b[0;32m---> 88\u001b[0m lines \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrender\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 89\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m lines):\n\u001b[1;32m 90\u001b[0m lines \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mstr\u001b[39m(x) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m lines]\n", "File \u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/io/formats/html.py:644\u001b[0m, in \u001b[0;36mNotebookFormatter.render\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 642\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m