{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "d77bdb7b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total rows: 752\n", " Name Number\n", "0 zahidur 1781133333\n", "2 MD Jahidul Islam 1727074508\n", "4 Shafiqul Islam 1716603589\n", "6 Mujahidor Rahamanhalo 1337103616\n", "8 Jahid Hasan 1766435938\n" ] } ], "source": [ "import pandas as pd\n", "import warnings\n", "\n", "# warnings.filterwarnings(\"ignore\", category=UserWarning)\n", "\n", "file = \"df.xlsx\"\n", "\n", "df = pd.read_excel(\n", " file,\n", " sheet_name=None,\n", " dtype=str,\n", " engine=\"openpyxl\"\n", ")\n", "\n", "combined_df = pd.concat(df.values(), ignore_index=True)\n", "\n", "combined_df.columns = combined_df.columns.str.strip()\n", "df = combined_df.dropna(how=\"all\")\n", "\n", "print(\"Total rows:\", len(df))\n", "print(df.head())" ] }, { "cell_type": "code", "execution_count": 2, "id": "76e2e1f0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 752 entries, 0 to 1258\n", "Data columns (total 2 columns):\n", " # Column Non-Null Count Dtype\n", "--- ------ -------------- -----\n", " 0 Name 751 non-null str \n", " 1 Number 752 non-null str \n", "dtypes: str(2)\n", "memory usage: 17.6 KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 3, "id": "3dc2adc0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Name True\n", "Number False\n", "dtype: bool" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull().any() # check for missing values" ] }, { "cell_type": "code", "execution_count": 4, "id": "116b9212", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Empty DataFrame\n", "Columns: [Name, Number]\n", "Index: []\n" ] } ], "source": [ "# Filter rows where Number is null\n", "missing_number = df[df[\"Number\"].isna()]\n", "print(missing_number)" ] }, { "cell_type": "code", "execution_count": 5, "id": "1cd12680", "metadata": {}, "outputs": [], "source": [ "df.dropna(inplace=True) # remove rows with missing values" ] }, { "cell_type": "code", "execution_count": 6, "id": "ce33dead", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Name False\n", "Number False\n", "dtype: bool" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull().any() # check for missing values in each column" ] }, { "cell_type": "code", "execution_count": 7, "id": "18faac6b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number\n", "6 8\n", "1819228188 2\n", "1816720138 2\n", "Name: count, dtype: int64\n" ] } ], "source": [ "duplicate_numbers = df[\"Number\"].value_counts()\n", "duplicate_numbers = duplicate_numbers[duplicate_numbers > 1]\n", "print(duplicate_numbers)" ] }, { "cell_type": "code", "execution_count": 8, "id": "32d94a22", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Name Number\n", "267 5 6\n", "322 5 6\n", "363 5 6\n", "442 5 6\n", "533 5 6\n", "634 5 6\n", "735 5 6\n", "828 5 6\n", "1101 Anisur Rahman Sinha 1819228188\n", "1136 Anisur Rahman Sinha 1819228188\n", "1170 A. Salam Choudhury 1816720138\n", "1198 Shaheer Choudhury 1816720138\n" ] } ], "source": [ "duplicates = df[df.duplicated(subset=[\"Number\"], keep=False)]\n", "print(duplicates.head(50))" ] }, { "cell_type": "code", "execution_count": 9, "id": "0f5af544", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Before: 751\n", "After : 742\n", "Removed: 9\n" ] } ], "source": [ "before = len(df)\n", "df = df.drop_duplicates(subset=[\"Number\"], keep=\"last\")\n", "after = len(df)\n", "\n", "print(\"Before:\", before)\n", "print(\"After :\", after)\n", "print(\"Removed:\", before - after)" ] }, { "cell_type": "code", "execution_count": 10, "id": "6c1e477b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 01781133333\n", "2 01727074508\n", "4 01716603589\n", "6 01337103616\n", "8 01766435938\n", "Name: Number, dtype: str\n" ] } ], "source": [ "# Ensure Number column is string\n", "df['Number'] = df['Number'].astype(str).str.strip()\n", "\n", "# Add leading 0 if not already present\n", "df['Number'] = df['Number'].apply(lambda x: '0' + x if not x.startswith('0') else x)\n", "\n", "print(df['Number'].head())" ] }, { "cell_type": "code", "execution_count": 11, "id": "a61f5286", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[{'Name': 'zahidur', 'Number': '01781133333'}, {'Name': 'MD Jahidul Islam', 'Number': '01727074508'}, {'Name': 'Shafiqul Islam', 'Number': '01716603589'}, {'Name': 'Mujahidor Rahamanhalo', 'Number': '01337103616'}, {'Name': 'Jahid Hasan', 'Number': '01766435938'}]\n" ] } ], "source": [ "# Remove leading and trailing spaces from Name\n", "df[\"Name\"] = df[\"Name\"].str.strip()\n", "\n", "# Optional: remove extra spaces inside name too (multiple spaces → single space)\n", "df[\"Name\"] = df[\"Name\"].str.replace(r'\\s+', ' ', regex=True)\n", "\n", "# Convert to list of dict\n", "result = df.rename(columns={\n", " \"Name\": \"Name\",\n", " \"Number\": \"Number\"\n", "}).to_dict(orient=\"records\")\n", "\n", "print(result[:5])" ] }, { "cell_type": "code", "execution_count": 12, "id": "aa5aecb4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[{'Name': 'zahidur', 'Number': '01781133333'}, {'Name': 'MD Jahidul Islam', 'Number': '01727074508'}, {'Name': 'Shafiqul Islam', 'Number': '01716603589'}, {'Name': 'Mujahidor Rahamanhalo', 'Number': '01337103616'}, {'Name': 'Jahid Hasan', 'Number': '01766435938'}]\n", "Total valid numbers: 731\n" ] } ], "source": [ "import re\n", "pattern = r\"01[3-9]\\d{8}\"\n", "res_dec = []\n", "for i in result:\n", " txt = i[\"Number\"]\n", " matches = re.findall(pattern, txt, re.MULTILINE)\n", " if len(matches) > 1:\n", " for j in matches:\n", " res_dec.append({\"Name\": i[\"Name\"], \"Number\": j})\n", " elif len(matches) == 1:\n", " res_dec.append({\"Name\": i[\"Name\"], \"Number\": matches[0]})\n", "print(res_dec[:5])\n", "print(\"Total valid numbers:\", len(res_dec))" ] }, { "cell_type": "code", "execution_count": 13, "id": "4755835a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Duplicate numbers found:\n" ] } ], "source": [ "# 🔹 Find duplicate numbers\n", "seen = set()\n", "duplicates = []\n", "\n", "for contact in res_dec:\n", " num = contact['Number']\n", " if num in seen:\n", " duplicates.append(contact)\n", " else:\n", " seen.add(num)\n", "\n", "print(\"Duplicate numbers found:\")\n", "for dup in duplicates:\n", " print(dup)" ] }, { "cell_type": "code", "execution_count": 14, "id": "1c62c0de", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "After removing duplicates:\n", "731\n" ] } ], "source": [ "unique_contacts = []\n", "seen = set()\n", "\n", "for contact in res_dec:\n", " num = contact['Number']\n", " if num not in seen:\n", " unique_contacts.append(contact)\n", " seen.add(num)\n", "\n", "print(\"After removing duplicates:\")\n", "print(len(unique_contacts))" ] }, { "cell_type": "code", "execution_count": 15, "id": "78c1adec", "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "with open(\"contacts.json\", \"w\", encoding=\"utf-8\") as f:\n", " json.dump(unique_contacts, f, ensure_ascii=False, indent=4)" ] }, { "cell_type": "code", "execution_count": 2, "id": "fb76d1ed", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total entries in JSON: 563\n" ] } ], "source": [ "import json\n", "with open(\"done.json\", \"r\", encoding=\"utf-8\") as f:\n", " done = json.load(f)\n", "print(\"Total entries in JSON:\", len(done))" ] }, { "cell_type": "code", "execution_count": 3, "id": "5c1c2d31", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total successful entries: 362\n" ] } ], "source": [ "counter = 0\n", "for i in done:\n", " if i['Status'] == \"success\":\n", " counter = counter + 1\n", "print(\"Total successful entries:\", counter)" ] }, { "cell_type": "code", "execution_count": null, "id": "fefde580", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Excel file created successfully!\n" ] } ], "source": [ "import json\n", "import pandas as pd\n", "\n", "\n", "# Convert to DataFrame\n", "df = pd.DataFrame(done)\n", "\n", "# Save to Excel\n", "df.to_excel(\"done.xlsx\", index=False)\n", "\n", "print(\"Excel file created successfully!\")" ] }, { "cell_type": "code", "execution_count": null, "id": "53a2f96f", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "wp", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.13" } }, "nbformat": 4, "nbformat_minor": 5 }