{ "cells": [ { "cell_type": "markdown", "id": "8d1fae73", "metadata": {}, "source": [ "This notebook aims to map the manually extracted bools with the chunked data so we can have a more varied negative class." ] }, { "cell_type": "code", "execution_count": 15, "id": "9ced7f63", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from fuzzywuzzy import process\n", "from fuzzywuzzy import fuzz\n", "\n", "from tqdm.notebook import tqdm, IProgress " ] }, { "cell_type": "code", "execution_count": 16, "id": "e06e72c9", "metadata": {}, "outputs": [], "source": [ "CHUNK_TEXT_COLUMN = 'text'\n", "QUOTE_TEXT_COLUMN = 'Quote or table'" ] }, { "cell_type": "code", "execution_count": 17, "id": "e467b9cd", "metadata": {}, "outputs": [], "source": [ "try:\n", " chunked_pdfs_df = pd.read_excel('../../etl/20250708_sentences_pdf_extraction_results.xlsx', sheet_name= 'Sheet1').drop_duplicates()\n", " extracted_quotes_df = pd.read_excel('../NDC_scraping_stage_1.xlsx', sheet_name= 'Prev_Finance').drop_duplicates()\n", "except Exception as e:\n", " raise RuntimeError(f\"An unexpected error occurred while loading DataFrames: {e}\")\n" ] }, { "cell_type": "code", "execution_count": 18, "id": "62866717", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',\n", " 'Antigua_and_Barbuda', 'Argentina', 'Armenia', 'Australia',\n", " 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados',\n", " 'Belarus', 'Belize', 'Benin', 'Bhutan',\n", " 'Bolivia_Plurinational_State_of', 'Bosnia_and_Herzegovina',\n", " 'Botswana', 'Brazil', 'Brunei_Darussalam', 'Burkina_Faso',\n", " 'Burundi', 'Cabo_Verde', 'Cambodia', 'Cameroon', 'Canada',\n", " 'Central_African_Republic', 'Chad', 'Chile', 'China', 'Colombia',\n", " 'Comoros', 'Congo', 'Cook_Islands', 'Costa_Rica', 'Cuba',\n", " 'Côte_dIvoire', 'Democratic_Republic_of_the_Congo', 'Djibouti',\n", " 'Dominica', 'Dominican_Republic', 'Ecuador', 'Egypt',\n", " 'El_Salvador', 'Equatorial_Guinea', 'Eritrea', 'Eswatini',\n", " 'Ethiopia', 'European_Union_EU', 'Fiji', 'Gabon', 'Gambia',\n", " 'Georgia', 'Ghana', 'Grenada', 'Guatemala', 'Guinea',\n", " 'Guinea-Bissau', 'Guyana', 'Haiti', 'Holy_See', 'Honduras',\n", " 'Iceland', 'India', 'Iraq', 'Israel', 'Jamaica', 'Japan', 'Jordan',\n", " 'Kazakhstan', 'Kenya', 'Kuwait', 'Kyrgyzstan',\n", " 'Lao_Peoples_Democratic_Republic', 'Lebanon', 'Liberia',\n", " 'Liechtenstein', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives',\n", " 'Mali', 'Marshall_Islands', 'Mauritania', 'Mauritius', 'Mexico',\n", " 'Micronesia_Federated_States_of', 'Monaco', 'Mongolia',\n", " 'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia',\n", " 'Nauru', 'Nepal', 'New_Zealand', 'Nicaragua', 'Niger', 'Nigeria',\n", " 'Niue', 'North_Macedonia', 'Norway', 'Oman', 'Pakistan', 'Palau',\n", " 'Panama', 'Papua_New_Guinea', 'Peru', 'Philippines', 'Qatar',\n", " 'Republic_of_Korea', 'Republic_of_Moldova', 'Russian_Federation',\n", " 'Rwanda', 'Saint_Kitts_and_Nevis', 'Saint_Lucia',\n", " 'Saint_Vincent_and_the_Grenadines', 'Samoa', 'San_Marino',\n", " 'Sao_Tome_and_Principe', 'Saudi_Arabia', 'Senegal', 'Serbia',\n", " 'Seychelles', 'Sierra_Leone', 'Singapore', 'Solomon_Islands',\n", " 'Somalia', 'South_Africa', 'South_Sudan', 'Sri_Lanka',\n", " 'State_of_Palestine', 'Sudan', 'Suriname', 'Switzerland',\n", " 'Syrian_Arab_Republic', 'Tajikistan', 'Thailand', 'Timor-Leste',\n", " 'Togo', 'Tonga', 'Trinidad_and_Tobago', 'Tunisia', 'Turkmenistan',\n", " 'Tuvalu', 'Türkiye', 'Uganda', 'Ukraine', 'United_Arab_Emirates',\n", " 'United_Kingdom_of_Great_Britain_and_Northern_Ireland',\n", " 'United_Republic_of_Tanzania', 'United_States_of_America',\n", " 'Uruguay', 'Uzbekistan', 'Vanuatu',\n", " 'Venezuela_Bolivarian_Republic_of', 'Viet_Nam', 'Zambia',\n", " 'Zimbabwe'], dtype=object)" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunked_pdfs_df['country'].unique()" ] }, { "cell_type": "code", "execution_count": 19, "id": "281d0e3e", "metadata": {}, "outputs": [], "source": [ "extracted_quotes_df['Source'] = extracted_quotes_df['Source'].str.split(' ').str[0]" ] }, { "cell_type": "code", "execution_count": 20, "id": "a4516eaa", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Afghanistan', 'Antigua', 'Bahamas', 'Barbados', 'Bhutan',\n", " 'Bosnia', 'Egypt', 'Bangladesh', 'Eswatini', 'Fiji', 'Grenada',\n", " 'Guyana', 'Indonesia', 'Kazakhstan', 'Kiribati', 'Liberia',\n", " 'Micronesia', 'Belize', 'Malaysia', 'Mongolia', 'Nepal',\n", " 'Pakistan', 'Solomon', 'Trinidad', nan], dtype=object)" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "extracted_quotes_df['Source'].unique()" ] }, { "cell_type": "markdown", "id": "7e118852", "metadata": {}, "source": [ "# Filter chunked pdfs to manually extracted data to reduce amount of fuzzy matches\n", "\n", "Can be further optimized by iterating through country / file name" ] }, { "cell_type": "code", "execution_count": 21, "id": "0f1beca6", "metadata": {}, "outputs": [], "source": [ "# chunked_pdfs_df = chunked_pdfs_df[chunked_pdfs_df['country'].isin(extracted_quotes_df['Source'].unique())]" ] }, { "cell_type": "code", "execution_count": 22, "id": "48d2f1d5", "metadata": {}, "outputs": [], "source": [ "if CHUNK_TEXT_COLUMN not in chunked_pdfs_df.columns:\n", " raise ValueError(f\"Error: Chunk text column '{CHUNK_TEXT_COLUMN}' not found in 'chunked_pdfs_df'.\")\n", "if QUOTE_TEXT_COLUMN not in extracted_quotes_df.columns:\n", " raise ValueError(f\"Error: Quote text column '{QUOTE_TEXT_COLUMN}' not found in 'extracted_quotes_df'.\")" ] }, { "cell_type": "code", "execution_count": 23, "id": "bdc824af", "metadata": {}, "outputs": [], "source": [ "# all_quotes = extracted_quotes_df[QUOTE_TEXT_COLUMN].tolist()\n", "# chunked_pdfs_df['is_target_quote'] = 0\n", "# chunked_pdfs_df['matched_quote'] = None\n", "# chunked_pdfs_df['match_score'] = 0\n", "\n", "# # Iterate through each chunk with a progress bar\n", "# # tqdm will automatically print the \"Starting...\" and \"Complete.\" messages through its bar.\n", "# # For Jupyter/IPython notebooks, use tqdm.notebook.tqdm. For scripts, use tqdm.tqdm\n", "# for index, row in tqdm(chunked_pdfs_df.iterrows(), total=len(chunked_pdfs_df), desc=\"Fuzzy Matching Chunks\"):\n", "# chunk_text = str(row[CHUNK_TEXT_COLUMN]) # Convert to string to handle potential non-string types\n", "\n", "# # Find the best matching quote and its score\n", "# best_match_tuple = process.extractOne(chunk_text, all_quotes, scorer=fuzz.token_set_ratio)\n", "\n", "# if best_match_tuple:\n", "# best_match_quote = best_match_tuple[0]\n", "# match_score = best_match_tuple[1]\n", "\n", "# if match_score >= FUZZY_MATCH_THRESHOLD:\n", "# chunked_pdfs_df.loc[index, 'is_target_quote'] = 1\n", "# chunked_pdfs_df.loc[index, 'matched_quote'] = best_match_quote\n", "# chunked_pdfs_df.loc[index, 'match_score'] = match_score\n", "# print(\"Fuzzy matching complete.\")" ] }, { "cell_type": "markdown", "id": "d509674b", "metadata": {}, "source": [ "# Further optimized fuzzy match" ] }, { "cell_type": "code", "execution_count": 29, "id": "e3ea4dce", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "85a3712ec33542bb811b04c8ea1ffcde", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Processing Countries: 0%| | 0/24 [00:00= FUZZY_MATCH_THRESHOLD:\n", " chunked_pdfs_df.loc[index, 'is_target_quote'] = 1\n", " chunked_pdfs_df.loc[index, 'matched_quote'] = best_match_quote\n", " chunked_pdfs_df.loc[index, 'match_score'] = match_score\n", "\n", "print(\"Fuzzy matching complete.\")" ] }, { "cell_type": "code", "execution_count": 30, "id": "9c0e0d8d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countryfilenamefilepathindicated_pagechunk_numtextis_target_quotematched_quotematch_score
0AfghanistanAfghanistan_First_NDC.pdf../data/raw/pdfs\\Afghanistan\\Afghanistan_First...111 ISLAMIC REPUBLIC OF AFGHANISTAN Intended Nat...1Target Years: \\n2020 to 2030 \\nContribution Ty...94
1AfghanistanAfghanistan_First_NDC.pdf../data/raw/pdfs\\Afghanistan\\Afghanistan_First...12Financial Needs: Total: USD 17.405 billion  A...1Target Years: \\n2020 to 2030 \\nContribution Ty...71
2AfghanistanAfghanistan_First_NDC.pdf../data/raw/pdfs\\Afghanistan\\Afghanistan_First...232 1. Afghanistan’s National Circumstances and ...0None0
3AfghanistanAfghanistan_First_NDC.pdf../data/raw/pdfs\\Afghanistan\\Afghanistan_First...24Afghanistan remains one of the poorest countri...0None0
4AfghanistanAfghanistan_First_NDC.pdf../data/raw/pdfs\\Afghanistan\\Afghanistan_First...25Despite these challenges, Afghanistan can rema...0None0
..............................
53027ZimbabweZimbabwe_NDC30_Country_Statement.pdf../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...404945.1 GENDER, YOUTH, CHILDREN AND INCLUSIVITY Th...0None0
53028ZimbabweZimbabwe_NDC30_Country_Statement.pdf../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...40495Adaptation and applicable mitigation actions w...0None0
53029ZimbabweZimbabwe_NDC30_Country_Statement.pdf../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...40496The enhanced integration of climate change int...0None0
53030ZimbabweZimbabwe_NDC30_Country_Statement.pdf../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...4149735 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT.0None0
53031ZimbabweZimbabwe_NDC30_Country_Statement.pdf../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...4249836 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT Ministr...0None0
\n", "

53032 rows × 9 columns

\n", "
" ], "text/plain": [ " country filename \\\n", "0 Afghanistan Afghanistan_First_NDC.pdf \n", "1 Afghanistan Afghanistan_First_NDC.pdf \n", "2 Afghanistan Afghanistan_First_NDC.pdf \n", "3 Afghanistan Afghanistan_First_NDC.pdf \n", "4 Afghanistan Afghanistan_First_NDC.pdf \n", "... ... ... \n", "53027 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n", "53028 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n", "53029 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n", "53030 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n", "53031 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n", "\n", " filepath indicated_page \\\n", "0 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 1 \n", "1 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 1 \n", "2 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 2 \n", "3 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 2 \n", "4 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 2 \n", "... ... ... \n", "53027 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 40 \n", "53028 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 40 \n", "53029 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 40 \n", "53030 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 41 \n", "53031 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 42 \n", "\n", " chunk_num text \\\n", "0 1 1 ISLAMIC REPUBLIC OF AFGHANISTAN Intended Nat... \n", "1 2 Financial Needs: Total: USD 17.405 billion  A... \n", "2 3 2 1. Afghanistan’s National Circumstances and ... \n", "3 4 Afghanistan remains one of the poorest countri... \n", "4 5 Despite these challenges, Afghanistan can rema... \n", "... ... ... \n", "53027 494 5.1 GENDER, YOUTH, CHILDREN AND INCLUSIVITY Th... \n", "53028 495 Adaptation and applicable mitigation actions w... \n", "53029 496 The enhanced integration of climate change int... \n", "53030 497 35 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT. \n", "53031 498 36 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT Ministr... \n", "\n", " is_target_quote matched_quote \\\n", "0 1 Target Years: \\n2020 to 2030 \\nContribution Ty... \n", "1 1 Target Years: \\n2020 to 2030 \\nContribution Ty... \n", "2 0 None \n", "3 0 None \n", "4 0 None \n", "... ... ... \n", "53027 0 None \n", "53028 0 None \n", "53029 0 None \n", "53030 0 None \n", "53031 0 None \n", "\n", " match_score \n", "0 94 \n", "1 71 \n", "2 0 \n", "3 0 \n", "4 0 \n", "... ... \n", "53027 0 \n", "53028 0 \n", "53029 0 \n", "53030 0 \n", "53031 0 \n", "\n", "[53032 rows x 9 columns]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunked_pdfs_df" ] }, { "cell_type": "code", "execution_count": 31, "id": "45a6b88d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 53032 entries, 0 to 53031\n", "Data columns (total 9 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 country 53032 non-null object\n", " 1 filename 53032 non-null object\n", " 2 filepath 53032 non-null object\n", " 3 indicated_page 53032 non-null int64 \n", " 4 chunk_num 53032 non-null int64 \n", " 5 text 53032 non-null object\n", " 6 is_target_quote 53032 non-null int64 \n", " 7 matched_quote 180 non-null object\n", " 8 match_score 53032 non-null int64 \n", "dtypes: int64(4), object(5)\n", "memory usage: 3.6+ MB\n" ] } ], "source": [ "chunked_pdfs_df.info()" ] }, { "cell_type": "code", "execution_count": 33, "id": "9d7038e9", "metadata": {}, "outputs": [], "source": [ "chunked_pdfs_df.to_csv('./fuzzy_matched_chunks.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 28, "id": "76b51ab6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/c/Users/Derik/Desktop/NDC_Scraper/Classification Model/tf_idf_lr_model\n" ] } ], "source": [ "!pwd" ] } ], "metadata": { "kernelspec": { "display_name": "NDC_extraction_ENV", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.21" } }, "nbformat": 4, "nbformat_minor": 5 }