{ "cells": [ { "cell_type": "markdown", "id": "8d1fae73", "metadata": {}, "source": [ "This notebook aims to map the manually extracted bools with the chunked data so we can have a more varied negative class." ] }, { "cell_type": "code", "execution_count": 15, "id": "9ced7f63", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from fuzzywuzzy import process\n", "from fuzzywuzzy import fuzz\n", "\n", "from tqdm.notebook import tqdm, IProgress " ] }, { "cell_type": "code", "execution_count": 16, "id": "e06e72c9", "metadata": {}, "outputs": [], "source": [ "CHUNK_TEXT_COLUMN = 'text'\n", "QUOTE_TEXT_COLUMN = 'Quote or table'" ] }, { "cell_type": "code", "execution_count": 17, "id": "e467b9cd", "metadata": {}, "outputs": [], "source": [ "try:\n", " chunked_pdfs_df = pd.read_excel('../../etl/20250708_sentences_pdf_extraction_results.xlsx', sheet_name= 'Sheet1').drop_duplicates()\n", " extracted_quotes_df = pd.read_excel('../NDC_scraping_stage_1.xlsx', sheet_name= 'Prev_Finance').drop_duplicates()\n", "except Exception as e:\n", " raise RuntimeError(f\"An unexpected error occurred while loading DataFrames: {e}\")\n" ] }, { "cell_type": "code", "execution_count": 18, "id": "62866717", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',\n", " 'Antigua_and_Barbuda', 'Argentina', 'Armenia', 'Australia',\n", " 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados',\n", " 'Belarus', 'Belize', 'Benin', 'Bhutan',\n", " 'Bolivia_Plurinational_State_of', 'Bosnia_and_Herzegovina',\n", " 'Botswana', 'Brazil', 'Brunei_Darussalam', 'Burkina_Faso',\n", " 'Burundi', 'Cabo_Verde', 'Cambodia', 'Cameroon', 'Canada',\n", " 'Central_African_Republic', 'Chad', 'Chile', 'China', 'Colombia',\n", " 'Comoros', 'Congo', 'Cook_Islands', 'Costa_Rica', 'Cuba',\n", " 'Côte_dIvoire', 'Democratic_Republic_of_the_Congo', 'Djibouti',\n", " 'Dominica', 'Dominican_Republic', 'Ecuador', 'Egypt',\n", " 'El_Salvador', 'Equatorial_Guinea', 'Eritrea', 'Eswatini',\n", " 'Ethiopia', 'European_Union_EU', 'Fiji', 'Gabon', 'Gambia',\n", " 'Georgia', 'Ghana', 'Grenada', 'Guatemala', 'Guinea',\n", " 'Guinea-Bissau', 'Guyana', 'Haiti', 'Holy_See', 'Honduras',\n", " 'Iceland', 'India', 'Iraq', 'Israel', 'Jamaica', 'Japan', 'Jordan',\n", " 'Kazakhstan', 'Kenya', 'Kuwait', 'Kyrgyzstan',\n", " 'Lao_Peoples_Democratic_Republic', 'Lebanon', 'Liberia',\n", " 'Liechtenstein', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives',\n", " 'Mali', 'Marshall_Islands', 'Mauritania', 'Mauritius', 'Mexico',\n", " 'Micronesia_Federated_States_of', 'Monaco', 'Mongolia',\n", " 'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia',\n", " 'Nauru', 'Nepal', 'New_Zealand', 'Nicaragua', 'Niger', 'Nigeria',\n", " 'Niue', 'North_Macedonia', 'Norway', 'Oman', 'Pakistan', 'Palau',\n", " 'Panama', 'Papua_New_Guinea', 'Peru', 'Philippines', 'Qatar',\n", " 'Republic_of_Korea', 'Republic_of_Moldova', 'Russian_Federation',\n", " 'Rwanda', 'Saint_Kitts_and_Nevis', 'Saint_Lucia',\n", " 'Saint_Vincent_and_the_Grenadines', 'Samoa', 'San_Marino',\n", " 'Sao_Tome_and_Principe', 'Saudi_Arabia', 'Senegal', 'Serbia',\n", " 'Seychelles', 'Sierra_Leone', 'Singapore', 'Solomon_Islands',\n", " 'Somalia', 'South_Africa', 'South_Sudan', 'Sri_Lanka',\n", " 'State_of_Palestine', 'Sudan', 'Suriname', 'Switzerland',\n", " 'Syrian_Arab_Republic', 'Tajikistan', 'Thailand', 'Timor-Leste',\n", " 'Togo', 'Tonga', 'Trinidad_and_Tobago', 'Tunisia', 'Turkmenistan',\n", " 'Tuvalu', 'Türkiye', 'Uganda', 'Ukraine', 'United_Arab_Emirates',\n", " 'United_Kingdom_of_Great_Britain_and_Northern_Ireland',\n", " 'United_Republic_of_Tanzania', 'United_States_of_America',\n", " 'Uruguay', 'Uzbekistan', 'Vanuatu',\n", " 'Venezuela_Bolivarian_Republic_of', 'Viet_Nam', 'Zambia',\n", " 'Zimbabwe'], dtype=object)" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunked_pdfs_df['country'].unique()" ] }, { "cell_type": "code", "execution_count": 19, "id": "281d0e3e", "metadata": {}, "outputs": [], "source": [ "extracted_quotes_df['Source'] = extracted_quotes_df['Source'].str.split(' ').str[0]" ] }, { "cell_type": "code", "execution_count": 20, "id": "a4516eaa", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Afghanistan', 'Antigua', 'Bahamas', 'Barbados', 'Bhutan',\n", " 'Bosnia', 'Egypt', 'Bangladesh', 'Eswatini', 'Fiji', 'Grenada',\n", " 'Guyana', 'Indonesia', 'Kazakhstan', 'Kiribati', 'Liberia',\n", " 'Micronesia', 'Belize', 'Malaysia', 'Mongolia', 'Nepal',\n", " 'Pakistan', 'Solomon', 'Trinidad', nan], dtype=object)" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "extracted_quotes_df['Source'].unique()" ] }, { "cell_type": "markdown", "id": "7e118852", "metadata": {}, "source": [ "# Filter chunked pdfs to manually extracted data to reduce amount of fuzzy matches\n", "\n", "Can be further optimized by iterating through country / file name" ] }, { "cell_type": "code", "execution_count": 21, "id": "0f1beca6", "metadata": {}, "outputs": [], "source": [ "# chunked_pdfs_df = chunked_pdfs_df[chunked_pdfs_df['country'].isin(extracted_quotes_df['Source'].unique())]" ] }, { "cell_type": "code", "execution_count": 22, "id": "48d2f1d5", "metadata": {}, "outputs": [], "source": [ "if CHUNK_TEXT_COLUMN not in chunked_pdfs_df.columns:\n", " raise ValueError(f\"Error: Chunk text column '{CHUNK_TEXT_COLUMN}' not found in 'chunked_pdfs_df'.\")\n", "if QUOTE_TEXT_COLUMN not in extracted_quotes_df.columns:\n", " raise ValueError(f\"Error: Quote text column '{QUOTE_TEXT_COLUMN}' not found in 'extracted_quotes_df'.\")" ] }, { "cell_type": "code", "execution_count": 23, "id": "bdc824af", "metadata": {}, "outputs": [], "source": [ "# all_quotes = extracted_quotes_df[QUOTE_TEXT_COLUMN].tolist()\n", "# chunked_pdfs_df['is_target_quote'] = 0\n", "# chunked_pdfs_df['matched_quote'] = None\n", "# chunked_pdfs_df['match_score'] = 0\n", "\n", "# # Iterate through each chunk with a progress bar\n", "# # tqdm will automatically print the \"Starting...\" and \"Complete.\" messages through its bar.\n", "# # For Jupyter/IPython notebooks, use tqdm.notebook.tqdm. For scripts, use tqdm.tqdm\n", "# for index, row in tqdm(chunked_pdfs_df.iterrows(), total=len(chunked_pdfs_df), desc=\"Fuzzy Matching Chunks\"):\n", "# chunk_text = str(row[CHUNK_TEXT_COLUMN]) # Convert to string to handle potential non-string types\n", "\n", "# # Find the best matching quote and its score\n", "# best_match_tuple = process.extractOne(chunk_text, all_quotes, scorer=fuzz.token_set_ratio)\n", "\n", "# if best_match_tuple:\n", "# best_match_quote = best_match_tuple[0]\n", "# match_score = best_match_tuple[1]\n", "\n", "# if match_score >= FUZZY_MATCH_THRESHOLD:\n", "# chunked_pdfs_df.loc[index, 'is_target_quote'] = 1\n", "# chunked_pdfs_df.loc[index, 'matched_quote'] = best_match_quote\n", "# chunked_pdfs_df.loc[index, 'match_score'] = match_score\n", "# print(\"Fuzzy matching complete.\")" ] }, { "cell_type": "markdown", "id": "d509674b", "metadata": {}, "source": [ "# Further optimized fuzzy match" ] }, { "cell_type": "code", "execution_count": 29, "id": "e3ea4dce", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "85a3712ec33542bb811b04c8ea1ffcde", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Processing Countries: 0%| | 0/24 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '---.']\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Fuzzy matching complete.\n" ] } ], "source": [ "FUZZY_MATCH_THRESHOLD = 70\n", "chunked_pdfs_df['is_target_quote'] = 0\n", "chunked_pdfs_df['matched_quote'] = None\n", "chunked_pdfs_df['match_score'] = 0\n", "\n", "unique_sources = extracted_quotes_df['Source'].dropna().unique()\n", "\n", "for source_country in tqdm(unique_sources, desc=\"Processing Countries\"):\n", " # Filter quotes for the current source_country\n", " country_specific_quotes_df = extracted_quotes_df[extracted_quotes_df['Source'] == source_country]\n", " all_quotes_for_country = country_specific_quotes_df[QUOTE_TEXT_COLUMN].tolist()\n", "\n", " if not all_quotes_for_country:\n", " continue # Skip if no quotes for this country\n", " normalized_chunk_countries = chunked_pdfs_df['country'].str.replace('_', ' ').str.lower()\n", " normalized_source_country = source_country.replace('_', ' ').lower()\n", " matching_chunk_indices = chunked_pdfs_df[normalized_chunk_countries.str.contains(normalized_source_country, na=False)].index\n", "\n", " if matching_chunk_indices.empty:\n", " continue # Skip if no chunks for this country\n", "\n", " # Iterate through only the relevant chunks for the current country\n", " for index in matching_chunk_indices:\n", " chunk_text = str(chunked_pdfs_df.loc[index, CHUNK_TEXT_COLUMN])\n", "\n", " # Find the best matching quote and its score within the country-specific quotes\n", " best_match_tuple = process.extractOne(chunk_text, all_quotes_for_country, scorer=fuzz.token_set_ratio)\n", "\n", " if best_match_tuple:\n", " best_match_quote = best_match_tuple[0]\n", " match_score = best_match_tuple[1]\n", "\n", " if match_score >= FUZZY_MATCH_THRESHOLD:\n", " chunked_pdfs_df.loc[index, 'is_target_quote'] = 1\n", " chunked_pdfs_df.loc[index, 'matched_quote'] = best_match_quote\n", " chunked_pdfs_df.loc[index, 'match_score'] = match_score\n", "\n", "print(\"Fuzzy matching complete.\")" ] }, { "cell_type": "code", "execution_count": 30, "id": "9c0e0d8d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | country | \n", "filename | \n", "filepath | \n", "indicated_page | \n", "chunk_num | \n", "text | \n", "is_target_quote | \n", "matched_quote | \n", "match_score | \n", "
|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "Afghanistan | \n", "Afghanistan_First_NDC.pdf | \n", "../data/raw/pdfs\\Afghanistan\\Afghanistan_First... | \n", "1 | \n", "1 | \n", "1 ISLAMIC REPUBLIC OF AFGHANISTAN Intended Nat... | \n", "1 | \n", "Target Years: \\n2020 to 2030 \\nContribution Ty... | \n", "94 | \n", "
| 1 | \n", "Afghanistan | \n", "Afghanistan_First_NDC.pdf | \n", "../data/raw/pdfs\\Afghanistan\\Afghanistan_First... | \n", "1 | \n", "2 | \n", "Financial Needs: Total: USD 17.405 billion A... | \n", "1 | \n", "Target Years: \\n2020 to 2030 \\nContribution Ty... | \n", "71 | \n", "
| 2 | \n", "Afghanistan | \n", "Afghanistan_First_NDC.pdf | \n", "../data/raw/pdfs\\Afghanistan\\Afghanistan_First... | \n", "2 | \n", "3 | \n", "2 1. Afghanistan’s National Circumstances and ... | \n", "0 | \n", "None | \n", "0 | \n", "
| 3 | \n", "Afghanistan | \n", "Afghanistan_First_NDC.pdf | \n", "../data/raw/pdfs\\Afghanistan\\Afghanistan_First... | \n", "2 | \n", "4 | \n", "Afghanistan remains one of the poorest countri... | \n", "0 | \n", "None | \n", "0 | \n", "
| 4 | \n", "Afghanistan | \n", "Afghanistan_First_NDC.pdf | \n", "../data/raw/pdfs\\Afghanistan\\Afghanistan_First... | \n", "2 | \n", "5 | \n", "Despite these challenges, Afghanistan can rema... | \n", "0 | \n", "None | \n", "0 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 53027 | \n", "Zimbabwe | \n", "Zimbabwe_NDC30_Country_Statement.pdf | \n", "../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... | \n", "40 | \n", "494 | \n", "5.1 GENDER, YOUTH, CHILDREN AND INCLUSIVITY Th... | \n", "0 | \n", "None | \n", "0 | \n", "
| 53028 | \n", "Zimbabwe | \n", "Zimbabwe_NDC30_Country_Statement.pdf | \n", "../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... | \n", "40 | \n", "495 | \n", "Adaptation and applicable mitigation actions w... | \n", "0 | \n", "None | \n", "0 | \n", "
| 53029 | \n", "Zimbabwe | \n", "Zimbabwe_NDC30_Country_Statement.pdf | \n", "../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... | \n", "40 | \n", "496 | \n", "The enhanced integration of climate change int... | \n", "0 | \n", "None | \n", "0 | \n", "
| 53030 | \n", "Zimbabwe | \n", "Zimbabwe_NDC30_Country_Statement.pdf | \n", "../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... | \n", "41 | \n", "497 | \n", "35 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT. | \n", "0 | \n", "None | \n", "0 | \n", "
| 53031 | \n", "Zimbabwe | \n", "Zimbabwe_NDC30_Country_Statement.pdf | \n", "../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... | \n", "42 | \n", "498 | \n", "36 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT Ministr... | \n", "0 | \n", "None | \n", "0 | \n", "
53032 rows × 9 columns
\n", "