{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "8d1fae73",
   "metadata": {},
   "source": [
    "This notebook aims to map the manually extracted bools with the chunked data so we can have a more varied negative class."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "9ced7f63",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from fuzzywuzzy import process\n",
    "from fuzzywuzzy import fuzz\n",
    "\n",
    "from tqdm.notebook import tqdm, IProgress "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "e06e72c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "CHUNK_TEXT_COLUMN = 'text'\n",
    "QUOTE_TEXT_COLUMN = 'Quote or table'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "e467b9cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    chunked_pdfs_df = pd.read_excel('../../etl/20250708_sentences_pdf_extraction_results.xlsx', sheet_name= 'Sheet1').drop_duplicates()\n",
    "    extracted_quotes_df = pd.read_excel('../NDC_scraping_stage_1.xlsx', sheet_name= 'Prev_Finance').drop_duplicates()\n",
    "except Exception as e:\n",
    "    raise RuntimeError(f\"An unexpected error occurred while loading DataFrames: {e}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "62866717",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',\n",
       "       'Antigua_and_Barbuda', 'Argentina', 'Armenia', 'Australia',\n",
       "       'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados',\n",
       "       'Belarus', 'Belize', 'Benin', 'Bhutan',\n",
       "       'Bolivia_Plurinational_State_of', 'Bosnia_and_Herzegovina',\n",
       "       'Botswana', 'Brazil', 'Brunei_Darussalam', 'Burkina_Faso',\n",
       "       'Burundi', 'Cabo_Verde', 'Cambodia', 'Cameroon', 'Canada',\n",
       "       'Central_African_Republic', 'Chad', 'Chile', 'China', 'Colombia',\n",
       "       'Comoros', 'Congo', 'Cook_Islands', 'Costa_Rica', 'Cuba',\n",
       "       'Côte_dIvoire', 'Democratic_Republic_of_the_Congo', 'Djibouti',\n",
       "       'Dominica', 'Dominican_Republic', 'Ecuador', 'Egypt',\n",
       "       'El_Salvador', 'Equatorial_Guinea', 'Eritrea', 'Eswatini',\n",
       "       'Ethiopia', 'European_Union_EU', 'Fiji', 'Gabon', 'Gambia',\n",
       "       'Georgia', 'Ghana', 'Grenada', 'Guatemala', 'Guinea',\n",
       "       'Guinea-Bissau', 'Guyana', 'Haiti', 'Holy_See', 'Honduras',\n",
       "       'Iceland', 'India', 'Iraq', 'Israel', 'Jamaica', 'Japan', 'Jordan',\n",
       "       'Kazakhstan', 'Kenya', 'Kuwait', 'Kyrgyzstan',\n",
       "       'Lao_Peoples_Democratic_Republic', 'Lebanon', 'Liberia',\n",
       "       'Liechtenstein', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives',\n",
       "       'Mali', 'Marshall_Islands', 'Mauritania', 'Mauritius', 'Mexico',\n",
       "       'Micronesia_Federated_States_of', 'Monaco', 'Mongolia',\n",
       "       'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia',\n",
       "       'Nauru', 'Nepal', 'New_Zealand', 'Nicaragua', 'Niger', 'Nigeria',\n",
       "       'Niue', 'North_Macedonia', 'Norway', 'Oman', 'Pakistan', 'Palau',\n",
       "       'Panama', 'Papua_New_Guinea', 'Peru', 'Philippines', 'Qatar',\n",
       "       'Republic_of_Korea', 'Republic_of_Moldova', 'Russian_Federation',\n",
       "       'Rwanda', 'Saint_Kitts_and_Nevis', 'Saint_Lucia',\n",
       "       'Saint_Vincent_and_the_Grenadines', 'Samoa', 'San_Marino',\n",
       "       'Sao_Tome_and_Principe', 'Saudi_Arabia', 'Senegal', 'Serbia',\n",
       "       'Seychelles', 'Sierra_Leone', 'Singapore', 'Solomon_Islands',\n",
       "       'Somalia', 'South_Africa', 'South_Sudan', 'Sri_Lanka',\n",
       "       'State_of_Palestine', 'Sudan', 'Suriname', 'Switzerland',\n",
       "       'Syrian_Arab_Republic', 'Tajikistan', 'Thailand', 'Timor-Leste',\n",
       "       'Togo', 'Tonga', 'Trinidad_and_Tobago', 'Tunisia', 'Turkmenistan',\n",
       "       'Tuvalu', 'Türkiye', 'Uganda', 'Ukraine', 'United_Arab_Emirates',\n",
       "       'United_Kingdom_of_Great_Britain_and_Northern_Ireland',\n",
       "       'United_Republic_of_Tanzania', 'United_States_of_America',\n",
       "       'Uruguay', 'Uzbekistan', 'Vanuatu',\n",
       "       'Venezuela_Bolivarian_Republic_of', 'Viet_Nam', 'Zambia',\n",
       "       'Zimbabwe'], dtype=object)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chunked_pdfs_df['country'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "281d0e3e",
   "metadata": {},
   "outputs": [],
   "source": [
    "extracted_quotes_df['Source'] = extracted_quotes_df['Source'].str.split(' ').str[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "a4516eaa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['Afghanistan', 'Antigua', 'Bahamas', 'Barbados', 'Bhutan',\n",
       "       'Bosnia', 'Egypt', 'Bangladesh', 'Eswatini', 'Fiji', 'Grenada',\n",
       "       'Guyana', 'Indonesia', 'Kazakhstan', 'Kiribati', 'Liberia',\n",
       "       'Micronesia', 'Belize', 'Malaysia', 'Mongolia', 'Nepal',\n",
       "       'Pakistan', 'Solomon', 'Trinidad', nan], dtype=object)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "extracted_quotes_df['Source'].unique()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7e118852",
   "metadata": {},
   "source": [
    "# Filter chunked pdfs to manually extracted data to reduce amount of fuzzy matches\n",
    "\n",
    "Can be further optimized by iterating through country / file name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "0f1beca6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# chunked_pdfs_df = chunked_pdfs_df[chunked_pdfs_df['country'].isin(extracted_quotes_df['Source'].unique())]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "48d2f1d5",
   "metadata": {},
   "outputs": [],
   "source": [
    "if CHUNK_TEXT_COLUMN not in chunked_pdfs_df.columns:\n",
    "    raise ValueError(f\"Error: Chunk text column '{CHUNK_TEXT_COLUMN}' not found in 'chunked_pdfs_df'.\")\n",
    "if QUOTE_TEXT_COLUMN not in extracted_quotes_df.columns:\n",
    "    raise ValueError(f\"Error: Quote text column '{QUOTE_TEXT_COLUMN}' not found in 'extracted_quotes_df'.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "bdc824af",
   "metadata": {},
   "outputs": [],
   "source": [
    "# all_quotes = extracted_quotes_df[QUOTE_TEXT_COLUMN].tolist()\n",
    "# chunked_pdfs_df['is_target_quote'] = 0\n",
    "# chunked_pdfs_df['matched_quote'] = None\n",
    "# chunked_pdfs_df['match_score'] = 0\n",
    "\n",
    "# # Iterate through each chunk with a progress bar\n",
    "# # tqdm will automatically print the \"Starting...\" and \"Complete.\" messages through its bar.\n",
    "# # For Jupyter/IPython notebooks, use tqdm.notebook.tqdm. For scripts, use tqdm.tqdm\n",
    "# for index, row in tqdm(chunked_pdfs_df.iterrows(), total=len(chunked_pdfs_df), desc=\"Fuzzy Matching Chunks\"):\n",
    "#     chunk_text = str(row[CHUNK_TEXT_COLUMN]) # Convert to string to handle potential non-string types\n",
    "\n",
    "#     # Find the best matching quote and its score\n",
    "#     best_match_tuple = process.extractOne(chunk_text, all_quotes, scorer=fuzz.token_set_ratio)\n",
    "\n",
    "#     if best_match_tuple:\n",
    "#         best_match_quote = best_match_tuple[0]\n",
    "#         match_score = best_match_tuple[1]\n",
    "\n",
    "#         if match_score >= FUZZY_MATCH_THRESHOLD:\n",
    "#             chunked_pdfs_df.loc[index, 'is_target_quote'] = 1\n",
    "#             chunked_pdfs_df.loc[index, 'matched_quote'] = best_match_quote\n",
    "#             chunked_pdfs_df.loc[index, 'match_score'] = match_score\n",
    "# print(\"Fuzzy matching complete.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d509674b",
   "metadata": {},
   "source": [
    "# Further optimized fuzzy match"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "e3ea4dce",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "85a3712ec33542bb811b04c8ea1ffcde",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Processing Countries:   0%|          | 0/24 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '---.']\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fuzzy matching complete.\n"
     ]
    }
   ],
   "source": [
    "FUZZY_MATCH_THRESHOLD = 70\n",
    "chunked_pdfs_df['is_target_quote'] = 0\n",
    "chunked_pdfs_df['matched_quote'] = None\n",
    "chunked_pdfs_df['match_score'] = 0\n",
    "\n",
    "unique_sources = extracted_quotes_df['Source'].dropna().unique()\n",
    "\n",
    "for source_country in tqdm(unique_sources, desc=\"Processing Countries\"):\n",
    "    # Filter quotes for the current source_country\n",
    "    country_specific_quotes_df = extracted_quotes_df[extracted_quotes_df['Source'] == source_country]\n",
    "    all_quotes_for_country = country_specific_quotes_df[QUOTE_TEXT_COLUMN].tolist()\n",
    "\n",
    "    if not all_quotes_for_country:\n",
    "        continue # Skip if no quotes for this country\n",
    "    normalized_chunk_countries = chunked_pdfs_df['country'].str.replace('_', ' ').str.lower()\n",
    "    normalized_source_country = source_country.replace('_', ' ').lower()\n",
    "    matching_chunk_indices = chunked_pdfs_df[normalized_chunk_countries.str.contains(normalized_source_country, na=False)].index\n",
    "\n",
    "    if matching_chunk_indices.empty:\n",
    "        continue # Skip if no chunks for this country\n",
    "\n",
    "    # Iterate through only the relevant chunks for the current country\n",
    "    for index in matching_chunk_indices:\n",
    "        chunk_text = str(chunked_pdfs_df.loc[index, CHUNK_TEXT_COLUMN])\n",
    "\n",
    "        # Find the best matching quote and its score within the country-specific quotes\n",
    "        best_match_tuple = process.extractOne(chunk_text, all_quotes_for_country, scorer=fuzz.token_set_ratio)\n",
    "\n",
    "        if best_match_tuple:\n",
    "            best_match_quote = best_match_tuple[0]\n",
    "            match_score = best_match_tuple[1]\n",
    "\n",
    "            if match_score >= FUZZY_MATCH_THRESHOLD:\n",
    "                chunked_pdfs_df.loc[index, 'is_target_quote'] = 1\n",
    "                chunked_pdfs_df.loc[index, 'matched_quote'] = best_match_quote\n",
    "                chunked_pdfs_df.loc[index, 'match_score'] = match_score\n",
    "\n",
    "print(\"Fuzzy matching complete.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "9c0e0d8d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country</th>\n",
       "      <th>filename</th>\n",
       "      <th>filepath</th>\n",
       "      <th>indicated_page</th>\n",
       "      <th>chunk_num</th>\n",
       "      <th>text</th>\n",
       "      <th>is_target_quote</th>\n",
       "      <th>matched_quote</th>\n",
       "      <th>match_score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>Afghanistan_First_NDC.pdf</td>\n",
       "      <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1 ISLAMIC REPUBLIC OF AFGHANISTAN Intended Nat...</td>\n",
       "      <td>1</td>\n",
       "      <td>Target Years: \\n2020 to 2030 \\nContribution Ty...</td>\n",
       "      <td>94</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>Afghanistan_First_NDC.pdf</td>\n",
       "      <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>Financial Needs: Total: USD 17.405 billion  A...</td>\n",
       "      <td>1</td>\n",
       "      <td>Target Years: \\n2020 to 2030 \\nContribution Ty...</td>\n",
       "      <td>71</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>Afghanistan_First_NDC.pdf</td>\n",
       "      <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>2 1. Afghanistan’s National Circumstances and ...</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>Afghanistan_First_NDC.pdf</td>\n",
       "      <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>Afghanistan remains one of the poorest countri...</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>Afghanistan_First_NDC.pdf</td>\n",
       "      <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>Despite these challenges, Afghanistan can rema...</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>53027</th>\n",
       "      <td>Zimbabwe</td>\n",
       "      <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
       "      <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
       "      <td>40</td>\n",
       "      <td>494</td>\n",
       "      <td>5.1 GENDER, YOUTH, CHILDREN AND INCLUSIVITY Th...</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>53028</th>\n",
       "      <td>Zimbabwe</td>\n",
       "      <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
       "      <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
       "      <td>40</td>\n",
       "      <td>495</td>\n",
       "      <td>Adaptation and applicable mitigation actions w...</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>53029</th>\n",
       "      <td>Zimbabwe</td>\n",
       "      <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
       "      <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
       "      <td>40</td>\n",
       "      <td>496</td>\n",
       "      <td>The enhanced integration of climate change int...</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>53030</th>\n",
       "      <td>Zimbabwe</td>\n",
       "      <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
       "      <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
       "      <td>41</td>\n",
       "      <td>497</td>\n",
       "      <td>35 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT.</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>53031</th>\n",
       "      <td>Zimbabwe</td>\n",
       "      <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
       "      <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
       "      <td>42</td>\n",
       "      <td>498</td>\n",
       "      <td>36 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT Ministr...</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>53032 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           country                              filename  \\\n",
       "0      Afghanistan             Afghanistan_First_NDC.pdf   \n",
       "1      Afghanistan             Afghanistan_First_NDC.pdf   \n",
       "2      Afghanistan             Afghanistan_First_NDC.pdf   \n",
       "3      Afghanistan             Afghanistan_First_NDC.pdf   \n",
       "4      Afghanistan             Afghanistan_First_NDC.pdf   \n",
       "...            ...                                   ...   \n",
       "53027     Zimbabwe  Zimbabwe_NDC30_Country_Statement.pdf   \n",
       "53028     Zimbabwe  Zimbabwe_NDC30_Country_Statement.pdf   \n",
       "53029     Zimbabwe  Zimbabwe_NDC30_Country_Statement.pdf   \n",
       "53030     Zimbabwe  Zimbabwe_NDC30_Country_Statement.pdf   \n",
       "53031     Zimbabwe  Zimbabwe_NDC30_Country_Statement.pdf   \n",
       "\n",
       "                                                filepath  indicated_page  \\\n",
       "0      ../data/raw/pdfs\\Afghanistan\\Afghanistan_First...               1   \n",
       "1      ../data/raw/pdfs\\Afghanistan\\Afghanistan_First...               1   \n",
       "2      ../data/raw/pdfs\\Afghanistan\\Afghanistan_First...               2   \n",
       "3      ../data/raw/pdfs\\Afghanistan\\Afghanistan_First...               2   \n",
       "4      ../data/raw/pdfs\\Afghanistan\\Afghanistan_First...               2   \n",
       "...                                                  ...             ...   \n",
       "53027  ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...              40   \n",
       "53028  ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...              40   \n",
       "53029  ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...              40   \n",
       "53030  ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...              41   \n",
       "53031  ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...              42   \n",
       "\n",
       "       chunk_num                                               text  \\\n",
       "0              1  1 ISLAMIC REPUBLIC OF AFGHANISTAN Intended Nat...   \n",
       "1              2  Financial Needs: Total: USD 17.405 billion  A...   \n",
       "2              3  2 1. Afghanistan’s National Circumstances and ...   \n",
       "3              4  Afghanistan remains one of the poorest countri...   \n",
       "4              5  Despite these challenges, Afghanistan can rema...   \n",
       "...          ...                                                ...   \n",
       "53027        494  5.1 GENDER, YOUTH, CHILDREN AND INCLUSIVITY Th...   \n",
       "53028        495  Adaptation and applicable mitigation actions w...   \n",
       "53029        496  The enhanced integration of climate change int...   \n",
       "53030        497            35 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT.   \n",
       "53031        498  36 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT Ministr...   \n",
       "\n",
       "       is_target_quote                                      matched_quote  \\\n",
       "0                    1  Target Years: \\n2020 to 2030 \\nContribution Ty...   \n",
       "1                    1  Target Years: \\n2020 to 2030 \\nContribution Ty...   \n",
       "2                    0                                               None   \n",
       "3                    0                                               None   \n",
       "4                    0                                               None   \n",
       "...                ...                                                ...   \n",
       "53027                0                                               None   \n",
       "53028                0                                               None   \n",
       "53029                0                                               None   \n",
       "53030                0                                               None   \n",
       "53031                0                                               None   \n",
       "\n",
       "       match_score  \n",
       "0               94  \n",
       "1               71  \n",
       "2                0  \n",
       "3                0  \n",
       "4                0  \n",
       "...            ...  \n",
       "53027            0  \n",
       "53028            0  \n",
       "53029            0  \n",
       "53030            0  \n",
       "53031            0  \n",
       "\n",
       "[53032 rows x 9 columns]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chunked_pdfs_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "45a6b88d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 53032 entries, 0 to 53031\n",
      "Data columns (total 9 columns):\n",
      " #   Column           Non-Null Count  Dtype \n",
      "---  ------           --------------  ----- \n",
      " 0   country          53032 non-null  object\n",
      " 1   filename         53032 non-null  object\n",
      " 2   filepath         53032 non-null  object\n",
      " 3   indicated_page   53032 non-null  int64 \n",
      " 4   chunk_num        53032 non-null  int64 \n",
      " 5   text             53032 non-null  object\n",
      " 6   is_target_quote  53032 non-null  int64 \n",
      " 7   matched_quote    180 non-null    object\n",
      " 8   match_score      53032 non-null  int64 \n",
      "dtypes: int64(4), object(5)\n",
      "memory usage: 3.6+ MB\n"
     ]
    }
   ],
   "source": [
    "chunked_pdfs_df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "9d7038e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "chunked_pdfs_df.to_csv('./fuzzy_matched_chunks.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "76b51ab6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/c/Users/Derik/Desktop/NDC_Scraper/Classification Model/tf_idf_lr_model\n"
     ]
    }
   ],
   "source": [
    "!pwd"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "NDC_extraction_ENV",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.21"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}