Upload TF-IDF Logistic Regression baseline model

Browse files

Files changed (3) hide show

fuzzy_match_training_data.ipynb +297 -178
fuzzy_matched_chunks.csv +2 -2
model_pipeline.joblib +2 -2

fuzzy_match_training_data.ipynb CHANGED Viewed

@@ -10,19 +10,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
    "id": "9ced7f63",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\Derik\\anaconda3\\envs\\NDC_extraction_ENV\\lib\\site-packages\\fuzzywuzzy\\fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning\n",
-      "  warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')\n"
-     ]
-    }
-   ],
    "source": [
     "import pandas as pd\n",
     "from fuzzywuzzy import process\n",
@@ -33,7 +24,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "id": "e06e72c9",
    "metadata": {},
    "outputs": [],
@@ -44,15 +35,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "id": "e467b9cd",
    "metadata": {},
    "outputs": [],
    "source": [
-    "FUZZY_MATCH_THRESHOLD = 85\n",
-    "\n",
     "try:\n",
-    "    chunked_pdfs_df = pd.read_excel('../../etl/20250409_pdf_extraction_results.xlsx', sheet_name= 'Sheet1').drop_duplicates()\n",
     "    extracted_quotes_df = pd.read_excel('../NDC_scraping_stage_1.xlsx', sheet_name= 'Prev_Finance').drop_duplicates()\n",
     "except Exception as e:\n",
     "    raise RuntimeError(f\"An unexpected error occurred while loading DataFrames: {e}\")\n"
@@ -60,7 +49,122 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "id": "48d2f1d5",
    "metadata": {},
    "outputs": [],
@@ -73,29 +177,59 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "id": "8063a230",
    "metadata": {},
    "outputs": [],
    "source": [
-    "all_quotes = extracted_quotes_df[QUOTE_TEXT_COLUMN].tolist()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "id": "bdc824af",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "703b233b5adf4465825b90883d1dcafe",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "Fuzzy Matching Chunks:   0%|          | 0/60128 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
@@ -105,22 +239,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: ')']\n",
-      "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
-      "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: ';']\n",
-      "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
-      "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
-      "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: ',']\n",
-      "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: ').']\n",
-      "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '/']\n",
-      "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
-      "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
-      "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
-      "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
-      "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
-      "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
-      "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
-      "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: ').']\n"
      ]
     },
     {
@@ -132,33 +251,49 @@
     }
    ],
    "source": [
     "chunked_pdfs_df['is_target_quote'] = 0\n",
     "chunked_pdfs_df['matched_quote'] = None\n",
     "chunked_pdfs_df['match_score'] = 0\n",
     "\n",
-    "# Iterate through each chunk with a progress bar\n",
-    "# tqdm will automatically print the \"Starting...\" and \"Complete.\" messages through its bar.\n",
-    "# For Jupyter/IPython notebooks, use tqdm.notebook.tqdm. For scripts, use tqdm.tqdm\n",
-    "for index, row in tqdm(chunked_pdfs_df.iterrows(), total=len(chunked_pdfs_df), desc=\"Fuzzy Matching Chunks\"):\n",
-    "    chunk_text = str(row[CHUNK_TEXT_COLUMN]) # Convert to string to handle potential non-string types\n",
     "\n",
-    "    # Find the best matching quote and its score\n",
-    "    best_match_tuple = process.extractOne(chunk_text, all_quotes, scorer=fuzz.token_set_ratio)\n",
     "\n",
-    "    if best_match_tuple:\n",
-    "        best_match_quote = best_match_tuple[0]\n",
-    "        match_score = best_match_tuple[1]\n",
     "\n",
-    "        if match_score >= FUZZY_MATCH_THRESHOLD:\n",
-    "            chunked_pdfs_df.loc[index, 'is_target_quote'] = 1\n",
-    "            chunked_pdfs_df.loc[index, 'matched_quote'] = best_match_quote\n",
-    "            chunked_pdfs_df.loc[index, 'match_score'] = match_score\n",
     "print(\"Fuzzy matching complete.\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
    "id": "9c0e0d8d",
    "metadata": {},
    "outputs": [
@@ -189,9 +324,6 @@
        "      <th>indicated_page</th>\n",
        "      <th>chunk_num</th>\n",
        "      <th>text</th>\n",
-       "      <th>contains_thematic_scope</th>\n",
-       "      <th>contains_coverage</th>\n",
-       "      <th>contains_Granularity</th>\n",
        "      <th>is_target_quote</th>\n",
        "      <th>matched_quote</th>\n",
        "      <th>match_score</th>\n",
@@ -206,12 +338,9 @@
        "      <td>1</td>\n",
        "      <td>1</td>\n",
        "      <td>1 ISLAMIC REPUBLIC OF AFGHANISTAN Intended Nat...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0</td>\n",
-       "      <td>None</td>\n",
-       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
@@ -220,40 +349,31 @@
        "      <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
        "      <td>1</td>\n",
        "      <td>2</td>\n",
-       "      <td>its Intended Nationally Determined Contributio...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0</td>\n",
-       "      <td>None</td>\n",
-       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>Afghanistan</td>\n",
        "      <td>Afghanistan_First_NDC.pdf</td>\n",
        "      <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
-       "      <td>1</td>\n",
        "      <td>3</td>\n",
-       "      <td>atural resource management, agriculture, waste...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Target Years: \\n2020 to 2030 \\nContribution Ty...</td>\n",
-       "      <td>98</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>Afghanistan</td>\n",
        "      <td>Afghanistan_First_NDC.pdf</td>\n",
        "      <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
-       "      <td>1</td>\n",
        "      <td>4</td>\n",
-       "      <td>ss as usual (BAU) 2030 scenario, conditional o...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
        "      <td>0</td>\n",
        "      <td>None</td>\n",
        "      <td>0</td>\n",
@@ -263,12 +383,9 @@
        "      <td>Afghanistan</td>\n",
        "      <td>Afghanistan_First_NDC.pdf</td>\n",
        "      <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
-       "      <td>1</td>\n",
        "      <td>5</td>\n",
-       "      <td>or Afghanistan showing 13.6% relative reductio...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
        "      <td>0</td>\n",
        "      <td>None</td>\n",
        "      <td>0</td>\n",
@@ -284,88 +401,70 @@
        "      <td>...</td>\n",
        "      <td>...</td>\n",
        "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>60123</th>\n",
        "      <td>Zimbabwe</td>\n",
        "      <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
        "      <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
        "      <td>40</td>\n",
-       "      <td>845</td>\n",
-       "      <td>ILDING, EDUCATION, TRAINING AND AWARENESS The ...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
        "      <td>0</td>\n",
        "      <td>None</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>60124</th>\n",
        "      <td>Zimbabwe</td>\n",
        "      <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
        "      <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
        "      <td>40</td>\n",
-       "      <td>846</td>\n",
-       "      <td>ious sectors. The enhanced integration of clim...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
        "      <td>0</td>\n",
        "      <td>None</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>60125</th>\n",
        "      <td>Zimbabwe</td>\n",
        "      <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
        "      <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
        "      <td>40</td>\n",
-       "      <td>847</td>\n",
-       "      <td>pacity building and innovation. In addition, t...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
        "      <td>0</td>\n",
        "      <td>None</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>60126</th>\n",
        "      <td>Zimbabwe</td>\n",
        "      <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
        "      <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
        "      <td>41</td>\n",
-       "      <td>848</td>\n",
-       "      <td>35 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
        "      <td>0</td>\n",
        "      <td>None</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>60127</th>\n",
        "      <td>Zimbabwe</td>\n",
        "      <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
        "      <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
        "      <td>42</td>\n",
-       "      <td>849</td>\n",
        "      <td>36 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT Ministr...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
        "      <td>0</td>\n",
        "      <td>None</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>60128 rows × 12 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
@@ -376,81 +475,68 @@
        "3      Afghanistan             Afghanistan_First_NDC.pdf   \n",
        "4      Afghanistan             Afghanistan_First_NDC.pdf   \n",
        "...            ...                                   ...   \n",
-       "60123     Zimbabwe  Zimbabwe_NDC30_Country_Statement.pdf   \n",
-       "60124     Zimbabwe  Zimbabwe_NDC30_Country_Statement.pdf   \n",
-       "60125     Zimbabwe  Zimbabwe_NDC30_Country_Statement.pdf   \n",
-       "60126     Zimbabwe  Zimbabwe_NDC30_Country_Statement.pdf   \n",
-       "60127     Zimbabwe  Zimbabwe_NDC30_Country_Statement.pdf   \n",
        "\n",
        "                                                filepath  indicated_page  \\\n",
        "0      ../data/raw/pdfs\\Afghanistan\\Afghanistan_First...               1   \n",
        "1      ../data/raw/pdfs\\Afghanistan\\Afghanistan_First...               1   \n",
-       "2      ../data/raw/pdfs\\Afghanistan\\Afghanistan_First...               1   \n",
-       "3      ../data/raw/pdfs\\Afghanistan\\Afghanistan_First...               1   \n",
-       "4      ../data/raw/pdfs\\Afghanistan\\Afghanistan_First...               1   \n",
        "...                                                  ...             ...   \n",
-       "60123  ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...              40   \n",
-       "60124  ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...              40   \n",
-       "60125  ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...              40   \n",
-       "60126  ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...              41   \n",
-       "60127  ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...              42   \n",
        "\n",
        "       chunk_num                                               text  \\\n",
        "0              1  1 ISLAMIC REPUBLIC OF AFGHANISTAN Intended Nat...   \n",
-       "1              2  its Intended Nationally Determined Contributio...   \n",
-       "2              3  atural resource management, agriculture, waste...   \n",
-       "3              4  ss as usual (BAU) 2030 scenario, conditional o...   \n",
-       "4              5  or Afghanistan showing 13.6% relative reductio...   \n",
        "...          ...                                                ...   \n",
-       "60123        845  ILDING, EDUCATION, TRAINING AND AWARENESS The ...   \n",
-       "60124        846  ious sectors. The enhanced integration of clim...   \n",
-       "60125        847  pacity building and innovation. In addition, t...   \n",
-       "60126        848             35 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT   \n",
-       "60127        849  36 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT Ministr...   \n",
-       "\n",
-       "       contains_thematic_scope  contains_coverage  contains_Granularity  \\\n",
-       "0                          NaN                NaN                   NaN   \n",
-       "1                          NaN                NaN                   NaN   \n",
-       "2                          NaN                NaN                   NaN   \n",
-       "3                          NaN                NaN                   NaN   \n",
-       "4                          NaN                NaN                   NaN   \n",
-       "...                        ...                ...                   ...   \n",
-       "60123                      NaN                NaN                   NaN   \n",
-       "60124                      NaN                NaN                   NaN   \n",
-       "60125                      NaN                NaN                   NaN   \n",
-       "60126                      NaN                NaN                   NaN   \n",
-       "60127                      NaN                NaN                   NaN   \n",
        "\n",
        "       is_target_quote                                      matched_quote  \\\n",
-       "0                    0                                               None   \n",
-       "1                    0                                               None   \n",
-       "2                    1  Target Years: \\n2020 to 2030 \\nContribution Ty...   \n",
        "3                    0                                               None   \n",
        "4                    0                                               None   \n",
        "...                ...                                                ...   \n",
-       "60123                0                                               None   \n",
-       "60124                0                                               None   \n",
-       "60125                0                                               None   \n",
-       "60126                0                                               None   \n",
-       "60127                0                                               None   \n",
        "\n",
        "       match_score  \n",
-       "0                0  \n",
-       "1                0  \n",
-       "2               98  \n",
        "3                0  \n",
        "4                0  \n",
        "...            ...  \n",
-       "60123            0  \n",
-       "60124            0  \n",
-       "60125            0  \n",
-       "60126            0  \n",
-       "60127            0  \n",
        "\n",
-       "[60128 rows x 12 columns]"
       ]
      },
-     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -461,17 +547,50 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "id": "9d7038e9",
    "metadata": {},
    "outputs": [],
    "source": [
-    "chunked_pdfs_df.to_csv('./fuzzy_matched_chunks.csv')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
    "id": "76b51ab6",
    "metadata": {},
    "outputs": [

   },
   {
    "cell_type": "code",
+   "execution_count": 15,
    "id": "9ced7f63",
    "metadata": {},
+   "outputs": [],
    "source": [
     "import pandas as pd\n",
     "from fuzzywuzzy import process\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 16,
    "id": "e06e72c9",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 17,
    "id": "e467b9cd",
    "metadata": {},
    "outputs": [],
    "source": [
     "try:\n",
+    "    chunked_pdfs_df = pd.read_excel('../../etl/20250708_sentences_pdf_extraction_results.xlsx', sheet_name= 'Sheet1').drop_duplicates()\n",
     "    extracted_quotes_df = pd.read_excel('../NDC_scraping_stage_1.xlsx', sheet_name= 'Prev_Finance').drop_duplicates()\n",
     "except Exception as e:\n",
     "    raise RuntimeError(f\"An unexpected error occurred while loading DataFrames: {e}\")\n"
   },
   {
    "cell_type": "code",
+   "execution_count": 18,
+   "id": "62866717",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',\n",
+       "       'Antigua_and_Barbuda', 'Argentina', 'Armenia', 'Australia',\n",
+       "       'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados',\n",
+       "       'Belarus', 'Belize', 'Benin', 'Bhutan',\n",
+       "       'Bolivia_Plurinational_State_of', 'Bosnia_and_Herzegovina',\n",
+       "       'Botswana', 'Brazil', 'Brunei_Darussalam', 'Burkina_Faso',\n",
+       "       'Burundi', 'Cabo_Verde', 'Cambodia', 'Cameroon', 'Canada',\n",
+       "       'Central_African_Republic', 'Chad', 'Chile', 'China', 'Colombia',\n",
+       "       'Comoros', 'Congo', 'Cook_Islands', 'Costa_Rica', 'Cuba',\n",
+       "       'Côte_dIvoire', 'Democratic_Republic_of_the_Congo', 'Djibouti',\n",
+       "       'Dominica', 'Dominican_Republic', 'Ecuador', 'Egypt',\n",
+       "       'El_Salvador', 'Equatorial_Guinea', 'Eritrea', 'Eswatini',\n",
+       "       'Ethiopia', 'European_Union_EU', 'Fiji', 'Gabon', 'Gambia',\n",
+       "       'Georgia', 'Ghana', 'Grenada', 'Guatemala', 'Guinea',\n",
+       "       'Guinea-Bissau', 'Guyana', 'Haiti', 'Holy_See', 'Honduras',\n",
+       "       'Iceland', 'India', 'Iraq', 'Israel', 'Jamaica', 'Japan', 'Jordan',\n",
+       "       'Kazakhstan', 'Kenya', 'Kuwait', 'Kyrgyzstan',\n",
+       "       'Lao_Peoples_Democratic_Republic', 'Lebanon', 'Liberia',\n",
+       "       'Liechtenstein', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives',\n",
+       "       'Mali', 'Marshall_Islands', 'Mauritania', 'Mauritius', 'Mexico',\n",
+       "       'Micronesia_Federated_States_of', 'Monaco', 'Mongolia',\n",
+       "       'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia',\n",
+       "       'Nauru', 'Nepal', 'New_Zealand', 'Nicaragua', 'Niger', 'Nigeria',\n",
+       "       'Niue', 'North_Macedonia', 'Norway', 'Oman', 'Pakistan', 'Palau',\n",
+       "       'Panama', 'Papua_New_Guinea', 'Peru', 'Philippines', 'Qatar',\n",
+       "       'Republic_of_Korea', 'Republic_of_Moldova', 'Russian_Federation',\n",
+       "       'Rwanda', 'Saint_Kitts_and_Nevis', 'Saint_Lucia',\n",
+       "       'Saint_Vincent_and_the_Grenadines', 'Samoa', 'San_Marino',\n",
+       "       'Sao_Tome_and_Principe', 'Saudi_Arabia', 'Senegal', 'Serbia',\n",
+       "       'Seychelles', 'Sierra_Leone', 'Singapore', 'Solomon_Islands',\n",
+       "       'Somalia', 'South_Africa', 'South_Sudan', 'Sri_Lanka',\n",
+       "       'State_of_Palestine', 'Sudan', 'Suriname', 'Switzerland',\n",
+       "       'Syrian_Arab_Republic', 'Tajikistan', 'Thailand', 'Timor-Leste',\n",
+       "       'Togo', 'Tonga', 'Trinidad_and_Tobago', 'Tunisia', 'Turkmenistan',\n",
+       "       'Tuvalu', 'Türkiye', 'Uganda', 'Ukraine', 'United_Arab_Emirates',\n",
+       "       'United_Kingdom_of_Great_Britain_and_Northern_Ireland',\n",
+       "       'United_Republic_of_Tanzania', 'United_States_of_America',\n",
+       "       'Uruguay', 'Uzbekistan', 'Vanuatu',\n",
+       "       'Venezuela_Bolivarian_Republic_of', 'Viet_Nam', 'Zambia',\n",
+       "       'Zimbabwe'], dtype=object)"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "chunked_pdfs_df['country'].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "281d0e3e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "extracted_quotes_df['Source'] = extracted_quotes_df['Source'].str.split(' ').str[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "a4516eaa",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['Afghanistan', 'Antigua', 'Bahamas', 'Barbados', 'Bhutan',\n",
+       "       'Bosnia', 'Egypt', 'Bangladesh', 'Eswatini', 'Fiji', 'Grenada',\n",
+       "       'Guyana', 'Indonesia', 'Kazakhstan', 'Kiribati', 'Liberia',\n",
+       "       'Micronesia', 'Belize', 'Malaysia', 'Mongolia', 'Nepal',\n",
+       "       'Pakistan', 'Solomon', 'Trinidad', nan], dtype=object)"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "extracted_quotes_df['Source'].unique()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7e118852",
+   "metadata": {},
+   "source": [
+    "# Filter chunked pdfs to manually extracted data to reduce amount of fuzzy matches\n",
+    "\n",
+    "Can be further optimized by iterating through country / file name"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "0f1beca6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# chunked_pdfs_df = chunked_pdfs_df[chunked_pdfs_df['country'].isin(extracted_quotes_df['Source'].unique())]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
    "id": "48d2f1d5",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 23,
+   "id": "bdc824af",
    "metadata": {},
    "outputs": [],
    "source": [
+    "# all_quotes = extracted_quotes_df[QUOTE_TEXT_COLUMN].tolist()\n",
+    "# chunked_pdfs_df['is_target_quote'] = 0\n",
+    "# chunked_pdfs_df['matched_quote'] = None\n",
+    "# chunked_pdfs_df['match_score'] = 0\n",
+    "\n",
+    "# # Iterate through each chunk with a progress bar\n",
+    "# # tqdm will automatically print the \"Starting...\" and \"Complete.\" messages through its bar.\n",
+    "# # For Jupyter/IPython notebooks, use tqdm.notebook.tqdm. For scripts, use tqdm.tqdm\n",
+    "# for index, row in tqdm(chunked_pdfs_df.iterrows(), total=len(chunked_pdfs_df), desc=\"Fuzzy Matching Chunks\"):\n",
+    "#     chunk_text = str(row[CHUNK_TEXT_COLUMN]) # Convert to string to handle potential non-string types\n",
+    "\n",
+    "#     # Find the best matching quote and its score\n",
+    "#     best_match_tuple = process.extractOne(chunk_text, all_quotes, scorer=fuzz.token_set_ratio)\n",
+    "\n",
+    "#     if best_match_tuple:\n",
+    "#         best_match_quote = best_match_tuple[0]\n",
+    "#         match_score = best_match_tuple[1]\n",
+    "\n",
+    "#         if match_score >= FUZZY_MATCH_THRESHOLD:\n",
+    "#             chunked_pdfs_df.loc[index, 'is_target_quote'] = 1\n",
+    "#             chunked_pdfs_df.loc[index, 'matched_quote'] = best_match_quote\n",
+    "#             chunked_pdfs_df.loc[index, 'match_score'] = match_score\n",
+    "# print(\"Fuzzy matching complete.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d509674b",
+   "metadata": {},
+   "source": [
+    "# Further optimized fuzzy match"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 29,
+   "id": "e3ea4dce",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "85a3712ec33542bb811b04c8ea1ffcde",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
+       "Processing Countries:   0%|          | 0/24 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '---.']\n"
      ]
     },
     {
     }
    ],
    "source": [
+    "FUZZY_MATCH_THRESHOLD = 70\n",
     "chunked_pdfs_df['is_target_quote'] = 0\n",
     "chunked_pdfs_df['matched_quote'] = None\n",
     "chunked_pdfs_df['match_score'] = 0\n",
     "\n",
+    "unique_sources = extracted_quotes_df['Source'].dropna().unique()\n",
     "\n",
+    "for source_country in tqdm(unique_sources, desc=\"Processing Countries\"):\n",
+    "    # Filter quotes for the current source_country\n",
+    "    country_specific_quotes_df = extracted_quotes_df[extracted_quotes_df['Source'] == source_country]\n",
+    "    all_quotes_for_country = country_specific_quotes_df[QUOTE_TEXT_COLUMN].tolist()\n",
     "\n",
+    "    if not all_quotes_for_country:\n",
+    "        continue # Skip if no quotes for this country\n",
+    "    normalized_chunk_countries = chunked_pdfs_df['country'].str.replace('_', ' ').str.lower()\n",
+    "    normalized_source_country = source_country.replace('_', ' ').lower()\n",
+    "    matching_chunk_indices = chunked_pdfs_df[normalized_chunk_countries.str.contains(normalized_source_country, na=False)].index\n",
+    "\n",
+    "    if matching_chunk_indices.empty:\n",
+    "        continue # Skip if no chunks for this country\n",
+    "\n",
+    "    # Iterate through only the relevant chunks for the current country\n",
+    "    for index in matching_chunk_indices:\n",
+    "        chunk_text = str(chunked_pdfs_df.loc[index, CHUNK_TEXT_COLUMN])\n",
+    "\n",
+    "        # Find the best matching quote and its score within the country-specific quotes\n",
+    "        best_match_tuple = process.extractOne(chunk_text, all_quotes_for_country, scorer=fuzz.token_set_ratio)\n",
+    "\n",
+    "        if best_match_tuple:\n",
+    "            best_match_quote = best_match_tuple[0]\n",
+    "            match_score = best_match_tuple[1]\n",
+    "\n",
+    "            if match_score >= FUZZY_MATCH_THRESHOLD:\n",
+    "                chunked_pdfs_df.loc[index, 'is_target_quote'] = 1\n",
+    "                chunked_pdfs_df.loc[index, 'matched_quote'] = best_match_quote\n",
+    "                chunked_pdfs_df.loc[index, 'match_score'] = match_score\n",
     "\n",
     "print(\"Fuzzy matching complete.\")"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 30,
    "id": "9c0e0d8d",
    "metadata": {},
    "outputs": [
        "      <th>indicated_page</th>\n",
        "      <th>chunk_num</th>\n",
        "      <th>text</th>\n",
        "      <th>is_target_quote</th>\n",
        "      <th>matched_quote</th>\n",
        "      <th>match_score</th>\n",
        "      <td>1</td>\n",
        "      <td>1</td>\n",
        "      <td>1 ISLAMIC REPUBLIC OF AFGHANISTAN Intended Nat...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Target Years: \\n2020 to 2030 \\nContribution Ty...</td>\n",
+       "      <td>94</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
        "      <td>1</td>\n",
        "      <td>2</td>\n",
+       "      <td>Financial Needs: Total: USD 17.405 billion  A...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Target Years: \\n2020 to 2030 \\nContribution Ty...</td>\n",
+       "      <td>71</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>Afghanistan</td>\n",
        "      <td>Afghanistan_First_NDC.pdf</td>\n",
        "      <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
+       "      <td>2</td>\n",
        "      <td>3</td>\n",
+       "      <td>2 1. Afghanistan’s National Circumstances and ...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>Afghanistan</td>\n",
        "      <td>Afghanistan_First_NDC.pdf</td>\n",
        "      <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
+       "      <td>2</td>\n",
        "      <td>4</td>\n",
+       "      <td>Afghanistan remains one of the poorest countri...</td>\n",
        "      <td>0</td>\n",
        "      <td>None</td>\n",
        "      <td>0</td>\n",
        "      <td>Afghanistan</td>\n",
        "      <td>Afghanistan_First_NDC.pdf</td>\n",
        "      <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
+       "      <td>2</td>\n",
        "      <td>5</td>\n",
+       "      <td>Despite these challenges, Afghanistan can rema...</td>\n",
        "      <td>0</td>\n",
        "      <td>None</td>\n",
        "      <td>0</td>\n",
        "      <td>...</td>\n",
        "      <td>...</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>53027</th>\n",
        "      <td>Zimbabwe</td>\n",
        "      <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
        "      <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
        "      <td>40</td>\n",
+       "      <td>494</td>\n",
+       "      <td>5.1 GENDER, YOUTH, CHILDREN AND INCLUSIVITY Th...</td>\n",
        "      <td>0</td>\n",
        "      <td>None</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>53028</th>\n",
        "      <td>Zimbabwe</td>\n",
        "      <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
        "      <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
        "      <td>40</td>\n",
+       "      <td>495</td>\n",
+       "      <td>Adaptation and applicable mitigation actions w...</td>\n",
        "      <td>0</td>\n",
        "      <td>None</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>53029</th>\n",
        "      <td>Zimbabwe</td>\n",
        "      <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
        "      <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
        "      <td>40</td>\n",
+       "      <td>496</td>\n",
+       "      <td>The enhanced integration of climate change int...</td>\n",
        "      <td>0</td>\n",
        "      <td>None</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>53030</th>\n",
        "      <td>Zimbabwe</td>\n",
        "      <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
        "      <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
        "      <td>41</td>\n",
+       "      <td>497</td>\n",
+       "      <td>35 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT.</td>\n",
        "      <td>0</td>\n",
        "      <td>None</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>53031</th>\n",
        "      <td>Zimbabwe</td>\n",
        "      <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
        "      <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
        "      <td>42</td>\n",
+       "      <td>498</td>\n",
        "      <td>36 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT Ministr...</td>\n",
        "      <td>0</td>\n",
        "      <td>None</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
+       "<p>53032 rows × 9 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
        "3      Afghanistan             Afghanistan_First_NDC.pdf   \n",
        "4      Afghanistan             Afghanistan_First_NDC.pdf   \n",
        "...            ...                                   ...   \n",
+       "53027     Zimbabwe  Zimbabwe_NDC30_Country_Statement.pdf   \n",
+       "53028     Zimbabwe  Zimbabwe_NDC30_Country_Statement.pdf   \n",
+       "53029     Zimbabwe  Zimbabwe_NDC30_Country_Statement.pdf   \n",
+       "53030     Zimbabwe  Zimbabwe_NDC30_Country_Statement.pdf   \n",
+       "53031     Zimbabwe  Zimbabwe_NDC30_Country_Statement.pdf   \n",
        "\n",
        "                                                filepath  indicated_page  \\\n",
        "0      ../data/raw/pdfs\\Afghanistan\\Afghanistan_First...               1   \n",
        "1      ../data/raw/pdfs\\Afghanistan\\Afghanistan_First...               1   \n",
+       "2      ../data/raw/pdfs\\Afghanistan\\Afghanistan_First...               2   \n",
+       "3      ../data/raw/pdfs\\Afghanistan\\Afghanistan_First...               2   \n",
+       "4      ../data/raw/pdfs\\Afghanistan\\Afghanistan_First...               2   \n",
        "...                                                  ...             ...   \n",
+       "53027  ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...              40   \n",
+       "53028  ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...              40   \n",
+       "53029  ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...              40   \n",
+       "53030  ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...              41   \n",
+       "53031  ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...              42   \n",
        "\n",
        "       chunk_num                                               text  \\\n",
        "0              1  1 ISLAMIC REPUBLIC OF AFGHANISTAN Intended Nat...   \n",
+       "1              2  Financial Needs: Total: USD 17.405 billion  A...   \n",
+       "2              3  2 1. Afghanistan’s National Circumstances and ...   \n",
+       "3              4  Afghanistan remains one of the poorest countri...   \n",
+       "4              5  Despite these challenges, Afghanistan can rema...   \n",
        "...          ...                                                ...   \n",
+       "53027        494  5.1 GENDER, YOUTH, CHILDREN AND INCLUSIVITY Th...   \n",
+       "53028        495  Adaptation and applicable mitigation actions w...   \n",
+       "53029        496  The enhanced integration of climate change int...   \n",
+       "53030        497            35 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT.   \n",
+       "53031        498  36 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT Ministr...   \n",
        "\n",
        "       is_target_quote                                      matched_quote  \\\n",
+       "0                    1  Target Years: \\n2020 to 2030 \\nContribution Ty...   \n",
+       "1                    1  Target Years: \\n2020 to 2030 \\nContribution Ty...   \n",
+       "2                    0                                               None   \n",
        "3                    0                                               None   \n",
        "4                    0                                               None   \n",
        "...                ...                                                ...   \n",
+       "53027                0                                               None   \n",
+       "53028                0                                               None   \n",
+       "53029                0                                               None   \n",
+       "53030                0                                               None   \n",
+       "53031                0                                               None   \n",
        "\n",
        "       match_score  \n",
+       "0               94  \n",
+       "1               71  \n",
+       "2                0  \n",
        "3                0  \n",
        "4                0  \n",
        "...            ...  \n",
+       "53027            0  \n",
+       "53028            0  \n",
+       "53029            0  \n",
+       "53030            0  \n",
+       "53031            0  \n",
        "\n",
+       "[53032 rows x 9 columns]"
       ]
      },
+     "execution_count": 30,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 31,
+   "id": "45a6b88d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 53032 entries, 0 to 53031\n",
+      "Data columns (total 9 columns):\n",
+      " #   Column           Non-Null Count  Dtype \n",
+      "---  ------           --------------  ----- \n",
+      " 0   country          53032 non-null  object\n",
+      " 1   filename         53032 non-null  object\n",
+      " 2   filepath         53032 non-null  object\n",
+      " 3   indicated_page   53032 non-null  int64 \n",
+      " 4   chunk_num        53032 non-null  int64 \n",
+      " 5   text             53032 non-null  object\n",
+      " 6   is_target_quote  53032 non-null  int64 \n",
+      " 7   matched_quote    180 non-null    object\n",
+      " 8   match_score      53032 non-null  int64 \n",
+      "dtypes: int64(4), object(5)\n",
+      "memory usage: 3.6+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "chunked_pdfs_df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
    "id": "9d7038e9",
    "metadata": {},
    "outputs": [],
    "source": [
+    "chunked_pdfs_df.to_csv('./fuzzy_matched_chunks.csv', index=False)"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 28,
    "id": "76b51ab6",
    "metadata": {},
    "outputs": [

fuzzy_matched_chunks.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:783c1ee7c7b2ef9a44592e0f7e96e0b290ce88b2337eec0a42460e9ceb0c32fa
-size 23764705

 version https://git-lfs.github.com/spec/v1
+oid sha256:a60783cdeb77e1be04350f2fbcd2875aa963e4727d476e04f75b4193a7535b46
+size 27197827

model_pipeline.joblib CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ed71ae5894cef561e61385a960a73c3faa377f1a71f547c861acf915dd52c614
-size 141529984

 version https://git-lfs.github.com/spec/v1
+oid sha256:4a5d00bca324c492ee18d6e52ee6e39a7ec62efbdb9d91cea61a46e29c73be06
+size 184283856