Upload TF-IDF Logistic Regression baseline model
Browse files- fuzzy_match_training_data.ipynb +297 -178
- fuzzy_matched_chunks.csv +2 -2
- model_pipeline.joblib +2 -2
fuzzy_match_training_data.ipynb
CHANGED
|
@@ -10,19 +10,10 @@
|
|
| 10 |
},
|
| 11 |
{
|
| 12 |
"cell_type": "code",
|
| 13 |
-
"execution_count":
|
| 14 |
"id": "9ced7f63",
|
| 15 |
"metadata": {},
|
| 16 |
-
"outputs": [
|
| 17 |
-
{
|
| 18 |
-
"name": "stderr",
|
| 19 |
-
"output_type": "stream",
|
| 20 |
-
"text": [
|
| 21 |
-
"c:\\Users\\Derik\\anaconda3\\envs\\NDC_extraction_ENV\\lib\\site-packages\\fuzzywuzzy\\fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning\n",
|
| 22 |
-
" warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')\n"
|
| 23 |
-
]
|
| 24 |
-
}
|
| 25 |
-
],
|
| 26 |
"source": [
|
| 27 |
"import pandas as pd\n",
|
| 28 |
"from fuzzywuzzy import process\n",
|
|
@@ -33,7 +24,7 @@
|
|
| 33 |
},
|
| 34 |
{
|
| 35 |
"cell_type": "code",
|
| 36 |
-
"execution_count":
|
| 37 |
"id": "e06e72c9",
|
| 38 |
"metadata": {},
|
| 39 |
"outputs": [],
|
|
@@ -44,15 +35,13 @@
|
|
| 44 |
},
|
| 45 |
{
|
| 46 |
"cell_type": "code",
|
| 47 |
-
"execution_count":
|
| 48 |
"id": "e467b9cd",
|
| 49 |
"metadata": {},
|
| 50 |
"outputs": [],
|
| 51 |
"source": [
|
| 52 |
-
"FUZZY_MATCH_THRESHOLD = 85\n",
|
| 53 |
-
"\n",
|
| 54 |
"try:\n",
|
| 55 |
-
" chunked_pdfs_df = pd.read_excel('../../etl/
|
| 56 |
" extracted_quotes_df = pd.read_excel('../NDC_scraping_stage_1.xlsx', sheet_name= 'Prev_Finance').drop_duplicates()\n",
|
| 57 |
"except Exception as e:\n",
|
| 58 |
" raise RuntimeError(f\"An unexpected error occurred while loading DataFrames: {e}\")\n"
|
|
@@ -60,7 +49,122 @@
|
|
| 60 |
},
|
| 61 |
{
|
| 62 |
"cell_type": "code",
|
| 63 |
-
"execution_count":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
"id": "48d2f1d5",
|
| 65 |
"metadata": {},
|
| 66 |
"outputs": [],
|
|
@@ -73,29 +177,59 @@
|
|
| 73 |
},
|
| 74 |
{
|
| 75 |
"cell_type": "code",
|
| 76 |
-
"execution_count":
|
| 77 |
-
"id": "
|
| 78 |
"metadata": {},
|
| 79 |
"outputs": [],
|
| 80 |
"source": [
|
| 81 |
-
"all_quotes = extracted_quotes_df[QUOTE_TEXT_COLUMN].tolist()"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
]
|
| 83 |
},
|
| 84 |
{
|
| 85 |
"cell_type": "code",
|
| 86 |
-
"execution_count":
|
| 87 |
-
"id": "
|
| 88 |
"metadata": {},
|
| 89 |
"outputs": [
|
| 90 |
{
|
| 91 |
"data": {
|
| 92 |
"application/vnd.jupyter.widget-view+json": {
|
| 93 |
-
"model_id": "
|
| 94 |
"version_major": 2,
|
| 95 |
"version_minor": 0
|
| 96 |
},
|
| 97 |
"text/plain": [
|
| 98 |
-
"
|
| 99 |
]
|
| 100 |
},
|
| 101 |
"metadata": {},
|
|
@@ -105,22 +239,7 @@
|
|
| 105 |
"name": "stderr",
|
| 106 |
"output_type": "stream",
|
| 107 |
"text": [
|
| 108 |
-
"WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '
|
| 109 |
-
"WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
|
| 110 |
-
"WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: ';']\n",
|
| 111 |
-
"WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
|
| 112 |
-
"WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
|
| 113 |
-
"WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: ',']\n",
|
| 114 |
-
"WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: ').']\n",
|
| 115 |
-
"WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '/']\n",
|
| 116 |
-
"WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
|
| 117 |
-
"WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
|
| 118 |
-
"WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
|
| 119 |
-
"WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
|
| 120 |
-
"WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
|
| 121 |
-
"WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
|
| 122 |
-
"WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
|
| 123 |
-
"WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: ').']\n"
|
| 124 |
]
|
| 125 |
},
|
| 126 |
{
|
|
@@ -132,33 +251,49 @@
|
|
| 132 |
}
|
| 133 |
],
|
| 134 |
"source": [
|
|
|
|
| 135 |
"chunked_pdfs_df['is_target_quote'] = 0\n",
|
| 136 |
"chunked_pdfs_df['matched_quote'] = None\n",
|
| 137 |
"chunked_pdfs_df['match_score'] = 0\n",
|
| 138 |
"\n",
|
| 139 |
-
"
|
| 140 |
-
"# tqdm will automatically print the \"Starting...\" and \"Complete.\" messages through its bar.\n",
|
| 141 |
-
"# For Jupyter/IPython notebooks, use tqdm.notebook.tqdm. For scripts, use tqdm.tqdm\n",
|
| 142 |
-
"for index, row in tqdm(chunked_pdfs_df.iterrows(), total=len(chunked_pdfs_df), desc=\"Fuzzy Matching Chunks\"):\n",
|
| 143 |
-
" chunk_text = str(row[CHUNK_TEXT_COLUMN]) # Convert to string to handle potential non-string types\n",
|
| 144 |
"\n",
|
| 145 |
-
"
|
| 146 |
-
"
|
|
|
|
|
|
|
| 147 |
"\n",
|
| 148 |
-
" if
|
| 149 |
-
"
|
| 150 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
"\n",
|
| 152 |
-
" if match_score >= FUZZY_MATCH_THRESHOLD:\n",
|
| 153 |
-
" chunked_pdfs_df.loc[index, 'is_target_quote'] = 1\n",
|
| 154 |
-
" chunked_pdfs_df.loc[index, 'matched_quote'] = best_match_quote\n",
|
| 155 |
-
" chunked_pdfs_df.loc[index, 'match_score'] = match_score\n",
|
| 156 |
"print(\"Fuzzy matching complete.\")"
|
| 157 |
]
|
| 158 |
},
|
| 159 |
{
|
| 160 |
"cell_type": "code",
|
| 161 |
-
"execution_count":
|
| 162 |
"id": "9c0e0d8d",
|
| 163 |
"metadata": {},
|
| 164 |
"outputs": [
|
|
@@ -189,9 +324,6 @@
|
|
| 189 |
" <th>indicated_page</th>\n",
|
| 190 |
" <th>chunk_num</th>\n",
|
| 191 |
" <th>text</th>\n",
|
| 192 |
-
" <th>contains_thematic_scope</th>\n",
|
| 193 |
-
" <th>contains_coverage</th>\n",
|
| 194 |
-
" <th>contains_Granularity</th>\n",
|
| 195 |
" <th>is_target_quote</th>\n",
|
| 196 |
" <th>matched_quote</th>\n",
|
| 197 |
" <th>match_score</th>\n",
|
|
@@ -206,12 +338,9 @@
|
|
| 206 |
" <td>1</td>\n",
|
| 207 |
" <td>1</td>\n",
|
| 208 |
" <td>1 ISLAMIC REPUBLIC OF AFGHANISTAN Intended Nat...</td>\n",
|
| 209 |
-
" <td>
|
| 210 |
-
" <td>
|
| 211 |
-
" <td>
|
| 212 |
-
" <td>0</td>\n",
|
| 213 |
-
" <td>None</td>\n",
|
| 214 |
-
" <td>0</td>\n",
|
| 215 |
" </tr>\n",
|
| 216 |
" <tr>\n",
|
| 217 |
" <th>1</th>\n",
|
|
@@ -220,40 +349,31 @@
|
|
| 220 |
" <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
|
| 221 |
" <td>1</td>\n",
|
| 222 |
" <td>2</td>\n",
|
| 223 |
-
" <td>
|
| 224 |
-
" <td>
|
| 225 |
-
" <td>
|
| 226 |
-
" <td>
|
| 227 |
-
" <td>0</td>\n",
|
| 228 |
-
" <td>None</td>\n",
|
| 229 |
-
" <td>0</td>\n",
|
| 230 |
" </tr>\n",
|
| 231 |
" <tr>\n",
|
| 232 |
" <th>2</th>\n",
|
| 233 |
" <td>Afghanistan</td>\n",
|
| 234 |
" <td>Afghanistan_First_NDC.pdf</td>\n",
|
| 235 |
" <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
|
| 236 |
-
" <td>
|
| 237 |
" <td>3</td>\n",
|
| 238 |
-
" <td>
|
| 239 |
-
" <td>
|
| 240 |
-
" <td>
|
| 241 |
-
" <td>
|
| 242 |
-
" <td>1</td>\n",
|
| 243 |
-
" <td>Target Years: \\n2020 to 2030 \\nContribution Ty...</td>\n",
|
| 244 |
-
" <td>98</td>\n",
|
| 245 |
" </tr>\n",
|
| 246 |
" <tr>\n",
|
| 247 |
" <th>3</th>\n",
|
| 248 |
" <td>Afghanistan</td>\n",
|
| 249 |
" <td>Afghanistan_First_NDC.pdf</td>\n",
|
| 250 |
" <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
|
| 251 |
-
" <td>
|
| 252 |
" <td>4</td>\n",
|
| 253 |
-
" <td>
|
| 254 |
-
" <td>NaN</td>\n",
|
| 255 |
-
" <td>NaN</td>\n",
|
| 256 |
-
" <td>NaN</td>\n",
|
| 257 |
" <td>0</td>\n",
|
| 258 |
" <td>None</td>\n",
|
| 259 |
" <td>0</td>\n",
|
|
@@ -263,12 +383,9 @@
|
|
| 263 |
" <td>Afghanistan</td>\n",
|
| 264 |
" <td>Afghanistan_First_NDC.pdf</td>\n",
|
| 265 |
" <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
|
| 266 |
-
" <td>
|
| 267 |
" <td>5</td>\n",
|
| 268 |
-
" <td>
|
| 269 |
-
" <td>NaN</td>\n",
|
| 270 |
-
" <td>NaN</td>\n",
|
| 271 |
-
" <td>NaN</td>\n",
|
| 272 |
" <td>0</td>\n",
|
| 273 |
" <td>None</td>\n",
|
| 274 |
" <td>0</td>\n",
|
|
@@ -284,88 +401,70 @@
|
|
| 284 |
" <td>...</td>\n",
|
| 285 |
" <td>...</td>\n",
|
| 286 |
" <td>...</td>\n",
|
| 287 |
-
" <td>...</td>\n",
|
| 288 |
-
" <td>...</td>\n",
|
| 289 |
-
" <td>...</td>\n",
|
| 290 |
" </tr>\n",
|
| 291 |
" <tr>\n",
|
| 292 |
-
" <th>
|
| 293 |
" <td>Zimbabwe</td>\n",
|
| 294 |
" <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
|
| 295 |
" <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
|
| 296 |
" <td>40</td>\n",
|
| 297 |
-
" <td>
|
| 298 |
-
" <td>
|
| 299 |
-
" <td>NaN</td>\n",
|
| 300 |
-
" <td>NaN</td>\n",
|
| 301 |
-
" <td>NaN</td>\n",
|
| 302 |
" <td>0</td>\n",
|
| 303 |
" <td>None</td>\n",
|
| 304 |
" <td>0</td>\n",
|
| 305 |
" </tr>\n",
|
| 306 |
" <tr>\n",
|
| 307 |
-
" <th>
|
| 308 |
" <td>Zimbabwe</td>\n",
|
| 309 |
" <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
|
| 310 |
" <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
|
| 311 |
" <td>40</td>\n",
|
| 312 |
-
" <td>
|
| 313 |
-
" <td>
|
| 314 |
-
" <td>NaN</td>\n",
|
| 315 |
-
" <td>NaN</td>\n",
|
| 316 |
-
" <td>NaN</td>\n",
|
| 317 |
" <td>0</td>\n",
|
| 318 |
" <td>None</td>\n",
|
| 319 |
" <td>0</td>\n",
|
| 320 |
" </tr>\n",
|
| 321 |
" <tr>\n",
|
| 322 |
-
" <th>
|
| 323 |
" <td>Zimbabwe</td>\n",
|
| 324 |
" <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
|
| 325 |
" <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
|
| 326 |
" <td>40</td>\n",
|
| 327 |
-
" <td>
|
| 328 |
-
" <td>
|
| 329 |
-
" <td>NaN</td>\n",
|
| 330 |
-
" <td>NaN</td>\n",
|
| 331 |
-
" <td>NaN</td>\n",
|
| 332 |
" <td>0</td>\n",
|
| 333 |
" <td>None</td>\n",
|
| 334 |
" <td>0</td>\n",
|
| 335 |
" </tr>\n",
|
| 336 |
" <tr>\n",
|
| 337 |
-
" <th>
|
| 338 |
" <td>Zimbabwe</td>\n",
|
| 339 |
" <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
|
| 340 |
" <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
|
| 341 |
" <td>41</td>\n",
|
| 342 |
-
" <td>
|
| 343 |
-
" <td>35 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT
|
| 344 |
-
" <td>NaN</td>\n",
|
| 345 |
-
" <td>NaN</td>\n",
|
| 346 |
-
" <td>NaN</td>\n",
|
| 347 |
" <td>0</td>\n",
|
| 348 |
" <td>None</td>\n",
|
| 349 |
" <td>0</td>\n",
|
| 350 |
" </tr>\n",
|
| 351 |
" <tr>\n",
|
| 352 |
-
" <th>
|
| 353 |
" <td>Zimbabwe</td>\n",
|
| 354 |
" <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
|
| 355 |
" <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
|
| 356 |
" <td>42</td>\n",
|
| 357 |
-
" <td>
|
| 358 |
" <td>36 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT Ministr...</td>\n",
|
| 359 |
-
" <td>NaN</td>\n",
|
| 360 |
-
" <td>NaN</td>\n",
|
| 361 |
-
" <td>NaN</td>\n",
|
| 362 |
" <td>0</td>\n",
|
| 363 |
" <td>None</td>\n",
|
| 364 |
" <td>0</td>\n",
|
| 365 |
" </tr>\n",
|
| 366 |
" </tbody>\n",
|
| 367 |
"</table>\n",
|
| 368 |
-
"<p>
|
| 369 |
"</div>"
|
| 370 |
],
|
| 371 |
"text/plain": [
|
|
@@ -376,81 +475,68 @@
|
|
| 376 |
"3 Afghanistan Afghanistan_First_NDC.pdf \n",
|
| 377 |
"4 Afghanistan Afghanistan_First_NDC.pdf \n",
|
| 378 |
"... ... ... \n",
|
| 379 |
-
"
|
| 380 |
-
"
|
| 381 |
-
"
|
| 382 |
-
"
|
| 383 |
-
"
|
| 384 |
"\n",
|
| 385 |
" filepath indicated_page \\\n",
|
| 386 |
"0 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 1 \n",
|
| 387 |
"1 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 1 \n",
|
| 388 |
-
"2 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First...
|
| 389 |
-
"3 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First...
|
| 390 |
-
"4 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First...
|
| 391 |
"... ... ... \n",
|
| 392 |
-
"
|
| 393 |
-
"
|
| 394 |
-
"
|
| 395 |
-
"
|
| 396 |
-
"
|
| 397 |
"\n",
|
| 398 |
" chunk_num text \\\n",
|
| 399 |
"0 1 1 ISLAMIC REPUBLIC OF AFGHANISTAN Intended Nat... \n",
|
| 400 |
-
"1 2
|
| 401 |
-
"2 3
|
| 402 |
-
"3 4
|
| 403 |
-
"4 5
|
| 404 |
"... ... ... \n",
|
| 405 |
-
"
|
| 406 |
-
"
|
| 407 |
-
"
|
| 408 |
-
"
|
| 409 |
-
"
|
| 410 |
-
"\n",
|
| 411 |
-
" contains_thematic_scope contains_coverage contains_Granularity \\\n",
|
| 412 |
-
"0 NaN NaN NaN \n",
|
| 413 |
-
"1 NaN NaN NaN \n",
|
| 414 |
-
"2 NaN NaN NaN \n",
|
| 415 |
-
"3 NaN NaN NaN \n",
|
| 416 |
-
"4 NaN NaN NaN \n",
|
| 417 |
-
"... ... ... ... \n",
|
| 418 |
-
"60123 NaN NaN NaN \n",
|
| 419 |
-
"60124 NaN NaN NaN \n",
|
| 420 |
-
"60125 NaN NaN NaN \n",
|
| 421 |
-
"60126 NaN NaN NaN \n",
|
| 422 |
-
"60127 NaN NaN NaN \n",
|
| 423 |
"\n",
|
| 424 |
" is_target_quote matched_quote \\\n",
|
| 425 |
-
"0
|
| 426 |
-
"1
|
| 427 |
-
"2
|
| 428 |
"3 0 None \n",
|
| 429 |
"4 0 None \n",
|
| 430 |
"... ... ... \n",
|
| 431 |
-
"
|
| 432 |
-
"
|
| 433 |
-
"
|
| 434 |
-
"
|
| 435 |
-
"
|
| 436 |
"\n",
|
| 437 |
" match_score \n",
|
| 438 |
-
"0
|
| 439 |
-
"1
|
| 440 |
-
"2
|
| 441 |
"3 0 \n",
|
| 442 |
"4 0 \n",
|
| 443 |
"... ... \n",
|
| 444 |
-
"
|
| 445 |
-
"
|
| 446 |
-
"
|
| 447 |
-
"
|
| 448 |
-
"
|
| 449 |
"\n",
|
| 450 |
-
"[
|
| 451 |
]
|
| 452 |
},
|
| 453 |
-
"execution_count":
|
| 454 |
"metadata": {},
|
| 455 |
"output_type": "execute_result"
|
| 456 |
}
|
|
@@ -461,17 +547,50 @@
|
|
| 461 |
},
|
| 462 |
{
|
| 463 |
"cell_type": "code",
|
| 464 |
-
"execution_count":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
"id": "9d7038e9",
|
| 466 |
"metadata": {},
|
| 467 |
"outputs": [],
|
| 468 |
"source": [
|
| 469 |
-
"chunked_pdfs_df.to_csv('./fuzzy_matched_chunks.csv')"
|
| 470 |
]
|
| 471 |
},
|
| 472 |
{
|
| 473 |
"cell_type": "code",
|
| 474 |
-
"execution_count":
|
| 475 |
"id": "76b51ab6",
|
| 476 |
"metadata": {},
|
| 477 |
"outputs": [
|
|
|
|
| 10 |
},
|
| 11 |
{
|
| 12 |
"cell_type": "code",
|
| 13 |
+
"execution_count": 15,
|
| 14 |
"id": "9ced7f63",
|
| 15 |
"metadata": {},
|
| 16 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
"source": [
|
| 18 |
"import pandas as pd\n",
|
| 19 |
"from fuzzywuzzy import process\n",
|
|
|
|
| 24 |
},
|
| 25 |
{
|
| 26 |
"cell_type": "code",
|
| 27 |
+
"execution_count": 16,
|
| 28 |
"id": "e06e72c9",
|
| 29 |
"metadata": {},
|
| 30 |
"outputs": [],
|
|
|
|
| 35 |
},
|
| 36 |
{
|
| 37 |
"cell_type": "code",
|
| 38 |
+
"execution_count": 17,
|
| 39 |
"id": "e467b9cd",
|
| 40 |
"metadata": {},
|
| 41 |
"outputs": [],
|
| 42 |
"source": [
|
|
|
|
|
|
|
| 43 |
"try:\n",
|
| 44 |
+
" chunked_pdfs_df = pd.read_excel('../../etl/20250708_sentences_pdf_extraction_results.xlsx', sheet_name= 'Sheet1').drop_duplicates()\n",
|
| 45 |
" extracted_quotes_df = pd.read_excel('../NDC_scraping_stage_1.xlsx', sheet_name= 'Prev_Finance').drop_duplicates()\n",
|
| 46 |
"except Exception as e:\n",
|
| 47 |
" raise RuntimeError(f\"An unexpected error occurred while loading DataFrames: {e}\")\n"
|
|
|
|
| 49 |
},
|
| 50 |
{
|
| 51 |
"cell_type": "code",
|
| 52 |
+
"execution_count": 18,
|
| 53 |
+
"id": "62866717",
|
| 54 |
+
"metadata": {},
|
| 55 |
+
"outputs": [
|
| 56 |
+
{
|
| 57 |
+
"data": {
|
| 58 |
+
"text/plain": [
|
| 59 |
+
"array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',\n",
|
| 60 |
+
" 'Antigua_and_Barbuda', 'Argentina', 'Armenia', 'Australia',\n",
|
| 61 |
+
" 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados',\n",
|
| 62 |
+
" 'Belarus', 'Belize', 'Benin', 'Bhutan',\n",
|
| 63 |
+
" 'Bolivia_Plurinational_State_of', 'Bosnia_and_Herzegovina',\n",
|
| 64 |
+
" 'Botswana', 'Brazil', 'Brunei_Darussalam', 'Burkina_Faso',\n",
|
| 65 |
+
" 'Burundi', 'Cabo_Verde', 'Cambodia', 'Cameroon', 'Canada',\n",
|
| 66 |
+
" 'Central_African_Republic', 'Chad', 'Chile', 'China', 'Colombia',\n",
|
| 67 |
+
" 'Comoros', 'Congo', 'Cook_Islands', 'Costa_Rica', 'Cuba',\n",
|
| 68 |
+
" 'Côte_dIvoire', 'Democratic_Republic_of_the_Congo', 'Djibouti',\n",
|
| 69 |
+
" 'Dominica', 'Dominican_Republic', 'Ecuador', 'Egypt',\n",
|
| 70 |
+
" 'El_Salvador', 'Equatorial_Guinea', 'Eritrea', 'Eswatini',\n",
|
| 71 |
+
" 'Ethiopia', 'European_Union_EU', 'Fiji', 'Gabon', 'Gambia',\n",
|
| 72 |
+
" 'Georgia', 'Ghana', 'Grenada', 'Guatemala', 'Guinea',\n",
|
| 73 |
+
" 'Guinea-Bissau', 'Guyana', 'Haiti', 'Holy_See', 'Honduras',\n",
|
| 74 |
+
" 'Iceland', 'India', 'Iraq', 'Israel', 'Jamaica', 'Japan', 'Jordan',\n",
|
| 75 |
+
" 'Kazakhstan', 'Kenya', 'Kuwait', 'Kyrgyzstan',\n",
|
| 76 |
+
" 'Lao_Peoples_Democratic_Republic', 'Lebanon', 'Liberia',\n",
|
| 77 |
+
" 'Liechtenstein', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives',\n",
|
| 78 |
+
" 'Mali', 'Marshall_Islands', 'Mauritania', 'Mauritius', 'Mexico',\n",
|
| 79 |
+
" 'Micronesia_Federated_States_of', 'Monaco', 'Mongolia',\n",
|
| 80 |
+
" 'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia',\n",
|
| 81 |
+
" 'Nauru', 'Nepal', 'New_Zealand', 'Nicaragua', 'Niger', 'Nigeria',\n",
|
| 82 |
+
" 'Niue', 'North_Macedonia', 'Norway', 'Oman', 'Pakistan', 'Palau',\n",
|
| 83 |
+
" 'Panama', 'Papua_New_Guinea', 'Peru', 'Philippines', 'Qatar',\n",
|
| 84 |
+
" 'Republic_of_Korea', 'Republic_of_Moldova', 'Russian_Federation',\n",
|
| 85 |
+
" 'Rwanda', 'Saint_Kitts_and_Nevis', 'Saint_Lucia',\n",
|
| 86 |
+
" 'Saint_Vincent_and_the_Grenadines', 'Samoa', 'San_Marino',\n",
|
| 87 |
+
" 'Sao_Tome_and_Principe', 'Saudi_Arabia', 'Senegal', 'Serbia',\n",
|
| 88 |
+
" 'Seychelles', 'Sierra_Leone', 'Singapore', 'Solomon_Islands',\n",
|
| 89 |
+
" 'Somalia', 'South_Africa', 'South_Sudan', 'Sri_Lanka',\n",
|
| 90 |
+
" 'State_of_Palestine', 'Sudan', 'Suriname', 'Switzerland',\n",
|
| 91 |
+
" 'Syrian_Arab_Republic', 'Tajikistan', 'Thailand', 'Timor-Leste',\n",
|
| 92 |
+
" 'Togo', 'Tonga', 'Trinidad_and_Tobago', 'Tunisia', 'Turkmenistan',\n",
|
| 93 |
+
" 'Tuvalu', 'Türkiye', 'Uganda', 'Ukraine', 'United_Arab_Emirates',\n",
|
| 94 |
+
" 'United_Kingdom_of_Great_Britain_and_Northern_Ireland',\n",
|
| 95 |
+
" 'United_Republic_of_Tanzania', 'United_States_of_America',\n",
|
| 96 |
+
" 'Uruguay', 'Uzbekistan', 'Vanuatu',\n",
|
| 97 |
+
" 'Venezuela_Bolivarian_Republic_of', 'Viet_Nam', 'Zambia',\n",
|
| 98 |
+
" 'Zimbabwe'], dtype=object)"
|
| 99 |
+
]
|
| 100 |
+
},
|
| 101 |
+
"execution_count": 18,
|
| 102 |
+
"metadata": {},
|
| 103 |
+
"output_type": "execute_result"
|
| 104 |
+
}
|
| 105 |
+
],
|
| 106 |
+
"source": [
|
| 107 |
+
"chunked_pdfs_df['country'].unique()"
|
| 108 |
+
]
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"cell_type": "code",
|
| 112 |
+
"execution_count": 19,
|
| 113 |
+
"id": "281d0e3e",
|
| 114 |
+
"metadata": {},
|
| 115 |
+
"outputs": [],
|
| 116 |
+
"source": [
|
| 117 |
+
"extracted_quotes_df['Source'] = extracted_quotes_df['Source'].str.split(' ').str[0]"
|
| 118 |
+
]
|
| 119 |
+
},
|
| 120 |
+
{
|
| 121 |
+
"cell_type": "code",
|
| 122 |
+
"execution_count": 20,
|
| 123 |
+
"id": "a4516eaa",
|
| 124 |
+
"metadata": {},
|
| 125 |
+
"outputs": [
|
| 126 |
+
{
|
| 127 |
+
"data": {
|
| 128 |
+
"text/plain": [
|
| 129 |
+
"array(['Afghanistan', 'Antigua', 'Bahamas', 'Barbados', 'Bhutan',\n",
|
| 130 |
+
" 'Bosnia', 'Egypt', 'Bangladesh', 'Eswatini', 'Fiji', 'Grenada',\n",
|
| 131 |
+
" 'Guyana', 'Indonesia', 'Kazakhstan', 'Kiribati', 'Liberia',\n",
|
| 132 |
+
" 'Micronesia', 'Belize', 'Malaysia', 'Mongolia', 'Nepal',\n",
|
| 133 |
+
" 'Pakistan', 'Solomon', 'Trinidad', nan], dtype=object)"
|
| 134 |
+
]
|
| 135 |
+
},
|
| 136 |
+
"execution_count": 20,
|
| 137 |
+
"metadata": {},
|
| 138 |
+
"output_type": "execute_result"
|
| 139 |
+
}
|
| 140 |
+
],
|
| 141 |
+
"source": [
|
| 142 |
+
"extracted_quotes_df['Source'].unique()"
|
| 143 |
+
]
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"cell_type": "markdown",
|
| 147 |
+
"id": "7e118852",
|
| 148 |
+
"metadata": {},
|
| 149 |
+
"source": [
|
| 150 |
+
"# Filter chunked pdfs to manually extracted data to reduce amount of fuzzy matches\n",
|
| 151 |
+
"\n",
|
| 152 |
+
"Can be further optimized by iterating through country / file name"
|
| 153 |
+
]
|
| 154 |
+
},
|
| 155 |
+
{
|
| 156 |
+
"cell_type": "code",
|
| 157 |
+
"execution_count": 21,
|
| 158 |
+
"id": "0f1beca6",
|
| 159 |
+
"metadata": {},
|
| 160 |
+
"outputs": [],
|
| 161 |
+
"source": [
|
| 162 |
+
"# chunked_pdfs_df = chunked_pdfs_df[chunked_pdfs_df['country'].isin(extracted_quotes_df['Source'].unique())]"
|
| 163 |
+
]
|
| 164 |
+
},
|
| 165 |
+
{
|
| 166 |
+
"cell_type": "code",
|
| 167 |
+
"execution_count": 22,
|
| 168 |
"id": "48d2f1d5",
|
| 169 |
"metadata": {},
|
| 170 |
"outputs": [],
|
|
|
|
| 177 |
},
|
| 178 |
{
|
| 179 |
"cell_type": "code",
|
| 180 |
+
"execution_count": 23,
|
| 181 |
+
"id": "bdc824af",
|
| 182 |
"metadata": {},
|
| 183 |
"outputs": [],
|
| 184 |
"source": [
|
| 185 |
+
"# all_quotes = extracted_quotes_df[QUOTE_TEXT_COLUMN].tolist()\n",
|
| 186 |
+
"# chunked_pdfs_df['is_target_quote'] = 0\n",
|
| 187 |
+
"# chunked_pdfs_df['matched_quote'] = None\n",
|
| 188 |
+
"# chunked_pdfs_df['match_score'] = 0\n",
|
| 189 |
+
"\n",
|
| 190 |
+
"# # Iterate through each chunk with a progress bar\n",
|
| 191 |
+
"# # tqdm will automatically print the \"Starting...\" and \"Complete.\" messages through its bar.\n",
|
| 192 |
+
"# # For Jupyter/IPython notebooks, use tqdm.notebook.tqdm. For scripts, use tqdm.tqdm\n",
|
| 193 |
+
"# for index, row in tqdm(chunked_pdfs_df.iterrows(), total=len(chunked_pdfs_df), desc=\"Fuzzy Matching Chunks\"):\n",
|
| 194 |
+
"# chunk_text = str(row[CHUNK_TEXT_COLUMN]) # Convert to string to handle potential non-string types\n",
|
| 195 |
+
"\n",
|
| 196 |
+
"# # Find the best matching quote and its score\n",
|
| 197 |
+
"# best_match_tuple = process.extractOne(chunk_text, all_quotes, scorer=fuzz.token_set_ratio)\n",
|
| 198 |
+
"\n",
|
| 199 |
+
"# if best_match_tuple:\n",
|
| 200 |
+
"# best_match_quote = best_match_tuple[0]\n",
|
| 201 |
+
"# match_score = best_match_tuple[1]\n",
|
| 202 |
+
"\n",
|
| 203 |
+
"# if match_score >= FUZZY_MATCH_THRESHOLD:\n",
|
| 204 |
+
"# chunked_pdfs_df.loc[index, 'is_target_quote'] = 1\n",
|
| 205 |
+
"# chunked_pdfs_df.loc[index, 'matched_quote'] = best_match_quote\n",
|
| 206 |
+
"# chunked_pdfs_df.loc[index, 'match_score'] = match_score\n",
|
| 207 |
+
"# print(\"Fuzzy matching complete.\")"
|
| 208 |
+
]
|
| 209 |
+
},
|
| 210 |
+
{
|
| 211 |
+
"cell_type": "markdown",
|
| 212 |
+
"id": "d509674b",
|
| 213 |
+
"metadata": {},
|
| 214 |
+
"source": [
|
| 215 |
+
"# Further optimized fuzzy match"
|
| 216 |
]
|
| 217 |
},
|
| 218 |
{
|
| 219 |
"cell_type": "code",
|
| 220 |
+
"execution_count": 29,
|
| 221 |
+
"id": "e3ea4dce",
|
| 222 |
"metadata": {},
|
| 223 |
"outputs": [
|
| 224 |
{
|
| 225 |
"data": {
|
| 226 |
"application/vnd.jupyter.widget-view+json": {
|
| 227 |
+
"model_id": "85a3712ec33542bb811b04c8ea1ffcde",
|
| 228 |
"version_major": 2,
|
| 229 |
"version_minor": 0
|
| 230 |
},
|
| 231 |
"text/plain": [
|
| 232 |
+
"Processing Countries: 0%| | 0/24 [00:00<?, ?it/s]"
|
| 233 |
]
|
| 234 |
},
|
| 235 |
"metadata": {},
|
|
|
|
| 239 |
"name": "stderr",
|
| 240 |
"output_type": "stream",
|
| 241 |
"text": [
|
| 242 |
+
"WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '---.']\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
]
|
| 244 |
},
|
| 245 |
{
|
|
|
|
| 251 |
}
|
| 252 |
],
|
| 253 |
"source": [
|
| 254 |
+
"FUZZY_MATCH_THRESHOLD = 70\n",
|
| 255 |
"chunked_pdfs_df['is_target_quote'] = 0\n",
|
| 256 |
"chunked_pdfs_df['matched_quote'] = None\n",
|
| 257 |
"chunked_pdfs_df['match_score'] = 0\n",
|
| 258 |
"\n",
|
| 259 |
+
"unique_sources = extracted_quotes_df['Source'].dropna().unique()\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
"\n",
|
| 261 |
+
"for source_country in tqdm(unique_sources, desc=\"Processing Countries\"):\n",
|
| 262 |
+
" # Filter quotes for the current source_country\n",
|
| 263 |
+
" country_specific_quotes_df = extracted_quotes_df[extracted_quotes_df['Source'] == source_country]\n",
|
| 264 |
+
" all_quotes_for_country = country_specific_quotes_df[QUOTE_TEXT_COLUMN].tolist()\n",
|
| 265 |
"\n",
|
| 266 |
+
" if not all_quotes_for_country:\n",
|
| 267 |
+
" continue # Skip if no quotes for this country\n",
|
| 268 |
+
" normalized_chunk_countries = chunked_pdfs_df['country'].str.replace('_', ' ').str.lower()\n",
|
| 269 |
+
" normalized_source_country = source_country.replace('_', ' ').lower()\n",
|
| 270 |
+
" matching_chunk_indices = chunked_pdfs_df[normalized_chunk_countries.str.contains(normalized_source_country, na=False)].index\n",
|
| 271 |
+
"\n",
|
| 272 |
+
" if matching_chunk_indices.empty:\n",
|
| 273 |
+
" continue # Skip if no chunks for this country\n",
|
| 274 |
+
"\n",
|
| 275 |
+
" # Iterate through only the relevant chunks for the current country\n",
|
| 276 |
+
" for index in matching_chunk_indices:\n",
|
| 277 |
+
" chunk_text = str(chunked_pdfs_df.loc[index, CHUNK_TEXT_COLUMN])\n",
|
| 278 |
+
"\n",
|
| 279 |
+
" # Find the best matching quote and its score within the country-specific quotes\n",
|
| 280 |
+
" best_match_tuple = process.extractOne(chunk_text, all_quotes_for_country, scorer=fuzz.token_set_ratio)\n",
|
| 281 |
+
"\n",
|
| 282 |
+
" if best_match_tuple:\n",
|
| 283 |
+
" best_match_quote = best_match_tuple[0]\n",
|
| 284 |
+
" match_score = best_match_tuple[1]\n",
|
| 285 |
+
"\n",
|
| 286 |
+
" if match_score >= FUZZY_MATCH_THRESHOLD:\n",
|
| 287 |
+
" chunked_pdfs_df.loc[index, 'is_target_quote'] = 1\n",
|
| 288 |
+
" chunked_pdfs_df.loc[index, 'matched_quote'] = best_match_quote\n",
|
| 289 |
+
" chunked_pdfs_df.loc[index, 'match_score'] = match_score\n",
|
| 290 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
"print(\"Fuzzy matching complete.\")"
|
| 292 |
]
|
| 293 |
},
|
| 294 |
{
|
| 295 |
"cell_type": "code",
|
| 296 |
+
"execution_count": 30,
|
| 297 |
"id": "9c0e0d8d",
|
| 298 |
"metadata": {},
|
| 299 |
"outputs": [
|
|
|
|
| 324 |
" <th>indicated_page</th>\n",
|
| 325 |
" <th>chunk_num</th>\n",
|
| 326 |
" <th>text</th>\n",
|
|
|
|
|
|
|
|
|
|
| 327 |
" <th>is_target_quote</th>\n",
|
| 328 |
" <th>matched_quote</th>\n",
|
| 329 |
" <th>match_score</th>\n",
|
|
|
|
| 338 |
" <td>1</td>\n",
|
| 339 |
" <td>1</td>\n",
|
| 340 |
" <td>1 ISLAMIC REPUBLIC OF AFGHANISTAN Intended Nat...</td>\n",
|
| 341 |
+
" <td>1</td>\n",
|
| 342 |
+
" <td>Target Years: \\n2020 to 2030 \\nContribution Ty...</td>\n",
|
| 343 |
+
" <td>94</td>\n",
|
|
|
|
|
|
|
|
|
|
| 344 |
" </tr>\n",
|
| 345 |
" <tr>\n",
|
| 346 |
" <th>1</th>\n",
|
|
|
|
| 349 |
" <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
|
| 350 |
" <td>1</td>\n",
|
| 351 |
" <td>2</td>\n",
|
| 352 |
+
" <td>Financial Needs: Total: USD 17.405 billion A...</td>\n",
|
| 353 |
+
" <td>1</td>\n",
|
| 354 |
+
" <td>Target Years: \\n2020 to 2030 \\nContribution Ty...</td>\n",
|
| 355 |
+
" <td>71</td>\n",
|
|
|
|
|
|
|
|
|
|
| 356 |
" </tr>\n",
|
| 357 |
" <tr>\n",
|
| 358 |
" <th>2</th>\n",
|
| 359 |
" <td>Afghanistan</td>\n",
|
| 360 |
" <td>Afghanistan_First_NDC.pdf</td>\n",
|
| 361 |
" <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
|
| 362 |
+
" <td>2</td>\n",
|
| 363 |
" <td>3</td>\n",
|
| 364 |
+
" <td>2 1. Afghanistan’s National Circumstances and ...</td>\n",
|
| 365 |
+
" <td>0</td>\n",
|
| 366 |
+
" <td>None</td>\n",
|
| 367 |
+
" <td>0</td>\n",
|
|
|
|
|
|
|
|
|
|
| 368 |
" </tr>\n",
|
| 369 |
" <tr>\n",
|
| 370 |
" <th>3</th>\n",
|
| 371 |
" <td>Afghanistan</td>\n",
|
| 372 |
" <td>Afghanistan_First_NDC.pdf</td>\n",
|
| 373 |
" <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
|
| 374 |
+
" <td>2</td>\n",
|
| 375 |
" <td>4</td>\n",
|
| 376 |
+
" <td>Afghanistan remains one of the poorest countri...</td>\n",
|
|
|
|
|
|
|
|
|
|
| 377 |
" <td>0</td>\n",
|
| 378 |
" <td>None</td>\n",
|
| 379 |
" <td>0</td>\n",
|
|
|
|
| 383 |
" <td>Afghanistan</td>\n",
|
| 384 |
" <td>Afghanistan_First_NDC.pdf</td>\n",
|
| 385 |
" <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
|
| 386 |
+
" <td>2</td>\n",
|
| 387 |
" <td>5</td>\n",
|
| 388 |
+
" <td>Despite these challenges, Afghanistan can rema...</td>\n",
|
|
|
|
|
|
|
|
|
|
| 389 |
" <td>0</td>\n",
|
| 390 |
" <td>None</td>\n",
|
| 391 |
" <td>0</td>\n",
|
|
|
|
| 401 |
" <td>...</td>\n",
|
| 402 |
" <td>...</td>\n",
|
| 403 |
" <td>...</td>\n",
|
|
|
|
|
|
|
|
|
|
| 404 |
" </tr>\n",
|
| 405 |
" <tr>\n",
|
| 406 |
+
" <th>53027</th>\n",
|
| 407 |
" <td>Zimbabwe</td>\n",
|
| 408 |
" <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
|
| 409 |
" <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
|
| 410 |
" <td>40</td>\n",
|
| 411 |
+
" <td>494</td>\n",
|
| 412 |
+
" <td>5.1 GENDER, YOUTH, CHILDREN AND INCLUSIVITY Th...</td>\n",
|
|
|
|
|
|
|
|
|
|
| 413 |
" <td>0</td>\n",
|
| 414 |
" <td>None</td>\n",
|
| 415 |
" <td>0</td>\n",
|
| 416 |
" </tr>\n",
|
| 417 |
" <tr>\n",
|
| 418 |
+
" <th>53028</th>\n",
|
| 419 |
" <td>Zimbabwe</td>\n",
|
| 420 |
" <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
|
| 421 |
" <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
|
| 422 |
" <td>40</td>\n",
|
| 423 |
+
" <td>495</td>\n",
|
| 424 |
+
" <td>Adaptation and applicable mitigation actions w...</td>\n",
|
|
|
|
|
|
|
|
|
|
| 425 |
" <td>0</td>\n",
|
| 426 |
" <td>None</td>\n",
|
| 427 |
" <td>0</td>\n",
|
| 428 |
" </tr>\n",
|
| 429 |
" <tr>\n",
|
| 430 |
+
" <th>53029</th>\n",
|
| 431 |
" <td>Zimbabwe</td>\n",
|
| 432 |
" <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
|
| 433 |
" <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
|
| 434 |
" <td>40</td>\n",
|
| 435 |
+
" <td>496</td>\n",
|
| 436 |
+
" <td>The enhanced integration of climate change int...</td>\n",
|
|
|
|
|
|
|
|
|
|
| 437 |
" <td>0</td>\n",
|
| 438 |
" <td>None</td>\n",
|
| 439 |
" <td>0</td>\n",
|
| 440 |
" </tr>\n",
|
| 441 |
" <tr>\n",
|
| 442 |
+
" <th>53030</th>\n",
|
| 443 |
" <td>Zimbabwe</td>\n",
|
| 444 |
" <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
|
| 445 |
" <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
|
| 446 |
" <td>41</td>\n",
|
| 447 |
+
" <td>497</td>\n",
|
| 448 |
+
" <td>35 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT.</td>\n",
|
|
|
|
|
|
|
|
|
|
| 449 |
" <td>0</td>\n",
|
| 450 |
" <td>None</td>\n",
|
| 451 |
" <td>0</td>\n",
|
| 452 |
" </tr>\n",
|
| 453 |
" <tr>\n",
|
| 454 |
+
" <th>53031</th>\n",
|
| 455 |
" <td>Zimbabwe</td>\n",
|
| 456 |
" <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
|
| 457 |
" <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
|
| 458 |
" <td>42</td>\n",
|
| 459 |
+
" <td>498</td>\n",
|
| 460 |
" <td>36 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT Ministr...</td>\n",
|
|
|
|
|
|
|
|
|
|
| 461 |
" <td>0</td>\n",
|
| 462 |
" <td>None</td>\n",
|
| 463 |
" <td>0</td>\n",
|
| 464 |
" </tr>\n",
|
| 465 |
" </tbody>\n",
|
| 466 |
"</table>\n",
|
| 467 |
+
"<p>53032 rows × 9 columns</p>\n",
|
| 468 |
"</div>"
|
| 469 |
],
|
| 470 |
"text/plain": [
|
|
|
|
| 475 |
"3 Afghanistan Afghanistan_First_NDC.pdf \n",
|
| 476 |
"4 Afghanistan Afghanistan_First_NDC.pdf \n",
|
| 477 |
"... ... ... \n",
|
| 478 |
+
"53027 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n",
|
| 479 |
+
"53028 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n",
|
| 480 |
+
"53029 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n",
|
| 481 |
+
"53030 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n",
|
| 482 |
+
"53031 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n",
|
| 483 |
"\n",
|
| 484 |
" filepath indicated_page \\\n",
|
| 485 |
"0 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 1 \n",
|
| 486 |
"1 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 1 \n",
|
| 487 |
+
"2 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 2 \n",
|
| 488 |
+
"3 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 2 \n",
|
| 489 |
+
"4 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 2 \n",
|
| 490 |
"... ... ... \n",
|
| 491 |
+
"53027 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 40 \n",
|
| 492 |
+
"53028 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 40 \n",
|
| 493 |
+
"53029 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 40 \n",
|
| 494 |
+
"53030 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 41 \n",
|
| 495 |
+
"53031 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 42 \n",
|
| 496 |
"\n",
|
| 497 |
" chunk_num text \\\n",
|
| 498 |
"0 1 1 ISLAMIC REPUBLIC OF AFGHANISTAN Intended Nat... \n",
|
| 499 |
+
"1 2 Financial Needs: Total: USD 17.405 billion A... \n",
|
| 500 |
+
"2 3 2 1. Afghanistan’s National Circumstances and ... \n",
|
| 501 |
+
"3 4 Afghanistan remains one of the poorest countri... \n",
|
| 502 |
+
"4 5 Despite these challenges, Afghanistan can rema... \n",
|
| 503 |
"... ... ... \n",
|
| 504 |
+
"53027 494 5.1 GENDER, YOUTH, CHILDREN AND INCLUSIVITY Th... \n",
|
| 505 |
+
"53028 495 Adaptation and applicable mitigation actions w... \n",
|
| 506 |
+
"53029 496 The enhanced integration of climate change int... \n",
|
| 507 |
+
"53030 497 35 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT. \n",
|
| 508 |
+
"53031 498 36 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT Ministr... \n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 509 |
"\n",
|
| 510 |
" is_target_quote matched_quote \\\n",
|
| 511 |
+
"0 1 Target Years: \\n2020 to 2030 \\nContribution Ty... \n",
|
| 512 |
+
"1 1 Target Years: \\n2020 to 2030 \\nContribution Ty... \n",
|
| 513 |
+
"2 0 None \n",
|
| 514 |
"3 0 None \n",
|
| 515 |
"4 0 None \n",
|
| 516 |
"... ... ... \n",
|
| 517 |
+
"53027 0 None \n",
|
| 518 |
+
"53028 0 None \n",
|
| 519 |
+
"53029 0 None \n",
|
| 520 |
+
"53030 0 None \n",
|
| 521 |
+
"53031 0 None \n",
|
| 522 |
"\n",
|
| 523 |
" match_score \n",
|
| 524 |
+
"0 94 \n",
|
| 525 |
+
"1 71 \n",
|
| 526 |
+
"2 0 \n",
|
| 527 |
"3 0 \n",
|
| 528 |
"4 0 \n",
|
| 529 |
"... ... \n",
|
| 530 |
+
"53027 0 \n",
|
| 531 |
+
"53028 0 \n",
|
| 532 |
+
"53029 0 \n",
|
| 533 |
+
"53030 0 \n",
|
| 534 |
+
"53031 0 \n",
|
| 535 |
"\n",
|
| 536 |
+
"[53032 rows x 9 columns]"
|
| 537 |
]
|
| 538 |
},
|
| 539 |
+
"execution_count": 30,
|
| 540 |
"metadata": {},
|
| 541 |
"output_type": "execute_result"
|
| 542 |
}
|
|
|
|
| 547 |
},
|
| 548 |
{
|
| 549 |
"cell_type": "code",
|
| 550 |
+
"execution_count": 31,
|
| 551 |
+
"id": "45a6b88d",
|
| 552 |
+
"metadata": {},
|
| 553 |
+
"outputs": [
|
| 554 |
+
{
|
| 555 |
+
"name": "stdout",
|
| 556 |
+
"output_type": "stream",
|
| 557 |
+
"text": [
|
| 558 |
+
"<class 'pandas.core.frame.DataFrame'>\n",
|
| 559 |
+
"RangeIndex: 53032 entries, 0 to 53031\n",
|
| 560 |
+
"Data columns (total 9 columns):\n",
|
| 561 |
+
" # Column Non-Null Count Dtype \n",
|
| 562 |
+
"--- ------ -------------- ----- \n",
|
| 563 |
+
" 0 country 53032 non-null object\n",
|
| 564 |
+
" 1 filename 53032 non-null object\n",
|
| 565 |
+
" 2 filepath 53032 non-null object\n",
|
| 566 |
+
" 3 indicated_page 53032 non-null int64 \n",
|
| 567 |
+
" 4 chunk_num 53032 non-null int64 \n",
|
| 568 |
+
" 5 text 53032 non-null object\n",
|
| 569 |
+
" 6 is_target_quote 53032 non-null int64 \n",
|
| 570 |
+
" 7 matched_quote 180 non-null object\n",
|
| 571 |
+
" 8 match_score 53032 non-null int64 \n",
|
| 572 |
+
"dtypes: int64(4), object(5)\n",
|
| 573 |
+
"memory usage: 3.6+ MB\n"
|
| 574 |
+
]
|
| 575 |
+
}
|
| 576 |
+
],
|
| 577 |
+
"source": [
|
| 578 |
+
"chunked_pdfs_df.info()"
|
| 579 |
+
]
|
| 580 |
+
},
|
| 581 |
+
{
|
| 582 |
+
"cell_type": "code",
|
| 583 |
+
"execution_count": 33,
|
| 584 |
"id": "9d7038e9",
|
| 585 |
"metadata": {},
|
| 586 |
"outputs": [],
|
| 587 |
"source": [
|
| 588 |
+
"chunked_pdfs_df.to_csv('./fuzzy_matched_chunks.csv', index=False)"
|
| 589 |
]
|
| 590 |
},
|
| 591 |
{
|
| 592 |
"cell_type": "code",
|
| 593 |
+
"execution_count": 28,
|
| 594 |
"id": "76b51ab6",
|
| 595 |
"metadata": {},
|
| 596 |
"outputs": [
|
fuzzy_matched_chunks.csv
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a60783cdeb77e1be04350f2fbcd2875aa963e4727d476e04f75b4193a7535b46
|
| 3 |
+
size 27197827
|
model_pipeline.joblib
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4a5d00bca324c492ee18d6e52ee6e39a7ec62efbdb9d91cea61a46e29c73be06
|
| 3 |
+
size 184283856
|