Derikvo commited on
Commit
7e67676
·
verified ·
1 Parent(s): 7a74ff9

Upload TF-IDF Logistic Regression baseline model

Browse files
fuzzy_match_training_data.ipynb CHANGED
@@ -10,19 +10,10 @@
10
  },
11
  {
12
  "cell_type": "code",
13
- "execution_count": 1,
14
  "id": "9ced7f63",
15
  "metadata": {},
16
- "outputs": [
17
- {
18
- "name": "stderr",
19
- "output_type": "stream",
20
- "text": [
21
- "c:\\Users\\Derik\\anaconda3\\envs\\NDC_extraction_ENV\\lib\\site-packages\\fuzzywuzzy\\fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning\n",
22
- " warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')\n"
23
- ]
24
- }
25
- ],
26
  "source": [
27
  "import pandas as pd\n",
28
  "from fuzzywuzzy import process\n",
@@ -33,7 +24,7 @@
33
  },
34
  {
35
  "cell_type": "code",
36
- "execution_count": 2,
37
  "id": "e06e72c9",
38
  "metadata": {},
39
  "outputs": [],
@@ -44,15 +35,13 @@
44
  },
45
  {
46
  "cell_type": "code",
47
- "execution_count": 3,
48
  "id": "e467b9cd",
49
  "metadata": {},
50
  "outputs": [],
51
  "source": [
52
- "FUZZY_MATCH_THRESHOLD = 85\n",
53
- "\n",
54
  "try:\n",
55
- " chunked_pdfs_df = pd.read_excel('../../etl/20250409_pdf_extraction_results.xlsx', sheet_name= 'Sheet1').drop_duplicates()\n",
56
  " extracted_quotes_df = pd.read_excel('../NDC_scraping_stage_1.xlsx', sheet_name= 'Prev_Finance').drop_duplicates()\n",
57
  "except Exception as e:\n",
58
  " raise RuntimeError(f\"An unexpected error occurred while loading DataFrames: {e}\")\n"
@@ -60,7 +49,122 @@
60
  },
61
  {
62
  "cell_type": "code",
63
- "execution_count": 4,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  "id": "48d2f1d5",
65
  "metadata": {},
66
  "outputs": [],
@@ -73,29 +177,59 @@
73
  },
74
  {
75
  "cell_type": "code",
76
- "execution_count": 5,
77
- "id": "8063a230",
78
  "metadata": {},
79
  "outputs": [],
80
  "source": [
81
- "all_quotes = extracted_quotes_df[QUOTE_TEXT_COLUMN].tolist()"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  ]
83
  },
84
  {
85
  "cell_type": "code",
86
- "execution_count": 6,
87
- "id": "bdc824af",
88
  "metadata": {},
89
  "outputs": [
90
  {
91
  "data": {
92
  "application/vnd.jupyter.widget-view+json": {
93
- "model_id": "703b233b5adf4465825b90883d1dcafe",
94
  "version_major": 2,
95
  "version_minor": 0
96
  },
97
  "text/plain": [
98
- "Fuzzy Matching Chunks: 0%| | 0/60128 [00:00<?, ?it/s]"
99
  ]
100
  },
101
  "metadata": {},
@@ -105,22 +239,7 @@
105
  "name": "stderr",
106
  "output_type": "stream",
107
  "text": [
108
- "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: ')']\n",
109
- "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
110
- "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: ';']\n",
111
- "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
112
- "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
113
- "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: ',']\n",
114
- "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: ').']\n",
115
- "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '/']\n",
116
- "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
117
- "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
118
- "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
119
- "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
120
- "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
121
- "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
122
- "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
123
- "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: ').']\n"
124
  ]
125
  },
126
  {
@@ -132,33 +251,49 @@
132
  }
133
  ],
134
  "source": [
 
135
  "chunked_pdfs_df['is_target_quote'] = 0\n",
136
  "chunked_pdfs_df['matched_quote'] = None\n",
137
  "chunked_pdfs_df['match_score'] = 0\n",
138
  "\n",
139
- "# Iterate through each chunk with a progress bar\n",
140
- "# tqdm will automatically print the \"Starting...\" and \"Complete.\" messages through its bar.\n",
141
- "# For Jupyter/IPython notebooks, use tqdm.notebook.tqdm. For scripts, use tqdm.tqdm\n",
142
- "for index, row in tqdm(chunked_pdfs_df.iterrows(), total=len(chunked_pdfs_df), desc=\"Fuzzy Matching Chunks\"):\n",
143
- " chunk_text = str(row[CHUNK_TEXT_COLUMN]) # Convert to string to handle potential non-string types\n",
144
  "\n",
145
- " # Find the best matching quote and its score\n",
146
- " best_match_tuple = process.extractOne(chunk_text, all_quotes, scorer=fuzz.token_set_ratio)\n",
 
 
147
  "\n",
148
- " if best_match_tuple:\n",
149
- " best_match_quote = best_match_tuple[0]\n",
150
- " match_score = best_match_tuple[1]\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  "\n",
152
- " if match_score >= FUZZY_MATCH_THRESHOLD:\n",
153
- " chunked_pdfs_df.loc[index, 'is_target_quote'] = 1\n",
154
- " chunked_pdfs_df.loc[index, 'matched_quote'] = best_match_quote\n",
155
- " chunked_pdfs_df.loc[index, 'match_score'] = match_score\n",
156
  "print(\"Fuzzy matching complete.\")"
157
  ]
158
  },
159
  {
160
  "cell_type": "code",
161
- "execution_count": 7,
162
  "id": "9c0e0d8d",
163
  "metadata": {},
164
  "outputs": [
@@ -189,9 +324,6 @@
189
  " <th>indicated_page</th>\n",
190
  " <th>chunk_num</th>\n",
191
  " <th>text</th>\n",
192
- " <th>contains_thematic_scope</th>\n",
193
- " <th>contains_coverage</th>\n",
194
- " <th>contains_Granularity</th>\n",
195
  " <th>is_target_quote</th>\n",
196
  " <th>matched_quote</th>\n",
197
  " <th>match_score</th>\n",
@@ -206,12 +338,9 @@
206
  " <td>1</td>\n",
207
  " <td>1</td>\n",
208
  " <td>1 ISLAMIC REPUBLIC OF AFGHANISTAN Intended Nat...</td>\n",
209
- " <td>NaN</td>\n",
210
- " <td>NaN</td>\n",
211
- " <td>NaN</td>\n",
212
- " <td>0</td>\n",
213
- " <td>None</td>\n",
214
- " <td>0</td>\n",
215
  " </tr>\n",
216
  " <tr>\n",
217
  " <th>1</th>\n",
@@ -220,40 +349,31 @@
220
  " <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
221
  " <td>1</td>\n",
222
  " <td>2</td>\n",
223
- " <td>its Intended Nationally Determined Contributio...</td>\n",
224
- " <td>NaN</td>\n",
225
- " <td>NaN</td>\n",
226
- " <td>NaN</td>\n",
227
- " <td>0</td>\n",
228
- " <td>None</td>\n",
229
- " <td>0</td>\n",
230
  " </tr>\n",
231
  " <tr>\n",
232
  " <th>2</th>\n",
233
  " <td>Afghanistan</td>\n",
234
  " <td>Afghanistan_First_NDC.pdf</td>\n",
235
  " <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
236
- " <td>1</td>\n",
237
  " <td>3</td>\n",
238
- " <td>atural resource management, agriculture, waste...</td>\n",
239
- " <td>NaN</td>\n",
240
- " <td>NaN</td>\n",
241
- " <td>NaN</td>\n",
242
- " <td>1</td>\n",
243
- " <td>Target Years: \\n2020 to 2030 \\nContribution Ty...</td>\n",
244
- " <td>98</td>\n",
245
  " </tr>\n",
246
  " <tr>\n",
247
  " <th>3</th>\n",
248
  " <td>Afghanistan</td>\n",
249
  " <td>Afghanistan_First_NDC.pdf</td>\n",
250
  " <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
251
- " <td>1</td>\n",
252
  " <td>4</td>\n",
253
- " <td>ss as usual (BAU) 2030 scenario, conditional o...</td>\n",
254
- " <td>NaN</td>\n",
255
- " <td>NaN</td>\n",
256
- " <td>NaN</td>\n",
257
  " <td>0</td>\n",
258
  " <td>None</td>\n",
259
  " <td>0</td>\n",
@@ -263,12 +383,9 @@
263
  " <td>Afghanistan</td>\n",
264
  " <td>Afghanistan_First_NDC.pdf</td>\n",
265
  " <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
266
- " <td>1</td>\n",
267
  " <td>5</td>\n",
268
- " <td>or Afghanistan showing 13.6% relative reductio...</td>\n",
269
- " <td>NaN</td>\n",
270
- " <td>NaN</td>\n",
271
- " <td>NaN</td>\n",
272
  " <td>0</td>\n",
273
  " <td>None</td>\n",
274
  " <td>0</td>\n",
@@ -284,88 +401,70 @@
284
  " <td>...</td>\n",
285
  " <td>...</td>\n",
286
  " <td>...</td>\n",
287
- " <td>...</td>\n",
288
- " <td>...</td>\n",
289
- " <td>...</td>\n",
290
  " </tr>\n",
291
  " <tr>\n",
292
- " <th>60123</th>\n",
293
  " <td>Zimbabwe</td>\n",
294
  " <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
295
  " <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
296
  " <td>40</td>\n",
297
- " <td>845</td>\n",
298
- " <td>ILDING, EDUCATION, TRAINING AND AWARENESS The ...</td>\n",
299
- " <td>NaN</td>\n",
300
- " <td>NaN</td>\n",
301
- " <td>NaN</td>\n",
302
  " <td>0</td>\n",
303
  " <td>None</td>\n",
304
  " <td>0</td>\n",
305
  " </tr>\n",
306
  " <tr>\n",
307
- " <th>60124</th>\n",
308
  " <td>Zimbabwe</td>\n",
309
  " <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
310
  " <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
311
  " <td>40</td>\n",
312
- " <td>846</td>\n",
313
- " <td>ious sectors. The enhanced integration of clim...</td>\n",
314
- " <td>NaN</td>\n",
315
- " <td>NaN</td>\n",
316
- " <td>NaN</td>\n",
317
  " <td>0</td>\n",
318
  " <td>None</td>\n",
319
  " <td>0</td>\n",
320
  " </tr>\n",
321
  " <tr>\n",
322
- " <th>60125</th>\n",
323
  " <td>Zimbabwe</td>\n",
324
  " <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
325
  " <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
326
  " <td>40</td>\n",
327
- " <td>847</td>\n",
328
- " <td>pacity building and innovation. In addition, t...</td>\n",
329
- " <td>NaN</td>\n",
330
- " <td>NaN</td>\n",
331
- " <td>NaN</td>\n",
332
  " <td>0</td>\n",
333
  " <td>None</td>\n",
334
  " <td>0</td>\n",
335
  " </tr>\n",
336
  " <tr>\n",
337
- " <th>60126</th>\n",
338
  " <td>Zimbabwe</td>\n",
339
  " <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
340
  " <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
341
  " <td>41</td>\n",
342
- " <td>848</td>\n",
343
- " <td>35 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT</td>\n",
344
- " <td>NaN</td>\n",
345
- " <td>NaN</td>\n",
346
- " <td>NaN</td>\n",
347
  " <td>0</td>\n",
348
  " <td>None</td>\n",
349
  " <td>0</td>\n",
350
  " </tr>\n",
351
  " <tr>\n",
352
- " <th>60127</th>\n",
353
  " <td>Zimbabwe</td>\n",
354
  " <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
355
  " <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
356
  " <td>42</td>\n",
357
- " <td>849</td>\n",
358
  " <td>36 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT Ministr...</td>\n",
359
- " <td>NaN</td>\n",
360
- " <td>NaN</td>\n",
361
- " <td>NaN</td>\n",
362
  " <td>0</td>\n",
363
  " <td>None</td>\n",
364
  " <td>0</td>\n",
365
  " </tr>\n",
366
  " </tbody>\n",
367
  "</table>\n",
368
- "<p>60128 rows × 12 columns</p>\n",
369
  "</div>"
370
  ],
371
  "text/plain": [
@@ -376,81 +475,68 @@
376
  "3 Afghanistan Afghanistan_First_NDC.pdf \n",
377
  "4 Afghanistan Afghanistan_First_NDC.pdf \n",
378
  "... ... ... \n",
379
- "60123 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n",
380
- "60124 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n",
381
- "60125 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n",
382
- "60126 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n",
383
- "60127 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n",
384
  "\n",
385
  " filepath indicated_page \\\n",
386
  "0 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 1 \n",
387
  "1 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 1 \n",
388
- "2 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 1 \n",
389
- "3 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 1 \n",
390
- "4 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 1 \n",
391
  "... ... ... \n",
392
- "60123 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 40 \n",
393
- "60124 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 40 \n",
394
- "60125 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 40 \n",
395
- "60126 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 41 \n",
396
- "60127 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 42 \n",
397
  "\n",
398
  " chunk_num text \\\n",
399
  "0 1 1 ISLAMIC REPUBLIC OF AFGHANISTAN Intended Nat... \n",
400
- "1 2 its Intended Nationally Determined Contributio... \n",
401
- "2 3 atural resource management, agriculture, waste... \n",
402
- "3 4 ss as usual (BAU) 2030 scenario, conditional o... \n",
403
- "4 5 or Afghanistan showing 13.6% relative reductio... \n",
404
  "... ... ... \n",
405
- "60123 845 ILDING, EDUCATION, TRAINING AND AWARENESS The ... \n",
406
- "60124 846 ious sectors. The enhanced integration of clim... \n",
407
- "60125 847 pacity building and innovation. In addition, t... \n",
408
- "60126 848 35 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT \n",
409
- "60127 849 36 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT Ministr... \n",
410
- "\n",
411
- " contains_thematic_scope contains_coverage contains_Granularity \\\n",
412
- "0 NaN NaN NaN \n",
413
- "1 NaN NaN NaN \n",
414
- "2 NaN NaN NaN \n",
415
- "3 NaN NaN NaN \n",
416
- "4 NaN NaN NaN \n",
417
- "... ... ... ... \n",
418
- "60123 NaN NaN NaN \n",
419
- "60124 NaN NaN NaN \n",
420
- "60125 NaN NaN NaN \n",
421
- "60126 NaN NaN NaN \n",
422
- "60127 NaN NaN NaN \n",
423
  "\n",
424
  " is_target_quote matched_quote \\\n",
425
- "0 0 None \n",
426
- "1 0 None \n",
427
- "2 1 Target Years: \\n2020 to 2030 \\nContribution Ty... \n",
428
  "3 0 None \n",
429
  "4 0 None \n",
430
  "... ... ... \n",
431
- "60123 0 None \n",
432
- "60124 0 None \n",
433
- "60125 0 None \n",
434
- "60126 0 None \n",
435
- "60127 0 None \n",
436
  "\n",
437
  " match_score \n",
438
- "0 0 \n",
439
- "1 0 \n",
440
- "2 98 \n",
441
  "3 0 \n",
442
  "4 0 \n",
443
  "... ... \n",
444
- "60123 0 \n",
445
- "60124 0 \n",
446
- "60125 0 \n",
447
- "60126 0 \n",
448
- "60127 0 \n",
449
  "\n",
450
- "[60128 rows x 12 columns]"
451
  ]
452
  },
453
- "execution_count": 7,
454
  "metadata": {},
455
  "output_type": "execute_result"
456
  }
@@ -461,17 +547,50 @@
461
  },
462
  {
463
  "cell_type": "code",
464
- "execution_count": 8,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
465
  "id": "9d7038e9",
466
  "metadata": {},
467
  "outputs": [],
468
  "source": [
469
- "chunked_pdfs_df.to_csv('./fuzzy_matched_chunks.csv')"
470
  ]
471
  },
472
  {
473
  "cell_type": "code",
474
- "execution_count": 9,
475
  "id": "76b51ab6",
476
  "metadata": {},
477
  "outputs": [
 
10
  },
11
  {
12
  "cell_type": "code",
13
+ "execution_count": 15,
14
  "id": "9ced7f63",
15
  "metadata": {},
16
+ "outputs": [],
 
 
 
 
 
 
 
 
 
17
  "source": [
18
  "import pandas as pd\n",
19
  "from fuzzywuzzy import process\n",
 
24
  },
25
  {
26
  "cell_type": "code",
27
+ "execution_count": 16,
28
  "id": "e06e72c9",
29
  "metadata": {},
30
  "outputs": [],
 
35
  },
36
  {
37
  "cell_type": "code",
38
+ "execution_count": 17,
39
  "id": "e467b9cd",
40
  "metadata": {},
41
  "outputs": [],
42
  "source": [
 
 
43
  "try:\n",
44
+ " chunked_pdfs_df = pd.read_excel('../../etl/20250708_sentences_pdf_extraction_results.xlsx', sheet_name= 'Sheet1').drop_duplicates()\n",
45
  " extracted_quotes_df = pd.read_excel('../NDC_scraping_stage_1.xlsx', sheet_name= 'Prev_Finance').drop_duplicates()\n",
46
  "except Exception as e:\n",
47
  " raise RuntimeError(f\"An unexpected error occurred while loading DataFrames: {e}\")\n"
 
49
  },
50
  {
51
  "cell_type": "code",
52
+ "execution_count": 18,
53
+ "id": "62866717",
54
+ "metadata": {},
55
+ "outputs": [
56
+ {
57
+ "data": {
58
+ "text/plain": [
59
+ "array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',\n",
60
+ " 'Antigua_and_Barbuda', 'Argentina', 'Armenia', 'Australia',\n",
61
+ " 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados',\n",
62
+ " 'Belarus', 'Belize', 'Benin', 'Bhutan',\n",
63
+ " 'Bolivia_Plurinational_State_of', 'Bosnia_and_Herzegovina',\n",
64
+ " 'Botswana', 'Brazil', 'Brunei_Darussalam', 'Burkina_Faso',\n",
65
+ " 'Burundi', 'Cabo_Verde', 'Cambodia', 'Cameroon', 'Canada',\n",
66
+ " 'Central_African_Republic', 'Chad', 'Chile', 'China', 'Colombia',\n",
67
+ " 'Comoros', 'Congo', 'Cook_Islands', 'Costa_Rica', 'Cuba',\n",
68
+ " 'Côte_dIvoire', 'Democratic_Republic_of_the_Congo', 'Djibouti',\n",
69
+ " 'Dominica', 'Dominican_Republic', 'Ecuador', 'Egypt',\n",
70
+ " 'El_Salvador', 'Equatorial_Guinea', 'Eritrea', 'Eswatini',\n",
71
+ " 'Ethiopia', 'European_Union_EU', 'Fiji', 'Gabon', 'Gambia',\n",
72
+ " 'Georgia', 'Ghana', 'Grenada', 'Guatemala', 'Guinea',\n",
73
+ " 'Guinea-Bissau', 'Guyana', 'Haiti', 'Holy_See', 'Honduras',\n",
74
+ " 'Iceland', 'India', 'Iraq', 'Israel', 'Jamaica', 'Japan', 'Jordan',\n",
75
+ " 'Kazakhstan', 'Kenya', 'Kuwait', 'Kyrgyzstan',\n",
76
+ " 'Lao_Peoples_Democratic_Republic', 'Lebanon', 'Liberia',\n",
77
+ " 'Liechtenstein', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives',\n",
78
+ " 'Mali', 'Marshall_Islands', 'Mauritania', 'Mauritius', 'Mexico',\n",
79
+ " 'Micronesia_Federated_States_of', 'Monaco', 'Mongolia',\n",
80
+ " 'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia',\n",
81
+ " 'Nauru', 'Nepal', 'New_Zealand', 'Nicaragua', 'Niger', 'Nigeria',\n",
82
+ " 'Niue', 'North_Macedonia', 'Norway', 'Oman', 'Pakistan', 'Palau',\n",
83
+ " 'Panama', 'Papua_New_Guinea', 'Peru', 'Philippines', 'Qatar',\n",
84
+ " 'Republic_of_Korea', 'Republic_of_Moldova', 'Russian_Federation',\n",
85
+ " 'Rwanda', 'Saint_Kitts_and_Nevis', 'Saint_Lucia',\n",
86
+ " 'Saint_Vincent_and_the_Grenadines', 'Samoa', 'San_Marino',\n",
87
+ " 'Sao_Tome_and_Principe', 'Saudi_Arabia', 'Senegal', 'Serbia',\n",
88
+ " 'Seychelles', 'Sierra_Leone', 'Singapore', 'Solomon_Islands',\n",
89
+ " 'Somalia', 'South_Africa', 'South_Sudan', 'Sri_Lanka',\n",
90
+ " 'State_of_Palestine', 'Sudan', 'Suriname', 'Switzerland',\n",
91
+ " 'Syrian_Arab_Republic', 'Tajikistan', 'Thailand', 'Timor-Leste',\n",
92
+ " 'Togo', 'Tonga', 'Trinidad_and_Tobago', 'Tunisia', 'Turkmenistan',\n",
93
+ " 'Tuvalu', 'Türkiye', 'Uganda', 'Ukraine', 'United_Arab_Emirates',\n",
94
+ " 'United_Kingdom_of_Great_Britain_and_Northern_Ireland',\n",
95
+ " 'United_Republic_of_Tanzania', 'United_States_of_America',\n",
96
+ " 'Uruguay', 'Uzbekistan', 'Vanuatu',\n",
97
+ " 'Venezuela_Bolivarian_Republic_of', 'Viet_Nam', 'Zambia',\n",
98
+ " 'Zimbabwe'], dtype=object)"
99
+ ]
100
+ },
101
+ "execution_count": 18,
102
+ "metadata": {},
103
+ "output_type": "execute_result"
104
+ }
105
+ ],
106
+ "source": [
107
+ "chunked_pdfs_df['country'].unique()"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": 19,
113
+ "id": "281d0e3e",
114
+ "metadata": {},
115
+ "outputs": [],
116
+ "source": [
117
+ "extracted_quotes_df['Source'] = extracted_quotes_df['Source'].str.split(' ').str[0]"
118
+ ]
119
+ },
120
+ {
121
+ "cell_type": "code",
122
+ "execution_count": 20,
123
+ "id": "a4516eaa",
124
+ "metadata": {},
125
+ "outputs": [
126
+ {
127
+ "data": {
128
+ "text/plain": [
129
+ "array(['Afghanistan', 'Antigua', 'Bahamas', 'Barbados', 'Bhutan',\n",
130
+ " 'Bosnia', 'Egypt', 'Bangladesh', 'Eswatini', 'Fiji', 'Grenada',\n",
131
+ " 'Guyana', 'Indonesia', 'Kazakhstan', 'Kiribati', 'Liberia',\n",
132
+ " 'Micronesia', 'Belize', 'Malaysia', 'Mongolia', 'Nepal',\n",
133
+ " 'Pakistan', 'Solomon', 'Trinidad', nan], dtype=object)"
134
+ ]
135
+ },
136
+ "execution_count": 20,
137
+ "metadata": {},
138
+ "output_type": "execute_result"
139
+ }
140
+ ],
141
+ "source": [
142
+ "extracted_quotes_df['Source'].unique()"
143
+ ]
144
+ },
145
+ {
146
+ "cell_type": "markdown",
147
+ "id": "7e118852",
148
+ "metadata": {},
149
+ "source": [
150
+ "# Filter chunked pdfs to manually extracted data to reduce amount of fuzzy matches\n",
151
+ "\n",
152
+ "Can be further optimized by iterating through country / file name"
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "code",
157
+ "execution_count": 21,
158
+ "id": "0f1beca6",
159
+ "metadata": {},
160
+ "outputs": [],
161
+ "source": [
162
+ "# chunked_pdfs_df = chunked_pdfs_df[chunked_pdfs_df['country'].isin(extracted_quotes_df['Source'].unique())]"
163
+ ]
164
+ },
165
+ {
166
+ "cell_type": "code",
167
+ "execution_count": 22,
168
  "id": "48d2f1d5",
169
  "metadata": {},
170
  "outputs": [],
 
177
  },
178
  {
179
  "cell_type": "code",
180
+ "execution_count": 23,
181
+ "id": "bdc824af",
182
  "metadata": {},
183
  "outputs": [],
184
  "source": [
185
+ "# all_quotes = extracted_quotes_df[QUOTE_TEXT_COLUMN].tolist()\n",
186
+ "# chunked_pdfs_df['is_target_quote'] = 0\n",
187
+ "# chunked_pdfs_df['matched_quote'] = None\n",
188
+ "# chunked_pdfs_df['match_score'] = 0\n",
189
+ "\n",
190
+ "# # Iterate through each chunk with a progress bar\n",
191
+ "# # tqdm will automatically print the \"Starting...\" and \"Complete.\" messages through its bar.\n",
192
+ "# # For Jupyter/IPython notebooks, use tqdm.notebook.tqdm. For scripts, use tqdm.tqdm\n",
193
+ "# for index, row in tqdm(chunked_pdfs_df.iterrows(), total=len(chunked_pdfs_df), desc=\"Fuzzy Matching Chunks\"):\n",
194
+ "# chunk_text = str(row[CHUNK_TEXT_COLUMN]) # Convert to string to handle potential non-string types\n",
195
+ "\n",
196
+ "# # Find the best matching quote and its score\n",
197
+ "# best_match_tuple = process.extractOne(chunk_text, all_quotes, scorer=fuzz.token_set_ratio)\n",
198
+ "\n",
199
+ "# if best_match_tuple:\n",
200
+ "# best_match_quote = best_match_tuple[0]\n",
201
+ "# match_score = best_match_tuple[1]\n",
202
+ "\n",
203
+ "# if match_score >= FUZZY_MATCH_THRESHOLD:\n",
204
+ "# chunked_pdfs_df.loc[index, 'is_target_quote'] = 1\n",
205
+ "# chunked_pdfs_df.loc[index, 'matched_quote'] = best_match_quote\n",
206
+ "# chunked_pdfs_df.loc[index, 'match_score'] = match_score\n",
207
+ "# print(\"Fuzzy matching complete.\")"
208
+ ]
209
+ },
210
+ {
211
+ "cell_type": "markdown",
212
+ "id": "d509674b",
213
+ "metadata": {},
214
+ "source": [
215
+ "# Further optimized fuzzy match"
216
  ]
217
  },
218
  {
219
  "cell_type": "code",
220
+ "execution_count": 29,
221
+ "id": "e3ea4dce",
222
  "metadata": {},
223
  "outputs": [
224
  {
225
  "data": {
226
  "application/vnd.jupyter.widget-view+json": {
227
+ "model_id": "85a3712ec33542bb811b04c8ea1ffcde",
228
  "version_major": 2,
229
  "version_minor": 0
230
  },
231
  "text/plain": [
232
+ "Processing Countries: 0%| | 0/24 [00:00<?, ?it/s]"
233
  ]
234
  },
235
  "metadata": {},
 
239
  "name": "stderr",
240
  "output_type": "stream",
241
  "text": [
242
+ "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '---.']\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  ]
244
  },
245
  {
 
251
  }
252
  ],
253
  "source": [
254
+ "FUZZY_MATCH_THRESHOLD = 70\n",
255
  "chunked_pdfs_df['is_target_quote'] = 0\n",
256
  "chunked_pdfs_df['matched_quote'] = None\n",
257
  "chunked_pdfs_df['match_score'] = 0\n",
258
  "\n",
259
+ "unique_sources = extracted_quotes_df['Source'].dropna().unique()\n",
 
 
 
 
260
  "\n",
261
+ "for source_country in tqdm(unique_sources, desc=\"Processing Countries\"):\n",
262
+ " # Filter quotes for the current source_country\n",
263
+ " country_specific_quotes_df = extracted_quotes_df[extracted_quotes_df['Source'] == source_country]\n",
264
+ " all_quotes_for_country = country_specific_quotes_df[QUOTE_TEXT_COLUMN].tolist()\n",
265
  "\n",
266
+ " if not all_quotes_for_country:\n",
267
+ " continue # Skip if no quotes for this country\n",
268
+ " normalized_chunk_countries = chunked_pdfs_df['country'].str.replace('_', ' ').str.lower()\n",
269
+ " normalized_source_country = source_country.replace('_', ' ').lower()\n",
270
+ " matching_chunk_indices = chunked_pdfs_df[normalized_chunk_countries.str.contains(normalized_source_country, na=False)].index\n",
271
+ "\n",
272
+ " if matching_chunk_indices.empty:\n",
273
+ " continue # Skip if no chunks for this country\n",
274
+ "\n",
275
+ " # Iterate through only the relevant chunks for the current country\n",
276
+ " for index in matching_chunk_indices:\n",
277
+ " chunk_text = str(chunked_pdfs_df.loc[index, CHUNK_TEXT_COLUMN])\n",
278
+ "\n",
279
+ " # Find the best matching quote and its score within the country-specific quotes\n",
280
+ " best_match_tuple = process.extractOne(chunk_text, all_quotes_for_country, scorer=fuzz.token_set_ratio)\n",
281
+ "\n",
282
+ " if best_match_tuple:\n",
283
+ " best_match_quote = best_match_tuple[0]\n",
284
+ " match_score = best_match_tuple[1]\n",
285
+ "\n",
286
+ " if match_score >= FUZZY_MATCH_THRESHOLD:\n",
287
+ " chunked_pdfs_df.loc[index, 'is_target_quote'] = 1\n",
288
+ " chunked_pdfs_df.loc[index, 'matched_quote'] = best_match_quote\n",
289
+ " chunked_pdfs_df.loc[index, 'match_score'] = match_score\n",
290
  "\n",
 
 
 
 
291
  "print(\"Fuzzy matching complete.\")"
292
  ]
293
  },
294
  {
295
  "cell_type": "code",
296
+ "execution_count": 30,
297
  "id": "9c0e0d8d",
298
  "metadata": {},
299
  "outputs": [
 
324
  " <th>indicated_page</th>\n",
325
  " <th>chunk_num</th>\n",
326
  " <th>text</th>\n",
 
 
 
327
  " <th>is_target_quote</th>\n",
328
  " <th>matched_quote</th>\n",
329
  " <th>match_score</th>\n",
 
338
  " <td>1</td>\n",
339
  " <td>1</td>\n",
340
  " <td>1 ISLAMIC REPUBLIC OF AFGHANISTAN Intended Nat...</td>\n",
341
+ " <td>1</td>\n",
342
+ " <td>Target Years: \\n2020 to 2030 \\nContribution Ty...</td>\n",
343
+ " <td>94</td>\n",
 
 
 
344
  " </tr>\n",
345
  " <tr>\n",
346
  " <th>1</th>\n",
 
349
  " <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
350
  " <td>1</td>\n",
351
  " <td>2</td>\n",
352
+ " <td>Financial Needs: Total: USD 17.405 billion  A...</td>\n",
353
+ " <td>1</td>\n",
354
+ " <td>Target Years: \\n2020 to 2030 \\nContribution Ty...</td>\n",
355
+ " <td>71</td>\n",
 
 
 
356
  " </tr>\n",
357
  " <tr>\n",
358
  " <th>2</th>\n",
359
  " <td>Afghanistan</td>\n",
360
  " <td>Afghanistan_First_NDC.pdf</td>\n",
361
  " <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
362
+ " <td>2</td>\n",
363
  " <td>3</td>\n",
364
+ " <td>2 1. Afghanistan’s National Circumstances and ...</td>\n",
365
+ " <td>0</td>\n",
366
+ " <td>None</td>\n",
367
+ " <td>0</td>\n",
 
 
 
368
  " </tr>\n",
369
  " <tr>\n",
370
  " <th>3</th>\n",
371
  " <td>Afghanistan</td>\n",
372
  " <td>Afghanistan_First_NDC.pdf</td>\n",
373
  " <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
374
+ " <td>2</td>\n",
375
  " <td>4</td>\n",
376
+ " <td>Afghanistan remains one of the poorest countri...</td>\n",
 
 
 
377
  " <td>0</td>\n",
378
  " <td>None</td>\n",
379
  " <td>0</td>\n",
 
383
  " <td>Afghanistan</td>\n",
384
  " <td>Afghanistan_First_NDC.pdf</td>\n",
385
  " <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
386
+ " <td>2</td>\n",
387
  " <td>5</td>\n",
388
+ " <td>Despite these challenges, Afghanistan can rema...</td>\n",
 
 
 
389
  " <td>0</td>\n",
390
  " <td>None</td>\n",
391
  " <td>0</td>\n",
 
401
  " <td>...</td>\n",
402
  " <td>...</td>\n",
403
  " <td>...</td>\n",
 
 
 
404
  " </tr>\n",
405
  " <tr>\n",
406
+ " <th>53027</th>\n",
407
  " <td>Zimbabwe</td>\n",
408
  " <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
409
  " <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
410
  " <td>40</td>\n",
411
+ " <td>494</td>\n",
412
+ " <td>5.1 GENDER, YOUTH, CHILDREN AND INCLUSIVITY Th...</td>\n",
 
 
 
413
  " <td>0</td>\n",
414
  " <td>None</td>\n",
415
  " <td>0</td>\n",
416
  " </tr>\n",
417
  " <tr>\n",
418
+ " <th>53028</th>\n",
419
  " <td>Zimbabwe</td>\n",
420
  " <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
421
  " <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
422
  " <td>40</td>\n",
423
+ " <td>495</td>\n",
424
+ " <td>Adaptation and applicable mitigation actions w...</td>\n",
 
 
 
425
  " <td>0</td>\n",
426
  " <td>None</td>\n",
427
  " <td>0</td>\n",
428
  " </tr>\n",
429
  " <tr>\n",
430
+ " <th>53029</th>\n",
431
  " <td>Zimbabwe</td>\n",
432
  " <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
433
  " <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
434
  " <td>40</td>\n",
435
+ " <td>496</td>\n",
436
+ " <td>The enhanced integration of climate change int...</td>\n",
 
 
 
437
  " <td>0</td>\n",
438
  " <td>None</td>\n",
439
  " <td>0</td>\n",
440
  " </tr>\n",
441
  " <tr>\n",
442
+ " <th>53030</th>\n",
443
  " <td>Zimbabwe</td>\n",
444
  " <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
445
  " <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
446
  " <td>41</td>\n",
447
+ " <td>497</td>\n",
448
+ " <td>35 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT.</td>\n",
 
 
 
449
  " <td>0</td>\n",
450
  " <td>None</td>\n",
451
  " <td>0</td>\n",
452
  " </tr>\n",
453
  " <tr>\n",
454
+ " <th>53031</th>\n",
455
  " <td>Zimbabwe</td>\n",
456
  " <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
457
  " <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
458
  " <td>42</td>\n",
459
+ " <td>498</td>\n",
460
  " <td>36 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT Ministr...</td>\n",
 
 
 
461
  " <td>0</td>\n",
462
  " <td>None</td>\n",
463
  " <td>0</td>\n",
464
  " </tr>\n",
465
  " </tbody>\n",
466
  "</table>\n",
467
+ "<p>53032 rows × 9 columns</p>\n",
468
  "</div>"
469
  ],
470
  "text/plain": [
 
475
  "3 Afghanistan Afghanistan_First_NDC.pdf \n",
476
  "4 Afghanistan Afghanistan_First_NDC.pdf \n",
477
  "... ... ... \n",
478
+ "53027 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n",
479
+ "53028 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n",
480
+ "53029 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n",
481
+ "53030 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n",
482
+ "53031 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n",
483
  "\n",
484
  " filepath indicated_page \\\n",
485
  "0 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 1 \n",
486
  "1 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 1 \n",
487
+ "2 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 2 \n",
488
+ "3 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 2 \n",
489
+ "4 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 2 \n",
490
  "... ... ... \n",
491
+ "53027 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 40 \n",
492
+ "53028 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 40 \n",
493
+ "53029 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 40 \n",
494
+ "53030 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 41 \n",
495
+ "53031 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 42 \n",
496
  "\n",
497
  " chunk_num text \\\n",
498
  "0 1 1 ISLAMIC REPUBLIC OF AFGHANISTAN Intended Nat... \n",
499
+ "1 2 Financial Needs: Total: USD 17.405 billion  A... \n",
500
+ "2 3 2 1. Afghanistan’s National Circumstances and ... \n",
501
+ "3 4 Afghanistan remains one of the poorest countri... \n",
502
+ "4 5 Despite these challenges, Afghanistan can rema... \n",
503
  "... ... ... \n",
504
+ "53027 494 5.1 GENDER, YOUTH, CHILDREN AND INCLUSIVITY Th... \n",
505
+ "53028 495 Adaptation and applicable mitigation actions w... \n",
506
+ "53029 496 The enhanced integration of climate change int... \n",
507
+ "53030 497 35 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT. \n",
508
+ "53031 498 36 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT Ministr... \n",
 
 
 
 
 
 
 
 
 
 
 
 
 
509
  "\n",
510
  " is_target_quote matched_quote \\\n",
511
+ "0 1 Target Years: \\n2020 to 2030 \\nContribution Ty... \n",
512
+ "1 1 Target Years: \\n2020 to 2030 \\nContribution Ty... \n",
513
+ "2 0 None \n",
514
  "3 0 None \n",
515
  "4 0 None \n",
516
  "... ... ... \n",
517
+ "53027 0 None \n",
518
+ "53028 0 None \n",
519
+ "53029 0 None \n",
520
+ "53030 0 None \n",
521
+ "53031 0 None \n",
522
  "\n",
523
  " match_score \n",
524
+ "0 94 \n",
525
+ "1 71 \n",
526
+ "2 0 \n",
527
  "3 0 \n",
528
  "4 0 \n",
529
  "... ... \n",
530
+ "53027 0 \n",
531
+ "53028 0 \n",
532
+ "53029 0 \n",
533
+ "53030 0 \n",
534
+ "53031 0 \n",
535
  "\n",
536
+ "[53032 rows x 9 columns]"
537
  ]
538
  },
539
+ "execution_count": 30,
540
  "metadata": {},
541
  "output_type": "execute_result"
542
  }
 
547
  },
548
  {
549
  "cell_type": "code",
550
+ "execution_count": 31,
551
+ "id": "45a6b88d",
552
+ "metadata": {},
553
+ "outputs": [
554
+ {
555
+ "name": "stdout",
556
+ "output_type": "stream",
557
+ "text": [
558
+ "<class 'pandas.core.frame.DataFrame'>\n",
559
+ "RangeIndex: 53032 entries, 0 to 53031\n",
560
+ "Data columns (total 9 columns):\n",
561
+ " # Column Non-Null Count Dtype \n",
562
+ "--- ------ -------------- ----- \n",
563
+ " 0 country 53032 non-null object\n",
564
+ " 1 filename 53032 non-null object\n",
565
+ " 2 filepath 53032 non-null object\n",
566
+ " 3 indicated_page 53032 non-null int64 \n",
567
+ " 4 chunk_num 53032 non-null int64 \n",
568
+ " 5 text 53032 non-null object\n",
569
+ " 6 is_target_quote 53032 non-null int64 \n",
570
+ " 7 matched_quote 180 non-null object\n",
571
+ " 8 match_score 53032 non-null int64 \n",
572
+ "dtypes: int64(4), object(5)\n",
573
+ "memory usage: 3.6+ MB\n"
574
+ ]
575
+ }
576
+ ],
577
+ "source": [
578
+ "chunked_pdfs_df.info()"
579
+ ]
580
+ },
581
+ {
582
+ "cell_type": "code",
583
+ "execution_count": 33,
584
  "id": "9d7038e9",
585
  "metadata": {},
586
  "outputs": [],
587
  "source": [
588
+ "chunked_pdfs_df.to_csv('./fuzzy_matched_chunks.csv', index=False)"
589
  ]
590
  },
591
  {
592
  "cell_type": "code",
593
+ "execution_count": 28,
594
  "id": "76b51ab6",
595
  "metadata": {},
596
  "outputs": [
fuzzy_matched_chunks.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:783c1ee7c7b2ef9a44592e0f7e96e0b290ce88b2337eec0a42460e9ceb0c32fa
3
- size 23764705
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a60783cdeb77e1be04350f2fbcd2875aa963e4727d476e04f75b4193a7535b46
3
+ size 27197827
model_pipeline.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed71ae5894cef561e61385a960a73c3faa377f1a71f547c861acf915dd52c614
3
- size 141529984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a5d00bca324c492ee18d6e52ee6e39a7ec62efbdb9d91cea61a46e29c73be06
3
+ size 184283856