Derikvo commited on
Commit
a834f8a
·
verified ·
1 Parent(s): b065a9f

Upload TF-IDF Logistic Regression baseline model

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ fuzzy_matched_chunks.csv filter=lfs diff=lfs merge=lfs -text
fuzzy_match_training_data.ipynb ADDED
@@ -0,0 +1,512 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "8d1fae73",
6
+ "metadata": {},
7
+ "source": [
8
+ "This notebook aims to map the manually extracted bools with the chunked data so we can have a more varied negative class."
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 1,
14
+ "id": "9ced7f63",
15
+ "metadata": {},
16
+ "outputs": [
17
+ {
18
+ "name": "stderr",
19
+ "output_type": "stream",
20
+ "text": [
21
+ "c:\\Users\\Derik\\anaconda3\\envs\\NDC_extraction_ENV\\lib\\site-packages\\fuzzywuzzy\\fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning\n",
22
+ " warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')\n"
23
+ ]
24
+ }
25
+ ],
26
+ "source": [
27
+ "import pandas as pd\n",
28
+ "from fuzzywuzzy import process\n",
29
+ "from fuzzywuzzy import fuzz\n",
30
+ "\n",
31
+ "from tqdm.notebook import tqdm, IProgress "
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": 2,
37
+ "id": "e06e72c9",
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "CHUNK_TEXT_COLUMN = 'text'\n",
42
+ "QUOTE_TEXT_COLUMN = 'Quote or table'"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": 3,
48
+ "id": "e467b9cd",
49
+ "metadata": {},
50
+ "outputs": [],
51
+ "source": [
52
+ "FUZZY_MATCH_THRESHOLD = 85\n",
53
+ "\n",
54
+ "try:\n",
55
+ " chunked_pdfs_df = pd.read_excel('../../etl/20250409_pdf_extraction_results.xlsx', sheet_name= 'Sheet1').drop_duplicates()\n",
56
+ " extracted_quotes_df = pd.read_excel('../NDC_scraping_stage_1.xlsx', sheet_name= 'Prev_Finance').drop_duplicates()\n",
57
+ "except Exception as e:\n",
58
+ " raise RuntimeError(f\"An unexpected error occurred while loading DataFrames: {e}\")\n"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": 4,
64
+ "id": "48d2f1d5",
65
+ "metadata": {},
66
+ "outputs": [],
67
+ "source": [
68
+ "if CHUNK_TEXT_COLUMN not in chunked_pdfs_df.columns:\n",
69
+ " raise ValueError(f\"Error: Chunk text column '{CHUNK_TEXT_COLUMN}' not found in 'chunked_pdfs_df'.\")\n",
70
+ "if QUOTE_TEXT_COLUMN not in extracted_quotes_df.columns:\n",
71
+ " raise ValueError(f\"Error: Quote text column '{QUOTE_TEXT_COLUMN}' not found in 'extracted_quotes_df'.\")"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": 5,
77
+ "id": "8063a230",
78
+ "metadata": {},
79
+ "outputs": [],
80
+ "source": [
81
+ "all_quotes = extracted_quotes_df[QUOTE_TEXT_COLUMN].tolist()"
82
+ ]
83
+ },
84
+ {
85
+ "cell_type": "code",
86
+ "execution_count": 6,
87
+ "id": "bdc824af",
88
+ "metadata": {},
89
+ "outputs": [
90
+ {
91
+ "data": {
92
+ "application/vnd.jupyter.widget-view+json": {
93
+ "model_id": "703b233b5adf4465825b90883d1dcafe",
94
+ "version_major": 2,
95
+ "version_minor": 0
96
+ },
97
+ "text/plain": [
98
+ "Fuzzy Matching Chunks: 0%| | 0/60128 [00:00<?, ?it/s]"
99
+ ]
100
+ },
101
+ "metadata": {},
102
+ "output_type": "display_data"
103
+ },
104
+ {
105
+ "name": "stderr",
106
+ "output_type": "stream",
107
+ "text": [
108
+ "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: ')']\n",
109
+ "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
110
+ "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: ';']\n",
111
+ "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
112
+ "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
113
+ "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: ',']\n",
114
+ "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: ').']\n",
115
+ "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '/']\n",
116
+ "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
117
+ "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
118
+ "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
119
+ "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
120
+ "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
121
+ "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
122
+ "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '.']\n",
123
+ "WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: ').']\n"
124
+ ]
125
+ },
126
+ {
127
+ "name": "stdout",
128
+ "output_type": "stream",
129
+ "text": [
130
+ "Fuzzy matching complete.\n"
131
+ ]
132
+ }
133
+ ],
134
+ "source": [
135
+ "chunked_pdfs_df['is_target_quote'] = 0\n",
136
+ "chunked_pdfs_df['matched_quote'] = None\n",
137
+ "chunked_pdfs_df['match_score'] = 0\n",
138
+ "\n",
139
+ "# Iterate through each chunk with a progress bar\n",
140
+ "# tqdm will automatically print the \"Starting...\" and \"Complete.\" messages through its bar.\n",
141
+ "# For Jupyter/IPython notebooks, use tqdm.notebook.tqdm. For scripts, use tqdm.tqdm\n",
142
+ "for index, row in tqdm(chunked_pdfs_df.iterrows(), total=len(chunked_pdfs_df), desc=\"Fuzzy Matching Chunks\"):\n",
143
+ " chunk_text = str(row[CHUNK_TEXT_COLUMN]) # Convert to string to handle potential non-string types\n",
144
+ "\n",
145
+ " # Find the best matching quote and its score\n",
146
+ " best_match_tuple = process.extractOne(chunk_text, all_quotes, scorer=fuzz.token_set_ratio)\n",
147
+ "\n",
148
+ " if best_match_tuple:\n",
149
+ " best_match_quote = best_match_tuple[0]\n",
150
+ " match_score = best_match_tuple[1]\n",
151
+ "\n",
152
+ " if match_score >= FUZZY_MATCH_THRESHOLD:\n",
153
+ " chunked_pdfs_df.loc[index, 'is_target_quote'] = 1\n",
154
+ " chunked_pdfs_df.loc[index, 'matched_quote'] = best_match_quote\n",
155
+ " chunked_pdfs_df.loc[index, 'match_score'] = match_score\n",
156
+ "print(\"Fuzzy matching complete.\")"
157
+ ]
158
+ },
159
+ {
160
+ "cell_type": "code",
161
+ "execution_count": 7,
162
+ "id": "9c0e0d8d",
163
+ "metadata": {},
164
+ "outputs": [
165
+ {
166
+ "data": {
167
+ "text/html": [
168
+ "<div>\n",
169
+ "<style scoped>\n",
170
+ " .dataframe tbody tr th:only-of-type {\n",
171
+ " vertical-align: middle;\n",
172
+ " }\n",
173
+ "\n",
174
+ " .dataframe tbody tr th {\n",
175
+ " vertical-align: top;\n",
176
+ " }\n",
177
+ "\n",
178
+ " .dataframe thead th {\n",
179
+ " text-align: right;\n",
180
+ " }\n",
181
+ "</style>\n",
182
+ "<table border=\"1\" class=\"dataframe\">\n",
183
+ " <thead>\n",
184
+ " <tr style=\"text-align: right;\">\n",
185
+ " <th></th>\n",
186
+ " <th>country</th>\n",
187
+ " <th>filename</th>\n",
188
+ " <th>filepath</th>\n",
189
+ " <th>indicated_page</th>\n",
190
+ " <th>chunk_num</th>\n",
191
+ " <th>text</th>\n",
192
+ " <th>contains_thematic_scope</th>\n",
193
+ " <th>contains_coverage</th>\n",
194
+ " <th>contains_Granularity</th>\n",
195
+ " <th>is_target_quote</th>\n",
196
+ " <th>matched_quote</th>\n",
197
+ " <th>match_score</th>\n",
198
+ " </tr>\n",
199
+ " </thead>\n",
200
+ " <tbody>\n",
201
+ " <tr>\n",
202
+ " <th>0</th>\n",
203
+ " <td>Afghanistan</td>\n",
204
+ " <td>Afghanistan_First_NDC.pdf</td>\n",
205
+ " <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
206
+ " <td>1</td>\n",
207
+ " <td>1</td>\n",
208
+ " <td>1 ISLAMIC REPUBLIC OF AFGHANISTAN Intended Nat...</td>\n",
209
+ " <td>NaN</td>\n",
210
+ " <td>NaN</td>\n",
211
+ " <td>NaN</td>\n",
212
+ " <td>0</td>\n",
213
+ " <td>None</td>\n",
214
+ " <td>0</td>\n",
215
+ " </tr>\n",
216
+ " <tr>\n",
217
+ " <th>1</th>\n",
218
+ " <td>Afghanistan</td>\n",
219
+ " <td>Afghanistan_First_NDC.pdf</td>\n",
220
+ " <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
221
+ " <td>1</td>\n",
222
+ " <td>2</td>\n",
223
+ " <td>its Intended Nationally Determined Contributio...</td>\n",
224
+ " <td>NaN</td>\n",
225
+ " <td>NaN</td>\n",
226
+ " <td>NaN</td>\n",
227
+ " <td>0</td>\n",
228
+ " <td>None</td>\n",
229
+ " <td>0</td>\n",
230
+ " </tr>\n",
231
+ " <tr>\n",
232
+ " <th>2</th>\n",
233
+ " <td>Afghanistan</td>\n",
234
+ " <td>Afghanistan_First_NDC.pdf</td>\n",
235
+ " <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
236
+ " <td>1</td>\n",
237
+ " <td>3</td>\n",
238
+ " <td>atural resource management, agriculture, waste...</td>\n",
239
+ " <td>NaN</td>\n",
240
+ " <td>NaN</td>\n",
241
+ " <td>NaN</td>\n",
242
+ " <td>1</td>\n",
243
+ " <td>Target Years: \\n2020 to 2030 \\nContribution Ty...</td>\n",
244
+ " <td>98</td>\n",
245
+ " </tr>\n",
246
+ " <tr>\n",
247
+ " <th>3</th>\n",
248
+ " <td>Afghanistan</td>\n",
249
+ " <td>Afghanistan_First_NDC.pdf</td>\n",
250
+ " <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
251
+ " <td>1</td>\n",
252
+ " <td>4</td>\n",
253
+ " <td>ss as usual (BAU) 2030 scenario, conditional o...</td>\n",
254
+ " <td>NaN</td>\n",
255
+ " <td>NaN</td>\n",
256
+ " <td>NaN</td>\n",
257
+ " <td>0</td>\n",
258
+ " <td>None</td>\n",
259
+ " <td>0</td>\n",
260
+ " </tr>\n",
261
+ " <tr>\n",
262
+ " <th>4</th>\n",
263
+ " <td>Afghanistan</td>\n",
264
+ " <td>Afghanistan_First_NDC.pdf</td>\n",
265
+ " <td>../data/raw/pdfs\\Afghanistan\\Afghanistan_First...</td>\n",
266
+ " <td>1</td>\n",
267
+ " <td>5</td>\n",
268
+ " <td>or Afghanistan showing 13.6% relative reductio...</td>\n",
269
+ " <td>NaN</td>\n",
270
+ " <td>NaN</td>\n",
271
+ " <td>NaN</td>\n",
272
+ " <td>0</td>\n",
273
+ " <td>None</td>\n",
274
+ " <td>0</td>\n",
275
+ " </tr>\n",
276
+ " <tr>\n",
277
+ " <th>...</th>\n",
278
+ " <td>...</td>\n",
279
+ " <td>...</td>\n",
280
+ " <td>...</td>\n",
281
+ " <td>...</td>\n",
282
+ " <td>...</td>\n",
283
+ " <td>...</td>\n",
284
+ " <td>...</td>\n",
285
+ " <td>...</td>\n",
286
+ " <td>...</td>\n",
287
+ " <td>...</td>\n",
288
+ " <td>...</td>\n",
289
+ " <td>...</td>\n",
290
+ " </tr>\n",
291
+ " <tr>\n",
292
+ " <th>60123</th>\n",
293
+ " <td>Zimbabwe</td>\n",
294
+ " <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
295
+ " <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
296
+ " <td>40</td>\n",
297
+ " <td>845</td>\n",
298
+ " <td>ILDING, EDUCATION, TRAINING AND AWARENESS The ...</td>\n",
299
+ " <td>NaN</td>\n",
300
+ " <td>NaN</td>\n",
301
+ " <td>NaN</td>\n",
302
+ " <td>0</td>\n",
303
+ " <td>None</td>\n",
304
+ " <td>0</td>\n",
305
+ " </tr>\n",
306
+ " <tr>\n",
307
+ " <th>60124</th>\n",
308
+ " <td>Zimbabwe</td>\n",
309
+ " <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
310
+ " <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
311
+ " <td>40</td>\n",
312
+ " <td>846</td>\n",
313
+ " <td>ious sectors. The enhanced integration of clim...</td>\n",
314
+ " <td>NaN</td>\n",
315
+ " <td>NaN</td>\n",
316
+ " <td>NaN</td>\n",
317
+ " <td>0</td>\n",
318
+ " <td>None</td>\n",
319
+ " <td>0</td>\n",
320
+ " </tr>\n",
321
+ " <tr>\n",
322
+ " <th>60125</th>\n",
323
+ " <td>Zimbabwe</td>\n",
324
+ " <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
325
+ " <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
326
+ " <td>40</td>\n",
327
+ " <td>847</td>\n",
328
+ " <td>pacity building and innovation. In addition, t...</td>\n",
329
+ " <td>NaN</td>\n",
330
+ " <td>NaN</td>\n",
331
+ " <td>NaN</td>\n",
332
+ " <td>0</td>\n",
333
+ " <td>None</td>\n",
334
+ " <td>0</td>\n",
335
+ " </tr>\n",
336
+ " <tr>\n",
337
+ " <th>60126</th>\n",
338
+ " <td>Zimbabwe</td>\n",
339
+ " <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
340
+ " <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
341
+ " <td>41</td>\n",
342
+ " <td>848</td>\n",
343
+ " <td>35 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT</td>\n",
344
+ " <td>NaN</td>\n",
345
+ " <td>NaN</td>\n",
346
+ " <td>NaN</td>\n",
347
+ " <td>0</td>\n",
348
+ " <td>None</td>\n",
349
+ " <td>0</td>\n",
350
+ " </tr>\n",
351
+ " <tr>\n",
352
+ " <th>60127</th>\n",
353
+ " <td>Zimbabwe</td>\n",
354
+ " <td>Zimbabwe_NDC30_Country_Statement.pdf</td>\n",
355
+ " <td>../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count...</td>\n",
356
+ " <td>42</td>\n",
357
+ " <td>849</td>\n",
358
+ " <td>36 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT Ministr...</td>\n",
359
+ " <td>NaN</td>\n",
360
+ " <td>NaN</td>\n",
361
+ " <td>NaN</td>\n",
362
+ " <td>0</td>\n",
363
+ " <td>None</td>\n",
364
+ " <td>0</td>\n",
365
+ " </tr>\n",
366
+ " </tbody>\n",
367
+ "</table>\n",
368
+ "<p>60128 rows × 12 columns</p>\n",
369
+ "</div>"
370
+ ],
371
+ "text/plain": [
372
+ " country filename \\\n",
373
+ "0 Afghanistan Afghanistan_First_NDC.pdf \n",
374
+ "1 Afghanistan Afghanistan_First_NDC.pdf \n",
375
+ "2 Afghanistan Afghanistan_First_NDC.pdf \n",
376
+ "3 Afghanistan Afghanistan_First_NDC.pdf \n",
377
+ "4 Afghanistan Afghanistan_First_NDC.pdf \n",
378
+ "... ... ... \n",
379
+ "60123 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n",
380
+ "60124 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n",
381
+ "60125 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n",
382
+ "60126 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n",
383
+ "60127 Zimbabwe Zimbabwe_NDC30_Country_Statement.pdf \n",
384
+ "\n",
385
+ " filepath indicated_page \\\n",
386
+ "0 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 1 \n",
387
+ "1 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 1 \n",
388
+ "2 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 1 \n",
389
+ "3 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 1 \n",
390
+ "4 ../data/raw/pdfs\\Afghanistan\\Afghanistan_First... 1 \n",
391
+ "... ... ... \n",
392
+ "60123 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 40 \n",
393
+ "60124 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 40 \n",
394
+ "60125 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 40 \n",
395
+ "60126 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 41 \n",
396
+ "60127 ../data/raw/pdfs\\Zimbabwe\\Zimbabwe_NDC30_Count... 42 \n",
397
+ "\n",
398
+ " chunk_num text \\\n",
399
+ "0 1 1 ISLAMIC REPUBLIC OF AFGHANISTAN Intended Nat... \n",
400
+ "1 2 its Intended Nationally Determined Contributio... \n",
401
+ "2 3 atural resource management, agriculture, waste... \n",
402
+ "3 4 ss as usual (BAU) 2030 scenario, conditional o... \n",
403
+ "4 5 or Afghanistan showing 13.6% relative reductio... \n",
404
+ "... ... ... \n",
405
+ "60123 845 ILDING, EDUCATION, TRAINING AND AWARENESS The ... \n",
406
+ "60124 846 ious sectors. The enhanced integration of clim... \n",
407
+ "60125 847 pacity building and innovation. In addition, t... \n",
408
+ "60126 848 35 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT \n",
409
+ "60127 849 36 ZIMBABWE’S NDC3.0 COUNTRY STATEMENT Ministr... \n",
410
+ "\n",
411
+ " contains_thematic_scope contains_coverage contains_Granularity \\\n",
412
+ "0 NaN NaN NaN \n",
413
+ "1 NaN NaN NaN \n",
414
+ "2 NaN NaN NaN \n",
415
+ "3 NaN NaN NaN \n",
416
+ "4 NaN NaN NaN \n",
417
+ "... ... ... ... \n",
418
+ "60123 NaN NaN NaN \n",
419
+ "60124 NaN NaN NaN \n",
420
+ "60125 NaN NaN NaN \n",
421
+ "60126 NaN NaN NaN \n",
422
+ "60127 NaN NaN NaN \n",
423
+ "\n",
424
+ " is_target_quote matched_quote \\\n",
425
+ "0 0 None \n",
426
+ "1 0 None \n",
427
+ "2 1 Target Years: \\n2020 to 2030 \\nContribution Ty... \n",
428
+ "3 0 None \n",
429
+ "4 0 None \n",
430
+ "... ... ... \n",
431
+ "60123 0 None \n",
432
+ "60124 0 None \n",
433
+ "60125 0 None \n",
434
+ "60126 0 None \n",
435
+ "60127 0 None \n",
436
+ "\n",
437
+ " match_score \n",
438
+ "0 0 \n",
439
+ "1 0 \n",
440
+ "2 98 \n",
441
+ "3 0 \n",
442
+ "4 0 \n",
443
+ "... ... \n",
444
+ "60123 0 \n",
445
+ "60124 0 \n",
446
+ "60125 0 \n",
447
+ "60126 0 \n",
448
+ "60127 0 \n",
449
+ "\n",
450
+ "[60128 rows x 12 columns]"
451
+ ]
452
+ },
453
+ "execution_count": 7,
454
+ "metadata": {},
455
+ "output_type": "execute_result"
456
+ }
457
+ ],
458
+ "source": [
459
+ "chunked_pdfs_df"
460
+ ]
461
+ },
462
+ {
463
+ "cell_type": "code",
464
+ "execution_count": 8,
465
+ "id": "9d7038e9",
466
+ "metadata": {},
467
+ "outputs": [],
468
+ "source": [
469
+ "chunked_pdfs_df.to_csv('./fuzzy_matched_chunks.csv')"
470
+ ]
471
+ },
472
+ {
473
+ "cell_type": "code",
474
+ "execution_count": 9,
475
+ "id": "76b51ab6",
476
+ "metadata": {},
477
+ "outputs": [
478
+ {
479
+ "name": "stdout",
480
+ "output_type": "stream",
481
+ "text": [
482
+ "/c/Users/Derik/Desktop/NDC_Scraper/Classification Model/tf_idf_lr_model\n"
483
+ ]
484
+ }
485
+ ],
486
+ "source": [
487
+ "!pwd"
488
+ ]
489
+ }
490
+ ],
491
+ "metadata": {
492
+ "kernelspec": {
493
+ "display_name": "NDC_extraction_ENV",
494
+ "language": "python",
495
+ "name": "python3"
496
+ },
497
+ "language_info": {
498
+ "codemirror_mode": {
499
+ "name": "ipython",
500
+ "version": 3
501
+ },
502
+ "file_extension": ".py",
503
+ "mimetype": "text/x-python",
504
+ "name": "python",
505
+ "nbconvert_exporter": "python",
506
+ "pygments_lexer": "ipython3",
507
+ "version": "3.9.21"
508
+ }
509
+ },
510
+ "nbformat": 4,
511
+ "nbformat_minor": 5
512
+ }
fuzzy_matched_chunks.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:783c1ee7c7b2ef9a44592e0f7e96e0b290ce88b2337eec0a42460e9ceb0c32fa
3
+ size 23764705
label_encoder.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e37ba6e0f6dee4380507f091d596429ef0fda3c46a997f1798b207785b5247e
3
- size 335
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d2f1f6c74d9339461a40453974dcfcf407a5a78522cef40080d323128fd8f9b
3
+ size 343
model_pipeline.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:91e4edd8a43620a74decfe6e96b07d2a8f2934170e3e5c02c9a4a51291bd2e12
3
- size 1181168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cbe13d3943b9379411289561324e244a33fb2208e31912b8380a9896e557af2
3
+ size 111587616