RalphThings commited on
Commit
2949e29
·
verified ·
1 Parent(s): 64d58ed

Upload 2 files

Browse files
Files changed (2) hide show
  1. cookies.py +715 -0
  2. mdconvert.py +1002 -0
cookies.py ADDED
@@ -0,0 +1,715 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from requests.cookies import RequestsCookieJar
2
+
3
+
4
+ COOKIES_LIST = [
5
+ {
6
+ "domain": ".youtube.com",
7
+ "expirationDate": 1718884961,
8
+ "hostOnly": False,
9
+ "httpOnly": False,
10
+ "name": "ST-xuwub9",
11
+ "path": "/",
12
+ "sameSite": None,
13
+ "secure": False,
14
+ "session": False,
15
+ "storeId": None,
16
+ "value": "session_logininfo=AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0%3AQUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3",
17
+ },
18
+ {
19
+ "domain": ".youtube.com",
20
+ "expirationDate": 1753004444.745411,
21
+ "hostOnly": False,
22
+ "httpOnly": True,
23
+ "name": "__Secure-YEC",
24
+ "path": "/",
25
+ "sameSite": "lax",
26
+ "secure": True,
27
+ "session": False,
28
+ "storeId": None,
29
+ "value": "CgtRVnI5LW1zRHlQVSjbtNCzBjIhCgJGUhIbEhcSFRMLFBUWFwwYGRobHB0eHw4PIBAREiAk",
30
+ },
31
+ {
32
+ "domain": ".youtube.com",
33
+ "expirationDate": 1753434620.050824,
34
+ "hostOnly": False,
35
+ "httpOnly": True,
36
+ "name": "__Secure-3PSID",
37
+ "path": "/",
38
+ "sameSite": "no_restriction",
39
+ "secure": True,
40
+ "session": False,
41
+ "storeId": None,
42
+ "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB4ezJ_bdWu46a7YwObVn44wACgYKAakSARQSFQHGX2MicJcTzecTKH6bHzqU6TMbTxoVAUF8yKqQYK-MoI6Ql3vI2oYTB3E-0076",
43
+ },
44
+ {
45
+ "domain": ".youtube.com",
46
+ "expirationDate": 1750420959.974642,
47
+ "hostOnly": False,
48
+ "httpOnly": False,
49
+ "name": "SIDCC",
50
+ "path": "/",
51
+ "sameSite": None,
52
+ "secure": False,
53
+ "session": False,
54
+ "storeId": None,
55
+ "value": "AKEyXzWQZauHKOo8t87zoEcjaVNIYUX54ohoWXT-tX4aAhEuZzIIptxZAcNkHuG2oDXYL6t-lw",
56
+ },
57
+ {
58
+ "domain": ".youtube.com",
59
+ "expirationDate": 1753434620.050652,
60
+ "hostOnly": False,
61
+ "httpOnly": False,
62
+ "name": "SID",
63
+ "path": "/",
64
+ "sameSite": None,
65
+ "secure": False,
66
+ "session": False,
67
+ "storeId": None,
68
+ "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB6VHrZcC3gBAsFPbCQ0gF5AACgYKAYkSARQSFQHGX2Mi9kt0gHg5CxCYSkLQGHWaeBoVAUF8yKre_V6r3jZVak6JV4o2Q0FL0076",
69
+ },
70
+ {
71
+ "domain": ".youtube.com",
72
+ "expirationDate": 1750420958.397534,
73
+ "hostOnly": False,
74
+ "httpOnly": True,
75
+ "name": "__Secure-1PSIDTS",
76
+ "path": "/",
77
+ "sameSite": None,
78
+ "secure": True,
79
+ "session": False,
80
+ "storeId": None,
81
+ "value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA",
82
+ },
83
+ {
84
+ "domain": ".youtube.com",
85
+ "expirationDate": 1753433494.44729,
86
+ "hostOnly": False,
87
+ "httpOnly": False,
88
+ "name": "_ga_M0180HEFCY",
89
+ "path": "/",
90
+ "sameSite": None,
91
+ "secure": False,
92
+ "session": False,
93
+ "storeId": None,
94
+ "value": "GS1.1.1718871908.1.0.1718873494.0.0.0",
95
+ },
96
+ {
97
+ "domain": ".youtube.com",
98
+ "expirationDate": 1753434620.050933,
99
+ "hostOnly": False,
100
+ "httpOnly": False,
101
+ "name": "SAPISID",
102
+ "path": "/",
103
+ "sameSite": None,
104
+ "secure": True,
105
+ "session": False,
106
+ "storeId": None,
107
+ "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
108
+ },
109
+ {
110
+ "domain": ".youtube.com",
111
+ "expirationDate": 1750420959.974764,
112
+ "hostOnly": False,
113
+ "httpOnly": True,
114
+ "name": "__Secure-1PSIDCC",
115
+ "path": "/",
116
+ "sameSite": None,
117
+ "secure": True,
118
+ "session": False,
119
+ "storeId": None,
120
+ "value": "AKEyXzWHDSoXGCZpZhPxRrnC7B1s8zGIUjeMVyvgtQfsm1fs92lXPtFEI_td9LBUyqVUe0xK",
121
+ },
122
+ {
123
+ "domain": ".youtube.com",
124
+ "expirationDate": 1753434620.050881,
125
+ "hostOnly": False,
126
+ "httpOnly": True,
127
+ "name": "SSID",
128
+ "path": "/",
129
+ "sameSite": None,
130
+ "secure": True,
131
+ "session": False,
132
+ "storeId": None,
133
+ "value": "AmlwXHnQvOQ10LVd-",
134
+ },
135
+ {
136
+ "domain": ".youtube.com",
137
+ "expirationDate": 1753434620.050959,
138
+ "hostOnly": False,
139
+ "httpOnly": False,
140
+ "name": "__Secure-1PAPISID",
141
+ "path": "/",
142
+ "sameSite": None,
143
+ "secure": True,
144
+ "session": False,
145
+ "storeId": None,
146
+ "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
147
+ },
148
+ {
149
+ "domain": ".youtube.com",
150
+ "expirationDate": 1753434620.050795,
151
+ "hostOnly": False,
152
+ "httpOnly": True,
153
+ "name": "__Secure-1PSID",
154
+ "path": "/",
155
+ "sameSite": None,
156
+ "secure": True,
157
+ "session": False,
158
+ "storeId": None,
159
+ "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBBrlk7lRpKQGywAHEon7WGQAACgYKAQsSARQSFQHGX2MirAmnSRdZl6GPG6KLd4hOihoVAUF8yKoV17Tcj1a_OenIOkf2wBjO0076",
160
+ },
161
+ {
162
+ "domain": ".youtube.com",
163
+ "expirationDate": 1753434620.050993,
164
+ "hostOnly": False,
165
+ "httpOnly": False,
166
+ "name": "__Secure-3PAPISID",
167
+ "path": "/",
168
+ "sameSite": "no_restriction",
169
+ "secure": True,
170
+ "session": False,
171
+ "storeId": None,
172
+ "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
173
+ },
174
+ {
175
+ "domain": ".youtube.com",
176
+ "expirationDate": 1750420959.974815,
177
+ "hostOnly": False,
178
+ "httpOnly": True,
179
+ "name": "__Secure-3PSIDCC",
180
+ "path": "/",
181
+ "sameSite": "no_restriction",
182
+ "secure": True,
183
+ "session": False,
184
+ "storeId": None,
185
+ "value": "AKEyXzXM5UjKUEXwSHVmRAIo6hGHA4G63adj3EE1VdNriD0f38jZQbsUKiD4LQbA3BValmTFDg",
186
+ },
187
+ {
188
+ "domain": ".youtube.com",
189
+ "expirationDate": 1750420958.397647,
190
+ "hostOnly": False,
191
+ "httpOnly": True,
192
+ "name": "__Secure-3PSIDTS",
193
+ "path": "/",
194
+ "sameSite": "no_restriction",
195
+ "secure": True,
196
+ "session": False,
197
+ "storeId": None,
198
+ "value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA",
199
+ },
200
+ {
201
+ "domain": ".youtube.com",
202
+ "expirationDate": 1753434620.050908,
203
+ "hostOnly": False,
204
+ "httpOnly": False,
205
+ "name": "APISID",
206
+ "path": "/",
207
+ "sameSite": None,
208
+ "secure": False,
209
+ "session": False,
210
+ "storeId": None,
211
+ "value": "IlQWLPjdNqziwCrV/ANG7Z4x5FF-IBxbZk",
212
+ },
213
+ {
214
+ "domain": ".youtube.com",
215
+ "expirationDate": 1753434620.050855,
216
+ "hostOnly": False,
217
+ "httpOnly": True,
218
+ "name": "HSID",
219
+ "path": "/",
220
+ "sameSite": None,
221
+ "secure": False,
222
+ "session": False,
223
+ "storeId": None,
224
+ "value": "AasA7hmRuTFv7vjoq",
225
+ },
226
+ {
227
+ "domain": ".youtube.com",
228
+ "expirationDate": 1753435873.577793,
229
+ "hostOnly": False,
230
+ "httpOnly": True,
231
+ "name": "LOGIN_INFO",
232
+ "path": "/",
233
+ "sameSite": "no_restriction",
234
+ "secure": True,
235
+ "session": False,
236
+ "storeId": None,
237
+ "value": "AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0:QUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3",
238
+ },
239
+ {
240
+ "domain": ".youtube.com",
241
+ "expirationDate": 1753444956.555608,
242
+ "hostOnly": False,
243
+ "httpOnly": False,
244
+ "name": "PREF",
245
+ "path": "/",
246
+ "sameSite": None,
247
+ "secure": True,
248
+ "session": False,
249
+ "storeId": None,
250
+ "value": "f4=4000000&f6=40000000&tz=Europe.Paris&f5=30000&f7=100",
251
+ },
252
+ ]
253
+
254
+ COOKIES_LIST += [
255
+ {
256
+ "domain": ".www.researchgate.net",
257
+ "hostOnly": False,
258
+ "httpOnly": True,
259
+ "name": "isInstIp",
260
+ "path": "/",
261
+ "sameSite": None,
262
+ "secure": True,
263
+ "session": True,
264
+ "storeId": None,
265
+ "value": "False",
266
+ },
267
+ {
268
+ "domain": ".researchgate.net",
269
+ "expirationDate": 1734423981,
270
+ "hostOnly": False,
271
+ "httpOnly": False,
272
+ "name": "__eoi",
273
+ "path": "/",
274
+ "sameSite": None,
275
+ "secure": False,
276
+ "session": False,
277
+ "storeId": None,
278
+ "value": "ID=c26f752377373146:T=1718871981:RT=1718884914:S=AA-AfjZw-T_OOX2kW2LLaFzXImgc",
279
+ },
280
+ {
281
+ "domain": ".www.researchgate.net",
282
+ "expirationDate": 1753444909.646103,
283
+ "hostOnly": False,
284
+ "httpOnly": True,
285
+ "name": "ptc",
286
+ "path": "/",
287
+ "sameSite": None,
288
+ "secure": True,
289
+ "session": False,
290
+ "storeId": None,
291
+ "value": "RG1.8947708639250500550.1718872043",
292
+ },
293
+ {
294
+ "domain": ".researchgate.net",
295
+ "expirationDate": 1750507578,
296
+ "hostOnly": False,
297
+ "httpOnly": False,
298
+ "name": "euconsent-v2-didomi",
299
+ "path": "/",
300
+ "sameSite": "lax",
301
+ "secure": True,
302
+ "session": False,
303
+ "storeId": None,
304
+ "value": "CQAgmoAQAgmoAAHABBENA5EsAP_gAEPgAAYgJ2pB5G5UTWlBIG53YMskIAUFhFBoQEAgAACAAwIBSBIAIIwEAGAAIAgAICACAAIAIBIAIABAGAAAAAAAYIAAIAAIAAAQIAAKIAAAAAAAAgBQAAgIAgggEAAAgEBEABAAgAAAEIIAQNgACgAAACCAAAAAAAABAAAAAAAAQAAAAAAAYCQAAAJIAAAAACAIABAIAAAAAAAAAAAAAAAABBAAIJ2wPIAFAAXABQAFQALgAcAA8ACAAEgALwAZAA0ACIAEcAJgAUgAqgBcADEAGgAPQAfgBEACOAE4AMMAZYA0QBsgDkAHOAO4AfsBBwEIAItARwBHQC6gHUAO2Ae0A_4CHQEXgJ2AUOAo8BT4CpQFqALYAXmAwQBkgDLAGXANjAhCBG8CbAE3gJ1gTtAA.f_wACHwAAAAA",
305
+ },
306
+ {
307
+ "domain": ".researchgate.net",
308
+ "expirationDate": 1718885236,
309
+ "hostOnly": False,
310
+ "httpOnly": False,
311
+ "name": "_gat",
312
+ "path": "/",
313
+ "sameSite": None,
314
+ "secure": False,
315
+ "session": False,
316
+ "storeId": None,
317
+ "value": "1",
318
+ },
319
+ {
320
+ "domain": "www.researchgate.net",
321
+ "expirationDate": 1721477183,
322
+ "hostOnly": True,
323
+ "httpOnly": False,
324
+ "name": "_pbjs_userid_consent_data",
325
+ "path": "/",
326
+ "sameSite": "lax",
327
+ "secure": False,
328
+ "session": False,
329
+ "storeId": None,
330
+ "value": "3524755945110770",
331
+ },
332
+ {
333
+ "domain": ".researchgate.net",
334
+ "expirationDate": 1752567981,
335
+ "hostOnly": False,
336
+ "httpOnly": False,
337
+ "name": "__gads",
338
+ "path": "/",
339
+ "sameSite": None,
340
+ "secure": False,
341
+ "session": False,
342
+ "storeId": None,
343
+ "value": "ID=eca2adb88969c830:T=1718871981:RT=1718884914:S=ALNI_MY2qZchynrhWX6hWMlaI87Pcj9riQ",
344
+ },
345
+ {
346
+ "domain": ".researchgate.net",
347
+ "expirationDate": 1718886709.646173,
348
+ "hostOnly": False,
349
+ "httpOnly": True,
350
+ "name": "__cf_bm",
351
+ "path": "/",
352
+ "sameSite": "no_restriction",
353
+ "secure": True,
354
+ "session": False,
355
+ "storeId": None,
356
+ "value": "IkQ_J4ciBzKQduRvjqsfSmQu8UygDWbHeROO5JVccfo-1718884909-1.0.1.1-qvNGEdbfI0HfhFP6kwe7R7mkTqODNhFuKhs72lLly6K2BOPMG3kbahpQFGvPK0U8FUfkznkq65gngd1sWj7sDA",
357
+ },
358
+ {
359
+ "domain": ".researchgate.net",
360
+ "expirationDate": 1752567981,
361
+ "hostOnly": False,
362
+ "httpOnly": False,
363
+ "name": "__gpi",
364
+ "path": "/",
365
+ "sameSite": None,
366
+ "secure": False,
367
+ "session": False,
368
+ "storeId": None,
369
+ "value": "UID=00000e4e9aa2e6f2:T=1718871981:RT=1718884914:S=ALNI_MYFNrgzkKn7K6Bd2y8hC6GJCvDiSg",
370
+ },
371
+ {
372
+ "domain": ".researchgate.net",
373
+ "hostOnly": False,
374
+ "httpOnly": True,
375
+ "name": "_cfuvid",
376
+ "path": "/",
377
+ "sameSite": "no_restriction",
378
+ "secure": True,
379
+ "session": True,
380
+ "storeId": None,
381
+ "value": "_GPmGZkBymiH3UiqTqzakEpi98br3nfFUWC2_u_wqkc-1718884909785-0.0.1.1-604800000",
382
+ },
383
+ {
384
+ "domain": ".researchgate.net",
385
+ "expirationDate": 1753445177.271667,
386
+ "hostOnly": False,
387
+ "httpOnly": False,
388
+ "name": "_ga",
389
+ "path": "/",
390
+ "sameSite": None,
391
+ "secure": False,
392
+ "session": False,
393
+ "storeId": None,
394
+ "value": "GA1.1.1525244793.1718885177",
395
+ },
396
+ {
397
+ "domain": ".researchgate.net",
398
+ "expirationDate": 1753445177.271482,
399
+ "hostOnly": False,
400
+ "httpOnly": False,
401
+ "name": "_ga_4P31SJ70EJ",
402
+ "path": "/",
403
+ "sameSite": None,
404
+ "secure": False,
405
+ "session": False,
406
+ "storeId": None,
407
+ "value": "GS1.1.1718885177.1.0.1718885177.0.0.0",
408
+ },
409
+ {
410
+ "domain": ".researchgate.net",
411
+ "expirationDate": 1718971576,
412
+ "hostOnly": False,
413
+ "httpOnly": False,
414
+ "name": "_gid",
415
+ "path": "/",
416
+ "sameSite": None,
417
+ "secure": False,
418
+ "session": False,
419
+ "storeId": None,
420
+ "value": "GA1.2.854907463.1718885177",
421
+ },
422
+ {
423
+ "domain": ".www.researchgate.net",
424
+ "expirationDate": 1750407982.506505,
425
+ "hostOnly": False,
426
+ "httpOnly": True,
427
+ "name": "did",
428
+ "path": "/",
429
+ "sameSite": None,
430
+ "secure": True,
431
+ "session": False,
432
+ "storeId": None,
433
+ "value": "1dWLO3C6am8l667Q4VUlBo0O1LI49Qi2Vw21SJEXHavBDYT56DI9007W5rYGVFVH",
434
+ },
435
+ {
436
+ "domain": ".researchgate.net",
437
+ "expirationDate": 1750507578,
438
+ "hostOnly": False,
439
+ "httpOnly": False,
440
+ "name": "didomi_token",
441
+ "path": "/",
442
+ "sameSite": "lax",
443
+ "secure": True,
444
+ "session": False,
445
+ "storeId": None,
446
+ "value": "eyJ1c2VyX2lkIjoiMTkwMzU4YTUtNWU2My02Y2UzLWJlNzAtZGFjNzVmYjdiY2ExIiwiY3JlYXRlZCI6IjIwMjQtMDYtMjBUMTI6MDY6MTYuODA2WiIsInVwZGF0ZWQiOiIyMDI0LTA2LTIwVDEyOjA2OjE4Ljc4MVoiLCJ2ZW5kb3JzIjp7ImVuYWJsZWQiOlsidHdpdHRlciIsImdvb2dsZSIsImM6bGlua2VkaW4tbWFya2V0aW5nLXNvbHV0aW9ucyIsImM6b3duZXJpcSIsImM6b21uaXR1cmUtYWRvYmUtYW5hbHl0aWNzIiwiYzp0ZWNobm9yYXRpLW1lZGlhIiwiYzppbnRlcmNvbSIsImM6aW50ZW50LWlxIiwiYzppcHJvbSIsImM6bGlua2VkaW4iLCJjOmFtYXpvbmFkdi16Y1hGTEI2WCIsImM6bWVkaWFuZXQtY1V3YUtFNnoiLCJjOmluZGV4ZXhjaC1OWkNRTTY4UCIsImM6emVvdGFwZ21iLWQ3YndtdGp3IiwiYzp0cmlwbGVsaWYtZGRKSDM0clkiLCJjOnJ0YmhvdXNlLWI4Y2RIOHRNIiwiYzptZHByaW1pcy1lYU4yOVdjUCIsImM6bG9vcG1lbGktVGRhWXRCUHEiLCJjOm1hZ25pdGVpbi05d1RZTHFSRCIsImM6Ymlkc3dpdGNoLWQ2N0V3N1c5IiwiYzpvcmFjbGVhZHYtcUhlREptQUwiLCJjOmdvb2dsZWFuYS00VFhuSmlnUiIsImM6bG90YW1lc29sLURIaTdMUmpNIiwiYzpuZXh0bWlsbGUtR0pyZlg4VWMiLCJjOm5yaWNodGVjLXFVVlEyUlFxIiwiYzpicml0ZXBvb2wtQldWeVdHeVUiLCJjOnRhcGFkaW5jLXFxY2tVN1BXIiwiYzppZDV0ZWNobi16Tk1KNGR3ZiIsImM6bWljcm9zb2Z0IiwiYzpwZXJtdXRpdmUtSjdpaHJlTWsiLCJjOm9wZXJhc29mdC1CY1hjRFZKTSIsImM6cG9zdGhvZy1Cakp4RmRGOSJdfSwicHVycG9zZXMiOnsiZW5hYmxlZCI6WyJnZW9sb2NhdGlvbl9kYXRhIiwiZGV2aWNlX2NoYXJhY3RlcmlzdGljcyJdfSwidmVuZG9yc19saSI6eyJlbmFibGVkIjpbImdvb2dsZSIsImM6b3BlcmFzb2Z0LUJjWGNEVkpNIl19LCJ2ZXJzaW9uIjoyLCJhYyI6IkRIU0FvQUZrQWNnQTVnSHFnUUhBeGdCNndEMTRJR0FRTkFqMEJJd0NTY0VyQUtCd1YtZ3MxQmgwREc0R09nQUEuREhTQW9BRmtBY2dBNWdIcWdRSEF4Z0I2d0QxNElHQVFOQWowQkl3Q1NjRXJBS0J3Vi1nczFCaDBERzRHT2dBQSJ9",
447
+ },
448
+ {
449
+ "domain": ".www.researchgate.net",
450
+ "hostOnly": False,
451
+ "httpOnly": True,
452
+ "name": "hasPdpNext",
453
+ "path": "/",
454
+ "sameSite": None,
455
+ "secure": True,
456
+ "session": True,
457
+ "storeId": None,
458
+ "value": "False",
459
+ },
460
+ {
461
+ "domain": ".researchgate.net",
462
+ "expirationDate": 1750421183,
463
+ "hostOnly": False,
464
+ "httpOnly": False,
465
+ "name": "ph_phc_ma1XTQyee96N1GML6qUTgLQRiDifnRcE9STiHTZ0CfZ_posthog",
466
+ "path": "/",
467
+ "sameSite": "lax",
468
+ "secure": True,
469
+ "session": False,
470
+ "storeId": None,
471
+ "value": "%7B%22distinct_id%22%3A%220190358a-56a1-7313-83b0-d13dddeac787%22%2C%22%24sesid%22%3A%5B1718885183223%2C%220190358a-56a1-7313-83b0-d13b2b87778d%22%2C1718885176993%5D%2C%22%24session_is_sampled%22%3Atrue%7D",
472
+ },
473
+ {
474
+ "domain": ".www.researchgate.net",
475
+ "hostOnly": False,
476
+ "httpOnly": True,
477
+ "name": "sid",
478
+ "path": "/",
479
+ "sameSite": None,
480
+ "secure": True,
481
+ "session": True,
482
+ "storeId": None,
483
+ "value": "qmH5Lc4f0CUJ3zeaxORcV0S8I8V1MuCFZtcIQqPYtv1XPejrbSLAQRbT50PL40TqeKQ1XsQDWt9gtYVzuL80bRmPjw6jn3cQ0ikNqW40maHcQ3JL2Vfa8ZZf0j7p35eJ",
484
+ },
485
+ ]
486
+
487
+ COOKIES_LIST += [
488
+ {
489
+ "domain": "github.com",
490
+ "hostOnly": True,
491
+ "httpOnly": True,
492
+ "name": "_gh_sess",
493
+ "path": "/",
494
+ "sameSite": "lax",
495
+ "secure": True,
496
+ "session": True,
497
+ "storeId": None,
498
+ "value": "P%2Fmof1avuqwHaUQUIJR%2FZYn7jqbT7lgGuTGjp1BGAFIG5UpNDusEE3b8dRjz0eATE5xPdPjLYFqMs%2FI9AOalKX4YuYfSEEnxCMawU01099b4o9Xzzcv%2BmecrmO0Q8q%2Bdq1h8SIv6nvPP7HzlFesl8ysafb9b%2F0q6dTArKdSOurasza8UgLSYD08ofA50Pcm0IG7CTzF8ZCizrGgGTMi%2F%2B7L3E17jav5PM1Sf2vQKg15Gbg1QIOppJJHzlufgQoZigqFv%2BWznaws0Tt7Y2lSFCw%3D%3D--CJRhqMXJnwOaJgk4--DhUErlL4GdROikEjKD4O9g%3D%3D",
499
+ },
500
+ {
501
+ "domain": ".github.com",
502
+ "expirationDate": 1750408875.763785,
503
+ "hostOnly": False,
504
+ "httpOnly": False,
505
+ "name": "_octo",
506
+ "path": "/",
507
+ "sameSite": "lax",
508
+ "secure": True,
509
+ "session": False,
510
+ "storeId": None,
511
+ "value": "GH1.1.728652011.1718872875",
512
+ },
513
+ {
514
+ "domain": ".github.com",
515
+ "expirationDate": 1750408875.763926,
516
+ "hostOnly": False,
517
+ "httpOnly": True,
518
+ "name": "logged_in",
519
+ "path": "/",
520
+ "sameSite": "lax",
521
+ "secure": True,
522
+ "session": False,
523
+ "storeId": None,
524
+ "value": "no",
525
+ },
526
+ {
527
+ "domain": ".github.com",
528
+ "hostOnly": False,
529
+ "httpOnly": False,
530
+ "name": "preferred_color_mode",
531
+ "path": "/",
532
+ "sameSite": "lax",
533
+ "secure": True,
534
+ "session": True,
535
+ "storeId": None,
536
+ "value": "dark",
537
+ },
538
+ {
539
+ "domain": ".github.com",
540
+ "hostOnly": False,
541
+ "httpOnly": False,
542
+ "name": "tz",
543
+ "path": "/",
544
+ "sameSite": "lax",
545
+ "secure": True,
546
+ "session": True,
547
+ "storeId": None,
548
+ "value": "Europe%2FParis",
549
+ },
550
+ ]
551
+
552
+ COOKIES_LIST += [
553
+ {
554
+ "domain": ".web.archive.org",
555
+ "expirationDate": 1718886430,
556
+ "hostOnly": False,
557
+ "httpOnly": False,
558
+ "name": "_gat",
559
+ "path": "/web/20201123221659/http://orcid.org/",
560
+ "sameSite": None,
561
+ "secure": False,
562
+ "session": False,
563
+ "storeId": None,
564
+ "value": "1",
565
+ },
566
+ {
567
+ "domain": ".web.archive.org",
568
+ "expirationDate": 1718972770,
569
+ "hostOnly": False,
570
+ "httpOnly": False,
571
+ "name": "_gid",
572
+ "path": "/web/20201123221659/http://orcid.org/",
573
+ "sameSite": None,
574
+ "secure": False,
575
+ "session": False,
576
+ "storeId": None,
577
+ "value": "GA1.2.402246368.1606169825",
578
+ },
579
+ {
580
+ "domain": ".web.archive.org",
581
+ "expirationDate": 1753446370.315621,
582
+ "hostOnly": False,
583
+ "httpOnly": False,
584
+ "name": "_ga",
585
+ "path": "/web/20201123221659/http://orcid.org/",
586
+ "sameSite": None,
587
+ "secure": False,
588
+ "session": False,
589
+ "storeId": None,
590
+ "value": "GA1.2.1301409987.1606169825",
591
+ },
592
+ {
593
+ "domain": ".web.archive.org",
594
+ "expirationDate": 1750422367,
595
+ "hostOnly": False,
596
+ "httpOnly": False,
597
+ "name": "_hjid",
598
+ "path": "/web/20201123221659/http://orcid.org/",
599
+ "sameSite": "lax",
600
+ "secure": False,
601
+ "session": False,
602
+ "storeId": None,
603
+ "value": "07f80263-a631-4bf4-8ffd-8fc8912085e2",
604
+ },
605
+ {
606
+ "domain": ".web.archive.org",
607
+ "expirationDate": 1718888167,
608
+ "hostOnly": False,
609
+ "httpOnly": False,
610
+ "name": "_hjFirstSeen",
611
+ "path": "/web/20201123221659/http://orcid.org/",
612
+ "sameSite": "lax",
613
+ "secure": False,
614
+ "session": False,
615
+ "storeId": None,
616
+ "value": "1",
617
+ },
618
+ ]
619
+ COOKIES_LIST += [
620
+ {
621
+ "domain": "orcid.org",
622
+ "hostOnly": True,
623
+ "httpOnly": False,
624
+ "name": "AWSELBCORS",
625
+ "path": "/",
626
+ "sameSite": "no_restriction",
627
+ "secure": True,
628
+ "session": True,
629
+ "storeId": None,
630
+ "value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F",
631
+ },
632
+ {
633
+ "domain": ".orcid.org",
634
+ "expirationDate": 1753452454.637671,
635
+ "hostOnly": False,
636
+ "httpOnly": False,
637
+ "name": "_ga_9R61FWK9H5",
638
+ "path": "/",
639
+ "sameSite": None,
640
+ "secure": False,
641
+ "session": False,
642
+ "storeId": None,
643
+ "value": "GS1.1.1718892454.1.0.1718892454.0.0.0",
644
+ },
645
+ {
646
+ "domain": ".orcid.org",
647
+ "expirationDate": 1753452454.63421,
648
+ "hostOnly": False,
649
+ "httpOnly": False,
650
+ "name": "_ga",
651
+ "path": "/",
652
+ "sameSite": None,
653
+ "secure": False,
654
+ "session": False,
655
+ "storeId": None,
656
+ "value": "GA1.1.2021310691.1718892455",
657
+ },
658
+ {
659
+ "domain": "orcid.org",
660
+ "hostOnly": True,
661
+ "httpOnly": False,
662
+ "name": "AWSELB",
663
+ "path": "/",
664
+ "sameSite": None,
665
+ "secure": False,
666
+ "session": True,
667
+ "storeId": None,
668
+ "value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F",
669
+ },
670
+ {
671
+ "domain": ".orcid.org",
672
+ "expirationDate": 1750428454,
673
+ "hostOnly": False,
674
+ "httpOnly": False,
675
+ "name": "OptanonAlertBoxClosed",
676
+ "path": "/",
677
+ "sameSite": "lax",
678
+ "secure": False,
679
+ "session": False,
680
+ "storeId": None,
681
+ "value": "2024-06-20T14:07:34.583Z",
682
+ },
683
+ {
684
+ "domain": ".orcid.org",
685
+ "expirationDate": 1750428454,
686
+ "hostOnly": False,
687
+ "httpOnly": False,
688
+ "name": "OptanonConsent",
689
+ "path": "/",
690
+ "sameSite": "lax",
691
+ "secure": False,
692
+ "session": False,
693
+ "storeId": None,
694
+ "value": "isGpcEnabled=0&datestamp=Thu+Jun+20+2024+16%3A07%3A34+GMT%2B0200+(heure+d%E2%80%99%C3%A9t%C3%A9+d%E2%80%99Europe+centrale)&version=202310.2.0&browserGpcFlag=0&isIABGlobal=False&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0002%3A1%2CC0004%3A1",
695
+ },
696
+ {
697
+ "domain": "orcid.org",
698
+ "hostOnly": True,
699
+ "httpOnly": False,
700
+ "name": "XSRF-TOKEN",
701
+ "path": "/",
702
+ "sameSite": None,
703
+ "secure": True,
704
+ "session": True,
705
+ "storeId": None,
706
+ "value": "6957be7a-bcb4-4d59-a522-ea9b6b210ed9",
707
+ },
708
+ ]
709
+
710
+ # Create a RequestsCookieJar instance
711
+ COOKIES = RequestsCookieJar()
712
+
713
+ # Add cookies to the jar
714
+ for cookie in COOKIES_LIST:
715
+ COOKIES.set(cookie["name"], cookie["value"], domain=cookie["domain"], path=cookie["path"])
mdconvert.py ADDED
@@ -0,0 +1,1002 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is copied from Magentic-one's great repo: https://github.com/microsoft/autogen/blob/v0.4.4/python/packages/autogen-magentic-one/src/autogen_magentic_one/markdown_browser/mdconvert.py
2
+ # Thanks to Microsoft researchers for open-sourcing this!
3
+ # type: ignore
4
+ import base64
5
+ import copy
6
+ import html
7
+ import json
8
+ import mimetypes
9
+ import os
10
+ import re
11
+ import shutil
12
+ import subprocess
13
+ import sys
14
+ import tempfile
15
+ import traceback
16
+ import zipfile
17
+ from typing import Any
18
+ from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
19
+
20
+ import mammoth
21
+ import markdownify
22
+ import pandas as pd
23
+ import pdfminer
24
+ import pdfminer.high_level
25
+ import pptx
26
+
27
+ # File-format detection
28
+ import puremagic
29
+ import pydub
30
+ import requests
31
+ import speech_recognition as sr
32
+ from bs4 import BeautifulSoup
33
+ from youtube_transcript_api import YouTubeTranscriptApi
34
+ from youtube_transcript_api.formatters import SRTFormatter
35
+
36
+
37
+ class _CustomMarkdownify(markdownify.MarkdownConverter):
38
+ """
39
+ A custom version of markdownify's MarkdownConverter. Changes include:
40
+
41
+ - Altering the default heading style to use '#', '##', etc.
42
+ - Removing javascript hyperlinks.
43
+ - Truncating images with large data:uri sources.
44
+ - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
45
+ """
46
+
47
+ def __init__(self, **options: Any):
48
+ options["heading_style"] = options.get("heading_style", markdownify.ATX)
49
+ # Explicitly cast options to the expected type if necessary
50
+ super().__init__(**options)
51
+
52
+ def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
53
+ """Same as usual, but be sure to start with a new line"""
54
+ if not convert_as_inline:
55
+ if not re.search(r"^\n", text):
56
+ return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore
57
+
58
+ return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
59
+
60
+ def convert_a(self, el: Any, text: str, convert_as_inline: bool):
61
+ """Same as usual converter, but removes Javascript links and escapes URIs."""
62
+ prefix, suffix, text = markdownify.chomp(text) # type: ignore
63
+ if not text:
64
+ return ""
65
+ href = el.get("href")
66
+ title = el.get("title")
67
+
68
+ # Escape URIs and skip non-http or file schemes
69
+ if href:
70
+ try:
71
+ parsed_url = urlparse(href) # type: ignore
72
+ if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore
73
+ return "%s%s%s" % (prefix, text, suffix)
74
+ href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore
75
+ except ValueError: # It's not clear if this ever gets thrown
76
+ return "%s%s%s" % (prefix, text, suffix)
77
+
78
+ # For the replacement see #29: text nodes underscores are escaped
79
+ if (
80
+ self.options["autolinks"]
81
+ and text.replace(r"\_", "_") == href
82
+ and not title
83
+ and not self.options["default_title"]
84
+ ):
85
+ # Shortcut syntax
86
+ return "<%s>" % href
87
+ if self.options["default_title"] and not title:
88
+ title = href
89
+ title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
90
+ return "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix) if href else text
91
+
92
+ def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
93
+ """Same as usual converter, but removes data URIs"""
94
+
95
+ alt = el.attrs.get("alt", None) or ""
96
+ src = el.attrs.get("src", None) or ""
97
+ title = el.attrs.get("title", None) or ""
98
+ title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
99
+ if convert_as_inline and el.parent.name not in self.options["keep_inline_images_in"]:
100
+ return alt
101
+
102
+ # Remove dataURIs
103
+ if src.startswith("data:"):
104
+ src = src.split(",")[0] + "..."
105
+
106
+ return "![%s](%s%s)" % (alt, src, title_part)
107
+
108
+ def convert_soup(self, soup: Any) -> str:
109
+ return super().convert_soup(soup) # type: ignore
110
+
111
+
112
+ class DocumentConverterResult:
113
+ """The result of converting a document to text."""
114
+
115
+ def __init__(self, title: str | None = None, text_content: str = ""):
116
+ self.title: str | None = title
117
+ self.text_content: str = text_content
118
+
119
+
120
+ class DocumentConverter:
121
+ """Abstract superclass of all DocumentConverters."""
122
+
123
+ def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult:
124
+ raise NotImplementedError()
125
+
126
+
127
+ class PlainTextConverter(DocumentConverter):
128
+ """Anything with content type text/plain"""
129
+
130
+ def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult:
131
+ # Guess the content type from any file extension that might be around
132
+ content_type, _ = mimetypes.guess_type("__placeholder" + kwargs.get("file_extension", ""))
133
+
134
+ # Only accept text files
135
+ if content_type is None:
136
+ return None
137
+ # elif "text/" not in content_type.lower():
138
+ # return None
139
+
140
+ text_content = ""
141
+ with open(local_path, "rt", encoding="utf-8") as fh:
142
+ text_content = fh.read()
143
+ return DocumentConverterResult(
144
+ title=None,
145
+ text_content=text_content,
146
+ )
147
+
148
+
149
+ class HtmlConverter(DocumentConverter):
150
+ """Anything with content type text/html"""
151
+
152
+ def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult:
153
+ # Bail if not html
154
+ extension = kwargs.get("file_extension", "")
155
+ if extension.lower() not in [".html", ".htm"]:
156
+ return None
157
+
158
+ result = None
159
+ with open(local_path, "rt", encoding="utf-8") as fh:
160
+ result = self._convert(fh.read())
161
+
162
+ return result
163
+
164
+ def _convert(self, html_content: str) -> None | DocumentConverterResult:
165
+ """Helper function that converts and HTML string."""
166
+
167
+ # Parse the string
168
+ soup = BeautifulSoup(html_content, "html.parser")
169
+
170
+ # Remove javascript and style blocks
171
+ for script in soup(["script", "style"]):
172
+ script.extract()
173
+
174
+ # Print only the main content
175
+ body_elm = soup.find("body")
176
+ webpage_text = ""
177
+ if body_elm:
178
+ webpage_text = _CustomMarkdownify().convert_soup(body_elm)
179
+ else:
180
+ webpage_text = _CustomMarkdownify().convert_soup(soup)
181
+
182
+ assert isinstance(webpage_text, str)
183
+
184
+ return DocumentConverterResult(
185
+ title=None if soup.title is None else soup.title.string, text_content=webpage_text
186
+ )
187
+
188
+
189
+ class WikipediaConverter(DocumentConverter):
190
+ """Handle Wikipedia pages separately, focusing only on the main document content."""
191
+
192
+ def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult:
193
+ # Bail if not Wikipedia
194
+ extension = kwargs.get("file_extension", "")
195
+ if extension.lower() not in [".html", ".htm"]:
196
+ return None
197
+ url = kwargs.get("url", "")
198
+ if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
199
+ return None
200
+
201
+ # Parse the file
202
+ soup = None
203
+ with open(local_path, "rt", encoding="utf-8") as fh:
204
+ soup = BeautifulSoup(fh.read(), "html.parser")
205
+
206
+ # Remove javascript and style blocks
207
+ for script in soup(["script", "style"]):
208
+ script.extract()
209
+
210
+ # Print only the main content
211
+ body_elm = soup.find("div", {"id": "mw-content-text"})
212
+ title_elm = soup.find("span", {"class": "mw-page-title-main"})
213
+
214
+ webpage_text = ""
215
+ main_title = None if soup.title is None else soup.title.string
216
+
217
+ if body_elm:
218
+ # What's the title
219
+ if title_elm and len(title_elm) > 0:
220
+ main_title = title_elm.string # type: ignore
221
+ assert isinstance(main_title, str)
222
+
223
+ # Convert the page
224
+ webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(body_elm)
225
+ else:
226
+ webpage_text = _CustomMarkdownify().convert_soup(soup)
227
+
228
+ return DocumentConverterResult(
229
+ title=main_title,
230
+ text_content=webpage_text,
231
+ )
232
+
233
+
234
+ class YouTubeConverter(DocumentConverter):
235
+ """Handle YouTube specially, focusing on the video title, description, and transcript."""
236
+
237
+ def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult:
238
+ # Bail if not YouTube
239
+ extension = kwargs.get("file_extension", "")
240
+ if extension.lower() not in [".html", ".htm"]:
241
+ return None
242
+ url = kwargs.get("url", "")
243
+ if not url.startswith("https://www.youtube.com/watch?"):
244
+ return None
245
+
246
+ # Parse the file
247
+ soup = None
248
+ with open(local_path, "rt", encoding="utf-8") as fh:
249
+ soup = BeautifulSoup(fh.read(), "html.parser")
250
+
251
+ # Read the meta tags
252
+ assert soup.title is not None and soup.title.string is not None
253
+ metadata: dict[str, str] = {"title": soup.title.string}
254
+ for meta in soup(["meta"]):
255
+ for a in meta.attrs:
256
+ if a in ["itemprop", "property", "name"]:
257
+ metadata[meta[a]] = meta.get("content", "")
258
+ break
259
+
260
+ # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
261
+ try:
262
+ for script in soup(["script"]):
263
+ content = script.text
264
+ if "ytInitialData" in content:
265
+ lines = re.split(r"\r?\n", content)
266
+ obj_start = lines[0].find("{")
267
+ obj_end = lines[0].rfind("}")
268
+ if obj_start >= 0 and obj_end >= 0:
269
+ data = json.loads(lines[0][obj_start : obj_end + 1])
270
+ attrdesc = self._findKey(data, "attributedDescriptionBodyText") # type: ignore
271
+ if attrdesc:
272
+ metadata["description"] = str(attrdesc["content"])
273
+ break
274
+ except Exception:
275
+ pass
276
+
277
+ # Start preparing the page
278
+ webpage_text = "# YouTube\n"
279
+
280
+ title = self._get(metadata, ["title", "og:title", "name"]) # type: ignore
281
+ assert isinstance(title, str)
282
+
283
+ if title:
284
+ webpage_text += f"\n## {title}\n"
285
+
286
+ stats = ""
287
+ views = self._get(metadata, ["interactionCount"]) # type: ignore
288
+ if views:
289
+ stats += f"- **Views:** {views}\n"
290
+
291
+ keywords = self._get(metadata, ["keywords"]) # type: ignore
292
+ if keywords:
293
+ stats += f"- **Keywords:** {keywords}\n"
294
+
295
+ runtime = self._get(metadata, ["duration"]) # type: ignore
296
+ if runtime:
297
+ stats += f"- **Runtime:** {runtime}\n"
298
+
299
+ if len(stats) > 0:
300
+ webpage_text += f"\n### Video Metadata\n{stats}\n"
301
+
302
+ description = self._get(metadata, ["description", "og:description"]) # type: ignore
303
+ if description:
304
+ webpage_text += f"\n### Description\n{description}\n"
305
+
306
+ transcript_text = ""
307
+ parsed_url = urlparse(url) # type: ignore
308
+ params = parse_qs(parsed_url.query) # type: ignore
309
+ if "v" in params:
310
+ assert isinstance(params["v"][0], str)
311
+ video_id = str(params["v"][0])
312
+ try:
313
+ # Must be a single transcript.
314
+ transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore
315
+ # transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
316
+ # Alternative formatting:
317
+ transcript_text = SRTFormatter().format_transcript(transcript)
318
+ except Exception:
319
+ pass
320
+ if transcript_text:
321
+ webpage_text += f"\n### Transcript\n{transcript_text}\n"
322
+
323
+ title = title if title else soup.title.string
324
+ assert isinstance(title, str)
325
+
326
+ return DocumentConverterResult(
327
+ title=title,
328
+ text_content=webpage_text,
329
+ )
330
+
331
+ def _get(self, metadata: dict[str, str], keys: list[str], default: str | None = None) -> str | None:
332
+ for k in keys:
333
+ if k in metadata:
334
+ return metadata[k]
335
+ return default
336
+
337
+ def _findKey(self, json: Any, key: str) -> str | None: # TODO: Fix json type
338
+ if isinstance(json, list):
339
+ for elm in json:
340
+ ret = self._findKey(elm, key)
341
+ if ret is not None:
342
+ return ret
343
+ elif isinstance(json, dict):
344
+ for k in json:
345
+ if k == key:
346
+ return json[k]
347
+ else:
348
+ ret = self._findKey(json[k], key)
349
+ if ret is not None:
350
+ return ret
351
+ return None
352
+
353
+
354
+ class PdfConverter(DocumentConverter):
355
+ """
356
+ Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
357
+ """
358
+
359
+ def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
360
+ # Bail if not a PDF
361
+ extension = kwargs.get("file_extension", "")
362
+ if extension.lower() != ".pdf":
363
+ return None
364
+
365
+ return DocumentConverterResult(
366
+ title=None,
367
+ text_content=pdfminer.high_level.extract_text(local_path),
368
+ )
369
+
370
+
371
+ class DocxConverter(HtmlConverter):
372
+ """
373
+ Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
374
+ """
375
+
376
+ def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
377
+ # Bail if not a DOCX
378
+ extension = kwargs.get("file_extension", "")
379
+ if extension.lower() != ".docx":
380
+ return None
381
+
382
+ result = None
383
+ with open(local_path, "rb") as docx_file:
384
+ result = mammoth.convert_to_html(docx_file)
385
+ html_content = result.value
386
+ result = self._convert(html_content)
387
+
388
+ return result
389
+
390
+
391
+ class XlsxConverter(HtmlConverter):
392
+ """
393
+ Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
394
+ """
395
+
396
+ def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
397
+ # Bail if not a XLSX
398
+ extension = kwargs.get("file_extension", "")
399
+ if extension.lower() not in [".xlsx", ".xls"]:
400
+ return None
401
+
402
+ sheets = pd.read_excel(local_path, sheet_name=None)
403
+ md_content = ""
404
+ for s in sheets:
405
+ md_content += f"## {s}\n"
406
+ html_content = sheets[s].to_html(index=False)
407
+ md_content += self._convert(html_content).text_content.strip() + "\n\n"
408
+
409
+ return DocumentConverterResult(
410
+ title=None,
411
+ text_content=md_content.strip(),
412
+ )
413
+
414
+
415
+ class PptxConverter(HtmlConverter):
416
+ """
417
+ Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
418
+ """
419
+
420
+ def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
421
+ # Bail if not a PPTX
422
+ extension = kwargs.get("file_extension", "")
423
+ if extension.lower() != ".pptx":
424
+ return None
425
+
426
+ md_content = ""
427
+
428
+ presentation = pptx.Presentation(local_path)
429
+ slide_num = 0
430
+ for slide in presentation.slides:
431
+ slide_num += 1
432
+
433
+ md_content += f"\n\n<!-- Slide number: {slide_num} -->\n"
434
+
435
+ title = slide.shapes.title
436
+ for shape in slide.shapes:
437
+ # Pictures
438
+ if self._is_picture(shape):
439
+ # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
440
+ alt_text = ""
441
+ try:
442
+ alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
443
+ except Exception:
444
+ pass
445
+
446
+ # A placeholder name
447
+ filename = re.sub(r"\W", "", shape.name) + ".jpg"
448
+ md_content += "\n![" + (alt_text if alt_text else shape.name) + "](" + filename + ")\n"
449
+
450
+ # Tables
451
+ if self._is_table(shape):
452
+ html_table = "<html><body><table>"
453
+ first_row = True
454
+ for row in shape.table.rows:
455
+ html_table += "<tr>"
456
+ for cell in row.cells:
457
+ if first_row:
458
+ html_table += "<th>" + html.escape(cell.text) + "</th>"
459
+ else:
460
+ html_table += "<td>" + html.escape(cell.text) + "</td>"
461
+ html_table += "</tr>"
462
+ first_row = False
463
+ html_table += "</table></body></html>"
464
+ md_content += "\n" + self._convert(html_table).text_content.strip() + "\n"
465
+
466
+ # Text areas
467
+ elif shape.has_text_frame:
468
+ if shape == title:
469
+ md_content += "# " + shape.text.lstrip() + "\n"
470
+ else:
471
+ md_content += shape.text + "\n"
472
+
473
+ md_content = md_content.strip()
474
+
475
+ if slide.has_notes_slide:
476
+ md_content += "\n\n### Notes:\n"
477
+ notes_frame = slide.notes_slide.notes_text_frame
478
+ if notes_frame is not None:
479
+ md_content += notes_frame.text
480
+ md_content = md_content.strip()
481
+
482
+ return DocumentConverterResult(
483
+ title=None,
484
+ text_content=md_content.strip(),
485
+ )
486
+
487
+ def _is_picture(self, shape):
488
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
489
+ return True
490
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
491
+ if hasattr(shape, "image"):
492
+ return True
493
+ return False
494
+
495
+ def _is_table(self, shape):
496
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
497
+ return True
498
+ return False
499
+
500
+
501
+ class MediaConverter(DocumentConverter):
502
+ """
503
+ Abstract class for multi-modal media (e.g., images and audio)
504
+ """
505
+
506
+ def _get_metadata(self, local_path):
507
+ exiftool = shutil.which("exiftool")
508
+ if not exiftool:
509
+ return None
510
+ else:
511
+ try:
512
+ result = subprocess.run([exiftool, "-json", local_path], capture_output=True, text=True).stdout
513
+ return json.loads(result)[0]
514
+ except Exception:
515
+ return None
516
+
517
+
518
+ class WavConverter(MediaConverter):
519
+ """
520
+ Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
521
+ """
522
+
523
+ def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
524
+ # Bail if not a XLSX
525
+ extension = kwargs.get("file_extension", "")
526
+ if extension.lower() != ".wav":
527
+ return None
528
+
529
+ md_content = ""
530
+
531
+ # Add metadata
532
+ metadata = self._get_metadata(local_path)
533
+ if metadata:
534
+ for f in [
535
+ "Title",
536
+ "Artist",
537
+ "Author",
538
+ "Band",
539
+ "Album",
540
+ "Genre",
541
+ "Track",
542
+ "DateTimeOriginal",
543
+ "CreateDate",
544
+ "Duration",
545
+ ]:
546
+ if f in metadata:
547
+ md_content += f"{f}: {metadata[f]}\n"
548
+
549
+ # Transcribe
550
+ try:
551
+ transcript = self._transcribe_audio(local_path)
552
+ md_content += "\n\n### Audio Transcript:\n" + ("[No speech detected]" if transcript == "" else transcript)
553
+ except Exception:
554
+ md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
555
+
556
+ return DocumentConverterResult(
557
+ title=None,
558
+ text_content=md_content.strip(),
559
+ )
560
+
561
+ def _transcribe_audio(self, local_path) -> str:
562
+ recognizer = sr.Recognizer()
563
+ with sr.AudioFile(local_path) as source:
564
+ audio = recognizer.record(source)
565
+ return recognizer.recognize_google(audio).strip()
566
+
567
+
568
+ class Mp3Converter(WavConverter):
569
+ """
570
+ Converts MP3 and M4A files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
571
+ """
572
+
573
+ def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
574
+ # Bail if not a MP3
575
+ extension = kwargs.get("file_extension", "")
576
+ if extension.lower() not in [".mp3", ".m4a"]:
577
+ return None
578
+
579
+ md_content = ""
580
+
581
+ # Add metadata
582
+ metadata = self._get_metadata(local_path)
583
+ if metadata:
584
+ for f in [
585
+ "Title",
586
+ "Artist",
587
+ "Author",
588
+ "Band",
589
+ "Album",
590
+ "Genre",
591
+ "Track",
592
+ "DateTimeOriginal",
593
+ "CreateDate",
594
+ "Duration",
595
+ ]:
596
+ if f in metadata:
597
+ md_content += f"{f}: {metadata[f]}\n"
598
+
599
+ # Transcribe
600
+ handle, temp_path = tempfile.mkstemp(suffix=".wav")
601
+ os.close(handle)
602
+ try:
603
+ if extension.lower() == ".mp3":
604
+ sound = pydub.AudioSegment.from_mp3(local_path)
605
+ else:
606
+ sound = pydub.AudioSegment.from_file(local_path, format="m4a")
607
+ sound.export(temp_path, format="wav")
608
+
609
+ _args = dict()
610
+ _args.update(kwargs)
611
+ _args["file_extension"] = ".wav"
612
+
613
+ try:
614
+ transcript = super()._transcribe_audio(temp_path).strip()
615
+ md_content += "\n\n### Audio Transcript:\n" + (
616
+ "[No speech detected]" if transcript == "" else transcript
617
+ )
618
+ except Exception:
619
+ md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
620
+
621
+ finally:
622
+ os.unlink(temp_path)
623
+
624
+ # Return the result
625
+ return DocumentConverterResult(
626
+ title=None,
627
+ text_content=md_content.strip(),
628
+ )
629
+
630
+
631
+ class ZipConverter(DocumentConverter):
632
+ """
633
+ Extracts ZIP files to a permanent local directory and returns a listing of extracted files.
634
+ """
635
+
636
+ def __init__(self, extract_dir: str = "downloads"):
637
+ """
638
+ Initialize with path to extraction directory.
639
+
640
+ Args:
641
+ extract_dir: The directory where files will be extracted. Defaults to "downloads"
642
+ """
643
+ self.extract_dir = extract_dir
644
+ # Create the extraction directory if it doesn't exist
645
+ os.makedirs(self.extract_dir, exist_ok=True)
646
+
647
+ def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult:
648
+ # Bail if not a ZIP file
649
+ extension = kwargs.get("file_extension", "")
650
+ if extension.lower() != ".zip":
651
+ return None
652
+
653
+ # Verify it's actually a ZIP file
654
+ if not zipfile.is_zipfile(local_path):
655
+ return None
656
+
657
+ # Extract all files and build list
658
+ extracted_files = []
659
+ with zipfile.ZipFile(local_path, "r") as zip_ref:
660
+ # Extract all files
661
+ zip_ref.extractall(self.extract_dir)
662
+ # Get list of all files
663
+ for file_path in zip_ref.namelist():
664
+ # Skip directories
665
+ if not file_path.endswith("/"):
666
+ extracted_files.append(self.extract_dir + "/" + file_path)
667
+
668
+ # Sort files for consistent output
669
+ extracted_files.sort()
670
+
671
+ # Build the markdown content
672
+ md_content = "Downloaded the following files:\n"
673
+ for file in extracted_files:
674
+ md_content += f"* {file}\n"
675
+
676
+ return DocumentConverterResult(title="Extracted Files", text_content=md_content.strip())
677
+
678
+
679
+ class ImageConverter(MediaConverter):
680
+ """
681
+ Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured).
682
+ """
683
+
684
+ def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
685
+ # Bail if not a XLSX
686
+ extension = kwargs.get("file_extension", "")
687
+ if extension.lower() not in [".jpg", ".jpeg", ".png"]:
688
+ return None
689
+
690
+ md_content = ""
691
+
692
+ # Add metadata
693
+ metadata = self._get_metadata(local_path)
694
+ if metadata:
695
+ for f in [
696
+ "ImageSize",
697
+ "Title",
698
+ "Caption",
699
+ "Description",
700
+ "Keywords",
701
+ "Artist",
702
+ "Author",
703
+ "DateTimeOriginal",
704
+ "CreateDate",
705
+ "GPSPosition",
706
+ ]:
707
+ if f in metadata:
708
+ md_content += f"{f}: {metadata[f]}\n"
709
+
710
+ # Try describing the image with GPTV
711
+ mlm_client = kwargs.get("mlm_client")
712
+ mlm_model = kwargs.get("mlm_model")
713
+ if mlm_client is not None and mlm_model is not None:
714
+ md_content += (
715
+ "\n# Description:\n"
716
+ + self._get_mlm_description(
717
+ local_path, extension, mlm_client, mlm_model, prompt=kwargs.get("mlm_prompt")
718
+ ).strip()
719
+ + "\n"
720
+ )
721
+
722
+ return DocumentConverterResult(
723
+ title=None,
724
+ text_content=md_content,
725
+ )
726
+
727
+ def _get_mlm_description(self, local_path, extension, client, model, prompt=None):
728
+ if prompt is None or prompt.strip() == "":
729
+ prompt = "Write a detailed caption for this image."
730
+
731
+ sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
732
+
733
+ data_uri = ""
734
+ with open(local_path, "rb") as image_file:
735
+ content_type, encoding = mimetypes.guess_type("_dummy" + extension)
736
+ if content_type is None:
737
+ content_type = "image/jpeg"
738
+ image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
739
+ data_uri = f"data:{content_type};base64,{image_base64}"
740
+
741
+ messages = [
742
+ {
743
+ "role": "user",
744
+ "content": [
745
+ {"type": "text", "text": prompt},
746
+ {
747
+ "type": "image_url",
748
+ "image_url": {
749
+ "url": data_uri,
750
+ },
751
+ },
752
+ ],
753
+ }
754
+ ]
755
+
756
+ response = client.chat.completions.create(model=model, messages=messages)
757
+ return response.choices[0].message.content
758
+
759
+
760
+ class FileConversionException(Exception):
761
+ pass
762
+
763
+
764
+ class UnsupportedFormatException(Exception):
765
+ pass
766
+
767
+
768
+ class MarkdownConverter:
769
+ """(In preview) An extremely simple text-based document reader, suitable for LLM use.
770
+ This reader will convert common file-types or webpages to Markdown."""
771
+
772
+ def __init__(
773
+ self,
774
+ requests_session: requests.Session | None = None,
775
+ mlm_client: Any | None = None,
776
+ mlm_model: Any | None = None,
777
+ ):
778
+ if requests_session is None:
779
+ self._requests_session = requests.Session()
780
+ else:
781
+ self._requests_session = requests_session
782
+
783
+ self._mlm_client = mlm_client
784
+ self._mlm_model = mlm_model
785
+
786
+ self._page_converters: list[DocumentConverter] = []
787
+
788
+ # Register converters for successful browsing operations
789
+ # Later registrations are tried first / take higher priority than earlier registrations
790
+ # To this end, the most specific converters should appear below the most generic converters
791
+ self.register_page_converter(PlainTextConverter())
792
+ self.register_page_converter(HtmlConverter())
793
+ self.register_page_converter(WikipediaConverter())
794
+ self.register_page_converter(YouTubeConverter())
795
+ self.register_page_converter(DocxConverter())
796
+ self.register_page_converter(XlsxConverter())
797
+ self.register_page_converter(PptxConverter())
798
+ self.register_page_converter(WavConverter())
799
+ self.register_page_converter(Mp3Converter())
800
+ self.register_page_converter(ImageConverter())
801
+ self.register_page_converter(ZipConverter())
802
+ self.register_page_converter(PdfConverter())
803
+
804
+ def convert(
805
+ self, source: str | requests.Response, **kwargs: Any
806
+ ) -> DocumentConverterResult: # TODO: deal with kwargs
807
+ """
808
+ Args:
809
+ - source: can be a string representing a path or url, or a requests.response object
810
+ - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
811
+ """
812
+
813
+ # Local path or url
814
+ if isinstance(source, str):
815
+ if source.startswith("http://") or source.startswith("https://") or source.startswith("file://"):
816
+ return self.convert_url(source, **kwargs)
817
+ else:
818
+ return self.convert_local(source, **kwargs)
819
+ # Request response
820
+ elif isinstance(source, requests.Response):
821
+ return self.convert_response(source, **kwargs)
822
+
823
+ def convert_local(self, path: str, **kwargs: Any) -> DocumentConverterResult: # TODO: deal with kwargs
824
+ # Prepare a list of extensions to try (in order of priority)
825
+ ext = kwargs.get("file_extension")
826
+ extensions = [ext] if ext is not None else []
827
+
828
+ # Get extension alternatives from the path and puremagic
829
+ base, ext = os.path.splitext(path)
830
+ self._append_ext(extensions, ext)
831
+ self._append_ext(extensions, self._guess_ext_magic(path))
832
+
833
+ # Convert
834
+ return self._convert(path, extensions, **kwargs)
835
+
836
+ # TODO what should stream's type be?
837
+ def convert_stream(self, stream: Any, **kwargs: Any) -> DocumentConverterResult: # TODO: deal with kwargs
838
+ # Prepare a list of extensions to try (in order of priority)
839
+ ext = kwargs.get("file_extension")
840
+ extensions = [ext] if ext is not None else []
841
+
842
+ # Save the file locally to a temporary file. It will be deleted before this method exits
843
+ handle, temp_path = tempfile.mkstemp()
844
+ fh = os.fdopen(handle, "wb")
845
+ result = None
846
+ try:
847
+ # Write to the temporary file
848
+ content = stream.read()
849
+ if isinstance(content, str):
850
+ fh.write(content.encode("utf-8"))
851
+ else:
852
+ fh.write(content)
853
+ fh.close()
854
+
855
+ # Use puremagic to check for more extension options
856
+ self._append_ext(extensions, self._guess_ext_magic(temp_path))
857
+
858
+ # Convert
859
+ result = self._convert(temp_path, extensions, **kwargs)
860
+ # Clean up
861
+ finally:
862
+ try:
863
+ fh.close()
864
+ except Exception:
865
+ pass
866
+ os.unlink(temp_path)
867
+
868
+ return result
869
+
870
+ def convert_url(self, url: str, **kwargs: Any) -> DocumentConverterResult: # TODO: fix kwargs type
871
+ # Send a HTTP request to the URL
872
+ user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
873
+ response = self._requests_session.get(url, stream=True, headers={"User-Agent": user_agent})
874
+ response.raise_for_status()
875
+ return self.convert_response(response, **kwargs)
876
+
877
+ def convert_response(
878
+ self, response: requests.Response, **kwargs: Any
879
+ ) -> DocumentConverterResult: # TODO fix kwargs type
880
+ # Prepare a list of extensions to try (in order of priority)
881
+ ext = kwargs.get("file_extension")
882
+ extensions = [ext] if ext is not None else []
883
+
884
+ # Guess from the mimetype
885
+ content_type = response.headers.get("content-type", "").split(";")[0]
886
+ self._append_ext(extensions, mimetypes.guess_extension(content_type))
887
+
888
+ # Read the content disposition if there is one
889
+ content_disposition = response.headers.get("content-disposition", "")
890
+ m = re.search(r"filename=([^;]+)", content_disposition)
891
+ if m:
892
+ base, ext = os.path.splitext(m.group(1).strip("\"'"))
893
+ self._append_ext(extensions, ext)
894
+
895
+ # Read from the extension from the path
896
+ base, ext = os.path.splitext(urlparse(response.url).path)
897
+ self._append_ext(extensions, ext)
898
+
899
+ # Save the file locally to a temporary file. It will be deleted before this method exits
900
+ handle, temp_path = tempfile.mkstemp()
901
+ fh = os.fdopen(handle, "wb")
902
+ result = None
903
+ try:
904
+ # Download the file
905
+ for chunk in response.iter_content(chunk_size=512):
906
+ fh.write(chunk)
907
+ fh.close()
908
+
909
+ # Use puremagic to check for more extension options
910
+ self._append_ext(extensions, self._guess_ext_magic(temp_path))
911
+
912
+ # Convert
913
+ result = self._convert(temp_path, extensions, url=response.url)
914
+ except Exception as e:
915
+ print(f"Error in converting: {e}")
916
+
917
+ # Clean up
918
+ finally:
919
+ try:
920
+ fh.close()
921
+ except Exception:
922
+ pass
923
+ os.unlink(temp_path)
924
+
925
+ return result
926
+
927
+ def _convert(self, local_path: str, extensions: list[str | None], **kwargs) -> DocumentConverterResult:
928
+ error_trace = ""
929
+ for ext in extensions + [None]: # Try last with no extension
930
+ for converter in self._page_converters:
931
+ _kwargs = copy.deepcopy(kwargs)
932
+
933
+ # Overwrite file_extension appropriately
934
+ if ext is None:
935
+ if "file_extension" in _kwargs:
936
+ del _kwargs["file_extension"]
937
+ else:
938
+ _kwargs.update({"file_extension": ext})
939
+
940
+ # Copy any additional global options
941
+ if "mlm_client" not in _kwargs and self._mlm_client is not None:
942
+ _kwargs["mlm_client"] = self._mlm_client
943
+
944
+ if "mlm_model" not in _kwargs and self._mlm_model is not None:
945
+ _kwargs["mlm_model"] = self._mlm_model
946
+
947
+ # If we hit an error log it and keep trying
948
+ try:
949
+ res = converter.convert(local_path, **_kwargs)
950
+ except Exception:
951
+ error_trace = ("\n\n" + traceback.format_exc()).strip()
952
+
953
+ if res is not None:
954
+ # Normalize the content
955
+ res.text_content = "\n".join([line.rstrip() for line in re.split(r"\r?\n", res.text_content)])
956
+ res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
957
+
958
+ # Todo
959
+ return res
960
+
961
+ # If we got this far without success, report any exceptions
962
+ if len(error_trace) > 0:
963
+ raise FileConversionException(
964
+ f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
965
+ )
966
+
967
+ # Nothing can handle it!
968
+ raise UnsupportedFormatException(
969
+ f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
970
+ )
971
+
972
+ def _append_ext(self, extensions, ext):
973
+ """Append a unique non-None, non-empty extension to a list of extensions."""
974
+ if ext is None:
975
+ return
976
+ ext = ext.strip()
977
+ if ext == "":
978
+ return
979
+ # if ext not in extensions:
980
+ if True:
981
+ extensions.append(ext)
982
+
983
+ def _guess_ext_magic(self, path):
984
+ """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
985
+ # Use puremagic to guess
986
+ try:
987
+ guesses = puremagic.magic_file(path)
988
+ if len(guesses) > 0:
989
+ ext = guesses[0].extension.strip()
990
+ if len(ext) > 0:
991
+ return ext
992
+ except FileNotFoundError:
993
+ pass
994
+ except IsADirectoryError:
995
+ pass
996
+ except PermissionError:
997
+ pass
998
+ return None
999
+
1000
+ def register_page_converter(self, converter: DocumentConverter) -> None:
1001
+ """Register a page text converter."""
1002
+ self._page_converters.insert(0, converter)