Final_Assignment_Template

Sleeping

App Files Files Community

RalphThings commited on May 9, 2025

Commit

2949e29

verified ·

1 Parent(s): 64d58ed

Upload 2 files

Browse files

Files changed (2) hide show

cookies.py +715 -0
mdconvert.py +1002 -0

cookies.py ADDED Viewed

	@@ -0,0 +1,715 @@

+from requests.cookies import RequestsCookieJar
+COOKIES_LIST = [
+    {
+        "domain": ".youtube.com",
+        "expirationDate": 1718884961,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "ST-xuwub9",
+        "path": "/",
+        "sameSite": None,
+        "secure": False,
+        "session": False,
+        "storeId": None,
+        "value": "session_logininfo=AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0%3AQUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3",
+    },
+    {
+        "domain": ".youtube.com",
+        "expirationDate": 1753004444.745411,
+        "hostOnly": False,
+        "httpOnly": True,
+        "name": "__Secure-YEC",
+        "path": "/",
+        "sameSite": "lax",
+        "secure": True,
+        "session": False,
+        "storeId": None,
+        "value": "CgtRVnI5LW1zRHlQVSjbtNCzBjIhCgJGUhIbEhcSFRMLFBUWFwwYGRobHB0eHw4PIBAREiAk",
+    },
+    {
+        "domain": ".youtube.com",
+        "expirationDate": 1753434620.050824,
+        "hostOnly": False,
+        "httpOnly": True,
+        "name": "__Secure-3PSID",
+        "path": "/",
+        "sameSite": "no_restriction",
+        "secure": True,
+        "session": False,
+        "storeId": None,
+        "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB4ezJ_bdWu46a7YwObVn44wACgYKAakSARQSFQHGX2MicJcTzecTKH6bHzqU6TMbTxoVAUF8yKqQYK-MoI6Ql3vI2oYTB3E-0076",
+    },
+    {
+        "domain": ".youtube.com",
+        "expirationDate": 1750420959.974642,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "SIDCC",
+        "path": "/",
+        "sameSite": None,
+        "secure": False,
+        "session": False,
+        "storeId": None,
+        "value": "AKEyXzWQZauHKOo8t87zoEcjaVNIYUX54ohoWXT-tX4aAhEuZzIIptxZAcNkHuG2oDXYL6t-lw",
+    },
+    {
+        "domain": ".youtube.com",
+        "expirationDate": 1753434620.050652,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "SID",
+        "path": "/",
+        "sameSite": None,
+        "secure": False,
+        "session": False,
+        "storeId": None,
+        "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB6VHrZcC3gBAsFPbCQ0gF5AACgYKAYkSARQSFQHGX2Mi9kt0gHg5CxCYSkLQGHWaeBoVAUF8yKre_V6r3jZVak6JV4o2Q0FL0076",
+    },
+    {
+        "domain": ".youtube.com",
+        "expirationDate": 1750420958.397534,
+        "hostOnly": False,
+        "httpOnly": True,
+        "name": "__Secure-1PSIDTS",
+        "path": "/",
+        "sameSite": None,
+        "secure": True,
+        "session": False,
+        "storeId": None,
+        "value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA",
+    },
+    {
+        "domain": ".youtube.com",
+        "expirationDate": 1753433494.44729,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "_ga_M0180HEFCY",
+        "path": "/",
+        "sameSite": None,
+        "secure": False,
+        "session": False,
+        "storeId": None,
+        "value": "GS1.1.1718871908.1.0.1718873494.0.0.0",
+    },
+    {
+        "domain": ".youtube.com",
+        "expirationDate": 1753434620.050933,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "SAPISID",
+        "path": "/",
+        "sameSite": None,
+        "secure": True,
+        "session": False,
+        "storeId": None,
+        "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
+    },
+    {
+        "domain": ".youtube.com",
+        "expirationDate": 1750420959.974764,
+        "hostOnly": False,
+        "httpOnly": True,
+        "name": "__Secure-1PSIDCC",
+        "path": "/",
+        "sameSite": None,
+        "secure": True,
+        "session": False,
+        "storeId": None,
+        "value": "AKEyXzWHDSoXGCZpZhPxRrnC7B1s8zGIUjeMVyvgtQfsm1fs92lXPtFEI_td9LBUyqVUe0xK",
+    },
+    {
+        "domain": ".youtube.com",
+        "expirationDate": 1753434620.050881,
+        "hostOnly": False,
+        "httpOnly": True,
+        "name": "SSID",
+        "path": "/",
+        "sameSite": None,
+        "secure": True,
+        "session": False,
+        "storeId": None,
+        "value": "AmlwXHnQvOQ10LVd-",
+    },
+    {
+        "domain": ".youtube.com",
+        "expirationDate": 1753434620.050959,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "__Secure-1PAPISID",
+        "path": "/",
+        "sameSite": None,
+        "secure": True,
+        "session": False,
+        "storeId": None,
+        "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
+    },
+    {
+        "domain": ".youtube.com",
+        "expirationDate": 1753434620.050795,
+        "hostOnly": False,
+        "httpOnly": True,
+        "name": "__Secure-1PSID",
+        "path": "/",
+        "sameSite": None,
+        "secure": True,
+        "session": False,
+        "storeId": None,
+        "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBBrlk7lRpKQGywAHEon7WGQAACgYKAQsSARQSFQHGX2MirAmnSRdZl6GPG6KLd4hOihoVAUF8yKoV17Tcj1a_OenIOkf2wBjO0076",
+    },
+    {
+        "domain": ".youtube.com",
+        "expirationDate": 1753434620.050993,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "__Secure-3PAPISID",
+        "path": "/",
+        "sameSite": "no_restriction",
+        "secure": True,
+        "session": False,
+        "storeId": None,
+        "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
+    },
+    {
+        "domain": ".youtube.com",
+        "expirationDate": 1750420959.974815,
+        "hostOnly": False,
+        "httpOnly": True,
+        "name": "__Secure-3PSIDCC",
+        "path": "/",
+        "sameSite": "no_restriction",
+        "secure": True,
+        "session": False,
+        "storeId": None,
+        "value": "AKEyXzXM5UjKUEXwSHVmRAIo6hGHA4G63adj3EE1VdNriD0f38jZQbsUKiD4LQbA3BValmTFDg",
+    },
+    {
+        "domain": ".youtube.com",
+        "expirationDate": 1750420958.397647,
+        "hostOnly": False,
+        "httpOnly": True,
+        "name": "__Secure-3PSIDTS",
+        "path": "/",
+        "sameSite": "no_restriction",
+        "secure": True,
+        "session": False,
+        "storeId": None,
+        "value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA",
+    },
+    {
+        "domain": ".youtube.com",
+        "expirationDate": 1753434620.050908,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "APISID",
+        "path": "/",
+        "sameSite": None,
+        "secure": False,
+        "session": False,
+        "storeId": None,
+        "value": "IlQWLPjdNqziwCrV/ANG7Z4x5FF-IBxbZk",
+    },
+    {
+        "domain": ".youtube.com",
+        "expirationDate": 1753434620.050855,
+        "hostOnly": False,
+        "httpOnly": True,
+        "name": "HSID",
+        "path": "/",
+        "sameSite": None,
+        "secure": False,
+        "session": False,
+        "storeId": None,
+        "value": "AasA7hmRuTFv7vjoq",
+    },
+    {
+        "domain": ".youtube.com",
+        "expirationDate": 1753435873.577793,
+        "hostOnly": False,
+        "httpOnly": True,
+        "name": "LOGIN_INFO",
+        "path": "/",
+        "sameSite": "no_restriction",
+        "secure": True,
+        "session": False,
+        "storeId": None,
+        "value": "AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0:QUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3",
+    },
+    {
+        "domain": ".youtube.com",
+        "expirationDate": 1753444956.555608,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "PREF",
+        "path": "/",
+        "sameSite": None,
+        "secure": True,
+        "session": False,
+        "storeId": None,
+        "value": "f4=4000000&f6=40000000&tz=Europe.Paris&f5=30000&f7=100",
+    },
+]
+COOKIES_LIST += [
+    {
+        "domain": ".www.researchgate.net",
+        "hostOnly": False,
+        "httpOnly": True,
+        "name": "isInstIp",
+        "path": "/",
+        "sameSite": None,
+        "secure": True,
+        "session": True,
+        "storeId": None,
+        "value": "False",
+    },
+    {
+        "domain": ".researchgate.net",
+        "expirationDate": 1734423981,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "__eoi",
+        "path": "/",
+        "sameSite": None,
+        "secure": False,
+        "session": False,
+        "storeId": None,
+        "value": "ID=c26f752377373146:T=1718871981:RT=1718884914:S=AA-AfjZw-T_OOX2kW2LLaFzXImgc",
+    },
+    {
+        "domain": ".www.researchgate.net",
+        "expirationDate": 1753444909.646103,
+        "hostOnly": False,
+        "httpOnly": True,
+        "name": "ptc",
+        "path": "/",
+        "sameSite": None,
+        "secure": True,
+        "session": False,
+        "storeId": None,
+        "value": "RG1.8947708639250500550.1718872043",
+    },
+    {
+        "domain": ".researchgate.net",
+        "expirationDate": 1750507578,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "euconsent-v2-didomi",
+        "path": "/",
+        "sameSite": "lax",
+        "secure": True,
+        "session": False,
+        "storeId": None,
+        "value": "CQAgmoAQAgmoAAHABBENA5EsAP_gAEPgAAYgJ2pB5G5UTWlBIG53YMskIAUFhFBoQEAgAACAAwIBSBIAIIwEAGAAIAgAICACAAIAIBIAIABAGAAAAAAAYIAAIAAIAAAQIAAKIAAAAAAAAgBQAAgIAgggEAAAgEBEABAAgAAAEIIAQNgACgAAACCAAAAAAAABAAAAAAAAQAAAAAAAYCQAAAJIAAAAACAIABAIAAAAAAAAAAAAAAAABBAAIJ2wPIAFAAXABQAFQALgAcAA8ACAAEgALwAZAA0ACIAEcAJgAUgAqgBcADEAGgAPQAfgBEACOAE4AMMAZYA0QBsgDkAHOAO4AfsBBwEIAItARwBHQC6gHUAO2Ae0A_4CHQEXgJ2AUOAo8BT4CpQFqALYAXmAwQBkgDLAGXANjAhCBG8CbAE3gJ1gTtAA.f_wACHwAAAAA",
+    },
+    {
+        "domain": ".researchgate.net",
+        "expirationDate": 1718885236,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "_gat",
+        "path": "/",
+        "sameSite": None,
+        "secure": False,
+        "session": False,
+        "storeId": None,
+        "value": "1",
+    },
+    {
+        "domain": "www.researchgate.net",
+        "expirationDate": 1721477183,
+        "hostOnly": True,
+        "httpOnly": False,
+        "name": "_pbjs_userid_consent_data",
+        "path": "/",
+        "sameSite": "lax",
+        "secure": False,
+        "session": False,
+        "storeId": None,
+        "value": "3524755945110770",
+    },
+    {
+        "domain": ".researchgate.net",
+        "expirationDate": 1752567981,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "__gads",
+        "path": "/",
+        "sameSite": None,
+        "secure": False,
+        "session": False,
+        "storeId": None,
+        "value": "ID=eca2adb88969c830:T=1718871981:RT=1718884914:S=ALNI_MY2qZchynrhWX6hWMlaI87Pcj9riQ",
+    },
+    {
+        "domain": ".researchgate.net",
+        "expirationDate": 1718886709.646173,
+        "hostOnly": False,
+        "httpOnly": True,
+        "name": "__cf_bm",
+        "path": "/",
+        "sameSite": "no_restriction",
+        "secure": True,
+        "session": False,
+        "storeId": None,
+        "value": "IkQ_J4ciBzKQduRvjqsfSmQu8UygDWbHeROO5JVccfo-1718884909-1.0.1.1-qvNGEdbfI0HfhFP6kwe7R7mkTqODNhFuKhs72lLly6K2BOPMG3kbahpQFGvPK0U8FUfkznkq65gngd1sWj7sDA",
+    },
+    {
+        "domain": ".researchgate.net",
+        "expirationDate": 1752567981,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "__gpi",
+        "path": "/",
+        "sameSite": None,
+        "secure": False,
+        "session": False,
+        "storeId": None,
+        "value": "UID=00000e4e9aa2e6f2:T=1718871981:RT=1718884914:S=ALNI_MYFNrgzkKn7K6Bd2y8hC6GJCvDiSg",
+    },
+    {
+        "domain": ".researchgate.net",
+        "hostOnly": False,
+        "httpOnly": True,
+        "name": "_cfuvid",
+        "path": "/",
+        "sameSite": "no_restriction",
+        "secure": True,
+        "session": True,
+        "storeId": None,
+        "value": "_GPmGZkBymiH3UiqTqzakEpi98br3nfFUWC2_u_wqkc-1718884909785-0.0.1.1-604800000",
+    },
+    {
+        "domain": ".researchgate.net",
+        "expirationDate": 1753445177.271667,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "_ga",
+        "path": "/",
+        "sameSite": None,
+        "secure": False,
+        "session": False,
+        "storeId": None,
+        "value": "GA1.1.1525244793.1718885177",
+    },
+    {
+        "domain": ".researchgate.net",
+        "expirationDate": 1753445177.271482,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "_ga_4P31SJ70EJ",
+        "path": "/",
+        "sameSite": None,
+        "secure": False,
+        "session": False,
+        "storeId": None,
+        "value": "GS1.1.1718885177.1.0.1718885177.0.0.0",
+    },
+    {
+        "domain": ".researchgate.net",
+        "expirationDate": 1718971576,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "_gid",
+        "path": "/",
+        "sameSite": None,
+        "secure": False,
+        "session": False,
+        "storeId": None,
+        "value": "GA1.2.854907463.1718885177",
+    },
+    {
+        "domain": ".www.researchgate.net",
+        "expirationDate": 1750407982.506505,
+        "hostOnly": False,
+        "httpOnly": True,
+        "name": "did",
+        "path": "/",
+        "sameSite": None,
+        "secure": True,
+        "session": False,
+        "storeId": None,
+        "value": "1dWLO3C6am8l667Q4VUlBo0O1LI49Qi2Vw21SJEXHavBDYT56DI9007W5rYGVFVH",
+    },
+    {
+        "domain": ".researchgate.net",
+        "expirationDate": 1750507578,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "didomi_token",
+        "path": "/",
+        "sameSite": "lax",
+        "secure": True,
+        "session": False,
+        "storeId": None,
+        "value": "eyJ1c2VyX2lkIjoiMTkwMzU4YTUtNWU2My02Y2UzLWJlNzAtZGFjNzVmYjdiY2ExIiwiY3JlYXRlZCI6IjIwMjQtMDYtMjBUMTI6MDY6MTYuODA2WiIsInVwZGF0ZWQiOiIyMDI0LTA2LTIwVDEyOjA2OjE4Ljc4MVoiLCJ2ZW5kb3JzIjp7ImVuYWJsZWQiOlsidHdpdHRlciIsImdvb2dsZSIsImM6bGlua2VkaW4tbWFya2V0aW5nLXNvbHV0aW9ucyIsImM6b3duZXJpcSIsImM6b21uaXR1cmUtYWRvYmUtYW5hbHl0aWNzIiwiYzp0ZWNobm9yYXRpLW1lZGlhIiwiYzppbnRlcmNvbSIsImM6aW50ZW50LWlxIiwiYzppcHJvbSIsImM6bGlua2VkaW4iLCJjOmFtYXpvbmFkdi16Y1hGTEI2WCIsImM6bWVkaWFuZXQtY1V3YUtFNnoiLCJjOmluZGV4ZXhjaC1OWkNRTTY4UCIsImM6emVvdGFwZ21iLWQ3YndtdGp3IiwiYzp0cmlwbGVsaWYtZGRKSDM0clkiLCJjOnJ0YmhvdXNlLWI4Y2RIOHRNIiwiYzptZHByaW1pcy1lYU4yOVdjUCIsImM6bG9vcG1lbGktVGRhWXRCUHEiLCJjOm1hZ25pdGVpbi05d1RZTHFSRCIsImM6Ymlkc3dpdGNoLWQ2N0V3N1c5IiwiYzpvcmFjbGVhZHYtcUhlREptQUwiLCJjOmdvb2dsZWFuYS00VFhuSmlnUiIsImM6bG90YW1lc29sLURIaTdMUmpNIiwiYzpuZXh0bWlsbGUtR0pyZlg4VWMiLCJjOm5yaWNodGVjLXFVVlEyUlFxIiwiYzpicml0ZXBvb2wtQldWeVdHeVUiLCJjOnRhcGFkaW5jLXFxY2tVN1BXIiwiYzppZDV0ZWNobi16Tk1KNGR3ZiIsImM6bWljcm9zb2Z0IiwiYzpwZXJtdXRpdmUtSjdpaHJlTWsiLCJjOm9wZXJhc29mdC1CY1hjRFZKTSIsImM6cG9zdGhvZy1Cakp4RmRGOSJdfSwicHVycG9zZXMiOnsiZW5hYmxlZCI6WyJnZW9sb2NhdGlvbl9kYXRhIiwiZGV2aWNlX2NoYXJhY3RlcmlzdGljcyJdfSwidmVuZG9yc19saSI6eyJlbmFibGVkIjpbImdvb2dsZSIsImM6b3BlcmFzb2Z0LUJjWGNEVkpNIl19LCJ2ZXJzaW9uIjoyLCJhYyI6IkRIU0FvQUZrQWNnQTVnSHFnUUhBeGdCNndEMTRJR0FRTkFqMEJJd0NTY0VyQUtCd1YtZ3MxQmgwREc0R09nQUEuREhTQW9BRmtBY2dBNWdIcWdRSEF4Z0I2d0QxNElHQVFOQWowQkl3Q1NjRXJBS0J3Vi1nczFCaDBERzRHT2dBQSJ9",
+    },
+    {
+        "domain": ".www.researchgate.net",
+        "hostOnly": False,
+        "httpOnly": True,
+        "name": "hasPdpNext",
+        "path": "/",
+        "sameSite": None,
+        "secure": True,
+        "session": True,
+        "storeId": None,
+        "value": "False",
+    },
+    {
+        "domain": ".researchgate.net",
+        "expirationDate": 1750421183,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "ph_phc_ma1XTQyee96N1GML6qUTgLQRiDifnRcE9STiHTZ0CfZ_posthog",
+        "path": "/",
+        "sameSite": "lax",
+        "secure": True,
+        "session": False,
+        "storeId": None,
+        "value": "%7B%22distinct_id%22%3A%220190358a-56a1-7313-83b0-d13dddeac787%22%2C%22%24sesid%22%3A%5B1718885183223%2C%220190358a-56a1-7313-83b0-d13b2b87778d%22%2C1718885176993%5D%2C%22%24session_is_sampled%22%3Atrue%7D",
+    },
+    {
+        "domain": ".www.researchgate.net",
+        "hostOnly": False,
+        "httpOnly": True,
+        "name": "sid",
+        "path": "/",
+        "sameSite": None,
+        "secure": True,
+        "session": True,
+        "storeId": None,
+        "value": "qmH5Lc4f0CUJ3zeaxORcV0S8I8V1MuCFZtcIQqPYtv1XPejrbSLAQRbT50PL40TqeKQ1XsQDWt9gtYVzuL80bRmPjw6jn3cQ0ikNqW40maHcQ3JL2Vfa8ZZf0j7p35eJ",
+    },
+]
+COOKIES_LIST += [
+    {
+        "domain": "github.com",
+        "hostOnly": True,
+        "httpOnly": True,
+        "name": "_gh_sess",
+        "path": "/",
+        "sameSite": "lax",
+        "secure": True,
+        "session": True,
+        "storeId": None,
+        "value": "P%2Fmof1avuqwHaUQUIJR%2FZYn7jqbT7lgGuTGjp1BGAFIG5UpNDusEE3b8dRjz0eATE5xPdPjLYFqMs%2FI9AOalKX4YuYfSEEnxCMawU01099b4o9Xzzcv%2BmecrmO0Q8q%2Bdq1h8SIv6nvPP7HzlFesl8ysafb9b%2F0q6dTArKdSOurasza8UgLSYD08ofA50Pcm0IG7CTzF8ZCizrGgGTMi%2F%2B7L3E17jav5PM1Sf2vQKg15Gbg1QIOppJJHzlufgQoZigqFv%2BWznaws0Tt7Y2lSFCw%3D%3D--CJRhqMXJnwOaJgk4--DhUErlL4GdROikEjKD4O9g%3D%3D",
+    },
+    {
+        "domain": ".github.com",
+        "expirationDate": 1750408875.763785,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "_octo",
+        "path": "/",
+        "sameSite": "lax",
+        "secure": True,
+        "session": False,
+        "storeId": None,
+        "value": "GH1.1.728652011.1718872875",
+    },
+    {
+        "domain": ".github.com",
+        "expirationDate": 1750408875.763926,
+        "hostOnly": False,
+        "httpOnly": True,
+        "name": "logged_in",
+        "path": "/",
+        "sameSite": "lax",
+        "secure": True,
+        "session": False,
+        "storeId": None,
+        "value": "no",
+    },
+    {
+        "domain": ".github.com",
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "preferred_color_mode",
+        "path": "/",
+        "sameSite": "lax",
+        "secure": True,
+        "session": True,
+        "storeId": None,
+        "value": "dark",
+    },
+    {
+        "domain": ".github.com",
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "tz",
+        "path": "/",
+        "sameSite": "lax",
+        "secure": True,
+        "session": True,
+        "storeId": None,
+        "value": "Europe%2FParis",
+    },
+]
+COOKIES_LIST += [
+    {
+        "domain": ".web.archive.org",
+        "expirationDate": 1718886430,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "_gat",
+        "path": "/web/20201123221659/http://orcid.org/",
+        "sameSite": None,
+        "secure": False,
+        "session": False,
+        "storeId": None,
+        "value": "1",
+    },
+    {
+        "domain": ".web.archive.org",
+        "expirationDate": 1718972770,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "_gid",
+        "path": "/web/20201123221659/http://orcid.org/",
+        "sameSite": None,
+        "secure": False,
+        "session": False,
+        "storeId": None,
+        "value": "GA1.2.402246368.1606169825",
+    },
+    {
+        "domain": ".web.archive.org",
+        "expirationDate": 1753446370.315621,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "_ga",
+        "path": "/web/20201123221659/http://orcid.org/",
+        "sameSite": None,
+        "secure": False,
+        "session": False,
+        "storeId": None,
+        "value": "GA1.2.1301409987.1606169825",
+    },
+    {
+        "domain": ".web.archive.org",
+        "expirationDate": 1750422367,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "_hjid",
+        "path": "/web/20201123221659/http://orcid.org/",
+        "sameSite": "lax",
+        "secure": False,
+        "session": False,
+        "storeId": None,
+        "value": "07f80263-a631-4bf4-8ffd-8fc8912085e2",
+    },
+    {
+        "domain": ".web.archive.org",
+        "expirationDate": 1718888167,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "_hjFirstSeen",
+        "path": "/web/20201123221659/http://orcid.org/",
+        "sameSite": "lax",
+        "secure": False,
+        "session": False,
+        "storeId": None,
+        "value": "1",
+    },
+]
+COOKIES_LIST += [
+    {
+        "domain": "orcid.org",
+        "hostOnly": True,
+        "httpOnly": False,
+        "name": "AWSELBCORS",
+        "path": "/",
+        "sameSite": "no_restriction",
+        "secure": True,
+        "session": True,
+        "storeId": None,
+        "value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F",
+    },
+    {
+        "domain": ".orcid.org",
+        "expirationDate": 1753452454.637671,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "_ga_9R61FWK9H5",
+        "path": "/",
+        "sameSite": None,
+        "secure": False,
+        "session": False,
+        "storeId": None,
+        "value": "GS1.1.1718892454.1.0.1718892454.0.0.0",
+    },
+    {
+        "domain": ".orcid.org",
+        "expirationDate": 1753452454.63421,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "_ga",
+        "path": "/",
+        "sameSite": None,
+        "secure": False,
+        "session": False,
+        "storeId": None,
+        "value": "GA1.1.2021310691.1718892455",
+    },
+    {
+        "domain": "orcid.org",
+        "hostOnly": True,
+        "httpOnly": False,
+        "name": "AWSELB",
+        "path": "/",
+        "sameSite": None,
+        "secure": False,
+        "session": True,
+        "storeId": None,
+        "value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F",
+    },
+    {
+        "domain": ".orcid.org",
+        "expirationDate": 1750428454,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "OptanonAlertBoxClosed",
+        "path": "/",
+        "sameSite": "lax",
+        "secure": False,
+        "session": False,
+        "storeId": None,
+        "value": "2024-06-20T14:07:34.583Z",
+    },
+    {
+        "domain": ".orcid.org",
+        "expirationDate": 1750428454,
+        "hostOnly": False,
+        "httpOnly": False,
+        "name": "OptanonConsent",
+        "path": "/",
+        "sameSite": "lax",
+        "secure": False,
+        "session": False,
+        "storeId": None,
+        "value": "isGpcEnabled=0&datestamp=Thu+Jun+20+2024+16%3A07%3A34+GMT%2B0200+(heure+d%E2%80%99%C3%A9t%C3%A9+d%E2%80%99Europe+centrale)&version=202310.2.0&browserGpcFlag=0&isIABGlobal=False&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0002%3A1%2CC0004%3A1",
+    },
+    {
+        "domain": "orcid.org",
+        "hostOnly": True,
+        "httpOnly": False,
+        "name": "XSRF-TOKEN",
+        "path": "/",
+        "sameSite": None,
+        "secure": True,
+        "session": True,
+        "storeId": None,
+        "value": "6957be7a-bcb4-4d59-a522-ea9b6b210ed9",
+    },
+]
+# Create a RequestsCookieJar instance
+COOKIES = RequestsCookieJar()
+# Add cookies to the jar
+for cookie in COOKIES_LIST:
+    COOKIES.set(cookie["name"], cookie["value"], domain=cookie["domain"], path=cookie["path"])

mdconvert.py ADDED Viewed

	@@ -0,0 +1,1002 @@

+# This is copied from Magentic-one's great repo: https://github.com/microsoft/autogen/blob/v0.4.4/python/packages/autogen-magentic-one/src/autogen_magentic_one/markdown_browser/mdconvert.py
+# Thanks to Microsoft researchers for open-sourcing this!
+# type: ignore
+import base64
+import copy
+import html
+import json
+import mimetypes
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import traceback
+import zipfile
+from typing import Any
+from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
+import mammoth
+import markdownify
+import pandas as pd
+import pdfminer
+import pdfminer.high_level
+import pptx
+# File-format detection
+import puremagic
+import pydub
+import requests
+import speech_recognition as sr
+from bs4 import BeautifulSoup
+from youtube_transcript_api import YouTubeTranscriptApi
+from youtube_transcript_api.formatters import SRTFormatter
+class _CustomMarkdownify(markdownify.MarkdownConverter):
+    """
+    A custom version of markdownify's MarkdownConverter. Changes include:
+    - Altering the default heading style to use '#', '##', etc.
+    - Removing javascript hyperlinks.
+    - Truncating images with large data:uri sources.
+    - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
+    """
+    def __init__(self, **options: Any):
+        options["heading_style"] = options.get("heading_style", markdownify.ATX)
+        # Explicitly cast options to the expected type if necessary
+        super().__init__(**options)
+    def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
+        """Same as usual, but be sure to start with a new line"""
+        if not convert_as_inline:
+            if not re.search(r"^\n", text):
+                return "\n" + super().convert_hn(n, el, text, convert_as_inline)  # type: ignore
+        return super().convert_hn(n, el, text, convert_as_inline)  # type: ignore
+    def convert_a(self, el: Any, text: str, convert_as_inline: bool):
+        """Same as usual converter, but removes Javascript links and escapes URIs."""
+        prefix, suffix, text = markdownify.chomp(text)  # type: ignore
+        if not text:
+            return ""
+        href = el.get("href")
+        title = el.get("title")
+        # Escape URIs and skip non-http or file schemes
+        if href:
+            try:
+                parsed_url = urlparse(href)  # type: ignore
+                if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]:  # type: ignore
+                    return "%s%s%s" % (prefix, text, suffix)
+                href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path))))  # type: ignore
+            except ValueError:  # It's not clear if this ever gets thrown
+                return "%s%s%s" % (prefix, text, suffix)
+        # For the replacement see #29: text nodes underscores are escaped
+        if (
+            self.options["autolinks"]
+            and text.replace(r"\_", "_") == href
+            and not title
+            and not self.options["default_title"]
+        ):
+            # Shortcut syntax
+            return "<%s>" % href
+        if self.options["default_title"] and not title:
+            title = href
+        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
+        return "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix) if href else text
+    def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
+        """Same as usual converter, but removes data URIs"""
+        alt = el.attrs.get("alt", None) or ""
+        src = el.attrs.get("src", None) or ""
+        title = el.attrs.get("title", None) or ""
+        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
+        if convert_as_inline and el.parent.name not in self.options["keep_inline_images_in"]:
+            return alt
+        # Remove dataURIs
+        if src.startswith("data:"):
+            src = src.split(",")[0] + "..."
+        return "![%s](%s%s)" % (alt, src, title_part)
+    def convert_soup(self, soup: Any) -> str:
+        return super().convert_soup(soup)  # type: ignore
+class DocumentConverterResult:
+    """The result of converting a document to text."""
+    def __init__(self, title: str | None = None, text_content: str = ""):
+        self.title: str | None = title
+        self.text_content: str = text_content
+class DocumentConverter:
+    """Abstract superclass of all DocumentConverters."""
+    def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult:
+        raise NotImplementedError()
+class PlainTextConverter(DocumentConverter):
+    """Anything with content type text/plain"""
+    def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult:
+        # Guess the content type from any file extension that might be around
+        content_type, _ = mimetypes.guess_type("__placeholder" + kwargs.get("file_extension", ""))
+        # Only accept text files
+        if content_type is None:
+            return None
+        # elif "text/" not in content_type.lower():
+        #     return None
+        text_content = ""
+        with open(local_path, "rt", encoding="utf-8") as fh:
+            text_content = fh.read()
+        return DocumentConverterResult(
+            title=None,
+            text_content=text_content,
+        )
+class HtmlConverter(DocumentConverter):
+    """Anything with content type text/html"""
+    def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult:
+        # Bail if not html
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() not in [".html", ".htm"]:
+            return None
+        result = None
+        with open(local_path, "rt", encoding="utf-8") as fh:
+            result = self._convert(fh.read())
+        return result
+    def _convert(self, html_content: str) -> None | DocumentConverterResult:
+        """Helper function that converts and HTML string."""
+        # Parse the string
+        soup = BeautifulSoup(html_content, "html.parser")
+        # Remove javascript and style blocks
+        for script in soup(["script", "style"]):
+            script.extract()
+        # Print only the main content
+        body_elm = soup.find("body")
+        webpage_text = ""
+        if body_elm:
+            webpage_text = _CustomMarkdownify().convert_soup(body_elm)
+        else:
+            webpage_text = _CustomMarkdownify().convert_soup(soup)
+        assert isinstance(webpage_text, str)
+        return DocumentConverterResult(
+            title=None if soup.title is None else soup.title.string, text_content=webpage_text
+        )
+class WikipediaConverter(DocumentConverter):
+    """Handle Wikipedia pages separately, focusing only on the main document content."""
+    def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult:
+        # Bail if not Wikipedia
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() not in [".html", ".htm"]:
+            return None
+        url = kwargs.get("url", "")
+        if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
+            return None
+        # Parse the file
+        soup = None
+        with open(local_path, "rt", encoding="utf-8") as fh:
+            soup = BeautifulSoup(fh.read(), "html.parser")
+        # Remove javascript and style blocks
+        for script in soup(["script", "style"]):
+            script.extract()
+        # Print only the main content
+        body_elm = soup.find("div", {"id": "mw-content-text"})
+        title_elm = soup.find("span", {"class": "mw-page-title-main"})
+        webpage_text = ""
+        main_title = None if soup.title is None else soup.title.string
+        if body_elm:
+            # What's the title
+            if title_elm and len(title_elm) > 0:
+                main_title = title_elm.string  # type: ignore
+                assert isinstance(main_title, str)
+            # Convert the page
+            webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(body_elm)
+        else:
+            webpage_text = _CustomMarkdownify().convert_soup(soup)
+        return DocumentConverterResult(
+            title=main_title,
+            text_content=webpage_text,
+        )
+class YouTubeConverter(DocumentConverter):
+    """Handle YouTube specially, focusing on the video title, description, and transcript."""
+    def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult:
+        # Bail if not YouTube
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() not in [".html", ".htm"]:
+            return None
+        url = kwargs.get("url", "")
+        if not url.startswith("https://www.youtube.com/watch?"):
+            return None
+        # Parse the file
+        soup = None
+        with open(local_path, "rt", encoding="utf-8") as fh:
+            soup = BeautifulSoup(fh.read(), "html.parser")
+        # Read the meta tags
+        assert soup.title is not None and soup.title.string is not None
+        metadata: dict[str, str] = {"title": soup.title.string}
+        for meta in soup(["meta"]):
+            for a in meta.attrs:
+                if a in ["itemprop", "property", "name"]:
+                    metadata[meta[a]] = meta.get("content", "")
+                    break
+        # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
+        try:
+            for script in soup(["script"]):
+                content = script.text
+                if "ytInitialData" in content:
+                    lines = re.split(r"\r?\n", content)
+                    obj_start = lines[0].find("{")
+                    obj_end = lines[0].rfind("}")
+                    if obj_start >= 0 and obj_end >= 0:
+                        data = json.loads(lines[0][obj_start : obj_end + 1])
+                        attrdesc = self._findKey(data, "attributedDescriptionBodyText")  # type: ignore
+                        if attrdesc:
+                            metadata["description"] = str(attrdesc["content"])
+                    break
+        except Exception:
+            pass
+        # Start preparing the page
+        webpage_text = "# YouTube\n"
+        title = self._get(metadata, ["title", "og:title", "name"])  # type: ignore
+        assert isinstance(title, str)
+        if title:
+            webpage_text += f"\n## {title}\n"
+        stats = ""
+        views = self._get(metadata, ["interactionCount"])  # type: ignore
+        if views:
+            stats += f"- **Views:** {views}\n"
+        keywords = self._get(metadata, ["keywords"])  # type: ignore
+        if keywords:
+            stats += f"- **Keywords:** {keywords}\n"
+        runtime = self._get(metadata, ["duration"])  # type: ignore
+        if runtime:
+            stats += f"- **Runtime:** {runtime}\n"
+        if len(stats) > 0:
+            webpage_text += f"\n### Video Metadata\n{stats}\n"
+        description = self._get(metadata, ["description", "og:description"])  # type: ignore
+        if description:
+            webpage_text += f"\n### Description\n{description}\n"
+        transcript_text = ""
+        parsed_url = urlparse(url)  # type: ignore
+        params = parse_qs(parsed_url.query)  # type: ignore
+        if "v" in params:
+            assert isinstance(params["v"][0], str)
+            video_id = str(params["v"][0])
+            try:
+                # Must be a single transcript.
+                transcript = YouTubeTranscriptApi.get_transcript(video_id)  # type: ignore
+                # transcript_text = " ".join([part["text"] for part in transcript])  # type: ignore
+                # Alternative formatting:
+                transcript_text = SRTFormatter().format_transcript(transcript)
+            except Exception:
+                pass
+        if transcript_text:
+            webpage_text += f"\n### Transcript\n{transcript_text}\n"
+        title = title if title else soup.title.string
+        assert isinstance(title, str)
+        return DocumentConverterResult(
+            title=title,
+            text_content=webpage_text,
+        )
+    def _get(self, metadata: dict[str, str], keys: list[str], default: str | None = None) -> str | None:
+        for k in keys:
+            if k in metadata:
+                return metadata[k]
+        return default
+    def _findKey(self, json: Any, key: str) -> str | None:  # TODO: Fix json type
+        if isinstance(json, list):
+            for elm in json:
+                ret = self._findKey(elm, key)
+                if ret is not None:
+                    return ret
+        elif isinstance(json, dict):
+            for k in json:
+                if k == key:
+                    return json[k]
+                else:
+                    ret = self._findKey(json[k], key)
+                    if ret is not None:
+                        return ret
+        return None
+class PdfConverter(DocumentConverter):
+    """
+    Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
+    """
+    def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
+        # Bail if not a PDF
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".pdf":
+            return None
+        return DocumentConverterResult(
+            title=None,
+            text_content=pdfminer.high_level.extract_text(local_path),
+        )
+class DocxConverter(HtmlConverter):
+    """
+    Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
+    """
+    def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
+        # Bail if not a DOCX
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".docx":
+            return None
+        result = None
+        with open(local_path, "rb") as docx_file:
+            result = mammoth.convert_to_html(docx_file)
+            html_content = result.value
+            result = self._convert(html_content)
+        return result
+class XlsxConverter(HtmlConverter):
+    """
+    Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
+    """
+    def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
+        # Bail if not a XLSX
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() not in [".xlsx", ".xls"]:
+            return None
+        sheets = pd.read_excel(local_path, sheet_name=None)
+        md_content = ""
+        for s in sheets:
+            md_content += f"## {s}\n"
+            html_content = sheets[s].to_html(index=False)
+            md_content += self._convert(html_content).text_content.strip() + "\n\n"
+        return DocumentConverterResult(
+            title=None,
+            text_content=md_content.strip(),
+        )
+class PptxConverter(HtmlConverter):
+    """
+    Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
+    """
+    def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
+        # Bail if not a PPTX
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".pptx":
+            return None
+        md_content = ""
+        presentation = pptx.Presentation(local_path)
+        slide_num = 0
+        for slide in presentation.slides:
+            slide_num += 1
+            md_content += f"\n\n<!-- Slide number: {slide_num} -->\n"
+            title = slide.shapes.title
+            for shape in slide.shapes:
+                # Pictures
+                if self._is_picture(shape):
+                    # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
+                    alt_text = ""
+                    try:
+                        alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
+                    except Exception:
+                        pass
+                    # A placeholder name
+                    filename = re.sub(r"\W", "", shape.name) + ".jpg"
+                    md_content += "\n![" + (alt_text if alt_text else shape.name) + "](" + filename + ")\n"
+                # Tables
+                if self._is_table(shape):
+                    html_table = "<html><body><table>"
+                    first_row = True
+                    for row in shape.table.rows:
+                        html_table += "<tr>"
+                        for cell in row.cells:
+                            if first_row:
+                                html_table += "<th>" + html.escape(cell.text) + "</th>"
+                            else:
+                                html_table += "<td>" + html.escape(cell.text) + "</td>"
+                        html_table += "</tr>"
+                        first_row = False
+                    html_table += "</table></body></html>"
+                    md_content += "\n" + self._convert(html_table).text_content.strip() + "\n"
+                # Text areas
+                elif shape.has_text_frame:
+                    if shape == title:
+                        md_content += "# " + shape.text.lstrip() + "\n"
+                    else:
+                        md_content += shape.text + "\n"
+            md_content = md_content.strip()
+            if slide.has_notes_slide:
+                md_content += "\n\n### Notes:\n"
+                notes_frame = slide.notes_slide.notes_text_frame
+                if notes_frame is not None:
+                    md_content += notes_frame.text
+                md_content = md_content.strip()
+        return DocumentConverterResult(
+            title=None,
+            text_content=md_content.strip(),
+        )
+    def _is_picture(self, shape):
+        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
+            return True
+        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
+            if hasattr(shape, "image"):
+                return True
+        return False
+    def _is_table(self, shape):
+        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
+            return True
+        return False
+class MediaConverter(DocumentConverter):
+    """
+    Abstract class for multi-modal media (e.g., images and audio)
+    """
+    def _get_metadata(self, local_path):
+        exiftool = shutil.which("exiftool")
+        if not exiftool:
+            return None
+        else:
+            try:
+                result = subprocess.run([exiftool, "-json", local_path], capture_output=True, text=True).stdout
+                return json.loads(result)[0]
+            except Exception:
+                return None
+class WavConverter(MediaConverter):
+    """
+    Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
+    """
+    def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
+        # Bail if not a XLSX
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".wav":
+            return None
+        md_content = ""
+        # Add metadata
+        metadata = self._get_metadata(local_path)
+        if metadata:
+            for f in [
+                "Title",
+                "Artist",
+                "Author",
+                "Band",
+                "Album",
+                "Genre",
+                "Track",
+                "DateTimeOriginal",
+                "CreateDate",
+                "Duration",
+            ]:
+                if f in metadata:
+                    md_content += f"{f}: {metadata[f]}\n"
+        # Transcribe
+        try:
+            transcript = self._transcribe_audio(local_path)
+            md_content += "\n\n### Audio Transcript:\n" + ("[No speech detected]" if transcript == "" else transcript)
+        except Exception:
+            md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
+        return DocumentConverterResult(
+            title=None,
+            text_content=md_content.strip(),
+        )
+    def _transcribe_audio(self, local_path) -> str:
+        recognizer = sr.Recognizer()
+        with sr.AudioFile(local_path) as source:
+            audio = recognizer.record(source)
+            return recognizer.recognize_google(audio).strip()
+class Mp3Converter(WavConverter):
+    """
+    Converts MP3 and M4A files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
+    """
+    def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
+        # Bail if not a MP3
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() not in [".mp3", ".m4a"]:
+            return None
+        md_content = ""
+        # Add metadata
+        metadata = self._get_metadata(local_path)
+        if metadata:
+            for f in [
+                "Title",
+                "Artist",
+                "Author",
+                "Band",
+                "Album",
+                "Genre",
+                "Track",
+                "DateTimeOriginal",
+                "CreateDate",
+                "Duration",
+            ]:
+                if f in metadata:
+                    md_content += f"{f}: {metadata[f]}\n"
+        # Transcribe
+        handle, temp_path = tempfile.mkstemp(suffix=".wav")
+        os.close(handle)
+        try:
+            if extension.lower() == ".mp3":
+                sound = pydub.AudioSegment.from_mp3(local_path)
+            else:
+                sound = pydub.AudioSegment.from_file(local_path, format="m4a")
+            sound.export(temp_path, format="wav")
+            _args = dict()
+            _args.update(kwargs)
+            _args["file_extension"] = ".wav"
+            try:
+                transcript = super()._transcribe_audio(temp_path).strip()
+                md_content += "\n\n### Audio Transcript:\n" + (
+                    "[No speech detected]" if transcript == "" else transcript
+                )
+            except Exception:
+                md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
+        finally:
+            os.unlink(temp_path)
+        # Return the result
+        return DocumentConverterResult(
+            title=None,
+            text_content=md_content.strip(),
+        )
+class ZipConverter(DocumentConverter):
+    """
+    Extracts ZIP files to a permanent local directory and returns a listing of extracted files.
+    """
+    def __init__(self, extract_dir: str = "downloads"):
+        """
+        Initialize with path to extraction directory.
+        Args:
+            extract_dir: The directory where files will be extracted. Defaults to "downloads"
+        """
+        self.extract_dir = extract_dir
+        # Create the extraction directory if it doesn't exist
+        os.makedirs(self.extract_dir, exist_ok=True)
+    def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult:
+        # Bail if not a ZIP file
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".zip":
+            return None
+        # Verify it's actually a ZIP file
+        if not zipfile.is_zipfile(local_path):
+            return None
+        # Extract all files and build list
+        extracted_files = []
+        with zipfile.ZipFile(local_path, "r") as zip_ref:
+            # Extract all files
+            zip_ref.extractall(self.extract_dir)
+            # Get list of all files
+            for file_path in zip_ref.namelist():
+                # Skip directories
+                if not file_path.endswith("/"):
+                    extracted_files.append(self.extract_dir + "/" + file_path)
+        # Sort files for consistent output
+        extracted_files.sort()
+        # Build the markdown content
+        md_content = "Downloaded the following files:\n"
+        for file in extracted_files:
+            md_content += f"* {file}\n"
+        return DocumentConverterResult(title="Extracted Files", text_content=md_content.strip())
+class ImageConverter(MediaConverter):
+    """
+    Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured).
+    """
+    def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
+        # Bail if not a XLSX
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() not in [".jpg", ".jpeg", ".png"]:
+            return None
+        md_content = ""
+        # Add metadata
+        metadata = self._get_metadata(local_path)
+        if metadata:
+            for f in [
+                "ImageSize",
+                "Title",
+                "Caption",
+                "Description",
+                "Keywords",
+                "Artist",
+                "Author",
+                "DateTimeOriginal",
+                "CreateDate",
+                "GPSPosition",
+            ]:
+                if f in metadata:
+                    md_content += f"{f}: {metadata[f]}\n"
+        # Try describing the image with GPTV
+        mlm_client = kwargs.get("mlm_client")
+        mlm_model = kwargs.get("mlm_model")
+        if mlm_client is not None and mlm_model is not None:
+            md_content += (
+                "\n# Description:\n"
+                + self._get_mlm_description(
+                    local_path, extension, mlm_client, mlm_model, prompt=kwargs.get("mlm_prompt")
+                ).strip()
+                + "\n"
+            )
+        return DocumentConverterResult(
+            title=None,
+            text_content=md_content,
+        )
+    def _get_mlm_description(self, local_path, extension, client, model, prompt=None):
+        if prompt is None or prompt.strip() == "":
+            prompt = "Write a detailed caption for this image."
+        sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
+        data_uri = ""
+        with open(local_path, "rb") as image_file:
+            content_type, encoding = mimetypes.guess_type("_dummy" + extension)
+            if content_type is None:
+                content_type = "image/jpeg"
+            image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
+            data_uri = f"data:{content_type};base64,{image_base64}"
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": data_uri,
+                        },
+                    },
+                ],
+            }
+        ]
+        response = client.chat.completions.create(model=model, messages=messages)
+        return response.choices[0].message.content
+class FileConversionException(Exception):
+    pass
+class UnsupportedFormatException(Exception):
+    pass
+class MarkdownConverter:
+    """(In preview) An extremely simple text-based document reader, suitable for LLM use.
+    This reader will convert common file-types or webpages to Markdown."""
+    def __init__(
+        self,
+        requests_session: requests.Session | None = None,
+        mlm_client: Any | None = None,
+        mlm_model: Any | None = None,
+    ):
+        if requests_session is None:
+            self._requests_session = requests.Session()
+        else:
+            self._requests_session = requests_session
+        self._mlm_client = mlm_client
+        self._mlm_model = mlm_model
+        self._page_converters: list[DocumentConverter] = []
+        # Register converters for successful browsing operations
+        # Later registrations are tried first / take higher priority than earlier registrations
+        # To this end, the most specific converters should appear below the most generic converters
+        self.register_page_converter(PlainTextConverter())
+        self.register_page_converter(HtmlConverter())
+        self.register_page_converter(WikipediaConverter())
+        self.register_page_converter(YouTubeConverter())
+        self.register_page_converter(DocxConverter())
+        self.register_page_converter(XlsxConverter())
+        self.register_page_converter(PptxConverter())
+        self.register_page_converter(WavConverter())
+        self.register_page_converter(Mp3Converter())
+        self.register_page_converter(ImageConverter())
+        self.register_page_converter(ZipConverter())
+        self.register_page_converter(PdfConverter())
+    def convert(
+        self, source: str | requests.Response, **kwargs: Any
+    ) -> DocumentConverterResult:  # TODO: deal with kwargs
+        """
+        Args:
+            - source: can be a string representing a path or url, or a requests.response object
+            - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
+        """
+        # Local path or url
+        if isinstance(source, str):
+            if source.startswith("http://") or source.startswith("https://") or source.startswith("file://"):
+                return self.convert_url(source, **kwargs)
+            else:
+                return self.convert_local(source, **kwargs)
+        # Request response
+        elif isinstance(source, requests.Response):
+            return self.convert_response(source, **kwargs)
+    def convert_local(self, path: str, **kwargs: Any) -> DocumentConverterResult:  # TODO: deal with kwargs
+        # Prepare a list of extensions to try (in order of priority)
+        ext = kwargs.get("file_extension")
+        extensions = [ext] if ext is not None else []
+        # Get extension alternatives from the path and puremagic
+        base, ext = os.path.splitext(path)
+        self._append_ext(extensions, ext)
+        self._append_ext(extensions, self._guess_ext_magic(path))
+        # Convert
+        return self._convert(path, extensions, **kwargs)
+    # TODO what should stream's type be?
+    def convert_stream(self, stream: Any, **kwargs: Any) -> DocumentConverterResult:  # TODO: deal with kwargs
+        # Prepare a list of extensions to try (in order of priority)
+        ext = kwargs.get("file_extension")
+        extensions = [ext] if ext is not None else []
+        # Save the file locally to a temporary file. It will be deleted before this method exits
+        handle, temp_path = tempfile.mkstemp()
+        fh = os.fdopen(handle, "wb")
+        result = None
+        try:
+            # Write to the temporary file
+            content = stream.read()
+            if isinstance(content, str):
+                fh.write(content.encode("utf-8"))
+            else:
+                fh.write(content)
+            fh.close()
+            # Use puremagic to check for more extension options
+            self._append_ext(extensions, self._guess_ext_magic(temp_path))
+            # Convert
+            result = self._convert(temp_path, extensions, **kwargs)
+        # Clean up
+        finally:
+            try:
+                fh.close()
+            except Exception:
+                pass
+            os.unlink(temp_path)
+        return result
+    def convert_url(self, url: str, **kwargs: Any) -> DocumentConverterResult:  # TODO: fix kwargs type
+        # Send a HTTP request to the URL
+        user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
+        response = self._requests_session.get(url, stream=True, headers={"User-Agent": user_agent})
+        response.raise_for_status()
+        return self.convert_response(response, **kwargs)
+    def convert_response(
+        self, response: requests.Response, **kwargs: Any
+    ) -> DocumentConverterResult:  # TODO fix kwargs type
+        # Prepare a list of extensions to try (in order of priority)
+        ext = kwargs.get("file_extension")
+        extensions = [ext] if ext is not None else []
+        # Guess from the mimetype
+        content_type = response.headers.get("content-type", "").split(";")[0]
+        self._append_ext(extensions, mimetypes.guess_extension(content_type))
+        # Read the content disposition if there is one
+        content_disposition = response.headers.get("content-disposition", "")
+        m = re.search(r"filename=([^;]+)", content_disposition)
+        if m:
+            base, ext = os.path.splitext(m.group(1).strip("\"'"))
+            self._append_ext(extensions, ext)
+        # Read from the extension from the path
+        base, ext = os.path.splitext(urlparse(response.url).path)
+        self._append_ext(extensions, ext)
+        # Save the file locally to a temporary file. It will be deleted before this method exits
+        handle, temp_path = tempfile.mkstemp()
+        fh = os.fdopen(handle, "wb")
+        result = None
+        try:
+            # Download the file
+            for chunk in response.iter_content(chunk_size=512):
+                fh.write(chunk)
+            fh.close()
+            # Use puremagic to check for more extension options
+            self._append_ext(extensions, self._guess_ext_magic(temp_path))
+            # Convert
+            result = self._convert(temp_path, extensions, url=response.url)
+        except Exception as e:
+            print(f"Error in converting: {e}")
+        # Clean up
+        finally:
+            try:
+                fh.close()
+            except Exception:
+                pass
+            os.unlink(temp_path)
+        return result
+    def _convert(self, local_path: str, extensions: list[str | None], **kwargs) -> DocumentConverterResult:
+        error_trace = ""
+        for ext in extensions + [None]:  # Try last with no extension
+            for converter in self._page_converters:
+                _kwargs = copy.deepcopy(kwargs)
+                # Overwrite file_extension appropriately
+                if ext is None:
+                    if "file_extension" in _kwargs:
+                        del _kwargs["file_extension"]
+                else:
+                    _kwargs.update({"file_extension": ext})
+                # Copy any additional global options
+                if "mlm_client" not in _kwargs and self._mlm_client is not None:
+                    _kwargs["mlm_client"] = self._mlm_client
+                if "mlm_model" not in _kwargs and self._mlm_model is not None:
+                    _kwargs["mlm_model"] = self._mlm_model
+                # If we hit an error log it and keep trying
+                try:
+                    res = converter.convert(local_path, **_kwargs)
+                except Exception:
+                    error_trace = ("\n\n" + traceback.format_exc()).strip()
+                if res is not None:
+                    # Normalize the content
+                    res.text_content = "\n".join([line.rstrip() for line in re.split(r"\r?\n", res.text_content)])
+                    res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
+                    # Todo
+                    return res
+        # If we got this far without success, report any exceptions
+        if len(error_trace) > 0:
+            raise FileConversionException(
+                f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
+            )
+        # Nothing can handle it!
+        raise UnsupportedFormatException(
+            f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
+        )
+    def _append_ext(self, extensions, ext):
+        """Append a unique non-None, non-empty extension to a list of extensions."""
+        if ext is None:
+            return
+        ext = ext.strip()
+        if ext == "":
+            return
+        # if ext not in extensions:
+        if True:
+            extensions.append(ext)
+    def _guess_ext_magic(self, path):
+        """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
+        # Use puremagic to guess
+        try:
+            guesses = puremagic.magic_file(path)
+            if len(guesses) > 0:
+                ext = guesses[0].extension.strip()
+                if len(ext) > 0:
+                    return ext
+        except FileNotFoundError:
+            pass
+        except IsADirectoryError:
+            pass
+        except PermissionError:
+            pass
+        return None
+    def register_page_converter(self, converter: DocumentConverter) -> None:
+        """Register a page text converter."""
+        self._page_converters.insert(0, converter)