HarishMaths commited on
Commit
5989690
·
verified ·
1 Parent(s): 84df929

Update downloads.py

Browse files
Files changed (1) hide show
  1. downloads.py +146 -144
downloads.py CHANGED
@@ -1,144 +1,146 @@
1
- import asyncio
2
- from pydrive2.auth import GoogleAuth
3
- from pydrive2.drive import GoogleDrive
4
- from pathlib import Path
5
- import logging
6
- from tqdm.asyncio import tqdm_asyncio
7
- import sys
8
-
9
- logging.basicConfig(
10
- level=logging.INFO,
11
- format='[%(asctime)s] %(levelname)s: %(message)s',
12
- handlers=[
13
- logging.StreamHandler(sys.stdout),
14
- logging.FileHandler("download_log.txt", encoding="utf-8")
15
- ]
16
- )
17
- logger = logging.getLogger(__name__)
18
-
19
-
20
- gauth = GoogleAuth()
21
- gauth.LoadCredentialsFile('credentials.json')
22
- drive = GoogleDrive(gauth)
23
-
24
-
25
- matching_folders = []
26
- MAX_CONCURRENT_REQUESTS = 10
27
- semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
28
-
29
-
30
- def list_folder(folder_id):
31
- return drive.ListFile({'q': f"'{folder_id}' in parents and trashed=false"}).GetList()
32
-
33
- async def crawl_drive_bfs(root_id):
34
- queue = [(root_id, "ROOT")]
35
- while queue:
36
- next_queue = []
37
- tasks = []
38
-
39
- for folder_id, path in queue:
40
- tasks.append(scan_folder(folder_id, path, next_queue))
41
-
42
- await asyncio.gather(*tasks)
43
- queue = next_queue
44
-
45
- async def scan_folder(folder_id, path_trace, next_queue):
46
- async with semaphore:
47
- try:
48
- file_list = await asyncio.to_thread(list_folder, folder_id)
49
- except Exception as e:
50
- logger.error(f"Failed to list folder {folder_id}: {e}")
51
- return
52
-
53
- # Infer folder title
54
- folder_title = None
55
- if file_list:
56
- parent_info = await asyncio.to_thread(lambda: drive.CreateFile({'id': folder_id}))
57
- try:
58
- parent_info.FetchMetadata(fields="title")
59
- folder_title = parent_info['title']
60
- except:
61
- folder_title = "Unknown"
62
-
63
- current_path = f"{path_trace}/{folder_title}" if path_trace else folder_title
64
-
65
- if folder_title and folder_title.startswith("Notebook"):
66
- ipynb_files = [f for f in file_list if f['title'].endswith('.ipynb')]
67
- if ipynb_files:
68
- matching_folders.append((folder_id, current_path, ipynb_files))
69
-
70
- for f in file_list:
71
- if f['mimeType'] == 'application/vnd.google-apps.folder':
72
- next_queue.append((f['id'], current_path))
73
-
74
- download_semaphore = asyncio.Semaphore(10)
75
-
76
- async def download_folder(folder_id, local_path, hierarchy_path):
77
- async with download_semaphore:
78
- base_path = Path("tmp") / local_path
79
- base_path.mkdir(parents=True, exist_ok=True)
80
-
81
- try:
82
- items = await asyncio.to_thread(list_folder, folder_id)
83
- except Exception as e:
84
- logger.error(f"Failed to list folder for download {folder_id}: {e}")
85
- return
86
-
87
- ipynb_entries = []
88
- subfolder_tasks = []
89
-
90
- for item in items:
91
- if item['mimeType'] == 'application/vnd.google-apps.folder':
92
- task = asyncio.create_task(
93
- download_folder(item['id'], base_path / item['title'], hierarchy_path + "/" + item['title'])
94
- )
95
- subfolder_tasks.append(task)
96
- else:
97
- try:
98
- if item['title'].endswith('.ipynb'):
99
- logger.info(f"Downloading notebook: {item['title']} into {base_path}")
100
- await asyncio.to_thread(item.GetContentFile, str(base_path / item['title']))
101
-
102
- file_path = f"{hierarchy_path}/{item['title']}"
103
- folder_link = f"https://drive.google.com/drive/folders/{folder_id}"
104
- ipynb_entries.append(f"{file_path},{folder_link}")
105
- else:
106
- await asyncio.to_thread(item.GetContentFile, str(base_path / item['title']))
107
- except Exception as e:
108
- logger.error(f"Error downloading {item['title']}: {e}")
109
- continue
110
-
111
- if ipynb_entries:
112
- (base_path / "file_links.txt").write_text("\n".join(ipynb_entries), encoding="utf-8")
113
-
114
- if subfolder_tasks:
115
- await asyncio.gather(*subfolder_tasks)
116
-
117
-
118
- async def parallel_download():
119
- tasks = []
120
- for idx, (folder_id, path, _) in enumerate(matching_folders):
121
- folder_name = Path(f"notebook_folder_{idx+1}")
122
- tasks.append(download_folder(folder_id, folder_name, path))
123
-
124
- for f in tqdm_asyncio.as_completed(tasks, desc="Downloading Notebooks", unit="folder", total=len(tasks)):
125
- await f
126
-
127
-
128
- async def downloads(root_folder_id):
129
- try:
130
- matching_folders.clear()
131
- logger.info(f"Scanning Drive folder: {root_folder_id}")
132
- await crawl_drive_bfs(root_folder_id)
133
-
134
- if matching_folders:
135
- logger.info(f"Found {len(matching_folders)} folders with notebooks.")
136
- await parallel_download()
137
- logger.info("Download completed.")
138
- else:
139
- logger.warning("No folders with prefix 'Notebook' and .ipynb files found.")
140
- except Exception as e:
141
- logger.exception(f"Unexpected error during execution: {e}")
142
-
143
-
144
- asyncio.run(downloads("1KWHJC3yqXXVK2Cz-qZxWeeic8ifCmgej"))
 
 
 
1
+ import asyncio
2
+ from pydrive2.auth import GoogleAuth
3
+ from pydrive2.drive import GoogleDrive
4
+ from pathlib import Path
5
+ import logging
6
+ from tqdm.asyncio import tqdm_asyncio
7
+ import sys
8
+
9
+ logging.basicConfig(
10
+ level=logging.INFO,
11
+ format='[%(asctime)s] %(levelname)s: %(message)s',
12
+ handlers=[
13
+ logging.StreamHandler(sys.stdout),
14
+ logging.FileHandler("download_log.txt", encoding="utf-8")
15
+ ]
16
+ )
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ gauth = GoogleAuth()
21
+ gauth.LoadCredentialsFile('credentials.json')
22
+ drive = GoogleDrive(gauth)
23
+
24
+
25
+ matching_folders = []
26
+ MAX_CONCURRENT_REQUESTS = 10
27
+
28
+
29
+
30
+ def list_folder(folder_id):
31
+ return drive.ListFile({'q': f"'{folder_id}' in parents and trashed=false"}).GetList()
32
+
33
+ async def crawl_drive_bfs(root_id):
34
+ queue = [(root_id, "ROOT")]
35
+ while queue:
36
+ next_queue = []
37
+ tasks = []
38
+
39
+ for folder_id, path in queue:
40
+ tasks.append(scan_folder(folder_id, path, next_queue))
41
+
42
+ await asyncio.gather(*tasks)
43
+ queue = next_queue
44
+
45
+ async def scan_folder(folder_id, path_trace, next_queue):
46
+ semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
47
+ async with semaphore:
48
+ try:
49
+ file_list = await asyncio.to_thread(list_folder, folder_id)
50
+ except Exception as e:
51
+ logger.error(f"Failed to list folder {folder_id}: {e}")
52
+ return
53
+
54
+ # Infer folder title
55
+ folder_title = None
56
+ if file_list:
57
+ parent_info = await asyncio.to_thread(lambda: drive.CreateFile({'id': folder_id}))
58
+ try:
59
+ parent_info.FetchMetadata(fields="title")
60
+ folder_title = parent_info['title']
61
+ except:
62
+ folder_title = "Unknown"
63
+
64
+ current_path = f"{path_trace}/{folder_title}" if path_trace else folder_title
65
+
66
+ if folder_title and folder_title.startswith("Notebook"):
67
+ ipynb_files = [f for f in file_list if f['title'].endswith('.ipynb')]
68
+ if ipynb_files:
69
+ matching_folders.append((folder_id, current_path, ipynb_files))
70
+
71
+ for f in file_list:
72
+ if f['mimeType'] == 'application/vnd.google-apps.folder':
73
+ next_queue.append((f['id'], current_path))
74
+
75
+
76
+
77
+ async def download_folder(folder_id, local_path, hierarchy_path):
78
+ download_semaphore = asyncio.Semaphore(10)
79
+ async with download_semaphore:
80
+ base_path = Path("tmp") / local_path
81
+ base_path.mkdir(parents=True, exist_ok=True)
82
+
83
+ try:
84
+ items = await asyncio.to_thread(list_folder, folder_id)
85
+ except Exception as e:
86
+ logger.error(f"Failed to list folder for download {folder_id}: {e}")
87
+ return
88
+
89
+ ipynb_entries = []
90
+ subfolder_tasks = []
91
+
92
+ for item in items:
93
+ if item['mimeType'] == 'application/vnd.google-apps.folder':
94
+ task = asyncio.create_task(
95
+ download_folder(item['id'], base_path / item['title'], hierarchy_path + "/" + item['title'])
96
+ )
97
+ subfolder_tasks.append(task)
98
+ else:
99
+ try:
100
+ if item['title'].endswith('.ipynb'):
101
+ logger.info(f"Downloading notebook: {item['title']} into {base_path}")
102
+ await asyncio.to_thread(item.GetContentFile, str(base_path / item['title']))
103
+
104
+ file_path = f"{hierarchy_path}/{item['title']}"
105
+ folder_link = f"https://drive.google.com/drive/folders/{folder_id}"
106
+ ipynb_entries.append(f"{file_path},{folder_link}")
107
+ else:
108
+ await asyncio.to_thread(item.GetContentFile, str(base_path / item['title']))
109
+ except Exception as e:
110
+ logger.error(f"Error downloading {item['title']}: {e}")
111
+ continue
112
+
113
+ if ipynb_entries:
114
+ (base_path / "file_links.txt").write_text("\n".join(ipynb_entries), encoding="utf-8")
115
+
116
+ if subfolder_tasks:
117
+ await asyncio.gather(*subfolder_tasks)
118
+
119
+
120
+ async def parallel_download():
121
+ tasks = []
122
+ for idx, (folder_id, path, _) in enumerate(matching_folders):
123
+ folder_name = Path(f"notebook_folder_{idx+1}")
124
+ tasks.append(download_folder(folder_id, folder_name, path))
125
+
126
+ for f in tqdm_asyncio.as_completed(tasks, desc="Downloading Notebooks", unit="folder", total=len(tasks)):
127
+ await f
128
+
129
+
130
+ async def downloads(root_folder_id):
131
+ try:
132
+ matching_folders.clear()
133
+ logger.info(f"Scanning Drive folder: {root_folder_id}")
134
+ await crawl_drive_bfs(root_folder_id)
135
+
136
+ if matching_folders:
137
+ logger.info(f"Found {len(matching_folders)} folders with notebooks.")
138
+ await parallel_download()
139
+ logger.info("Download completed.")
140
+ else:
141
+ logger.warning("No folders with prefix 'Notebook' and .ipynb files found.")
142
+ except Exception as e:
143
+ logger.exception(f"Unexpected error during execution: {e}")
144
+
145
+
146
+ asyncio.run(downloads("1KWHJC3yqXXVK2Cz-qZxWeeic8ifCmgej"))