HarishMaths commited on
Commit
84df929
·
verified ·
1 Parent(s): 94e46de

Upload 5 files

Browse files
Files changed (5) hide show
  1. Dockerfile +20 -0
  2. client_secrets.json +1 -0
  3. credentials.json +1 -0
  4. downloads.py +144 -0
  5. requirements.txt +3 -0
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ RUN useradd -m -u 1000 user
4
+
5
+ USER user
6
+
7
+ ENV HOME=/home/user \
8
+ PATH=/home/user/.local/bin:$PATH
9
+
10
+ WORKDIR $HOME/app
11
+
12
+ COPY --chown=user . $HOME/app
13
+
14
+ COPY . .
15
+
16
+ RUN pip install --no-cache-dir -r requirements.txt || true
17
+
18
+ CMD ["python", "downloads.py"]
19
+
20
+
client_secrets.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"installed":{"client_id":"892683253156-lm16fsr92io325ilamlh4js6dvc0370b.apps.googleusercontent.com","project_id":"silver-tape-454317-m5","auth_uri":"https://accounts.google.com/o/oauth2/auth","token_uri":"https://oauth2.googleapis.com/token","auth_provider_x509_cert_url":"https://www.googleapis.com/oauth2/v1/certs","client_secret":"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","redirect_uris":["http://localhost"]}}
credentials.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"access_token": "ya29.a0AW4XtxjRFkv7_yiUtq_J7yf41tj09XdTtZqX0rWRFWqH3qeQrTlxzG8FhJPmmrjqkod_OZblRQTsve0mkWHrZmSUEhAuJJjanrYhz3w91cIxtlx-9WB0B0TpgTcoOp_8oqPFdmMfFqYBXAAknf1Phne1LWkSiBnpeVb1VM9x9NIaCgYKAQESARASFQHGX2Mi7Yj7q0YUgwCqyAXy9ZgW3A0178", "client_id": "892683253156-lm16fsr92io325ilamlh4js6dvc0370b.apps.googleusercontent.com", "client_secret": "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", "refresh_token": "1//0geRMEWb6fTY3CgYIARAAGBASNwF-L9IrNYMz5Z6pQachJpt4P7DcGxDFRf0uOGLQ4DolW_6NTg0H8qTsqY-liXP88uRPRxjj2ss", "token_expiry": "2025-06-17T00:47:52Z", "token_uri": "https://oauth2.googleapis.com/token", "user_agent": null, "revoke_uri": "https://oauth2.googleapis.com/revoke", "id_token": null, "id_token_jwt": null, "token_response": {"access_token": "ya29.a0AW4XtxjRFkv7_yiUtq_J7yf41tj09XdTtZqX0rWRFWqH3qeQrTlxzG8FhJPmmrjqkod_OZblRQTsve0mkWHrZmSUEhAuJJjanrYhz3w91cIxtlx-9WB0B0TpgTcoOp_8oqPFdmMfFqYBXAAknf1Phne1LWkSiBnpeVb1VM9x9NIaCgYKAQESARASFQHGX2Mi7Yj7q0YUgwCqyAXy9ZgW3A0178", "expires_in": 3599, "scope": "https://www.googleapis.com/auth/drive", "token_type": "Bearer"}, "scopes": ["https://www.googleapis.com/auth/drive"], "token_info_uri": "https://oauth2.googleapis.com/tokeninfo", "invalid": false, "_class": "OAuth2Credentials", "_module": "oauth2client.client"}
downloads.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from pydrive2.auth import GoogleAuth
3
+ from pydrive2.drive import GoogleDrive
4
+ from pathlib import Path
5
+ import logging
6
+ from tqdm.asyncio import tqdm_asyncio
7
+ import sys
8
+
9
+ logging.basicConfig(
10
+ level=logging.INFO,
11
+ format='[%(asctime)s] %(levelname)s: %(message)s',
12
+ handlers=[
13
+ logging.StreamHandler(sys.stdout),
14
+ logging.FileHandler("download_log.txt", encoding="utf-8")
15
+ ]
16
+ )
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ gauth = GoogleAuth()
21
+ gauth.LoadCredentialsFile('credentials.json')
22
+ drive = GoogleDrive(gauth)
23
+
24
+
25
+ matching_folders = []
26
+ MAX_CONCURRENT_REQUESTS = 10
27
+ semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
28
+
29
+
30
+ def list_folder(folder_id):
31
+ return drive.ListFile({'q': f"'{folder_id}' in parents and trashed=false"}).GetList()
32
+
33
+ async def crawl_drive_bfs(root_id):
34
+ queue = [(root_id, "ROOT")]
35
+ while queue:
36
+ next_queue = []
37
+ tasks = []
38
+
39
+ for folder_id, path in queue:
40
+ tasks.append(scan_folder(folder_id, path, next_queue))
41
+
42
+ await asyncio.gather(*tasks)
43
+ queue = next_queue
44
+
45
+ async def scan_folder(folder_id, path_trace, next_queue):
46
+ async with semaphore:
47
+ try:
48
+ file_list = await asyncio.to_thread(list_folder, folder_id)
49
+ except Exception as e:
50
+ logger.error(f"Failed to list folder {folder_id}: {e}")
51
+ return
52
+
53
+ # Infer folder title
54
+ folder_title = None
55
+ if file_list:
56
+ parent_info = await asyncio.to_thread(lambda: drive.CreateFile({'id': folder_id}))
57
+ try:
58
+ parent_info.FetchMetadata(fields="title")
59
+ folder_title = parent_info['title']
60
+ except:
61
+ folder_title = "Unknown"
62
+
63
+ current_path = f"{path_trace}/{folder_title}" if path_trace else folder_title
64
+
65
+ if folder_title and folder_title.startswith("Notebook"):
66
+ ipynb_files = [f for f in file_list if f['title'].endswith('.ipynb')]
67
+ if ipynb_files:
68
+ matching_folders.append((folder_id, current_path, ipynb_files))
69
+
70
+ for f in file_list:
71
+ if f['mimeType'] == 'application/vnd.google-apps.folder':
72
+ next_queue.append((f['id'], current_path))
73
+
74
+ download_semaphore = asyncio.Semaphore(10)
75
+
76
+ async def download_folder(folder_id, local_path, hierarchy_path):
77
+ async with download_semaphore:
78
+ base_path = Path("tmp") / local_path
79
+ base_path.mkdir(parents=True, exist_ok=True)
80
+
81
+ try:
82
+ items = await asyncio.to_thread(list_folder, folder_id)
83
+ except Exception as e:
84
+ logger.error(f"Failed to list folder for download {folder_id}: {e}")
85
+ return
86
+
87
+ ipynb_entries = []
88
+ subfolder_tasks = []
89
+
90
+ for item in items:
91
+ if item['mimeType'] == 'application/vnd.google-apps.folder':
92
+ task = asyncio.create_task(
93
+ download_folder(item['id'], base_path / item['title'], hierarchy_path + "/" + item['title'])
94
+ )
95
+ subfolder_tasks.append(task)
96
+ else:
97
+ try:
98
+ if item['title'].endswith('.ipynb'):
99
+ logger.info(f"Downloading notebook: {item['title']} into {base_path}")
100
+ await asyncio.to_thread(item.GetContentFile, str(base_path / item['title']))
101
+
102
+ file_path = f"{hierarchy_path}/{item['title']}"
103
+ folder_link = f"https://drive.google.com/drive/folders/{folder_id}"
104
+ ipynb_entries.append(f"{file_path},{folder_link}")
105
+ else:
106
+ await asyncio.to_thread(item.GetContentFile, str(base_path / item['title']))
107
+ except Exception as e:
108
+ logger.error(f"Error downloading {item['title']}: {e}")
109
+ continue
110
+
111
+ if ipynb_entries:
112
+ (base_path / "file_links.txt").write_text("\n".join(ipynb_entries), encoding="utf-8")
113
+
114
+ if subfolder_tasks:
115
+ await asyncio.gather(*subfolder_tasks)
116
+
117
+
118
+ async def parallel_download():
119
+ tasks = []
120
+ for idx, (folder_id, path, _) in enumerate(matching_folders):
121
+ folder_name = Path(f"notebook_folder_{idx+1}")
122
+ tasks.append(download_folder(folder_id, folder_name, path))
123
+
124
+ for f in tqdm_asyncio.as_completed(tasks, desc="Downloading Notebooks", unit="folder", total=len(tasks)):
125
+ await f
126
+
127
+
128
+ async def downloads(root_folder_id):
129
+ try:
130
+ matching_folders.clear()
131
+ logger.info(f"Scanning Drive folder: {root_folder_id}")
132
+ await crawl_drive_bfs(root_folder_id)
133
+
134
+ if matching_folders:
135
+ logger.info(f"Found {len(matching_folders)} folders with notebooks.")
136
+ await parallel_download()
137
+ logger.info("Download completed.")
138
+ else:
139
+ logger.warning("No folders with prefix 'Notebook' and .ipynb files found.")
140
+ except Exception as e:
141
+ logger.exception(f"Unexpected error during execution: {e}")
142
+
143
+
144
+ asyncio.run(downloads("1KWHJC3yqXXVK2Cz-qZxWeeic8ifCmgej"))
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ pydrive2==1.21.3
2
+ aiofiles==24.1.0
3
+ tqdm==4.67.1