samfred2 commited on
Commit
ac655f5
·
verified ·
1 Parent(s): 603a71c

Update download_channel.py

Browse files
Files changed (1) hide show
  1. download_channel.py +226 -226
download_channel.py CHANGED
@@ -1,227 +1,227 @@
1
- #!/usr/bin/env python3
2
- from __future__ import annotations
3
-
4
- import asyncio
5
- import json
6
- import os
7
- from telethon import TelegramClient, errors
8
-
9
- # def load_state():
10
- # if os.path.exists(STATE_FILE):
11
- # try:
12
- # with open(STATE_FILE, 'r') as f:
13
- # return json.load(f)
14
- # except:
15
- # return {"downloaded_ids": []}
16
- # return {"downloaded_ids": []}
17
-
18
- # def save_state(state):
19
- # with open(STATE_FILE, 'w') as f:
20
- # json.dump(state, f, indent=2)
21
-
22
- # async def download_channel():
23
- # os.makedirs(OUTPUT_DIR, exist_ok=True)
24
-
25
- # # Load previous download state
26
- # state = load_state()
27
- # downloaded_ids = set(state["downloaded_ids"])
28
-
29
- # # Initialize client with your session
30
- # client = TelegramClient(SESSION_FILE, API_ID, API_HASH)
31
- import asyncio
32
- import json
33
- import os
34
- from typing import Dict, Any
35
-
36
- from telethon import TelegramClient, errors
37
- from huggingface_hub import HfApi, hf_hub_download
38
-
39
- # Configuration - Edit these variables
40
- CHANNEL = "cgsvalka" # Channel username or ID to download from
41
- SESSION_FILE = "my_session.session" # Your existing session file
42
- OUTPUT_DIR = "downloads" # Where to save downloaded files
43
- API_ID = 28708692 # Your Telegram API ID
44
- API_HASH = "72fa6a22c65d7a58e00f2ccb8d60841d" # Your Telegram API Hash
45
- MESSAGE_LIMIT = 0 # 0 = download all messages, or set a number for testing
46
- STATE_FILE = "download_state.json" # Where to save download progress
47
-
48
- # Hugging Face dataset repo where files and state will be uploaded
49
- # Read the HF token from environment for safety. Set HF_TOKEN env var before running.
50
- HF_TOKEN = os.environ.get("HF_TOKEN", "") # Hugging Face token with write permission (empty = disabled)
51
- HF_REPO_ID = "Fred808/TGFiles"
52
- STATE_FILE = "download_state.json" # Local filename for state
53
-
54
-
55
- def load_local_state() -> Dict[str, Any]:
56
- if os.path.exists(STATE_FILE):
57
- try:
58
- with open(STATE_FILE, "r", encoding="utf-8") as f:
59
- return json.load(f)
60
- except Exception:
61
- return {"downloaded_files": []}
62
- return {"downloaded_files": []}
63
-
64
-
65
- def save_local_state(state: Dict[str, Any]) -> None:
66
- with open(STATE_FILE, "w", encoding="utf-8") as f:
67
- json.dump(state, f, indent=2, ensure_ascii=False)
68
-
69
-
70
- def download_state_from_hf(token: str) -> Dict[str, Any]:
71
- """Try to download the state file from the HF dataset. Returns state dict or empty state."""
72
- if not token:
73
- return {"downloaded_files": []}
74
- try:
75
- # hf_hub_download will raise if file doesn't exist
76
- local_path = hf_hub_download(repo_id=HF_REPO_ID, filename=STATE_FILE, repo_type="dataset", token=token)
77
- with open(local_path, "r", encoding="utf-8") as f:
78
- return json.load(f)
79
- except Exception:
80
- return {"downloaded_files": []}
81
-
82
-
83
- def upload_file_to_hf(local_path: str, path_in_repo: str, token: str) -> bool:
84
- """Upload a single file to the HF dataset repo. Returns True on success."""
85
- if not token:
86
- return False
87
- try:
88
- api = HfApi()
89
- api.upload_file(path_or_fileobj=local_path, path_in_repo=path_in_repo, repo_id=HF_REPO_ID, repo_type="dataset", token=token)
90
- return True
91
- except Exception as e:
92
- print(f"Failed to upload {local_path} to HF: {e}")
93
- return False
94
-
95
-
96
- def upload_state_to_hf(state: Dict[str, Any], token: str) -> bool:
97
- # write temp state file and upload
98
- save_local_state(state)
99
- return upload_file_to_hf(STATE_FILE, STATE_FILE, token)
100
-
101
-
102
- async def download_channel():
103
- os.makedirs(OUTPUT_DIR, exist_ok=True)
104
-
105
- # Try to download remote state first (if token provided), then merge with local state
106
- remote_state = download_state_from_hf(HF_TOKEN) if HF_TOKEN else {"downloaded_files": []}
107
- local_state = load_local_state()
108
-
109
- # Merge: prefer remote entries, then local missing ones
110
- downloaded_files = { (e.get("message_id"), e.get("filename")) for e in remote_state.get("downloaded_files", []) }
111
- for e in local_state.get("downloaded_files", []):
112
- downloaded_files.add((e.get("message_id"), e.get("filename")))
113
-
114
- # Recreate ordered list
115
- downloaded_list = [ {"message_id": mid, "filename": fname} for (mid, fname) in downloaded_files if mid is not None ]
116
-
117
- state: Dict[str, Any] = {"downloaded_files": downloaded_list}
118
-
119
- # Build quick lookup set of message ids to skip
120
- downloaded_ids = {entry["message_id"] for entry in state["downloaded_files"]}
121
-
122
- # Initialize client with your session
123
- client = TelegramClient(SESSION_FILE, API_ID, API_HASH)
124
-
125
- async with client:
126
- try:
127
- entity = await client.get_entity(CHANNEL)
128
- except Exception as e:
129
- print(f"Failed to resolve channel '{CHANNEL}': {e}")
130
- return 1
131
-
132
- print(f"Starting download from: {entity.title if hasattr(entity, 'title') else CHANNEL}")
133
-
134
- count = 0
135
- downloaded = 0
136
- skipped = 0
137
- not_rar = 0
138
-
139
- try:
140
- async for message in client.iter_messages(entity, limit=MESSAGE_LIMIT or None):
141
- count += 1
142
-
143
- # Skip if already downloaded according to state
144
- if message.id in downloaded_ids:
145
- skipped += 1
146
- continue
147
-
148
- if not message.media:
149
- continue
150
-
151
- # Check if it's a RAR file
152
- is_rar = False
153
- filename = ""
154
- if message.file:
155
- filename = getattr(message.file, 'name', '') or ''
156
- if filename:
157
- is_rar = filename.lower().endswith('.rar')
158
- else:
159
- mime_type = getattr(message.file, 'mime_type', '') or ''
160
- is_rar = 'rar' in mime_type.lower() if mime_type else False
161
-
162
- if not is_rar:
163
- not_rar += 1
164
- continue
165
-
166
- # Use message ID and original filename for saved file
167
- if filename:
168
- suggested = f"{message.id}_{filename}"
169
- else:
170
- suggested = f"{message.id}.rar"
171
-
172
- out_path = os.path.join(OUTPUT_DIR, suggested)
173
-
174
- # Download the RAR file
175
- try:
176
- print(f"[{count}] downloading -> {os.path.basename(out_path)}")
177
- await client.download_media(message, file=out_path)
178
- downloaded += 1
179
-
180
- # Upload the RAR file to the HF dataset (path files/<basename>)
181
- if HF_TOKEN:
182
- path_in_repo = f"files/{os.path.basename(out_path)}"
183
- ok = upload_file_to_hf(out_path, path_in_repo, HF_TOKEN)
184
- if not ok:
185
- print(f"Warning: failed to upload {out_path} to HF repo {HF_REPO_ID}")
186
-
187
- # Update state after successful download (and attempted upload)
188
- state["downloaded_files"].append({"message_id": message.id, "filename": os.path.basename(out_path)})
189
- downloaded_ids.add(message.id)
190
- save_local_state(state)
191
-
192
- # Upload updated state to HF
193
- if HF_TOKEN:
194
- upload_state_to_hf(state, HF_TOKEN)
195
-
196
- # Be polite to the server
197
- await asyncio.sleep(0.2)
198
- except errors.FloodWaitError as fw:
199
- wait = int(fw.seconds) if fw.seconds else 60
200
- print(f"Hit FloodWait: sleeping {wait}s")
201
- await asyncio.sleep(wait + 1)
202
- except KeyboardInterrupt:
203
- print("Interrupted by user; saving state and exiting.")
204
- save_local_state(state)
205
- if HF_TOKEN:
206
- upload_state_to_hf(state, HF_TOKEN)
207
- break
208
- except Exception as e:
209
- print(f"Error while downloading message {message.id}: {e}")
210
-
211
- except KeyboardInterrupt:
212
- print("Interrupted by user; saving final state.")
213
- save_local_state(state)
214
- if HF_TOKEN:
215
- upload_state_to_hf(state, HF_TOKEN)
216
-
217
- print(f"\nFinal Statistics:")
218
- print(f"Messages scanned: {count}")
219
- print(f"RAR files downloaded: {downloaded}")
220
- print(f"Already downloaded (skipped): {skipped}")
221
- print(f"Non-RAR files skipped: {not_rar}")
222
- print(f"\nDownload state saved to: {STATE_FILE}")
223
- return 0
224
-
225
-
226
- if __name__ == "__main__":
227
  asyncio.run(download_channel())
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ import json
6
+ import os
7
+ from telethon import TelegramClient, errors
8
+
9
+ # def load_state():
10
+ # if os.path.exists(STATE_FILE):
11
+ # try:
12
+ # with open(STATE_FILE, 'r') as f:
13
+ # return json.load(f)
14
+ # except:
15
+ # return {"downloaded_ids": []}
16
+ # return {"downloaded_ids": []}
17
+
18
+ # def save_state(state):
19
+ # with open(STATE_FILE, 'w') as f:
20
+ # json.dump(state, f, indent=2)
21
+
22
+ # async def download_channel():
23
+ # os.makedirs(OUTPUT_DIR, exist_ok=True)
24
+
25
+ # # Load previous download state
26
+ # state = load_state()
27
+ # downloaded_ids = set(state["downloaded_ids"])
28
+
29
+ # # Initialize client with your session
30
+ # client = TelegramClient(SESSION_FILE, API_ID, API_HASH)
31
+ import asyncio
32
+ import json
33
+ import os
34
+ from typing import Dict, Any
35
+
36
+ from telethon import TelegramClient, errors
37
+ from huggingface_hub import HfApi, hf_hub_download
38
+
39
+ # Configuration - Edit these variables
40
+ CHANNEL = "cgsvalka" # Channel username or ID to download from
41
+ SESSION_FILE = "my_session.session" # Your existing session file
42
+ OUTPUT_DIR = "downloads" # Where to save downloaded files
43
+ API_ID = 28708692 # Your Telegram API ID
44
+ API_HASH = "72fa6a22c65d7a58e00f2ccb8d60841d" # Your Telegram API Hash
45
+ MESSAGE_LIMIT = 0 # 0 = download all messages, or set a number for testing
46
+ STATE_FILE = "download_state.json" # Where to save download progress
47
+
48
+ # Hugging Face dataset repo where files and state will be uploaded
49
+ # Read the HF token from environment for safety. Set HF_TOKEN env var before running.
50
+ HF_TOKEN = os.environ.get("HF_TOKEN", "") # Hugging Face token with write permission (empty = disabled)
51
+ HF_REPO_ID = "samfred2/TGFiles"
52
+ STATE_FILE = "download_state.json" # Local filename for state
53
+
54
+
55
+ def load_local_state() -> Dict[str, Any]:
56
+ if os.path.exists(STATE_FILE):
57
+ try:
58
+ with open(STATE_FILE, "r", encoding="utf-8") as f:
59
+ return json.load(f)
60
+ except Exception:
61
+ return {"downloaded_files": []}
62
+ return {"downloaded_files": []}
63
+
64
+
65
+ def save_local_state(state: Dict[str, Any]) -> None:
66
+ with open(STATE_FILE, "w", encoding="utf-8") as f:
67
+ json.dump(state, f, indent=2, ensure_ascii=False)
68
+
69
+
70
+ def download_state_from_hf(token: str) -> Dict[str, Any]:
71
+ """Try to download the state file from the HF dataset. Returns state dict or empty state."""
72
+ if not token:
73
+ return {"downloaded_files": []}
74
+ try:
75
+ # hf_hub_download will raise if file doesn't exist
76
+ local_path = hf_hub_download(repo_id=HF_REPO_ID, filename=STATE_FILE, repo_type="dataset", token=token)
77
+ with open(local_path, "r", encoding="utf-8") as f:
78
+ return json.load(f)
79
+ except Exception:
80
+ return {"downloaded_files": []}
81
+
82
+
83
+ def upload_file_to_hf(local_path: str, path_in_repo: str, token: str) -> bool:
84
+ """Upload a single file to the HF dataset repo. Returns True on success."""
85
+ if not token:
86
+ return False
87
+ try:
88
+ api = HfApi()
89
+ api.upload_file(path_or_fileobj=local_path, path_in_repo=path_in_repo, repo_id=HF_REPO_ID, repo_type="dataset", token=token)
90
+ return True
91
+ except Exception as e:
92
+ print(f"Failed to upload {local_path} to HF: {e}")
93
+ return False
94
+
95
+
96
+ def upload_state_to_hf(state: Dict[str, Any], token: str) -> bool:
97
+ # write temp state file and upload
98
+ save_local_state(state)
99
+ return upload_file_to_hf(STATE_FILE, STATE_FILE, token)
100
+
101
+
102
+ async def download_channel():
103
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
104
+
105
+ # Try to download remote state first (if token provided), then merge with local state
106
+ remote_state = download_state_from_hf(HF_TOKEN) if HF_TOKEN else {"downloaded_files": []}
107
+ local_state = load_local_state()
108
+
109
+ # Merge: prefer remote entries, then local missing ones
110
+ downloaded_files = { (e.get("message_id"), e.get("filename")) for e in remote_state.get("downloaded_files", []) }
111
+ for e in local_state.get("downloaded_files", []):
112
+ downloaded_files.add((e.get("message_id"), e.get("filename")))
113
+
114
+ # Recreate ordered list
115
+ downloaded_list = [ {"message_id": mid, "filename": fname} for (mid, fname) in downloaded_files if mid is not None ]
116
+
117
+ state: Dict[str, Any] = {"downloaded_files": downloaded_list}
118
+
119
+ # Build quick lookup set of message ids to skip
120
+ downloaded_ids = {entry["message_id"] for entry in state["downloaded_files"]}
121
+
122
+ # Initialize client with your session
123
+ client = TelegramClient(SESSION_FILE, API_ID, API_HASH)
124
+
125
+ async with client:
126
+ try:
127
+ entity = await client.get_entity(CHANNEL)
128
+ except Exception as e:
129
+ print(f"Failed to resolve channel '{CHANNEL}': {e}")
130
+ return 1
131
+
132
+ print(f"Starting download from: {entity.title if hasattr(entity, 'title') else CHANNEL}")
133
+
134
+ count = 0
135
+ downloaded = 0
136
+ skipped = 0
137
+ not_rar = 0
138
+
139
+ try:
140
+ async for message in client.iter_messages(entity, limit=MESSAGE_LIMIT or None):
141
+ count += 1
142
+
143
+ # Skip if already downloaded according to state
144
+ if message.id in downloaded_ids:
145
+ skipped += 1
146
+ continue
147
+
148
+ if not message.media:
149
+ continue
150
+
151
+ # Check if it's a RAR file
152
+ is_rar = False
153
+ filename = ""
154
+ if message.file:
155
+ filename = getattr(message.file, 'name', '') or ''
156
+ if filename:
157
+ is_rar = filename.lower().endswith('.rar')
158
+ else:
159
+ mime_type = getattr(message.file, 'mime_type', '') or ''
160
+ is_rar = 'rar' in mime_type.lower() if mime_type else False
161
+
162
+ if not is_rar:
163
+ not_rar += 1
164
+ continue
165
+
166
+ # Use message ID and original filename for saved file
167
+ if filename:
168
+ suggested = f"{message.id}_{filename}"
169
+ else:
170
+ suggested = f"{message.id}.rar"
171
+
172
+ out_path = os.path.join(OUTPUT_DIR, suggested)
173
+
174
+ # Download the RAR file
175
+ try:
176
+ print(f"[{count}] downloading -> {os.path.basename(out_path)}")
177
+ await client.download_media(message, file=out_path)
178
+ downloaded += 1
179
+
180
+ # Upload the RAR file to the HF dataset (path files/<basename>)
181
+ if HF_TOKEN:
182
+ path_in_repo = f"files/{os.path.basename(out_path)}"
183
+ ok = upload_file_to_hf(out_path, path_in_repo, HF_TOKEN)
184
+ if not ok:
185
+ print(f"Warning: failed to upload {out_path} to HF repo {HF_REPO_ID}")
186
+
187
+ # Update state after successful download (and attempted upload)
188
+ state["downloaded_files"].append({"message_id": message.id, "filename": os.path.basename(out_path)})
189
+ downloaded_ids.add(message.id)
190
+ save_local_state(state)
191
+
192
+ # Upload updated state to HF
193
+ if HF_TOKEN:
194
+ upload_state_to_hf(state, HF_TOKEN)
195
+
196
+ # Be polite to the server
197
+ await asyncio.sleep(0.2)
198
+ except errors.FloodWaitError as fw:
199
+ wait = int(fw.seconds) if fw.seconds else 60
200
+ print(f"Hit FloodWait: sleeping {wait}s")
201
+ await asyncio.sleep(wait + 1)
202
+ except KeyboardInterrupt:
203
+ print("Interrupted by user; saving state and exiting.")
204
+ save_local_state(state)
205
+ if HF_TOKEN:
206
+ upload_state_to_hf(state, HF_TOKEN)
207
+ break
208
+ except Exception as e:
209
+ print(f"Error while downloading message {message.id}: {e}")
210
+
211
+ except KeyboardInterrupt:
212
+ print("Interrupted by user; saving final state.")
213
+ save_local_state(state)
214
+ if HF_TOKEN:
215
+ upload_state_to_hf(state, HF_TOKEN)
216
+
217
+ print(f"\nFinal Statistics:")
218
+ print(f"Messages scanned: {count}")
219
+ print(f"RAR files downloaded: {downloaded}")
220
+ print(f"Already downloaded (skipped): {skipped}")
221
+ print(f"Non-RAR files skipped: {not_rar}")
222
+ print(f"\nDownload state saved to: {STATE_FILE}")
223
+ return 0
224
+
225
+
226
+ if __name__ == "__main__":
227
  asyncio.run(download_channel())