Fred808 commited on
Commit
baa75e2
·
verified ·
1 Parent(s): 948c53e

Upload 4 files

Browse files
Files changed (4) hide show
  1. api_server.py +181 -0
  2. download_channel.py +227 -0
  3. my_session.session +0 -0
  4. requirements.txt +5 -0
api_server.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, BackgroundTasks
2
+ from fastapi.responses import JSONResponse
3
+ import asyncio
4
+ import os
5
+ from typing import Optional, Dict, Any
6
+ from pydantic import BaseModel
7
+ import download_channel # Import the existing downloader
8
+
9
+ app = FastAPI(title="Telegram Channel Downloader API")
10
+
11
+ # Track active downloads and their status
12
+ active_downloads: Dict[str, Dict[str, Any]] = {}
13
+
14
+ class DownloadRequest(BaseModel):
15
+ channel: Optional[str] = None # If None, uses default from download_channel.py
16
+ message_limit: Optional[int] = None
17
+
18
+ class DownloadStatus(BaseModel):
19
+ channel: str
20
+ status: str # "running", "completed", "failed"
21
+ message_count: int = 0
22
+ downloaded: int = 0
23
+ skipped: int = 0
24
+ not_rar: int = 0
25
+ error: Optional[str] = None
26
+
27
+ async def run_download(channel: Optional[str], message_limit: Optional[int], task_id: str):
28
+ """Background task to run the download"""
29
+ try:
30
+ # Override channel and message limit if provided
31
+ if channel:
32
+ download_channel.CHANNEL = channel
33
+ if message_limit is not None:
34
+ download_channel.MESSAGE_LIMIT = message_limit
35
+
36
+ # Create a status tracker
37
+ status = {
38
+ "channel": download_channel.CHANNEL,
39
+ "status": "running",
40
+ "message_count": 0,
41
+ "downloaded": 0,
42
+ "skipped": 0,
43
+ "not_rar": 0,
44
+ "error": None
45
+ }
46
+ active_downloads[task_id] = status
47
+
48
+ # Patch the download function to update our status
49
+ original_download = download_channel.download_channel
50
+
51
+ async def wrapped_download():
52
+ nonlocal status
53
+ try:
54
+ # Initialize client and get entity
55
+ client = download_channel.TelegramClient(
56
+ download_channel.SESSION_FILE,
57
+ download_channel.API_ID,
58
+ download_channel.API_HASH
59
+ )
60
+
61
+ async with client:
62
+ try:
63
+ entity = await client.get_entity(download_channel.CHANNEL)
64
+ except Exception as e:
65
+ status["error"] = f"Failed to resolve channel: {str(e)}"
66
+ status["status"] = "failed"
67
+ return 1
68
+
69
+ try:
70
+ async for message in client.iter_messages(entity, limit=download_channel.MESSAGE_LIMIT or None):
71
+ status["message_count"] += 1
72
+
73
+ if not message.media:
74
+ continue
75
+
76
+ # Check if it's a RAR file
77
+ is_rar = False
78
+ if message.file:
79
+ filename = getattr(message.file, 'name', '') or ''
80
+ if filename:
81
+ is_rar = filename.lower().endswith('.rar')
82
+ else:
83
+ mime_type = getattr(message.file, 'mime_type', '') or ''
84
+ is_rar = 'rar' in mime_type.lower() if mime_type else False
85
+
86
+ if not is_rar:
87
+ status["not_rar"] += 1
88
+ continue
89
+
90
+ # Use the same filename logic
91
+ if filename:
92
+ suggested = f"{message.id}_{filename}"
93
+ else:
94
+ suggested = f"{message.id}.rar"
95
+
96
+ out_path = os.path.join(download_channel.OUTPUT_DIR, suggested)
97
+
98
+ if os.path.exists(out_path):
99
+ status["skipped"] += 1
100
+ continue
101
+
102
+ try:
103
+ await client.download_media(message, file=out_path)
104
+ status["downloaded"] += 1
105
+
106
+ # Upload to HF if token available
107
+ if download_channel.HF_TOKEN:
108
+ path_in_repo = f"files/{os.path.basename(out_path)}"
109
+ ok = download_channel.upload_file_to_hf(
110
+ out_path, path_in_repo, download_channel.HF_TOKEN
111
+ )
112
+ if not ok:
113
+ print(f"Warning: failed to upload {out_path}")
114
+
115
+ await asyncio.sleep(0.2) # Be polite
116
+
117
+ except download_channel.errors.FloodWaitError as fw:
118
+ wait = int(fw.seconds) if fw.seconds else 60
119
+ print(f"Hit FloodWait: sleeping {wait}s")
120
+ await asyncio.sleep(wait + 1)
121
+ except Exception as e:
122
+ print(f"Error downloading {message.id}: {e}")
123
+
124
+ except Exception as e:
125
+ status["error"] = str(e)
126
+ status["status"] = "failed"
127
+ return 1
128
+
129
+ status["status"] = "completed"
130
+ return 0
131
+
132
+ except Exception as e:
133
+ status["error"] = str(e)
134
+ status["status"] = "failed"
135
+ return 1
136
+
137
+ await wrapped_download()
138
+
139
+ except Exception as e:
140
+ active_downloads[task_id] = {
141
+ "channel": download_channel.CHANNEL,
142
+ "status": "failed",
143
+ "message_count": 0,
144
+ "downloaded": 0,
145
+ "skipped": 0,
146
+ "not_rar": 0,
147
+ "error": str(e)
148
+ }
149
+
150
+ @app.post("/download", response_model=Dict[str, str])
151
+ async def start_download(request: DownloadRequest, background_tasks: BackgroundTasks):
152
+ """Start a new download task"""
153
+ task_id = f"download_{len(active_downloads) + 1}"
154
+
155
+ # Schedule the download
156
+ background_tasks.add_task(
157
+ run_download,
158
+ channel=request.channel,
159
+ message_limit=request.message_limit,
160
+ task_id=task_id
161
+ )
162
+
163
+ return {"task_id": task_id}
164
+
165
+ @app.get("/status/{task_id}", response_model=DownloadStatus)
166
+ async def get_status(task_id: str):
167
+ """Get the status of a download task"""
168
+ if task_id not in active_downloads:
169
+ raise HTTPException(status_code=404, detail="Task not found")
170
+ return active_downloads[task_id]
171
+
172
+ @app.get("/active", response_model=Dict[str, DownloadStatus])
173
+ async def list_active():
174
+ """List all active or completed downloads"""
175
+ return active_downloads
176
+
177
+ if __name__ == "__main__":
178
+ import uvicorn
179
+ # Note: When running directly, this runs on 8000
180
+ # For production, use: uvicorn api_server:app --host 0.0.0.0 --port 8000
181
+ uvicorn.run(app, host="127.0.0.1", port=8000)
download_channel.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ import json
6
+ import os
7
+ from telethon import TelegramClient, errors
8
+
9
+ # def load_state():
10
+ # if os.path.exists(STATE_FILE):
11
+ # try:
12
+ # with open(STATE_FILE, 'r') as f:
13
+ # return json.load(f)
14
+ # except:
15
+ # return {"downloaded_ids": []}
16
+ # return {"downloaded_ids": []}
17
+
18
+ # def save_state(state):
19
+ # with open(STATE_FILE, 'w') as f:
20
+ # json.dump(state, f, indent=2)
21
+
22
+ # async def download_channel():
23
+ # os.makedirs(OUTPUT_DIR, exist_ok=True)
24
+
25
+ # # Load previous download state
26
+ # state = load_state()
27
+ # downloaded_ids = set(state["downloaded_ids"])
28
+
29
+ # # Initialize client with your session
30
+ # client = TelegramClient(SESSION_FILE, API_ID, API_HASH)
31
+ import asyncio
32
+ import json
33
+ import os
34
+ from typing import Dict, Any
35
+
36
+ from telethon import TelegramClient, errors
37
+ from huggingface_hub import HfApi, hf_hub_download
38
+
39
+ # Configuration - Edit these variables
40
+ CHANNEL = "cgsvalka" # Channel username or ID to download from
41
+ SESSION_FILE = "my_session.session" # Your existing session file
42
+ OUTPUT_DIR = "downloads" # Where to save downloaded files
43
+ API_ID = 28708692 # Your Telegram API ID
44
+ API_HASH = "72fa6a22c65d7a58e00f2ccb8d60841d" # Your Telegram API Hash
45
+ MESSAGE_LIMIT = 0 # 0 = download all messages, or set a number for testing
46
+ STATE_FILE = "download_state.json" # Where to save download progress
47
+
48
+ # Hugging Face dataset repo where files and state will be uploaded
49
+ # Read the HF token from environment for safety. Set HF_TOKEN env var before running.
50
+ HF_TOKEN = os.environ.get("HF_TOKEN", "") # Hugging Face token with write permission (empty = disabled)
51
+ HF_REPO_ID = "Fred808/TGFiles"
52
+ STATE_FILE = "download_state.json" # Local filename for state
53
+
54
+
55
+ def load_local_state() -> Dict[str, Any]:
56
+ if os.path.exists(STATE_FILE):
57
+ try:
58
+ with open(STATE_FILE, "r", encoding="utf-8") as f:
59
+ return json.load(f)
60
+ except Exception:
61
+ return {"downloaded_files": []}
62
+ return {"downloaded_files": []}
63
+
64
+
65
+ def save_local_state(state: Dict[str, Any]) -> None:
66
+ with open(STATE_FILE, "w", encoding="utf-8") as f:
67
+ json.dump(state, f, indent=2, ensure_ascii=False)
68
+
69
+
70
+ def download_state_from_hf(token: str) -> Dict[str, Any]:
71
+ """Try to download the state file from the HF dataset. Returns state dict or empty state."""
72
+ if not token:
73
+ return {"downloaded_files": []}
74
+ try:
75
+ # hf_hub_download will raise if file doesn't exist
76
+ local_path = hf_hub_download(repo_id=HF_REPO_ID, filename=STATE_FILE, repo_type="dataset", token=token)
77
+ with open(local_path, "r", encoding="utf-8") as f:
78
+ return json.load(f)
79
+ except Exception:
80
+ return {"downloaded_files": []}
81
+
82
+
83
+ def upload_file_to_hf(local_path: str, path_in_repo: str, token: str) -> bool:
84
+ """Upload a single file to the HF dataset repo. Returns True on success."""
85
+ if not token:
86
+ return False
87
+ try:
88
+ api = HfApi()
89
+ api.upload_file(path_or_fileobj=local_path, path_in_repo=path_in_repo, repo_id=HF_REPO_ID, repo_type="dataset", token=token)
90
+ return True
91
+ except Exception as e:
92
+ print(f"Failed to upload {local_path} to HF: {e}")
93
+ return False
94
+
95
+
96
+ def upload_state_to_hf(state: Dict[str, Any], token: str) -> bool:
97
+ # write temp state file and upload
98
+ save_local_state(state)
99
+ return upload_file_to_hf(STATE_FILE, STATE_FILE, token)
100
+
101
+
102
+ async def download_channel():
103
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
104
+
105
+ # Try to download remote state first (if token provided), then merge with local state
106
+ remote_state = download_state_from_hf(HF_TOKEN) if HF_TOKEN else {"downloaded_files": []}
107
+ local_state = load_local_state()
108
+
109
+ # Merge: prefer remote entries, then local missing ones
110
+ downloaded_files = { (e.get("message_id"), e.get("filename")) for e in remote_state.get("downloaded_files", []) }
111
+ for e in local_state.get("downloaded_files", []):
112
+ downloaded_files.add((e.get("message_id"), e.get("filename")))
113
+
114
+ # Recreate ordered list
115
+ downloaded_list = [ {"message_id": mid, "filename": fname} for (mid, fname) in downloaded_files if mid is not None ]
116
+
117
+ state: Dict[str, Any] = {"downloaded_files": downloaded_list}
118
+
119
+ # Build quick lookup set of message ids to skip
120
+ downloaded_ids = {entry["message_id"] for entry in state["downloaded_files"]}
121
+
122
+ # Initialize client with your session
123
+ client = TelegramClient(SESSION_FILE, API_ID, API_HASH)
124
+
125
+ async with client:
126
+ try:
127
+ entity = await client.get_entity(CHANNEL)
128
+ except Exception as e:
129
+ print(f"Failed to resolve channel '{CHANNEL}': {e}")
130
+ return 1
131
+
132
+ print(f"Starting download from: {entity.title if hasattr(entity, 'title') else CHANNEL}")
133
+
134
+ count = 0
135
+ downloaded = 0
136
+ skipped = 0
137
+ not_rar = 0
138
+
139
+ try:
140
+ async for message in client.iter_messages(entity, limit=MESSAGE_LIMIT or None):
141
+ count += 1
142
+
143
+ # Skip if already downloaded according to state
144
+ if message.id in downloaded_ids:
145
+ skipped += 1
146
+ continue
147
+
148
+ if not message.media:
149
+ continue
150
+
151
+ # Check if it's a RAR file
152
+ is_rar = False
153
+ filename = ""
154
+ if message.file:
155
+ filename = getattr(message.file, 'name', '') or ''
156
+ if filename:
157
+ is_rar = filename.lower().endswith('.rar')
158
+ else:
159
+ mime_type = getattr(message.file, 'mime_type', '') or ''
160
+ is_rar = 'rar' in mime_type.lower() if mime_type else False
161
+
162
+ if not is_rar:
163
+ not_rar += 1
164
+ continue
165
+
166
+ # Use message ID and original filename for saved file
167
+ if filename:
168
+ suggested = f"{message.id}_{filename}"
169
+ else:
170
+ suggested = f"{message.id}.rar"
171
+
172
+ out_path = os.path.join(OUTPUT_DIR, suggested)
173
+
174
+ # Download the RAR file
175
+ try:
176
+ print(f"[{count}] downloading -> {os.path.basename(out_path)}")
177
+ await client.download_media(message, file=out_path)
178
+ downloaded += 1
179
+
180
+ # Upload the RAR file to the HF dataset (path files/<basename>)
181
+ if HF_TOKEN:
182
+ path_in_repo = f"files/{os.path.basename(out_path)}"
183
+ ok = upload_file_to_hf(out_path, path_in_repo, HF_TOKEN)
184
+ if not ok:
185
+ print(f"Warning: failed to upload {out_path} to HF repo {HF_REPO_ID}")
186
+
187
+ # Update state after successful download (and attempted upload)
188
+ state["downloaded_files"].append({"message_id": message.id, "filename": os.path.basename(out_path)})
189
+ downloaded_ids.add(message.id)
190
+ save_local_state(state)
191
+
192
+ # Upload updated state to HF
193
+ if HF_TOKEN:
194
+ upload_state_to_hf(state, HF_TOKEN)
195
+
196
+ # Be polite to the server
197
+ await asyncio.sleep(0.2)
198
+ except errors.FloodWaitError as fw:
199
+ wait = int(fw.seconds) if fw.seconds else 60
200
+ print(f"Hit FloodWait: sleeping {wait}s")
201
+ await asyncio.sleep(wait + 1)
202
+ except KeyboardInterrupt:
203
+ print("Interrupted by user; saving state and exiting.")
204
+ save_local_state(state)
205
+ if HF_TOKEN:
206
+ upload_state_to_hf(state, HF_TOKEN)
207
+ break
208
+ except Exception as e:
209
+ print(f"Error while downloading message {message.id}: {e}")
210
+
211
+ except KeyboardInterrupt:
212
+ print("Interrupted by user; saving final state.")
213
+ save_local_state(state)
214
+ if HF_TOKEN:
215
+ upload_state_to_hf(state, HF_TOKEN)
216
+
217
+ print(f"\nFinal Statistics:")
218
+ print(f"Messages scanned: {count}")
219
+ print(f"RAR files downloaded: {downloaded}")
220
+ print(f"Already downloaded (skipped): {skipped}")
221
+ print(f"Non-RAR files skipped: {not_rar}")
222
+ print(f"\nDownload state saved to: {STATE_FILE}")
223
+ return 0
224
+
225
+
226
+ if __name__ == "__main__":
227
+ asyncio.run(download_channel())
my_session.session ADDED
Binary file (28.7 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ telethon>=1.36.0
2
+ huggingface_hub>=0.17.0
3
+ fastapi>=0.104.0
4
+ uvicorn>=0.24.0
5
+ pydantic>=2.4.2