eliason1 commited on
Commit
dfec9f4
·
verified ·
1 Parent(s): 3d7c739

Upload 5 files

Browse files
Files changed (5) hide show
  1. Dockerfile +45 -0
  2. api_server.py +458 -0
  3. download_channel.py +227 -0
  4. my_session2.session +0 -0
  5. requirements.txt +6 -0
Dockerfile ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim-bullseye
2
+
3
+ # Install system dependencies
4
+ RUN sed -i 's/main/main contrib non-free/' /etc/apt/sources.list && \
5
+ apt-get update && \
6
+ apt-get install -y --no-install-recommends \
7
+ unrar \
8
+ libgl1 \
9
+ libglib2.0-0 \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ WORKDIR /app
13
+
14
+ # Upgrade pip and install core dependencies first
15
+ RUN pip install --no-cache-dir --upgrade pip setuptools wheel packaging
16
+
17
+ # Install CPU-only PyTorch first
18
+
19
+ # Copy requirements and install with special handling for flash_attn
20
+ COPY requirements.txt .
21
+ RUN pip install --no-cache-dir \
22
+ -r requirements.txt \
23
+ --find-links https://download.pytorch.org/whl/cpu \
24
+ --extra-index-url https://pypi.org/simple && \
25
+ # Install remaining packages that might have been skipped
26
+ pip install --no-cache-dir \
27
+ accelerate \
28
+ transformers==4.36.2 \
29
+ timm==0.9.12 \
30
+ einops==0.7.0
31
+
32
+ # Copy application code
33
+ COPY . .
34
+
35
+ # Create non-root user
36
+ RUN useradd -m -u 1000 user && \
37
+ chown -R user:user /app
38
+
39
+ USER user
40
+
41
+ # Environment variables to suppress warnings
42
+ ENV HF_HUB_DISABLE_PROGRESS=1
43
+ ENV TF_CPP_MIN_LOG_LEVEL=3
44
+
45
+ CMD ["uvicorn", "api_server:app", "--host", "0.0.0.0", "--port", "7860"]
api_server.py ADDED
@@ -0,0 +1,458 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, BackgroundTasks
2
+ from fastapi.responses import JSONResponse
3
+ import asyncio
4
+ import os
5
+ import time
6
+ import json
7
+ from typing import Optional, Dict, Any, List
8
+ from enum import Enum
9
+ from pydantic import BaseModel
10
+ from rich.progress import (
11
+ Progress,
12
+ SpinnerColumn,
13
+ TimeElapsedColumn,
14
+ DownloadColumn,
15
+ TransferSpeedColumn,
16
+ BarColumn,
17
+ TextColumn,
18
+ )
19
+ from rich.console import Console
20
+ from rich.live import Live
21
+ from rich.table import Table
22
+ import download_channel
23
+
24
+ # Initialize rich console for pretty logging
25
+ console = Console()
26
+
27
+ app = FastAPI(title="Telegram Channel Downloader API")
28
+
29
+ # Track active downloads and their status
30
+ active_downloads: Dict[str, Dict[str, Any]] = {}
31
+
32
+ class FileStatus(str, Enum):
33
+ PENDING = "pending"
34
+ DOWNLOADING = "downloading"
35
+ DOWNLOADED = "downloaded"
36
+ FAILED = "failed"
37
+
38
+ class ChannelFile(BaseModel):
39
+ message_id: int
40
+ filename: str
41
+ status: FileStatus
42
+ size: Optional[int] = None
43
+ download_time: Optional[float] = None
44
+ error: Optional[str] = None
45
+ upload_path: Optional[str] = None
46
+
47
+ class DownloadState(BaseModel):
48
+ channel: str
49
+ last_scanned_id: Optional[int] = None
50
+ files: List[ChannelFile] = []
51
+ current_download: Optional[int] = None # message_id of current download
52
+ last_updated: float = time.time()
53
+
54
+ class DownloadRequest(BaseModel):
55
+ channel: Optional[str] = None
56
+ message_limit: Optional[int] = None
57
+
58
+ class DownloadStatus(BaseModel):
59
+ channel: str
60
+ status: str
61
+ message_count: int = 0
62
+ downloaded: int = 0
63
+ downloading: Optional[str] = None
64
+ error: Optional[str] = None
65
+
66
+ def create_hf_dataset(token: str) -> bool:
67
+ """Create the Hugging Face dataset if it doesn't exist."""
68
+ try:
69
+ from huggingface_hub import create_repo, RepoNotFoundError
70
+ try:
71
+ # Try to create the dataset repository
72
+ create_repo(
73
+ repo_id=download_channel.HF_REPO_ID,
74
+ token=token,
75
+ repo_type="dataset",
76
+ exist_ok=True
77
+ )
78
+ console.print(f"[green]Created or verified dataset:[/green] {download_channel.HF_REPO_ID}")
79
+
80
+ # Create initial state file
81
+ initial_state = DownloadState(channel=download_channel.CHANNEL)
82
+ with open(download_channel.STATE_FILE, "w", encoding="utf-8") as f:
83
+ json.dump(initial_state.dict(), f, indent=2, ensure_ascii=False)
84
+
85
+ # Upload initial state
86
+ if download_channel.upload_file_to_hf(
87
+ download_channel.STATE_FILE,
88
+ download_channel.STATE_FILE,
89
+ token
90
+ ):
91
+ console.print("[green]Initialized dataset with empty state file[/green]")
92
+ return True
93
+ except Exception as e:
94
+ console.print(f"[red]Failed to create dataset:[/red] {str(e)}")
95
+ return False
96
+ except ImportError:
97
+ console.print("[red]huggingface_hub not properly installed[/red]")
98
+ return False
99
+ return True
100
+
101
+ def download_state_from_hf(token: str) -> DownloadState:
102
+ """Try to download the state file from the HF dataset. Returns state dict or creates new."""
103
+ if not token:
104
+ return DownloadState(channel=download_channel.CHANNEL)
105
+ try:
106
+ # Try to download existing state
107
+ local_path = download_channel.hf_hub_download(
108
+ repo_id=download_channel.HF_REPO_ID,
109
+ filename=download_channel.STATE_FILE,
110
+ repo_type="dataset",
111
+ token=token
112
+ )
113
+ with open(local_path, "r", encoding="utf-8") as f:
114
+ data = json.load(f)
115
+ return DownloadState(**data)
116
+ except Exception as e:
117
+ console.print(f"[yellow]No existing state found, creating new dataset:[/yellow] {str(e)}")
118
+ if create_hf_dataset(token):
119
+ console.print("[green]Dataset created successfully![/green]")
120
+ return DownloadState(channel=download_channel.CHANNEL)
121
+ else:
122
+ console.print("[red]Failed to create dataset, using local state only[/red]")
123
+ return DownloadState(channel=download_channel.CHANNEL)
124
+
125
+ async def clean_downloaded_file(file_path: str):
126
+ """Remove local file after successful upload"""
127
+ try:
128
+ os.remove(file_path)
129
+ console.print(f"[blue]Cleaned up:[/blue] {os.path.basename(file_path)}")
130
+ except Exception as e:
131
+ console.print(f"[yellow]Warning:[/yellow] Could not clean up {file_path}: {e}")
132
+
133
+ async def update_and_upload_state(state: DownloadState, token: str) -> bool:
134
+ """Update state timestamp and upload to dataset"""
135
+ state.last_updated = time.time()
136
+ try:
137
+ # Save state locally first
138
+ with open(download_channel.STATE_FILE, "w", encoding="utf-8") as f:
139
+ json.dump(state.dict(), f, indent=2, ensure_ascii=False)
140
+ # Upload to dataset
141
+ return download_channel.upload_file_to_hf(
142
+ download_channel.STATE_FILE,
143
+ download_channel.STATE_FILE,
144
+ token
145
+ )
146
+ except Exception as e:
147
+ console.print(f"[red]Failed to update state:[/red] {e}")
148
+ return False
149
+
150
+ async def process_message(message, state: DownloadState, client) -> Optional[str]:
151
+ """Process a single message, return output path if file downloaded or None"""
152
+ if not message.media:
153
+ return None
154
+
155
+ # Check if it's a RAR file
156
+ is_rar = False
157
+ filename = ""
158
+ if message.file:
159
+ filename = getattr(message.file, 'name', '') or ''
160
+ if filename:
161
+ is_rar = filename.lower().endswith('.rar')
162
+ else:
163
+ mime_type = getattr(message.file, 'mime_type', '') or ''
164
+ is_rar = 'rar' in mime_type.lower() if mime_type else False
165
+
166
+ if not is_rar:
167
+ return None
168
+
169
+ # Use message ID and original filename for saved file
170
+ if filename:
171
+ suggested = f"{message.id}_{filename}"
172
+ else:
173
+ suggested = f"{message.id}.rar"
174
+
175
+ return os.path.join(download_channel.OUTPUT_DIR, suggested)
176
+
177
+ async def run_download(channel: Optional[str], message_limit: Optional[int], task_id: str):
178
+ """Background task to run the download with state management"""
179
+ try:
180
+ # Override channel if provided
181
+ if channel:
182
+ download_channel.CHANNEL = channel
183
+ if message_limit is not None:
184
+ download_channel.MESSAGE_LIMIT = message_limit
185
+
186
+ # Get or create download state
187
+ state = download_state_from_hf(download_channel.HF_TOKEN)
188
+
189
+ # Initialize status for API
190
+ status = {
191
+ "channel": state.channel,
192
+ "status": "running",
193
+ "message_count": len(state.files),
194
+ "downloaded": len([f for f in state.files if f.status == FileStatus.DOWNLOADED]),
195
+ "downloading": None,
196
+ "error": None
197
+ }
198
+ active_downloads[task_id] = status
199
+
200
+ # Create progress displays
201
+ progress = Progress(
202
+ SpinnerColumn(),
203
+ TextColumn("[bold blue]{task.fields[filename]}", justify="right"),
204
+ BarColumn(bar_width=40),
205
+ "[progress.percentage]{task.percentage:>3.1f}%",
206
+ "•",
207
+ DownloadColumn(),
208
+ "•",
209
+ TransferSpeedColumn(),
210
+ "•",
211
+ TimeElapsedColumn(),
212
+ )
213
+
214
+ overall_progress = Progress(
215
+ TextColumn("[bold yellow]{task.description}", justify="right"),
216
+ BarColumn(bar_width=40),
217
+ "[progress.percentage]{task.percentage:>3.1f}%",
218
+ "•",
219
+ TextColumn("[bold green]{task.fields[stats]}")
220
+ )
221
+
222
+ # Initialize client
223
+ client = download_channel.TelegramClient(
224
+ download_channel.SESSION_FILE,
225
+ download_channel.API_ID,
226
+ download_channel.API_HASH
227
+ )
228
+
229
+ async with client:
230
+ try:
231
+ entity = await client.get_entity(download_channel.CHANNEL)
232
+ except Exception as e:
233
+ console.print(f"[red]Failed to resolve channel:[/red] {e}")
234
+ return 1
235
+
236
+ console.print(f"[green]Starting download from:[/green] {entity.title if hasattr(entity, 'title') else download_channel.CHANNEL}")
237
+
238
+ # First, scan for new messages and update state
239
+ scan_count = 0
240
+ last_message_id = state.last_scanned_id
241
+
242
+ try:
243
+ async for message in client.iter_messages(entity, limit=download_channel.MESSAGE_LIMIT or None):
244
+ scan_count += 1
245
+
246
+ # Update last scanned ID
247
+ if last_message_id is None or message.id > last_message_id:
248
+ last_message_id = message.id
249
+
250
+ # Skip if we already know about this message
251
+ if any(f.message_id == message.id for f in state.files):
252
+ continue
253
+
254
+ # Check if it's a downloadable file
255
+ out_path = await process_message(message, state, client)
256
+ if out_path:
257
+ # Add to state as pending
258
+ file_info = ChannelFile(
259
+ message_id=message.id,
260
+ filename=os.path.basename(out_path),
261
+ status=FileStatus.PENDING,
262
+ size=getattr(message.media, 'size', 0) or 0
263
+ )
264
+ state.files.append(file_info)
265
+
266
+ # Update state with scan results
267
+ state.last_scanned_id = last_message_id
268
+ if download_channel.HF_TOKEN:
269
+ await update_and_upload_state(state, download_channel.HF_TOKEN)
270
+
271
+ console.print(f"[green]Channel scan complete:[/green] Found {scan_count} messages")
272
+
273
+ except Exception as e:
274
+ console.print(f"[red]Error during channel scan:[/red] {e}")
275
+
276
+ # Now process pending downloads
277
+ pending_files = [f for f in state.files if f.status == FileStatus.PENDING]
278
+ total_pending = len(pending_files)
279
+
280
+ if total_pending == 0:
281
+ console.print("[green]No new files to download![/green]")
282
+ return 0
283
+
284
+ console.print(f"[green]Starting downloads:[/green] {total_pending} files pending")
285
+
286
+ # Process pending files
287
+ with Live(progress) as live_progress, Live(overall_progress) as live_overall:
288
+ overall_task = overall_progress.add_task(
289
+ f"Channel: {download_channel.CHANNEL}",
290
+ total=total_pending,
291
+ stats=f"Pending: {total_pending}"
292
+ )
293
+
294
+ for file_info in pending_files:
295
+ try:
296
+ # Mark as downloading in state
297
+ file_info.status = FileStatus.DOWNLOADING
298
+ state.current_download = file_info.message_id
299
+ if download_channel.HF_TOKEN:
300
+ await update_and_upload_state(state, download_channel.HF_TOKEN)
301
+
302
+ # Update status
303
+ status["downloading"] = file_info.filename
304
+
305
+ # Get message and prepare download
306
+ message = await client.get_messages(entity, ids=file_info.message_id)
307
+ if not message or not message.media:
308
+ file_info.status = FileStatus.FAILED
309
+ file_info.error = "Message not found or no media"
310
+ continue
311
+
312
+ out_path = os.path.join(download_channel.OUTPUT_DIR, file_info.filename)
313
+ file_task = progress.add_task(
314
+ "download",
315
+ total=file_info.size or 100,
316
+ filename=file_info.filename
317
+ )
318
+
319
+ # Download with progress
320
+ start_time = time.time()
321
+ try:
322
+ async def progress_callback(current, total):
323
+ progress.update(file_task, completed=current)
324
+ overall_stats = f"Downloaded: {len([f for f in state.files if f.status == FileStatus.DOWNLOADED])}"
325
+ overall_progress.update(overall_task, completed=current/total*100, stats=overall_stats)
326
+
327
+ await client.download_media(
328
+ message,
329
+ file=out_path,
330
+ progress_callback=progress_callback
331
+ )
332
+
333
+ # Upload to HF
334
+ if download_channel.HF_TOKEN:
335
+ console.print(f"[yellow]Uploading to HF:[/yellow] {file_info.filename}")
336
+ path_in_repo = f"files/{file_info.filename}"
337
+ ok = download_channel.upload_file_to_hf(
338
+ out_path,
339
+ path_in_repo,
340
+ download_channel.HF_TOKEN
341
+ )
342
+ if ok:
343
+ console.print(f"[green]Uploaded:[/green] {file_info.filename}")
344
+ # Clean up local file
345
+ await clean_downloaded_file(out_path)
346
+ file_info.upload_path = path_in_repo
347
+ else:
348
+ console.print(f"[red]Upload failed:[/red] {file_info.filename}")
349
+ file_info.error = "Upload to dataset failed"
350
+ file_info.status = FileStatus.FAILED
351
+ continue
352
+
353
+ # Mark as completed in state
354
+ file_info.status = FileStatus.DOWNLOADED
355
+ file_info.download_time = time.time() - start_time
356
+
357
+ # Update state
358
+ if download_channel.HF_TOKEN:
359
+ await update_and_upload_state(state, download_channel.HF_TOKEN)
360
+
361
+ # Update status
362
+ status["downloaded"] += 1
363
+ await asyncio.sleep(0.2) # Be polite
364
+
365
+ except download_channel.errors.FloodWaitError as fw:
366
+ wait = int(fw.seconds) if fw.seconds else 60
367
+ console.print(f"[yellow]FloodWait:[/yellow] Sleeping {wait}s")
368
+ await asyncio.sleep(wait + 1)
369
+ # Retry this file
370
+ continue
371
+
372
+ except Exception as e:
373
+ console.print(f"[red]Error:[/red] {str(e)}")
374
+ file_info.status = FileStatus.FAILED
375
+ file_info.error = str(e)
376
+ if download_channel.HF_TOKEN:
377
+ await update_and_upload_state(state, download_channel.HF_TOKEN)
378
+
379
+ except Exception as e:
380
+ console.print(f"[red]Fatal error processing {file_info.filename}:[/red] {str(e)}")
381
+ continue
382
+
383
+ # Clear current download
384
+ state.current_download = None
385
+ if download_channel.HF_TOKEN:
386
+ await update_and_upload_state(state, download_channel.HF_TOKEN)
387
+
388
+ console.print("[green]Download session completed![/green]")
389
+ status["status"] = "completed"
390
+ status["downloading"] = None
391
+
392
+ except Exception as e:
393
+ console.print(f"[red]Fatal error:[/red] {str(e)}")
394
+ if "status" in locals():
395
+ status["status"] = "failed"
396
+ status["error"] = str(e)
397
+
398
+ return 0
399
+
400
+ @app.on_event("startup")
401
+ async def start_initial_download():
402
+ """Start the download process automatically when the server starts"""
403
+ task_id = "initial_download"
404
+
405
+ # Verify HF token is set
406
+ if not download_channel.HF_TOKEN:
407
+ console.print("[red]ERROR: HF_TOKEN not set. Please set your Hugging Face token.[/red]")
408
+ return
409
+
410
+ # Create dataset structure if needed
411
+ console.print("[yellow]Checking Hugging Face dataset...[/yellow]")
412
+ try:
413
+ state = download_state_from_hf(download_channel.HF_TOKEN)
414
+ console.print(f"[green]Using channel:[/green] {state.channel}")
415
+
416
+ # Create files directory in dataset if it doesn't exist
417
+ os.makedirs(download_channel.OUTPUT_DIR, exist_ok=True)
418
+
419
+ # Start the download process with default settings
420
+ asyncio.create_task(run_download(
421
+ channel=None, # Use default from download_channel.py
422
+ message_limit=None, # Use default
423
+ task_id=task_id
424
+ ))
425
+ console.print(f"[green]Started initial download task:[/green] {task_id}")
426
+
427
+ except Exception as e:
428
+ console.print(f"[red]Failed to initialize:[/red] {str(e)}")
429
+
430
+ @app.post("/download", response_model=Dict[str, str])
431
+ async def start_download(request: DownloadRequest, background_tasks: BackgroundTasks):
432
+ """Start a new download task"""
433
+ task_id = f"download_{len(active_downloads) + 1}"
434
+
435
+ background_tasks.add_task(
436
+ run_download,
437
+ channel=request.channel,
438
+ message_limit=request.message_limit,
439
+ task_id=task_id
440
+ )
441
+
442
+ return {"task_id": task_id}
443
+
444
+ @app.get("/status/{task_id}", response_model=DownloadStatus)
445
+ async def get_status(task_id: str):
446
+ """Get the status of a download task"""
447
+ if task_id not in active_downloads:
448
+ raise HTTPException(status_code=404, detail="Task not found")
449
+ return active_downloads[task_id]
450
+
451
+ @app.get("/active", response_model=Dict[str, DownloadStatus])
452
+ async def list_active():
453
+ """List all active or completed downloads"""
454
+ return active_downloads
455
+
456
+ if __name__ == "__main__":
457
+ import uvicorn
458
+ uvicorn.run(app, host="127.0.0.1", port=8000)
download_channel.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ import json
6
+ import os
7
+ from telethon import TelegramClient, errors
8
+
9
+ # def load_state():
10
+ # if os.path.exists(STATE_FILE):
11
+ # try:
12
+ # with open(STATE_FILE, 'r') as f:
13
+ # return json.load(f)
14
+ # except:
15
+ # return {"downloaded_ids": []}
16
+ # return {"downloaded_ids": []}
17
+
18
+ # def save_state(state):
19
+ # with open(STATE_FILE, 'w') as f:
20
+ # json.dump(state, f, indent=2)
21
+
22
+ # async def download_channel():
23
+ # os.makedirs(OUTPUT_DIR, exist_ok=True)
24
+
25
+ # # Load previous download state
26
+ # state = load_state()
27
+ # downloaded_ids = set(state["downloaded_ids"])
28
+
29
+ # # Initialize client with your session
30
+ # client = TelegramClient(SESSION_FILE, API_ID, API_HASH)
31
+ import asyncio
32
+ import json
33
+ import os
34
+ from typing import Dict, Any
35
+
36
+ from telethon import TelegramClient, errors
37
+ from huggingface_hub import HfApi, hf_hub_download
38
+
39
+ # Configuration - Edit these variables
40
+ CHANNEL = "cgsvalka" # Channel username or ID to download from
41
+ SESSION_FILE = "my_session.session" # Your existing session file
42
+ OUTPUT_DIR = "downloads" # Where to save downloaded files
43
+ API_ID = 28708692 # Your Telegram API ID
44
+ API_HASH = "72fa6a22c65d7a58e00f2ccb8d60841d" # Your Telegram API Hash
45
+ MESSAGE_LIMIT = 0 # 0 = download all messages, or set a number for testing
46
+ STATE_FILE = "download_state.json" # Where to save download progress
47
+
48
+ # Hugging Face dataset repo where files and state will be uploaded
49
+ # Read the HF token from environment for safety. Set HF_TOKEN env var before running.
50
+ HF_TOKEN = os.environ.get("HF_TOKEN", "") # Hugging Face token with write permission (empty = disabled)
51
+ HF_REPO_ID = "samfred2/TGFiles"
52
+ STATE_FILE = "download_state.json" # Local filename for state
53
+
54
+
55
+ def load_local_state() -> Dict[str, Any]:
56
+ if os.path.exists(STATE_FILE):
57
+ try:
58
+ with open(STATE_FILE, "r", encoding="utf-8") as f:
59
+ return json.load(f)
60
+ except Exception:
61
+ return {"downloaded_files": []}
62
+ return {"downloaded_files": []}
63
+
64
+
65
+ def save_local_state(state: Dict[str, Any]) -> None:
66
+ with open(STATE_FILE, "w", encoding="utf-8") as f:
67
+ json.dump(state, f, indent=2, ensure_ascii=False)
68
+
69
+
70
+ def download_state_from_hf(token: str) -> Dict[str, Any]:
71
+ """Try to download the state file from the HF dataset. Returns state dict or empty state."""
72
+ if not token:
73
+ return {"downloaded_files": []}
74
+ try:
75
+ # hf_hub_download will raise if file doesn't exist
76
+ local_path = hf_hub_download(repo_id=HF_REPO_ID, filename=STATE_FILE, repo_type="dataset", token=token)
77
+ with open(local_path, "r", encoding="utf-8") as f:
78
+ return json.load(f)
79
+ except Exception:
80
+ return {"downloaded_files": []}
81
+
82
+
83
+ def upload_file_to_hf(local_path: str, path_in_repo: str, token: str) -> bool:
84
+ """Upload a single file to the HF dataset repo. Returns True on success."""
85
+ if not token:
86
+ return False
87
+ try:
88
+ api = HfApi()
89
+ api.upload_file(path_or_fileobj=local_path, path_in_repo=path_in_repo, repo_id=HF_REPO_ID, repo_type="dataset", token=token)
90
+ return True
91
+ except Exception as e:
92
+ print(f"Failed to upload {local_path} to HF: {e}")
93
+ return False
94
+
95
+
96
+ def upload_state_to_hf(state: Dict[str, Any], token: str) -> bool:
97
+ # write temp state file and upload
98
+ save_local_state(state)
99
+ return upload_file_to_hf(STATE_FILE, STATE_FILE, token)
100
+
101
+
102
+ async def download_channel():
103
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
104
+
105
+ # Try to download remote state first (if token provided), then merge with local state
106
+ remote_state = download_state_from_hf(HF_TOKEN) if HF_TOKEN else {"downloaded_files": []}
107
+ local_state = load_local_state()
108
+
109
+ # Merge: prefer remote entries, then local missing ones
110
+ downloaded_files = { (e.get("message_id"), e.get("filename")) for e in remote_state.get("downloaded_files", []) }
111
+ for e in local_state.get("downloaded_files", []):
112
+ downloaded_files.add((e.get("message_id"), e.get("filename")))
113
+
114
+ # Recreate ordered list
115
+ downloaded_list = [ {"message_id": mid, "filename": fname} for (mid, fname) in downloaded_files if mid is not None ]
116
+
117
+ state: Dict[str, Any] = {"downloaded_files": downloaded_list}
118
+
119
+ # Build quick lookup set of message ids to skip
120
+ downloaded_ids = {entry["message_id"] for entry in state["downloaded_files"]}
121
+
122
+ # Initialize client with your session
123
+ client = TelegramClient(SESSION_FILE, API_ID, API_HASH)
124
+
125
+ async with client:
126
+ try:
127
+ entity = await client.get_entity(CHANNEL)
128
+ except Exception as e:
129
+ print(f"Failed to resolve channel '{CHANNEL}': {e}")
130
+ return 1
131
+
132
+ print(f"Starting download from: {entity.title if hasattr(entity, 'title') else CHANNEL}")
133
+
134
+ count = 0
135
+ downloaded = 0
136
+ skipped = 0
137
+ not_rar = 0
138
+
139
+ try:
140
+ async for message in client.iter_messages(entity, limit=MESSAGE_LIMIT or None):
141
+ count += 1
142
+
143
+ # Skip if already downloaded according to state
144
+ if message.id in downloaded_ids:
145
+ skipped += 1
146
+ continue
147
+
148
+ if not message.media:
149
+ continue
150
+
151
+ # Check if it's a RAR file
152
+ is_rar = False
153
+ filename = ""
154
+ if message.file:
155
+ filename = getattr(message.file, 'name', '') or ''
156
+ if filename:
157
+ is_rar = filename.lower().endswith('.rar')
158
+ else:
159
+ mime_type = getattr(message.file, 'mime_type', '') or ''
160
+ is_rar = 'rar' in mime_type.lower() if mime_type else False
161
+
162
+ if not is_rar:
163
+ not_rar += 1
164
+ continue
165
+
166
+ # Use message ID and original filename for saved file
167
+ if filename:
168
+ suggested = f"{message.id}_{filename}"
169
+ else:
170
+ suggested = f"{message.id}.rar"
171
+
172
+ out_path = os.path.join(OUTPUT_DIR, suggested)
173
+
174
+ # Download the RAR file
175
+ try:
176
+ print(f"[{count}] downloading -> {os.path.basename(out_path)}")
177
+ await client.download_media(message, file=out_path)
178
+ downloaded += 1
179
+
180
+ # Upload the RAR file to the HF dataset (path files/<basename>)
181
+ if HF_TOKEN:
182
+ path_in_repo = f"files/{os.path.basename(out_path)}"
183
+ ok = upload_file_to_hf(out_path, path_in_repo, HF_TOKEN)
184
+ if not ok:
185
+ print(f"Warning: failed to upload {out_path} to HF repo {HF_REPO_ID}")
186
+
187
+ # Update state after successful download (and attempted upload)
188
+ state["downloaded_files"].append({"message_id": message.id, "filename": os.path.basename(out_path)})
189
+ downloaded_ids.add(message.id)
190
+ save_local_state(state)
191
+
192
+ # Upload updated state to HF
193
+ if HF_TOKEN:
194
+ upload_state_to_hf(state, HF_TOKEN)
195
+
196
+ # Be polite to the server
197
+ await asyncio.sleep(0.2)
198
+ except errors.FloodWaitError as fw:
199
+ wait = int(fw.seconds) if fw.seconds else 60
200
+ print(f"Hit FloodWait: sleeping {wait}s")
201
+ await asyncio.sleep(wait + 1)
202
+ except KeyboardInterrupt:
203
+ print("Interrupted by user; saving state and exiting.")
204
+ save_local_state(state)
205
+ if HF_TOKEN:
206
+ upload_state_to_hf(state, HF_TOKEN)
207
+ break
208
+ except Exception as e:
209
+ print(f"Error while downloading message {message.id}: {e}")
210
+
211
+ except KeyboardInterrupt:
212
+ print("Interrupted by user; saving final state.")
213
+ save_local_state(state)
214
+ if HF_TOKEN:
215
+ upload_state_to_hf(state, HF_TOKEN)
216
+
217
+ print(f"\nFinal Statistics:")
218
+ print(f"Messages scanned: {count}")
219
+ print(f"RAR files downloaded: {downloaded}")
220
+ print(f"Already downloaded (skipped): {skipped}")
221
+ print(f"Non-RAR files skipped: {not_rar}")
222
+ print(f"\nDownload state saved to: {STATE_FILE}")
223
+ return 0
224
+
225
+
226
+ if __name__ == "__main__":
227
+ asyncio.run(download_channel())
my_session2.session ADDED
Binary file (28.7 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ telethon>=1.36.0
2
+ huggingface_hub>=0.17.0
3
+ fastapi>=0.104.0
4
+ uvicorn>=0.24.0
5
+ pydantic>=2.4.2
6
+ rich>=13.6.0