wuhp commited on
Commit
29f8cf0
·
verified ·
1 Parent(s): e1ab87a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -64
app.py CHANGED
@@ -1,23 +1,21 @@
1
  import os
2
  import json
3
- import time
4
  import hashlib
5
  import pathlib
6
  from typing import List, Tuple, Optional, Dict
7
- from urllib.parse import urlparse
8
 
9
  import requests
10
  import gradio as gr
11
  import bencodepy
12
  import py7zr
13
 
14
- # -------------------------
15
- # Small helpers
16
- # -------------------------
17
 
18
  def human_bytes(n: int) -> str:
19
  f = float(n)
20
- for unit in ["B","KiB","MiB","GiB","TiB","PiB"]:
21
  if f < 1024.0:
22
  return f"{f:.2f} {unit}"
23
  f /= 1024.0
@@ -32,26 +30,32 @@ def fetch_bytes(url: str, timeout: int = 45) -> bytes:
32
  return r.content
33
 
34
  def parse_torrent(raw: bytes) -> Dict:
 
 
 
 
 
 
 
 
 
35
  data = bencodepy.decode(raw)
36
  if not isinstance(data, dict) or b"info" not in data:
37
  raise ValueError("Invalid .torrent (missing 'info').")
38
  info = data[b"info"]
39
- info_bencoded = bencodepy.encode(info)
40
- infohash_v1 = hashlib.sha1(info_bencoded).hexdigest()
41
 
42
- # name
43
  name = info.get(b"name")
44
  if isinstance(name, (bytes, bytearray)):
45
  name = name.decode("utf-8", errors="replace")
46
 
47
- # files
48
  files = []
49
  if b"files" in info:
50
  for f in info[b"files"]:
51
  length = int(f.get(b"length", 0))
52
  parts = []
53
  for pe in f.get(b"path", []):
54
- parts.append((pe.decode("utf-8", "replace")) if isinstance(pe,(bytes,bytearray)) else str(pe))
55
  rel = "/".join(parts) if parts else "(unknown)"
56
  files.append({"path": rel, "length": length})
57
  else:
@@ -59,7 +63,6 @@ def parse_torrent(raw: bytes) -> Dict:
59
  rel = name or "(unnamed)"
60
  files.append({"path": rel, "length": length})
61
 
62
- # BEP-19 web seeds
63
  web_seeds = []
64
  if b"url-list" in data:
65
  v = data[b"url-list"]
@@ -74,7 +77,7 @@ def parse_torrent(raw: bytes) -> Dict:
74
  "infohash": infohash_v1,
75
  "name": name or "(unknown)",
76
  "files": files,
77
- "web_seeds": [s.rstrip("/") for s in web_seeds if isinstance(s,str) and s.strip()],
78
  }
79
 
80
  def join_url(base: str, *segs: str) -> str:
@@ -85,7 +88,7 @@ def join_url(base: str, *segs: str) -> str:
85
  return "/".join(parts)
86
 
87
  def _head_or_peek(url: str, timeout: int = 20) -> Tuple[bool, Optional[int]]:
88
- # Try HEAD
89
  try:
90
  r = requests.head(url, timeout=timeout, allow_redirects=True)
91
  if r.status_code < 400:
@@ -93,7 +96,7 @@ def _head_or_peek(url: str, timeout: int = 20) -> Tuple[bool, Optional[int]]:
93
  return True, (int(size) if size and size.isdigit() else None)
94
  except Exception:
95
  pass
96
- # Fallback tiny GET (first chunk)
97
  try:
98
  r = requests.get(url, stream=True, timeout=timeout, allow_redirects=True)
99
  if r.status_code < 400:
@@ -102,7 +105,10 @@ def _head_or_peek(url: str, timeout: int = 20) -> Tuple[bool, Optional[int]]:
102
  next(r.iter_content(chunk_size=1024))
103
  except Exception:
104
  pass
105
- r.close()
 
 
 
106
  return True, (int(size) if size and size.isdigit() else None)
107
  except Exception:
108
  pass
@@ -112,39 +118,74 @@ def supports_range_and_size(url: str, timeout: int = 30) -> Tuple[bool, Optional
112
  try:
113
  r = requests.head(url, timeout=timeout, allow_redirects=True)
114
  if r.status_code < 400:
115
- size = int(r.headers.get("Content-Length","0") or 0)
116
- return (("bytes" in r.headers.get("Accept-Ranges","").lower()) or size>0, size if size>0 else None)
117
  except Exception:
118
  pass
119
  try:
120
  r = requests.get(url, stream=True, timeout=timeout, allow_redirects=True)
121
  r.raise_for_status()
122
- size = int(r.headers.get("Content-Length","0") or 0)
123
- try: r.close()
124
- except: pass
125
- return ("bytes" in r.headers.get("Accept-Ranges","").lower() or size>0, size if size>0 else None)
 
 
126
  except Exception:
127
  return False, None
128
 
129
- def download_with_resume(url: str, dest_path: pathlib.Path, timeout: int = 120):
 
 
 
 
 
130
  dest_path.parent.mkdir(parents=True, exist_ok=True)
131
- tmp = dest_path.with_suffix(dest_path.suffix + ".part")
132
- existing = tmp.stat().st_size if tmp.exists() else 0
133
- can_range, total = supports_range_and_size(url)
134
- headers = {"Range": f"bytes={existing}-"} if (can_range and existing>0) else {}
135
- mode = "ab" if headers else "wb"
136
 
137
- with requests.get(url, stream=True, timeout=timeout, headers=headers) as r:
138
- r.raise_for_status()
139
- with open(tmp, mode) as f:
140
- for chunk in r.iter_content(chunk_size=1024*1024):
141
- if chunk:
142
- f.write(chunk)
 
 
 
 
 
 
 
143
 
144
- final_size = tmp.stat().st_size
145
- if (total is None) or (final_size >= (total or 0)):
 
 
 
 
 
 
 
 
 
 
146
  tmp.rename(dest_path)
147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  def list_files_recursive(root: pathlib.Path) -> List[str]:
149
  out = []
150
  for p in root.rglob("*"):
@@ -168,8 +209,6 @@ def preview_path(path_str: str, max_bytes: int = 250_000) -> Tuple[str, Optional
168
  except Exception as e:
169
  return f"Error previewing file: {type(e).__name__}: {e}", None
170
 
171
- # ---------- NEW: base inference when no web seeds ----------
172
-
173
  def infer_bases_from_torrent_url(torrent_url: str) -> List[str]:
174
  """
175
  For URLs like:
@@ -188,48 +227,52 @@ def resolve_download_url(bases: List[str], root_name: str, rel_path: str) -> Opt
188
  Try both:
189
  base/root_name/rel_path
190
  base/rel_path
191
- Return the first that exists.
192
  """
193
  candidates = []
194
  for b in bases:
195
  candidates.append(join_url(b, root_name, rel_path))
196
  candidates.append(join_url(b, rel_path))
197
- tried = []
198
  for c in candidates:
199
  ok, _ = _head_or_peek(c)
200
- tried.append((c, ok))
201
  if ok:
202
  return c
203
  return None
204
 
205
- # -------------------------
206
- # The single action
207
- # -------------------------
 
 
 
 
 
 
 
 
208
 
209
  def run_pipeline(torrent_url: str):
210
  if not torrent_url.strip().lower().endswith(".torrent"):
211
  raise gr.Error("Please provide a direct .torrent URL.")
212
 
213
- # Parse torrent
214
  raw = fetch_bytes(torrent_url.strip())
215
  meta = parse_torrent(raw)
216
 
217
- # seed list: web seeds if present, else infer from torrent URL folder (DDoSecrets-friendly)
218
- seeds = list(meta["web_seeds"])
219
- if not seeds:
220
- seeds = infer_bases_from_torrent_url(torrent_url)
221
 
222
  infohash = meta["infohash"]
223
  root_name = meta["name"]
224
 
225
- # We expect .7z payloads
226
  sevenz_files = [f for f in meta["files"] if f["path"].lower().endswith(".7z")]
227
  if not sevenz_files:
228
  raise gr.Error("No .7z files listed in the torrent.")
229
 
230
  if not seeds:
231
- raise gr.Error("No HTTP source found. Tried to infer base from the .torrent URL but failed. "
232
- "If this is DDoSecrets, host likely at the same folder as the torrent.")
233
 
234
  # Work dirs
235
  base_dir = pathlib.Path("/mnt/data/work") / infohash
@@ -241,10 +284,12 @@ def run_pipeline(torrent_url: str):
241
  logs = []
242
  saved_archives = []
243
 
244
- # Download each .7z over HTTP
 
 
 
245
  for f in sevenz_files:
246
  rel = f["path"]
247
- # resolve against any seed/base
248
  final_url = None
249
  for seed in seeds:
250
  final_url = resolve_download_url([seed], root_name, rel)
@@ -252,15 +297,26 @@ def run_pipeline(torrent_url: str):
252
  break
253
  if not final_url:
254
  raise gr.Error(f"Could not resolve an HTTP URL for {rel} from bases {seeds}.")
 
255
  dest = dl_dir / rel
 
 
256
  logs.append(f"Downloading: {final_url}")
257
- download_with_resume(final_url, dest)
258
  if not dest.exists():
259
  raise gr.Error(f"Download failed: {final_url}")
260
  logs.append(f"Saved: {dest} ({human_bytes(dest.stat().st_size)})")
 
 
 
 
 
 
 
 
261
  saved_archives.append(str(dest))
262
 
263
- # Extract all .7z archives
264
  for apath in saved_archives:
265
  logs.append(f"Extracting: {apath}")
266
  with py7zr.SevenZipFile(apath, mode="r") as z:
@@ -281,20 +337,18 @@ def do_preview(path: str):
281
  md, _ = preview_path(path)
282
  return md
283
 
284
- # -------------------------
285
  # UI
286
- # -------------------------
287
 
288
  with gr.Blocks(title="Torrent → 7z → View (HTTP only)") as demo:
289
  gr.Markdown(
290
  """
291
  # Torrent → 7z → View (HTTP only)
292
- Paste a **.torrent URL**.
293
- If it has web seeds, great. If not, we'll auto-guess the HTTPS folder from the URL (works for DDoSecrets layouts).
294
- The app downloads `.7z` file(s), extracts them, and lets you preview text/csv/json.
295
  """
296
  )
297
-
298
  url_in = gr.Textbox(label=".torrent URL", placeholder="https://data.ddosecrets.com/Collection/Collection.torrent")
299
  go_btn = gr.Button("Download, Extract & List")
300
  log_out = gr.Markdown()
@@ -311,12 +365,11 @@ The app downloads `.7z` file(s), extracts them, and lets you preview text/csv/js
311
  )
312
 
313
  go_btn.click(fn=_go, inputs=[url_in], outputs=[log_out, files_dd, files_dd])
314
-
315
  preview_btn.click(fn=do_preview, inputs=[files_dd], outputs=[preview_md])
316
 
317
  if __name__ == "__main__":
318
  demo.launch(
319
  server_name="0.0.0.0",
320
  server_port=int(os.environ.get("PORT", 7860)),
321
- allowed_paths=["/mnt/data"]
322
  )
 
1
  import os
2
  import json
 
3
  import hashlib
4
  import pathlib
5
  from typing import List, Tuple, Optional, Dict
 
6
 
7
  import requests
8
  import gradio as gr
9
  import bencodepy
10
  import py7zr
11
 
12
+ # =========================
13
+ # Helpers
14
+ # =========================
15
 
16
  def human_bytes(n: int) -> str:
17
  f = float(n)
18
+ for unit in ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]:
19
  if f < 1024.0:
20
  return f"{f:.2f} {unit}"
21
  f /= 1024.0
 
30
  return r.content
31
 
32
  def parse_torrent(raw: bytes) -> Dict:
33
+ """
34
+ Return:
35
+ {
36
+ "infohash": str,
37
+ "name": str,
38
+ "files": [{"path": str, "length": int}, ...],
39
+ "web_seeds": [str, ...]
40
+ }
41
+ """
42
  data = bencodepy.decode(raw)
43
  if not isinstance(data, dict) or b"info" not in data:
44
  raise ValueError("Invalid .torrent (missing 'info').")
45
  info = data[b"info"]
46
+ infohash_v1 = hashlib.sha1(bencodepy.encode(info)).hexdigest()
 
47
 
 
48
  name = info.get(b"name")
49
  if isinstance(name, (bytes, bytearray)):
50
  name = name.decode("utf-8", errors="replace")
51
 
 
52
  files = []
53
  if b"files" in info:
54
  for f in info[b"files"]:
55
  length = int(f.get(b"length", 0))
56
  parts = []
57
  for pe in f.get(b"path", []):
58
+ parts.append(pe.decode("utf-8", "replace") if isinstance(pe, (bytes, bytearray)) else str(pe))
59
  rel = "/".join(parts) if parts else "(unknown)"
60
  files.append({"path": rel, "length": length})
61
  else:
 
63
  rel = name or "(unnamed)"
64
  files.append({"path": rel, "length": length})
65
 
 
66
  web_seeds = []
67
  if b"url-list" in data:
68
  v = data[b"url-list"]
 
77
  "infohash": infohash_v1,
78
  "name": name or "(unknown)",
79
  "files": files,
80
+ "web_seeds": [s.rstrip("/") for s in web_seeds if isinstance(s, str) and s.strip()],
81
  }
82
 
83
  def join_url(base: str, *segs: str) -> str:
 
88
  return "/".join(parts)
89
 
90
  def _head_or_peek(url: str, timeout: int = 20) -> Tuple[bool, Optional[int]]:
91
+ # HEAD
92
  try:
93
  r = requests.head(url, timeout=timeout, allow_redirects=True)
94
  if r.status_code < 400:
 
96
  return True, (int(size) if size and size.isdigit() else None)
97
  except Exception:
98
  pass
99
+ # Tiny GET
100
  try:
101
  r = requests.get(url, stream=True, timeout=timeout, allow_redirects=True)
102
  if r.status_code < 400:
 
105
  next(r.iter_content(chunk_size=1024))
106
  except Exception:
107
  pass
108
+ try:
109
+ r.close()
110
+ except Exception:
111
+ pass
112
  return True, (int(size) if size and size.isdigit() else None)
113
  except Exception:
114
  pass
 
118
  try:
119
  r = requests.head(url, timeout=timeout, allow_redirects=True)
120
  if r.status_code < 400:
121
+ size = int(r.headers.get("Content-Length", "0") or 0)
122
+ return (("bytes" in r.headers.get("Accept-Ranges", "").lower()) or size > 0, size if size > 0 else None)
123
  except Exception:
124
  pass
125
  try:
126
  r = requests.get(url, stream=True, timeout=timeout, allow_redirects=True)
127
  r.raise_for_status()
128
+ size = int(r.headers.get("Content-Length", "0") or 0)
129
+ try:
130
+ r.close()
131
+ except Exception:
132
+ pass
133
+ return ("bytes" in r.headers.get("Accept-Ranges", "").lower() or size > 0, size if size > 0 else None)
134
  except Exception:
135
  return False, None
136
 
137
+ def download_file_exact(url: str, dest_path: pathlib.Path, expected_size: Optional[int],
138
+ timeout: int = 120, max_attempts: int = 2):
139
+ """
140
+ Download to dest_path. If expected_size is known, enforce it.
141
+ Attempt resume first; if mismatch, retry once with full GET.
142
+ """
143
  dest_path.parent.mkdir(parents=True, exist_ok=True)
 
 
 
 
 
144
 
145
+ def _resume_once():
146
+ tmp = dest_path.with_suffix(dest_path.suffix + ".part")
147
+ existing = tmp.stat().st_size if tmp.exists() else 0
148
+ can_range, _ = supports_range_and_size(url)
149
+ headers = {"Range": f"bytes={existing}-"} if (can_range and existing > 0) else {}
150
+ mode = "ab" if headers else "wb"
151
+ with requests.get(url, stream=True, timeout=timeout, headers=headers) as r:
152
+ r.raise_for_status()
153
+ with open(tmp, mode) as f:
154
+ for chunk in r.iter_content(chunk_size=1024 * 1024):
155
+ if chunk:
156
+ f.write(chunk)
157
+ tmp.rename(dest_path)
158
 
159
+ def _fresh_once():
160
+ tmp = dest_path.with_suffix(dest_path.suffix + ".part")
161
+ if tmp.exists():
162
+ tmp.unlink()
163
+ if dest_path.exists():
164
+ dest_path.unlink()
165
+ with requests.get(url, stream=True, timeout=timeout) as r:
166
+ r.raise_for_status()
167
+ with open(tmp, "wb") as f:
168
+ for chunk in r.iter_content(chunk_size=1024 * 1024):
169
+ if chunk:
170
+ f.write(chunk)
171
  tmp.rename(dest_path)
172
 
173
+ attempts = 0
174
+ while attempts < max_attempts:
175
+ attempts += 1
176
+ if attempts == 1:
177
+ _resume_once()
178
+ else:
179
+ _fresh_once()
180
+
181
+ if expected_size is None:
182
+ return # no verification possible
183
+ if dest_path.exists() and dest_path.stat().st_size == expected_size:
184
+ return
185
+
186
+ got = dest_path.stat().st_size if dest_path.exists() else 0
187
+ raise gr.Error(f"Downloaded size mismatch for {dest_path.name}: got {got} bytes, expected {expected_size}.")
188
+
189
  def list_files_recursive(root: pathlib.Path) -> List[str]:
190
  out = []
191
  for p in root.rglob("*"):
 
209
  except Exception as e:
210
  return f"Error previewing file: {type(e).__name__}: {e}", None
211
 
 
 
212
  def infer_bases_from_torrent_url(torrent_url: str) -> List[str]:
213
  """
214
  For URLs like:
 
227
  Try both:
228
  base/root_name/rel_path
229
  base/rel_path
230
+ Return the first that responds.
231
  """
232
  candidates = []
233
  for b in bases:
234
  candidates.append(join_url(b, root_name, rel_path))
235
  candidates.append(join_url(b, rel_path))
 
236
  for c in candidates:
237
  ok, _ = _head_or_peek(c)
 
238
  if ok:
239
  return c
240
  return None
241
 
242
+ def test_7z_integrity(archive_path: str) -> bool:
243
+ try:
244
+ with py7zr.SevenZipFile(archive_path, mode="r") as z:
245
+ z.test() # raises on CRC or structure errors
246
+ return True
247
+ except Exception:
248
+ return False
249
+
250
+ # =========================
251
+ # Pipeline
252
+ # =========================
253
 
254
  def run_pipeline(torrent_url: str):
255
  if not torrent_url.strip().lower().endswith(".torrent"):
256
  raise gr.Error("Please provide a direct .torrent URL.")
257
 
258
+ # Parse torrent metadata
259
  raw = fetch_bytes(torrent_url.strip())
260
  meta = parse_torrent(raw)
261
 
262
+ # Seeds: prefer BEP-19 web seeds, else infer from torrent URL folder (DDoSecrets-friendly)
263
+ seeds = list(meta["web_seeds"]) or infer_bases_from_torrent_url(torrent_url)
 
 
264
 
265
  infohash = meta["infohash"]
266
  root_name = meta["name"]
267
 
268
+ # Expect .7z payloads
269
  sevenz_files = [f for f in meta["files"] if f["path"].lower().endswith(".7z")]
270
  if not sevenz_files:
271
  raise gr.Error("No .7z files listed in the torrent.")
272
 
273
  if not seeds:
274
+ raise gr.Error("No HTTP source found to fetch files. "
275
+ "If this is DDoSecrets, ensure the .torrent sits with the files over HTTPS.")
276
 
277
  # Work dirs
278
  base_dir = pathlib.Path("/mnt/data/work") / infohash
 
284
  logs = []
285
  saved_archives = []
286
 
287
+ # Expected sizes from torrent metadata
288
+ expected_map = {f["path"]: int(f.get("length", 0)) for f in meta["files"]}
289
+
290
+ # Download each .7z over HTTP with verification and retry
291
  for f in sevenz_files:
292
  rel = f["path"]
 
293
  final_url = None
294
  for seed in seeds:
295
  final_url = resolve_download_url([seed], root_name, rel)
 
297
  break
298
  if not final_url:
299
  raise gr.Error(f"Could not resolve an HTTP URL for {rel} from bases {seeds}.")
300
+
301
  dest = dl_dir / rel
302
+ expected_size = expected_map.get(rel) or None
303
+
304
  logs.append(f"Downloading: {final_url}")
305
+ download_file_exact(final_url, dest, expected_size)
306
  if not dest.exists():
307
  raise gr.Error(f"Download failed: {final_url}")
308
  logs.append(f"Saved: {dest} ({human_bytes(dest.stat().st_size)})")
309
+
310
+ # Integrity test; if fails, re-fetch once fresh (handled inside download_file_exact via attempts)
311
+ if not test_7z_integrity(str(dest)):
312
+ logs.append(f"CRC test failed for {dest.name}, retrying download fresh…")
313
+ download_file_exact(final_url, dest, expected_size, max_attempts=2)
314
+ if not test_7z_integrity(str(dest)):
315
+ raise gr.Error(f"Archive still fails CRC after re-download: {dest.name}")
316
+
317
  saved_archives.append(str(dest))
318
 
319
+ # Extract all .7z archives (after passing CRC test)
320
  for apath in saved_archives:
321
  logs.append(f"Extracting: {apath}")
322
  with py7zr.SevenZipFile(apath, mode="r") as z:
 
337
  md, _ = preview_path(path)
338
  return md
339
 
340
+ # =========================
341
  # UI
342
+ # =========================
343
 
344
  with gr.Blocks(title="Torrent → 7z → View (HTTP only)") as demo:
345
  gr.Markdown(
346
  """
347
  # Torrent → 7z → View (HTTP only)
348
+ Paste a **.torrent URL** (with web seeds or DDoSecrets-style layout).
349
+ The app downloads `.7z` file(s), verifies size & CRC, extracts them, and lets you preview text/csv/json.
 
350
  """
351
  )
 
352
  url_in = gr.Textbox(label=".torrent URL", placeholder="https://data.ddosecrets.com/Collection/Collection.torrent")
353
  go_btn = gr.Button("Download, Extract & List")
354
  log_out = gr.Markdown()
 
365
  )
366
 
367
  go_btn.click(fn=_go, inputs=[url_in], outputs=[log_out, files_dd, files_dd])
 
368
  preview_btn.click(fn=do_preview, inputs=[files_dd], outputs=[preview_md])
369
 
370
  if __name__ == "__main__":
371
  demo.launch(
372
  server_name="0.0.0.0",
373
  server_port=int(os.environ.get("PORT", 7860)),
374
+ allowed_paths=["/mnt/data"] # allow returning files from /mnt/data if needed
375
  )