Spaces:

wuhp
/

testemails

Paused

App Files Files Community

wuhp commited on Aug 31, 2025

Commit

29f8cf0

verified ·

1 Parent(s): e1ab87a

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -64

app.py CHANGED Viewed

@@ -1,23 +1,21 @@
 import os
 import json
-import time
 import hashlib
 import pathlib
 from typing import List, Tuple, Optional, Dict
-from urllib.parse import urlparse
 import requests
 import gradio as gr
 import bencodepy
 import py7zr
-# -------------------------
-# Small helpers
-# -------------------------
 def human_bytes(n: int) -> str:
     f = float(n)
-    for unit in ["B","KiB","MiB","GiB","TiB","PiB"]:
         if f < 1024.0:
             return f"{f:.2f} {unit}"
         f /= 1024.0
@@ -32,26 +30,32 @@ def fetch_bytes(url: str, timeout: int = 45) -> bytes:
     return r.content
 def parse_torrent(raw: bytes) -> Dict:
     data = bencodepy.decode(raw)
     if not isinstance(data, dict) or b"info" not in data:
         raise ValueError("Invalid .torrent (missing 'info').")
     info = data[b"info"]
-    info_bencoded = bencodepy.encode(info)
-    infohash_v1 = hashlib.sha1(info_bencoded).hexdigest()
-    # name
     name = info.get(b"name")
     if isinstance(name, (bytes, bytearray)):
         name = name.decode("utf-8", errors="replace")
-    # files
     files = []
     if b"files" in info:
         for f in info[b"files"]:
             length = int(f.get(b"length", 0))
             parts = []
             for pe in f.get(b"path", []):
-                parts.append((pe.decode("utf-8", "replace")) if isinstance(pe,(bytes,bytearray)) else str(pe))
             rel = "/".join(parts) if parts else "(unknown)"
             files.append({"path": rel, "length": length})
     else:
@@ -59,7 +63,6 @@ def parse_torrent(raw: bytes) -> Dict:
         rel = name or "(unnamed)"
         files.append({"path": rel, "length": length})
-    # BEP-19 web seeds
     web_seeds = []
     if b"url-list" in data:
         v = data[b"url-list"]
@@ -74,7 +77,7 @@ def parse_torrent(raw: bytes) -> Dict:
         "infohash": infohash_v1,
         "name": name or "(unknown)",
         "files": files,
-        "web_seeds": [s.rstrip("/") for s in web_seeds if isinstance(s,str) and s.strip()],
     }
 def join_url(base: str, *segs: str) -> str:
@@ -85,7 +88,7 @@ def join_url(base: str, *segs: str) -> str:
     return "/".join(parts)
 def _head_or_peek(url: str, timeout: int = 20) -> Tuple[bool, Optional[int]]:
-    # Try HEAD
     try:
         r = requests.head(url, timeout=timeout, allow_redirects=True)
         if r.status_code < 400:
@@ -93,7 +96,7 @@ def _head_or_peek(url: str, timeout: int = 20) -> Tuple[bool, Optional[int]]:
             return True, (int(size) if size and size.isdigit() else None)
     except Exception:
         pass
-    # Fallback tiny GET (first chunk)
     try:
         r = requests.get(url, stream=True, timeout=timeout, allow_redirects=True)
         if r.status_code < 400:
@@ -102,7 +105,10 @@ def _head_or_peek(url: str, timeout: int = 20) -> Tuple[bool, Optional[int]]:
                 next(r.iter_content(chunk_size=1024))
             except Exception:
                 pass
-            r.close()
             return True, (int(size) if size and size.isdigit() else None)
     except Exception:
         pass
@@ -112,39 +118,74 @@ def supports_range_and_size(url: str, timeout: int = 30) -> Tuple[bool, Optional
     try:
         r = requests.head(url, timeout=timeout, allow_redirects=True)
         if r.status_code < 400:
-            size = int(r.headers.get("Content-Length","0") or 0)
-            return (("bytes" in r.headers.get("Accept-Ranges","").lower()) or size>0, size if size>0 else None)
     except Exception:
         pass
     try:
         r = requests.get(url, stream=True, timeout=timeout, allow_redirects=True)
         r.raise_for_status()
-        size = int(r.headers.get("Content-Length","0") or 0)
-        try: r.close()
-        except: pass
-        return ("bytes" in r.headers.get("Accept-Ranges","").lower() or size>0, size if size>0 else None)
     except Exception:
         return False, None
-def download_with_resume(url: str, dest_path: pathlib.Path, timeout: int = 120):
     dest_path.parent.mkdir(parents=True, exist_ok=True)
-    tmp = dest_path.with_suffix(dest_path.suffix + ".part")
-    existing = tmp.stat().st_size if tmp.exists() else 0
-    can_range, total = supports_range_and_size(url)
-    headers = {"Range": f"bytes={existing}-"} if (can_range and existing>0) else {}
-    mode = "ab" if headers else "wb"
-    with requests.get(url, stream=True, timeout=timeout, headers=headers) as r:
-        r.raise_for_status()
-        with open(tmp, mode) as f:
-            for chunk in r.iter_content(chunk_size=1024*1024):
-                if chunk:
-                    f.write(chunk)
-    final_size = tmp.stat().st_size
-    if (total is None) or (final_size >= (total or 0)):
         tmp.rename(dest_path)
 def list_files_recursive(root: pathlib.Path) -> List[str]:
     out = []
     for p in root.rglob("*"):
@@ -168,8 +209,6 @@ def preview_path(path_str: str, max_bytes: int = 250_000) -> Tuple[str, Optional
     except Exception as e:
         return f"Error previewing file: {type(e).__name__}: {e}", None
-# ---------- NEW: base inference when no web seeds ----------
 def infer_bases_from_torrent_url(torrent_url: str) -> List[str]:
     """
     For URLs like:
@@ -188,48 +227,52 @@ def resolve_download_url(bases: List[str], root_name: str, rel_path: str) -> Opt
     Try both:
       base/root_name/rel_path
       base/rel_path
-    Return the first that exists.
     """
     candidates = []
     for b in bases:
         candidates.append(join_url(b, root_name, rel_path))
         candidates.append(join_url(b, rel_path))
-    tried = []
     for c in candidates:
         ok, _ = _head_or_peek(c)
-        tried.append((c, ok))
         if ok:
             return c
     return None
-# -------------------------
-# The single action
-# -------------------------
 def run_pipeline(torrent_url: str):
     if not torrent_url.strip().lower().endswith(".torrent"):
         raise gr.Error("Please provide a direct .torrent URL.")
-    # Parse torrent
     raw = fetch_bytes(torrent_url.strip())
     meta = parse_torrent(raw)
-    # seed list: web seeds if present, else infer from torrent URL folder (DDoSecrets-friendly)
-    seeds = list(meta["web_seeds"])
-    if not seeds:
-        seeds = infer_bases_from_torrent_url(torrent_url)
     infohash = meta["infohash"]
     root_name = meta["name"]
-    # We expect .7z payloads
     sevenz_files = [f for f in meta["files"] if f["path"].lower().endswith(".7z")]
     if not sevenz_files:
         raise gr.Error("No .7z files listed in the torrent.")
     if not seeds:
-        raise gr.Error("No HTTP source found. Tried to infer base from the .torrent URL but failed. "
-                       "If this is DDoSecrets, host likely at the same folder as the torrent.")
     # Work dirs
     base_dir = pathlib.Path("/mnt/data/work") / infohash
@@ -241,10 +284,12 @@ def run_pipeline(torrent_url: str):
     logs = []
     saved_archives = []
-    # Download each .7z over HTTP
     for f in sevenz_files:
         rel = f["path"]
-        # resolve against any seed/base
         final_url = None
         for seed in seeds:
             final_url = resolve_download_url([seed], root_name, rel)
@@ -252,15 +297,26 @@ def run_pipeline(torrent_url: str):
                 break
         if not final_url:
             raise gr.Error(f"Could not resolve an HTTP URL for {rel} from bases {seeds}.")
         dest = dl_dir / rel
         logs.append(f"Downloading: {final_url}")
-        download_with_resume(final_url, dest)
         if not dest.exists():
             raise gr.Error(f"Download failed: {final_url}")
         logs.append(f"Saved: {dest} ({human_bytes(dest.stat().st_size)})")
         saved_archives.append(str(dest))
-    # Extract all .7z archives
     for apath in saved_archives:
         logs.append(f"Extracting: {apath}")
         with py7zr.SevenZipFile(apath, mode="r") as z:
@@ -281,20 +337,18 @@ def do_preview(path: str):
     md, _ = preview_path(path)
     return md
-# -------------------------
 # UI
-# -------------------------
 with gr.Blocks(title="Torrent → 7z → View (HTTP only)") as demo:
     gr.Markdown(
         """
 # Torrent → 7z → View (HTTP only)
-Paste a **.torrent URL**.
-If it has web seeds, great. If not, we'll auto-guess the HTTPS folder from the URL (works for DDoSecrets layouts).
-The app downloads `.7z` file(s), extracts them, and lets you preview text/csv/json.
         """
     )
     url_in = gr.Textbox(label=".torrent URL", placeholder="https://data.ddosecrets.com/Collection/Collection.torrent")
     go_btn = gr.Button("Download, Extract & List")
     log_out = gr.Markdown()
@@ -311,12 +365,11 @@ The app downloads `.7z` file(s), extracts them, and lets you preview text/csv/js
         )
     go_btn.click(fn=_go, inputs=[url_in], outputs=[log_out, files_dd, files_dd])
     preview_btn.click(fn=do_preview, inputs=[files_dd], outputs=[preview_md])
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=int(os.environ.get("PORT", 7860)),
-        allowed_paths=["/mnt/data"]
     )

 import os
 import json
 import hashlib
 import pathlib
 from typing import List, Tuple, Optional, Dict
 import requests
 import gradio as gr
 import bencodepy
 import py7zr
+# =========================
+# Helpers
+# =========================
 def human_bytes(n: int) -> str:
     f = float(n)
+    for unit in ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]:
         if f < 1024.0:
             return f"{f:.2f} {unit}"
         f /= 1024.0
     return r.content
 def parse_torrent(raw: bytes) -> Dict:
+    """
+    Return:
+      {
+        "infohash": str,
+        "name": str,
+        "files": [{"path": str, "length": int}, ...],
+        "web_seeds": [str, ...]
+      }
+    """
     data = bencodepy.decode(raw)
     if not isinstance(data, dict) or b"info" not in data:
         raise ValueError("Invalid .torrent (missing 'info').")
     info = data[b"info"]
+    infohash_v1 = hashlib.sha1(bencodepy.encode(info)).hexdigest()
     name = info.get(b"name")
     if isinstance(name, (bytes, bytearray)):
         name = name.decode("utf-8", errors="replace")
     files = []
     if b"files" in info:
         for f in info[b"files"]:
             length = int(f.get(b"length", 0))
             parts = []
             for pe in f.get(b"path", []):
+                parts.append(pe.decode("utf-8", "replace") if isinstance(pe, (bytes, bytearray)) else str(pe))
             rel = "/".join(parts) if parts else "(unknown)"
             files.append({"path": rel, "length": length})
     else:
         rel = name or "(unnamed)"
         files.append({"path": rel, "length": length})
     web_seeds = []
     if b"url-list" in data:
         v = data[b"url-list"]
         "infohash": infohash_v1,
         "name": name or "(unknown)",
         "files": files,
+        "web_seeds": [s.rstrip("/") for s in web_seeds if isinstance(s, str) and s.strip()],
     }
 def join_url(base: str, *segs: str) -> str:
     return "/".join(parts)
 def _head_or_peek(url: str, timeout: int = 20) -> Tuple[bool, Optional[int]]:
+    # HEAD
     try:
         r = requests.head(url, timeout=timeout, allow_redirects=True)
         if r.status_code < 400:
             return True, (int(size) if size and size.isdigit() else None)
     except Exception:
         pass
+    # Tiny GET
     try:
         r = requests.get(url, stream=True, timeout=timeout, allow_redirects=True)
         if r.status_code < 400:
                 next(r.iter_content(chunk_size=1024))
             except Exception:
                 pass
+            try:
+                r.close()
+            except Exception:
+                pass
             return True, (int(size) if size and size.isdigit() else None)
     except Exception:
         pass
     try:
         r = requests.head(url, timeout=timeout, allow_redirects=True)
         if r.status_code < 400:
+            size = int(r.headers.get("Content-Length", "0") or 0)
+            return (("bytes" in r.headers.get("Accept-Ranges", "").lower()) or size > 0, size if size > 0 else None)
     except Exception:
         pass
     try:
         r = requests.get(url, stream=True, timeout=timeout, allow_redirects=True)
         r.raise_for_status()
+        size = int(r.headers.get("Content-Length", "0") or 0)
+        try:
+            r.close()
+        except Exception:
+            pass
+        return ("bytes" in r.headers.get("Accept-Ranges", "").lower() or size > 0, size if size > 0 else None)
     except Exception:
         return False, None
+def download_file_exact(url: str, dest_path: pathlib.Path, expected_size: Optional[int],
+                        timeout: int = 120, max_attempts: int = 2):
+    """
+    Download to dest_path. If expected_size is known, enforce it.
+    Attempt resume first; if mismatch, retry once with full GET.
+    """
     dest_path.parent.mkdir(parents=True, exist_ok=True)
+    def _resume_once():
+        tmp = dest_path.with_suffix(dest_path.suffix + ".part")
+        existing = tmp.stat().st_size if tmp.exists() else 0
+        can_range, _ = supports_range_and_size(url)
+        headers = {"Range": f"bytes={existing}-"} if (can_range and existing > 0) else {}
+        mode = "ab" if headers else "wb"
+        with requests.get(url, stream=True, timeout=timeout, headers=headers) as r:
+            r.raise_for_status()
+            with open(tmp, mode) as f:
+                for chunk in r.iter_content(chunk_size=1024 * 1024):
+                    if chunk:
+                        f.write(chunk)
+        tmp.rename(dest_path)
+    def _fresh_once():
+        tmp = dest_path.with_suffix(dest_path.suffix + ".part")
+        if tmp.exists():
+            tmp.unlink()
+        if dest_path.exists():
+            dest_path.unlink()
+        with requests.get(url, stream=True, timeout=timeout) as r:
+            r.raise_for_status()
+            with open(tmp, "wb") as f:
+                for chunk in r.iter_content(chunk_size=1024 * 1024):
+                    if chunk:
+                        f.write(chunk)
         tmp.rename(dest_path)
+    attempts = 0
+    while attempts < max_attempts:
+        attempts += 1
+        if attempts == 1:
+            _resume_once()
+        else:
+            _fresh_once()
+        if expected_size is None:
+            return  # no verification possible
+        if dest_path.exists() and dest_path.stat().st_size == expected_size:
+            return
+    got = dest_path.stat().st_size if dest_path.exists() else 0
+    raise gr.Error(f"Downloaded size mismatch for {dest_path.name}: got {got} bytes, expected {expected_size}.")
 def list_files_recursive(root: pathlib.Path) -> List[str]:
     out = []
     for p in root.rglob("*"):
     except Exception as e:
         return f"Error previewing file: {type(e).__name__}: {e}", None
 def infer_bases_from_torrent_url(torrent_url: str) -> List[str]:
     """
     For URLs like:
     Try both:
       base/root_name/rel_path
       base/rel_path
+    Return the first that responds.
     """
     candidates = []
     for b in bases:
         candidates.append(join_url(b, root_name, rel_path))
         candidates.append(join_url(b, rel_path))
     for c in candidates:
         ok, _ = _head_or_peek(c)
         if ok:
             return c
     return None
+def test_7z_integrity(archive_path: str) -> bool:
+    try:
+        with py7zr.SevenZipFile(archive_path, mode="r") as z:
+            z.test()   # raises on CRC or structure errors
+        return True
+    except Exception:
+        return False
+# =========================
+# Pipeline
+# =========================
 def run_pipeline(torrent_url: str):
     if not torrent_url.strip().lower().endswith(".torrent"):
         raise gr.Error("Please provide a direct .torrent URL.")
+    # Parse torrent metadata
     raw = fetch_bytes(torrent_url.strip())
     meta = parse_torrent(raw)
+    # Seeds: prefer BEP-19 web seeds, else infer from torrent URL folder (DDoSecrets-friendly)
+    seeds = list(meta["web_seeds"]) or infer_bases_from_torrent_url(torrent_url)
     infohash = meta["infohash"]
     root_name = meta["name"]
+    # Expect .7z payloads
     sevenz_files = [f for f in meta["files"] if f["path"].lower().endswith(".7z")]
     if not sevenz_files:
         raise gr.Error("No .7z files listed in the torrent.")
     if not seeds:
+        raise gr.Error("No HTTP source found to fetch files. "
+                       "If this is DDoSecrets, ensure the .torrent sits with the files over HTTPS.")
     # Work dirs
     base_dir = pathlib.Path("/mnt/data/work") / infohash
     logs = []
     saved_archives = []
+    # Expected sizes from torrent metadata
+    expected_map = {f["path"]: int(f.get("length", 0)) for f in meta["files"]}
+    # Download each .7z over HTTP with verification and retry
     for f in sevenz_files:
         rel = f["path"]
         final_url = None
         for seed in seeds:
             final_url = resolve_download_url([seed], root_name, rel)
                 break
         if not final_url:
             raise gr.Error(f"Could not resolve an HTTP URL for {rel} from bases {seeds}.")
         dest = dl_dir / rel
+        expected_size = expected_map.get(rel) or None
         logs.append(f"Downloading: {final_url}")
+        download_file_exact(final_url, dest, expected_size)
         if not dest.exists():
             raise gr.Error(f"Download failed: {final_url}")
         logs.append(f"Saved: {dest} ({human_bytes(dest.stat().st_size)})")
+        # Integrity test; if fails, re-fetch once fresh (handled inside download_file_exact via attempts)
+        if not test_7z_integrity(str(dest)):
+            logs.append(f"CRC test failed for {dest.name}, retrying download fresh…")
+            download_file_exact(final_url, dest, expected_size, max_attempts=2)
+            if not test_7z_integrity(str(dest)):
+                raise gr.Error(f"Archive still fails CRC after re-download: {dest.name}")
         saved_archives.append(str(dest))
+    # Extract all .7z archives (after passing CRC test)
     for apath in saved_archives:
         logs.append(f"Extracting: {apath}")
         with py7zr.SevenZipFile(apath, mode="r") as z:
     md, _ = preview_path(path)
     return md
+# =========================
 # UI
+# =========================
 with gr.Blocks(title="Torrent → 7z → View (HTTP only)") as demo:
     gr.Markdown(
         """
 # Torrent → 7z → View (HTTP only)
+Paste a **.torrent URL** (with web seeds or DDoSecrets-style layout).
+The app downloads `.7z` file(s), verifies size & CRC, extracts them, and lets you preview text/csv/json.
         """
     )
     url_in = gr.Textbox(label=".torrent URL", placeholder="https://data.ddosecrets.com/Collection/Collection.torrent")
     go_btn = gr.Button("Download, Extract & List")
     log_out = gr.Markdown()
         )
     go_btn.click(fn=_go, inputs=[url_in], outputs=[log_out, files_dd, files_dd])
     preview_btn.click(fn=do_preview, inputs=[files_dd], outputs=[preview_md])
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=int(os.environ.get("PORT", 7860)),
+        allowed_paths=["/mnt/data"]  # allow returning files from /mnt/data if needed
     )