Spaces:

wuhp
/

testemails

Paused

App Files Files Community

wuhp commited on Aug 31, 2025

Commit

abae4cb

verified ·

1 Parent(s): dbb4b45

Create app.py

Browse files

Files changed (1) hide show

app.py +521 -0

app.py ADDED Viewed

	@@ -0,0 +1,521 @@

+import os
+import re
+import io
+import sys
+import json
+import time
+import math
+import hashlib
+import pathlib
+from typing import Dict, Any, List, Tuple, Optional
+from urllib.parse import urlparse, parse_qs, unquote
+import requests
+import pandas as pd
+import gradio as gr
+import bencodepy
+# =========================
+# Utilities
+# =========================
+def human_bytes(n: int) -> str:
+    if n is None:
+        return "—"
+    f = float(n)
+    for unit in ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]:
+        if f < 1024.0:
+            return f"{f:.2f} {unit}"
+        f /= 1024.0
+    return f"{f:.2f} EiB"
+def is_magnet(s: str) -> bool:
+    return s.strip().lower().startswith("magnet:")
+def is_http_url(s: str) -> bool:
+    s = s.strip().lower()
+    return s.startswith("http://") or s.startswith("https://")
+def parse_magnet(magnet: str) -> Dict[str, Any]:
+    out: Dict[str, Any] = {"ok": False, "error": None}
+    try:
+        magnet = magnet.strip()
+        if not is_magnet(magnet):
+            raise ValueError("Not a magnet URI.")
+        parsed = urlparse(magnet)
+        qs = parse_qs(parsed.query)
+        xt_vals = qs.get("xt", [])
+        ih_hex = None
+        if xt_vals:
+            for xt in xt_vals:
+                if xt.lower().startswith("urn:btih:"):
+                    ih = xt.split(":", 2)[-1]
+                    if re.fullmatch(r"[0-9a-fA-F]{40}", ih):
+                        ih_hex = ih.lower()
+                    else:
+                        # base32 → hex
+                        try:
+                            import base64
+                            raw = base64.b32decode(ih.upper())
+                            ih_hex = raw.hex()
+                        except Exception:
+                            pass
+                    break
+        dn = None
+        if "dn" in qs and qs["dn"]:
+            dn = unquote(qs["dn"][0])
+        trackers = [unquote(t) for t in qs.get("tr", [])]
+        out.update({"ok": True, "infohash_hex": ih_hex, "display_name": dn or "—", "trackers": trackers})
+        return out
+    except Exception as e:
+        out["error"] = f"{type(e).__name__}: {e}"
+        return out
+def fetch_bytes(url: str, timeout: int = 45) -> bytes:
+    r = requests.get(url, timeout=timeout)
+    r.raise_for_status()
+    return r.content
+def parse_torrent(raw: bytes) -> Dict[str, Any]:
+    dec = bencodepy.Bencode(encoding=None)
+    data = dec.decode(raw)
+    if not isinstance(data, dict) or b"info" not in data:
+        raise ValueError("Invalid .torrent: missing 'info' dictionary.")
+    info = data[b"info"]
+    info_bencoded = bencodepy.encode(info)
+    infohash_v1 = hashlib.sha1(info_bencoded).hexdigest()
+    name = info.get(b"name")
+    if isinstance(name, (bytes, bytearray)):
+        name = name.decode("utf-8", errors="replace")
+    piece_length = info.get(b"piece length")
+    pieces = info.get(b"pieces")
+    num_pieces = (len(pieces) // 20) if isinstance(pieces, (bytes, bytearray)) else "—"
+    announce = data.get(b"announce")
+    if isinstance(announce, (bytes, bytearray)):
+        announce = announce.decode("utf-8", errors="replace")
+    announce_list = []
+    if b"announce-list" in data and isinstance(data[b"announce-list"], list):
+        for tier in data[b"announce-list"]:
+            if isinstance(tier, list):
+                for tr in tier:
+                    if isinstance(tr, (bytes, bytearray)):
+                        announce_list.append(tr.decode("utf-8", errors="replace"))
+    # Web seeds (BEP-19)
+    web_seeds: List[str] = []
+    if b"url-list" in data:
+        url_list = data[b"url-list"]
+        if isinstance(url_list, (bytes, bytearray)):
+            web_seeds = [url_list.decode("utf-8", errors="replace")]
+        elif isinstance(url_list, list):
+            for u in url_list:
+                if isinstance(u, (bytes, bytearray)):
+                    web_seeds.append(u.decode("utf-8", errors="replace"))
+    # Files
+    rows: List[Dict[str, Any]] = []
+    total_len = 0
+    if b"files" in info and isinstance(info[b"files"], list):
+        for f in info[b"files"]:
+            if not isinstance(f, dict):
+                continue
+            length = int(f.get(b"length", 0))
+            total_len += length
+            parts = []
+            for pe in f.get(b"path", []):
+                if isinstance(pe, (bytes, bytearray)):
+                    parts.append(pe.decode("utf-8", errors="replace"))
+                else:
+                    parts.append(str(pe))
+            rel_path = "/".join(parts) if parts else "(unknown)"
+            rows.append({"Path": rel_path, "Length (bytes)": length, "Length (HR)": human_bytes(length)})
+    else:
+        length = int(info.get(b"length", 0))
+        total_len = length
+        rows.append({"Path": name or "(unnamed)", "Length (bytes)": length, "Length (HR)": human_bytes(length)})
+    df = pd.DataFrame(rows)
+    if not df.empty:
+        df.sort_values(by=["Path"], inplace=True, ignore_index=True)
+    created_by = data.get(b"created by")
+    if isinstance(created_by, (bytes, bytearray)):
+        created_by = created_by.decode("utf-8", errors="replace")
+    creation_date = data.get(b"creation date")
+    creation_date = int(creation_date) if isinstance(creation_date, int) else "—"
+    comment = data.get(b"comment")
+    if isinstance(comment, (bytes, bytearray)):
+        comment = comment.decode("utf-8", errors="replace")
+    summary = {
+        "Name": name or "(unknown)",
+        "Infohash (v1)": infohash_v1,
+        "Total size (bytes)": total_len,
+        "Total size": human_bytes(total_len),
+        "Files count": int(df.shape[0]),
+        "Piece length": piece_length if isinstance(piece_length, int) else "—",
+        "Pieces (count)": num_pieces,
+        "Primary announce": announce or "—",
+        "Trackers": announce_list,
+        "Web seeds": web_seeds,
+        "Created by": created_by or "—",
+        "Creation date (unix)": creation_date,
+        "Comment": comment or "—",
+        "Private": bool(info.get(b"private", 0)),
+    }
+    return {"summary": summary, "files_df": df, "raw": data, "info": info}
+def summary_md(summary: Dict[str, Any]) -> str:
+    md = []
+    md.append(f"### {summary.get('Name','(unknown)')}")
+    md.append(f"- **Infohash (v1):** `{summary.get('Infohash (v1)')}`")
+    md.append(f"- **Total size:** {summary.get('Total size')} ({summary.get('Total size (bytes)')} bytes)")
+    md.append(f"- **Files:** {summary.get('Files count')}")
+    md.append(f"- **Piece length:** {summary.get('Piece length')}")
+    md.append(f"- **Pieces (count):** {summary.get('Pieces (count)')}")
+    md.append(f"- **Primary announce:** {summary.get('Primary announce')}")
+    trs = summary.get("Trackers") or []
+    if trs:
+        md.append(f"- **Additional trackers ({len(trs)}):**")
+        for t in trs:
+            md.append(f"  - {t}")
+    seeds = summary.get("Web seeds") or []
+    if seeds:
+        md.append("- **Web seeds (BEP-19):**")
+        for s in seeds:
+            md.append(f"  - {s}")
+    md.append(f"- **Created by:** {summary.get('Created by')}")
+    md.append(f"- **Creation date (unix):** {summary.get('Creation date (unix)')}")
+    md.append(f"- **Comment:** {summary.get('Comment')}")
+    md.append(f"- **Private:** {summary.get('Private')}")
+    return "\n".join(md)
+# =========================
+# Inspect handlers
+# =========================
+def handle_input(input_text: str, uploaded_file) -> Tuple[str, pd.DataFrame, str, str]:
+    """
+    Accept: magnet, .torrent URL, or uploaded .torrent.
+    Returns: (markdown_summary, files_df, csv_path, json_state_for_download_tab)
+    """
+    if uploaded_file is not None:
+        raw = uploaded_file.read()
+        parsed = parse_torrent(raw)
+    else:
+        if not input_text or not input_text.strip():
+            raise gr.Error("Provide a magnet link, a direct .torrent URL, or upload a .torrent.")
+        text = input_text.strip()
+        if is_magnet(text):
+            mag = parse_magnet(text)
+            if not mag.get("ok"):
+                raise gr.Error(f"Could not parse magnet: {mag.get('error')}")
+            # Magnet on Spaces → no file list (no DHT)
+            empty = pd.DataFrame(columns=["Path", "Length (bytes)", "Length (HR)"])
+            csvp = f"/mnt/data/files_{int(time.time())}.csv"; empty.to_csv(csvp, index=False)
+            state = {"summary": {"Name": mag.get("display_name") or "(magnet)",
+                                 "Infohash (v1)": mag.get("infohash_hex") or "—",
+                                 "Web seeds": []},
+                     "files_df": empty.to_dict(orient="list"),
+                     "magnet": mag}
+            md = "\n".join([
+                "### Magnet Metadata (no DHT on Spaces)",
+                f"- **Display name:** {mag.get('display_name')}",
+                f"- **Infohash (hex):** `{mag.get('infohash_hex') or '—'}`",
+                f"- **Trackers:** {len(mag.get('trackers') or [])}",
+                "",
+                "ℹ️ Use a corresponding **.torrent URL** (or upload it) for the full file list."
+            ])
+            return md, empty, csvp, json.dumps(state)
+        elif is_http_url(text):
+            raw = fetch_bytes(text)
+            parsed = parse_torrent(raw)
+        else:
+            raise gr.Error("Unrecognized input. Paste a **magnet:** link or a direct **.torrent** URL, or upload a .torrent.")
+    # For .torrent path:
+    csv_path = f"/mnt/data/files_{int(time.time())}.csv"
+    parsed["files_df"].to_csv(csv_path, index=False)
+    # pack state (DataFrame → records) to reuse in Download tab
+    state = {"summary": parsed["summary"], "files_df": parsed["files_df"].to_dict(orient="list")}
+    return summary_md(parsed["summary"]), parsed["files_df"], csv_path, json.dumps(state)
+# =========================
+# Download over HTTP
+# =========================
+def _join_url(base: str, *segs: str) -> str:
+    parts = [base.rstrip("/")]
+    for s in segs:
+        enc = "/".join([requests.utils.quote(p) for p in s.split("/")])
+        parts.append(enc)
+    return "/".join(parts)
+def _sha256_file(path: pathlib.Path, bufsize: int = 1024 * 1024) -> str:
+    h = hashlib.sha256()
+    with open(path, "rb") as f:
+        while True:
+            b = f.read(bufsize)
+            if not b:
+                break
+            h.update(b)
+    return h.hexdigest()
+def _supports_range(url: str, timeout: int = 30) -> Tuple[bool, Optional[int]]:
+    # HEAD to learn size and Accept-Ranges
+    r = requests.head(url, timeout=timeout, allow_redirects=True)
+    if r.status_code >= 400:
+        # Some servers don't allow HEAD; try GET without download
+        r = requests.get(url, stream=True, timeout=timeout, allow_redirects=True)
+        r.raise_for_status()
+        size = int(r.headers.get("Content-Length", "0") or 0)
+        accept_ranges = r.headers.get("Accept-Ranges", "")
+        try:
+            r.close()
+        except Exception:
+            pass
+        return ("bytes" in accept_ranges.lower() or size > 0, size if size > 0 else None)
+    size = int(r.headers.get("Content-Length", "0") or 0)
+    accept_ranges = r.headers.get("Accept-Ranges", "")
+    return ("bytes" in accept_ranges.lower() or size > 0, size if size > 0 else None)
+def _download_with_resume(url: str, dest_path: pathlib.Path, timeout: int = 120) -> Tuple[int, Optional[int]]:
+    """
+    Download URL to dest_path with simple resume (HTTP Range).
+    Returns (bytes_written, total_expected or None).
+    """
+    dest_path.parent.mkdir(parents=True, exist_ok=True)
+    tmp_path = dest_path.with_suffix(dest_path.suffix + ".part")
+    existing = tmp_path.stat().st_size if tmp_path.exists() else 0
+    supports, total_size = _supports_range(url)
+    headers = {}
+    if supports and existing > 0:
+        headers["Range"] = f"bytes={existing}-"
+    mode = "ab" if headers.get("Range") else "wb"
+    bytes_written = 0
+    with requests.get(url, stream=True, timeout=timeout, headers=headers) as r:
+        r.raise_for_status()
+        with open(tmp_path, mode) as f:
+            for chunk in r.iter_content(chunk_size=1024 * 1024):
+                if chunk:
+                    f.write(chunk)
+                    bytes_written += len(chunk)
+    # finalize
+    final_size = (existing + bytes_written)
+    # If we know total_size and match or server returned 200 with complete payload, rename
+    if total_size is None or final_size >= (total_size or 0):
+        tmp_path.rename(dest_path)
+    else:
+        # Keep .part if incomplete
+        pass
+    return bytes_written, total_size
+def prepare_download(parsed_json: str, base_url_override: str) -> Tuple[str, List[str], List[str], str]:
+    """
+    Build seed list and file choices.
+    - If torrent has web seeds → use them.
+    - Else if base_url_override provided → use that as a 'seed'.
+    """
+    if not parsed_json:
+        return "Parse something in Inspect first.", [], [], ""
+    parsed = json.loads(parsed_json)
+    summary = parsed.get("summary") or {}
+    seeds = list(summary.get("Web seeds") or [])
+    root = summary.get("Name") or ""
+    if (not seeds) and base_url_override.strip():
+        seeds = [base_url_override.strip().rstrip("/")]
+    files_df = pd.DataFrame(parsed.get("files_df", {}))
+    file_list = files_df["Path"].astype(str).tolist() if not files_df.empty else []
+    if not seeds:
+        return ("No web seeds in torrent and no Base URL override supplied. "
+                "For Spaces downloads you need HTTP access to the files."), [], [], ""
+    msg = f"Ready. Root folder assumed: `{root}`. Select a seed and file(s) to download."
+    return msg, seeds, file_list, root
+def download_selected(parsed_json: str, seed_url: str, root_dir: str, selected_files: List[str]) -> Tuple[str, List[str]]:
+    """
+    Download selected files into /mnt/data/downloads/<infohash>/...
+    Create .sha256 sidecar after successful completion.
+    """
+    if not parsed_json:
+        raise gr.Error("Parse a torrent first.")
+    if not seed_url:
+        raise gr.Error("Choose a web seed or set a Base URL override.")
+    if not selected_files:
+        raise gr.Error("Select at least one file.")
+    parsed = json.loads(parsed_json)
+    summary = parsed.get("summary") or {}
+    infohash = summary.get("Infohash (v1)") or "unknown"
+    out_root = pathlib.Path("/mnt/data/downloads") / infohash
+    out_root.mkdir(parents=True, exist_ok=True)
+    logs = []
+    saved = []
+    for rel in selected_files:
+        try:
+            url = _join_url(seed_url, root_dir, rel)
+            dest_path = out_root / rel
+            bytes_written, total_expected = _download_with_resume(url, dest_path)
+            # Verify size if server told us
+            if total_expected is not None and dest_path.exists() and dest_path.stat().st_size != total_expected:
+                logs.append(f"⚠️ Size mismatch for {rel} (got {dest_path.stat().st_size}, expected {total_expected}). Kept .part if incomplete.")
+            # SHA256
+            if dest_path.exists():
+                sha = _sha256_file(dest_path)
+                with open(str(dest_path) + ".sha256", "w") as f:
+                    f.write(f"{sha}  {dest_path.name}\n")
+                logs.append(f"✅ {rel} — saved {human_bytes(dest_path.stat().st_size)} → {dest_path}")
+                saved.append(str(dest_path))
+            else:
+                logs.append(f"❌ {rel} — download incomplete.")
+        except Exception as e:
+            logs.append(f"❌ {rel} — {type(e).__name__}: {e}")
+    return "\n".join(logs), saved
+def preview_file(path_str: str, max_bytes: int = 300_000) -> Tuple[str, Optional[pd.DataFrame], Optional[str]]:
+    if not path_str:
+        raise gr.Error("Provide a path in /mnt/data.")
+    p = pathlib.Path(path_str)
+    if not p.exists():
+        raise gr.Error("File not found.")
+    suffix = p.suffix.lower()
+    try:
+        if suffix in [".csv", ".tsv"]:
+            sep = "," if suffix == ".csv" else "\t"
+            df = pd.read_csv(p, sep=sep, nrows=2000, on_bad_lines="skip")
+            return f"Previewing {p.name} (first rows):", df, None
+        elif suffix in [".json", ".ndjson"]:
+            raw = open(p, "rb").read(max_bytes)
+            try:
+                obj = json.loads(raw.decode("utf-8", errors="replace"))
+                pretty = json.dumps(obj, indent=2)[:max_bytes]
+            except Exception:
+                pretty = raw.decode("utf-8", errors="replace")
+            return f"Previewing {p.name}:", None, f"```\n{pretty}\n```"
+        elif suffix in [".txt", ".log", ".md"]:
+            raw = open(p, "rb").read(max_bytes)
+            text = raw.decode("utf-8", errors="replace")
+            return f"Previewing {p.name}:", None, f"```\n{text}\n```"
+        else:
+            st = p.stat()
+            return (f"File: **{p.name}**\nSize: {human_bytes(st.st_size)}\nPath: `{p}`",
+                    None, None)
+    except Exception as e:
+        return f"Error previewing file: {type(e).__name__}: {e}", None, None
+# =========================
+# UI
+# =========================
+CSS = """
+#top-note {font-size: 0.95rem; opacity: 0.9;}
+"""
+with gr.Blocks(css=CSS, title="Torrent Inspector + Full HTTP Downloader") as demo:
+    gr.Markdown(
+        """
+# 🧭 Torrent Inspector + Full HTTP Downloader (no P2P)
+- Paste a **magnet** or a **.torrent URL**, or upload a `.torrent`.
+- Full file lists come from **.torrent** metadata.
+- **Downloads use HTTPS only**: either BEP-19 **web seeds** from the torrent or a **Base URL override** you supply (e.g., `https://data.ddosecrets.com/<collection>`).
+- Supports **large files (e.g., 7 GB)** with **resume (HTTP Range)** and **SHA-256** sidecar files.
+> BitTorrent/DHT is not used here (Spaces networking limits). If your torrent doesn’t include web seeds, set the base URL to a mirror that serves the same paths.
+        """,
+        elem_id="top-note"
+    )
+    with gr.Tab("Inspect"):
+        with gr.Row():
+            input_text = gr.Textbox(
+                label="Magnet or .torrent URL",
+                placeholder="magnet:?xt=urn:btih:...  OR  https://.../something.torrent",
+            )
+            upload_torrent = gr.File(label="Or upload .torrent", file_types=[".torrent"])
+        inspect_btn = gr.Button("Parse")
+        out_summary = gr.Markdown(label="Summary")
+        out_df = gr.Dataframe(label="Files in Torrent", interactive=False)
+        out_csv = gr.File(label="Export file list (CSV)")
+        parsed_state = gr.State()
+        def _wrap_handle(inp, upl):
+            md, df, csvp, state_json = handle_input(inp, upl)
+            return md, df, csvp, state_json
+        inspect_btn.click(
+            fn=_wrap_handle,
+            inputs=[input_text, upload_torrent],
+            outputs=[out_summary, out_df, out_csv, parsed_state]
+        )
+    with gr.Tab("Download"):
+        gr.Markdown(
+            "Choose HTTP source: torrent **web seed** (if present) or specify a **Base URL override** (e.g., `https://data.ddosecrets.com/Gabi%20Ashkenazi%20emails`)."
+        )
+        base_url_override = gr.Textbox(
+            label="Base URL override (optional)",
+            placeholder="https://data.ddosecrets.com/<collection>",
+        )
+        prep_btn = gr.Button("Prepare")
+        status_md = gr.Markdown()
+        seed_dropdown = gr.Dropdown(label="Choose HTTP source", choices=[])
+        root_dir_box = gr.Textbox(label="Assumed root folder name", interactive=False)
+        files_checkbox = gr.CheckboxGroup(label="Select files to download", choices=[])
+        dl_btn = gr.Button("Download selected")
+        logs_md = gr.Markdown()
+        saved_files = gr.Gallery(label="Saved paths", show_label=True, columns=1, height=220)
+        saved_files.style(grid=[1])
+        def _prep(state_json, base_override):
+            msg, seeds, files, root = prepare_download(state_json, base_override or "")
+            return msg, gr.update(choices=seeds, value=(seeds[0] if seeds else None)), root, gr.update(choices=files)
+        prep_btn.click(fn=_prep, inputs=[parsed_state, base_url_override], outputs=[status_md, seed_dropdown, root_dir_box, files_checkbox])
+        def _dl(state_json, seed, root, files):
+            logs, paths = download_selected(state_json, seed, root, files)
+            return logs, paths
+        dl_btn.click(fn=_dl, inputs=[parsed_state, seed_dropdown, root_dir_box, files_checkbox], outputs=[logs_md, saved_files])
+        gr.Markdown("**Preview a saved file (optional):**")
+        with gr.Row():
+            path_in = gr.Textbox(label="Path under /mnt/data", placeholder="/mnt/data/downloads/<infohash>/subdir/file.txt")
+            preview_btn = gr.Button("Preview")
+        prev_status = gr.Markdown()
+        prev_df = gr.Dataframe(visible=False)
+        prev_md = gr.Markdown(visible=True)
+        def _preview(p):
+            status, df, md = preview_file(p)
+            return status, (df if df is not None else gr.update(visible=False)), (md if md is not None else gr.update(visible=False))
+        preview_btn.click(fn=_preview, inputs=[path_in], outputs=[prev_status, prev_df, prev_md])
+    gr.Markdown(
+        """
+**Tips / Limits**
+- If you know the dataset lives at `https://data.ddosecrets.com/<CollectionName>/`, put that in **Base URL override**.
+- The downloader writes to `/mnt/data/downloads/<infohash>/...` with resuming and `.sha256` checksums.
+- Some hosts may not allow `HEAD` or `Range`; the app falls back to plain GET when possible.
+        """
+    )
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))