Spaces:

wuhp
/

testemails

Paused

File size: 17,565 Bytes

abae4cb
 
 
 
600c87b
abae4cb
 
 
 
600c87b
4c1d09c
 
 
 
abae4cb
29f8cf0
 
 
abae4cb
 
 
29f8cf0
abae4cb
 
 
 
 
600c87b
 
abae4cb
 
 
 
 
 
600c87b
02f919f
abae4cb
600c87b
abae4cb
29f8cf0
abae4cb
 
 
 
 
600c87b
 
abae4cb
 
 
 
29f8cf0
600c87b
 
abae4cb
 
600c87b
 
abae4cb
600c87b
 
 
 
 
 
 
 
 
abae4cb
600c87b
 
 
 
29f8cf0
600c87b
abae4cb
600c87b
abae4cb
 
 
 
 
 
e1ab87a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29f8cf0
 
 
 
e1ab87a
 
 
 
 
600c87b
e1ab87a
 
 
29f8cf0
 
e1ab87a
 
 
abae4cb
 
29f8cf0
 
 
 
 
 
e1ab87a
 
600c87b
29f8cf0
 
abae4cb
 
29f8cf0
 
 
 
 
 
 
 
 
 
 
 
 
abae4cb
29f8cf0
 
 
 
 
 
 
 
 
 
 
 
600c87b
 
29f8cf0
 
 
 
 
 
 
 
 
4c1d09c
29f8cf0
 
 
 
 
 
600c87b
 
 
 
 
 
 
 
 
 
abae4cb
 
 
4c1d09c
abae4cb
 
600c87b
abae4cb
 
600c87b
abae4cb
600c87b
 
e1ab87a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29f8cf0
 
 
4c1d09c
29f8cf0
 
 
 
4b94fe6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c1d09c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29f8cf0
 
 
600c87b
 
 
 
 
 
 
e1ab87a
29f8cf0
600c87b
 
 
 
 
 
 
 
e1ab87a
29f8cf0
 
e1ab87a
600c87b
 
 
 
 
abae4cb
600c87b
 
 
29f8cf0
 
600c87b
 
e1ab87a
 
 
 
 
 
 
29f8cf0
600c87b
29f8cf0
 
e1ab87a
29f8cf0
600c87b
e1ab87a
600c87b
29f8cf0
 
 
 
 
4b94fe6
600c87b
 
 
 
4b94fe6
 
 
 
 
 
 
 
 
 
 
600c87b
 
 
 
 
 
abae4cb
600c87b
4c1d09c
 
abae4cb
600c87b
 
 
abae4cb
4c1d09c
 
 
 
 
 
 
 
 
29f8cf0
600c87b
29f8cf0
abae4cb
600c87b
 
 
 
29f8cf0
4c1d09c
600c87b
abae4cb
e1ab87a
600c87b
 
 
 
 
 
4c1d09c
 
 
 
 
 
 
 
 
 
600c87b
4c1d09c
600c87b
 
 
4c1d09c
 
 
abae4cb
 
4c1d09c
600c87b
abae4cb
4c1d09c
 
abae4cb
ffa2b77
 
 
4c1d09c
ffa2b77

import os
import json
import hashlib
import pathlib
from typing import List, Tuple, Optional, Dict

import requests
import gradio as gr
import bencodepy
import py7zr
from py7zr.exceptions import CrcError

# NEW: HTML parsing
from bs4 import BeautifulSoup

# =========================
# Helpers
# =========================

def human_bytes(n: int) -> str:
    f = float(n)
    for unit in ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]:
        if f < 1024.0:
            return f"{f:.2f} {unit}"
        f /= 1024.0
    return f"{f:.2f} EiB"

def ensure_dir(p: str):
    os.makedirs(p, exist_ok=True)

def fetch_bytes(url: str, timeout: int = 45) -> bytes:
    r = requests.get(url, timeout=timeout)
    r.raise_for_status()
    return r.content

def parse_torrent(raw: bytes) -> Dict:
    data = bencodepy.decode(raw)
    if not isinstance(data, dict) or b"info" not in data:
        raise ValueError("Invalid .torrent (missing 'info').")
    info = data[b"info"]
    infohash_v1 = hashlib.sha1(bencodepy.encode(info)).hexdigest()

    name = info.get(b"name")
    if isinstance(name, (bytes, bytearray)):
        name = name.decode("utf-8", errors="replace")

    files = []
    if b"files" in info:
        for f in info[b"files"]:
            length = int(f.get(b"length", 0))
            parts = []
            for pe in f.get(b"path", []):
                parts.append(pe.decode("utf-8", "replace") if isinstance(pe, (bytes, bytearray)) else str(pe))
            rel = "/".join(parts) if parts else "(unknown)"
            files.append({"path": rel, "length": length})
    else:
        length = int(info.get(b"length", 0))
        rel = name or "(unnamed)"
        files.append({"path": rel, "length": length})

    web_seeds = []
    if b"url-list" in data:
        v = data[b"url-list"]
        if isinstance(v, (bytes, bytearray)):
            web_seeds = [v.decode("utf-8", "replace")]
        elif isinstance(v, list):
            for u in v:
                if isinstance(u, (bytes, bytearray)):
                    web_seeds.append(u.decode("utf-8", "replace"))

    return {
        "infohash": infohash_v1,
        "name": name or "(unknown)",
        "files": files,
        "web_seeds": [s.rstrip("/") for s in web_seeds if isinstance(s, str) and s.strip()],
    }

def join_url(base: str, *segs: str) -> str:
    parts = [base.rstrip("/")]
    for s in segs:
        enc = "/".join([requests.utils.quote(p) for p in s.split("/")])
        parts.append(enc)
    return "/".join(parts)

def _head_or_peek(url: str, timeout: int = 20) -> Tuple[bool, Optional[int]]:
    try:
        r = requests.head(url, timeout=timeout, allow_redirects=True)
        if r.status_code < 400:
            size = r.headers.get("Content-Length")
            return True, (int(size) if size and size.isdigit() else None)
    except Exception:
        pass
    try:
        r = requests.get(url, stream=True, timeout=timeout, allow_redirects=True)
        if r.status_code < 400:
            size = r.headers.get("Content-Length")
            try:
                next(r.iter_content(chunk_size=1024))
            except Exception:
                pass
            try:
                r.close()
            except Exception:
                pass
            return True, (int(size) if size and size.isdigit() else None)
    except Exception:
        pass
    return False, None

def supports_range_and_size(url: str, timeout: int = 30) -> Tuple[bool, Optional[int]]:
    try:
        r = requests.head(url, timeout=timeout, allow_redirects=True)
        if r.status_code < 400:
            size = int(r.headers.get("Content-Length", "0") or 0)
            return (("bytes" in r.headers.get("Accept-Ranges", "").lower()) or size > 0, size if size > 0 else None)
    except Exception:
        pass
    try:
        r = requests.get(url, stream=True, timeout=timeout, allow_redirects=True)
        r.raise_for_status()
        size = int(r.headers.get("Content-Length", "0") or 0)
        try:
            r.close()
        except Exception:
            pass
        return ("bytes" in r.headers.get("Accept-Ranges", "").lower() or size > 0, size if size > 0 else None)
    except Exception:
        return False, None

def download_file_exact(url: str, dest_path: pathlib.Path, expected_size: Optional[int],
                        timeout: int = 120, max_attempts: int = 2):
    dest_path.parent.mkdir(parents=True, exist_ok=True)

    def _resume_once():
        tmp = dest_path.with_suffix(dest_path.suffix + ".part")
        existing = tmp.stat().st_size if tmp.exists() else 0
        can_range, _ = supports_range_and_size(url)
        headers = {"Range": f"bytes={existing}-"} if (can_range and existing > 0) else {}
        mode = "ab" if headers else "wb"
        with requests.get(url, stream=True, timeout=timeout, headers=headers) as r:
            r.raise_for_status()
            with open(tmp, mode) as f:
                for chunk in r.iter_content(chunk_size=1024 * 1024):
                    if chunk:
                        f.write(chunk)
        tmp.rename(dest_path)

    def _fresh_once():
        tmp = dest_path.with_suffix(dest_path.suffix + ".part")
        if tmp.exists():
            tmp.unlink()
        if dest_path.exists():
            dest_path.unlink()
        with requests.get(url, stream=True, timeout=timeout) as r:
            r.raise_for_status()
            with open(tmp, "wb") as f:
                for chunk in r.iter_content(chunk_size=1024 * 1024):
                    if chunk:
                        f.write(chunk)
        tmp.rename(dest_path)

    attempts = 0
    while attempts < max_attempts:
        attempts += 1
        if attempts == 1:
            _resume_once()
        else:
            _fresh_once()

        if expected_size is None:
            return
        if dest_path.exists() and dest_path.stat().st_size == expected_size:
            return

    got = dest_path.stat().st_size if dest_path.exists() else 0
    raise gr.Error(f"Downloaded size mismatch for {dest_path.name}: got {got} bytes, expected {expected_size}.")

def list_files_recursive(root: pathlib.Path) -> List[str]:
    out = []
    for p in root.rglob("*"):
        if p.is_file():
            out.append(str(p))
    return out

def preview_path(path_str: str, max_bytes: int = 250_000) -> Tuple[str, Optional[str]]:
    if not path_str or not os.path.exists(path_str):
        return "File not found.", None
    p = pathlib.Path(path_str)
    suffix = p.suffix.lower()
    try:
        if suffix in [".csv", ".tsv", ".json", ".ndjson", ".txt", ".log", ".md", ".eml", ".html", ".htm", ".meta"]:
            raw = open(p, "rb").read(max_bytes)
            text = raw.decode("utf-8", errors="replace")
            return f"Previewing {p.name} (truncated):\n\n```\n{text}\n```", None
        else:
            st = p.stat()
            return (f"File: **{p.name}**\nSize: {human_bytes(st.st_size)}\nPath: `{p}`", None)
    except Exception as e:
        return f"Error previewing file: {type(e).__name__}: {e}", None

def infer_bases_from_torrent_url(torrent_url: str) -> List[str]:
    u = torrent_url.strip()
    if "/" not in u:
        return []
    base = u.rsplit("/", 1)[0]
    return [base]

def resolve_download_url(bases: List[str], root_name: str, rel_path: str) -> Optional[str]:
    candidates = []
    for b in bases:
        candidates.append(join_url(b, root_name, rel_path))
        candidates.append(join_url(b, rel_path))
    for c in candidates:
        ok, _ = _head_or_peek(c)
        if ok:
            return c
    return None

def test_7z_integrity(archive_path: str) -> bool:
    try:
        with py7zr.SevenZipFile(archive_path, mode="r") as z:
            z.test()
        return True
    except Exception:
        return False

def safe_extract_7z(archive_path: str, dest_dir: str) -> Tuple[int, List[str]]:
    extracted_count = 0
    skipped: List[str] = []
    dest = pathlib.Path(dest_dir)
    dest.mkdir(parents=True, exist_ok=True)

    try:
        with py7zr.SevenZipFile(archive_path, mode="r") as z:
            z.extract(path=str(dest))
        return -1, skipped
    except CrcError:
        pass

    with py7zr.SevenZipFile(archive_path, mode="r") as z:
        members = [info.filename for info in z.list() if not info.is_directory]
        for name in members:
            try:
                z.extract(targets=[name], path=str(dest))
                extracted_count += 1
            except CrcError:
                skipped.append(name)
            except Exception:
                skipped.append(name)

    return extracted_count, skipped

# =========================
# NEW: HTML/.meta → JSONL exporter
# =========================

def _parse_meta_file(path: pathlib.Path) -> Dict:
    """
    Try JSON parse; else parse simple 'key: value' lines; else return raw text.
    """
    raw = path.read_text(encoding="utf-8", errors="replace")
    # try JSON
    try:
        obj = json.loads(raw)
        return {"type": "meta", "path": str(path), "content": obj}
    except Exception:
        pass
    # key: value lines
    data: Dict[str, str] = {}
    lines = [ln.strip() for ln in raw.splitlines() if ln.strip()]
    for ln in lines:
        if ":" in ln:
            k, v = ln.split(":", 1)
            data[k.strip()] = v.strip()
    if data:
        return {"type": "meta", "path": str(path), "content": data}
    # fallback raw
    return {"type": "meta", "path": str(path), "content_raw": raw}

def _parse_html_file(path: pathlib.Path) -> Dict:
    """
    Extract title, meta[name/content], and plain text.
    """
    raw = path.read_text(encoding="utf-8", errors="replace")
    # Prefer lxml if present; fallback to built-in parser
    try:
        soup = BeautifulSoup(raw, "lxml")
    except Exception:
        soup = BeautifulSoup(raw, "html.parser")
    title = (soup.title.string.strip() if soup.title and soup.title.string else "")
    meta = {}
    for tag in soup.find_all("meta"):
        name = tag.get("name") or tag.get("property")
        content = tag.get("content")
        if name and content:
            meta[str(name)] = str(content)
    text = soup.get_text(separator="\n", strip=True)
    return {"type": "html", "path": str(path), "title": title, "meta": meta, "text": text}

def build_jsonl_from_extracted(ex_dir: str, out_dir: str, max_records: Optional[int] = None) -> Tuple[str, int, int]:
    """
    Walk extracted dir, convert all .html/.htm and .meta files to JSONL.
    Returns (output_path, html_count, meta_count).
    """
    ex_root = pathlib.Path(ex_dir)
    out_root = pathlib.Path(out_dir)
    out_root.mkdir(parents=True, exist_ok=True)
    out_path = out_root / "converted.jsonl"

    html_count = 0
    meta_count = 0
    written = 0

    with open(out_path, "w", encoding="utf-8") as fout:
        for p in ex_root.rglob("*"):
            if not p.is_file():
                continue
            suf = p.suffix.lower()
            try:
                if suf in (".html", ".htm"):
                    rec = _parse_html_file(p)
                    html_count += 1
                elif suf == ".meta":
                    rec = _parse_meta_file(p)
                    meta_count += 1
                else:
                    continue
                fout.write(json.dumps(rec, ensure_ascii=False) + "\n")
                written += 1
                if max_records and written >= max_records:
                    break
            except Exception as e:
                # Skip unreadable files but carry on
                err = {"type": "error", "path": str(p), "error": f"{type(e).__name__}: {e}"}
                fout.write(json.dumps(err, ensure_ascii=False) + "\n")

    return str(out_path), html_count, meta_count

# =========================
# Pipeline
# =========================

def run_pipeline(torrent_url: str):
    if not torrent_url.strip().lower().endswith(".torrent"):
        raise gr.Error("Please provide a direct .torrent URL.")

    raw = fetch_bytes(torrent_url.strip())
    meta = parse_torrent(raw)

    seeds = list(meta["web_seeds"]) or infer_bases_from_torrent_url(torrent_url)

    infohash = meta["infohash"]
    root_name = meta["name"]

    sevenz_files = [f for f in meta["files"] if f["path"].lower().endswith(".7z")]
    if not sevenz_files:
        raise gr.Error("No .7z files listed in the torrent.")

    if not seeds:
        raise gr.Error("No HTTP source found to fetch files. "
                       "If this is DDoSecrets, ensure the .torrent sits with the files over HTTPS.")

    base_dir = pathlib.Path("/mnt/data/work") / infohash
    dl_dir = base_dir / "downloads"
    ex_dir = base_dir / "extracted"
    ensure_dir(str(dl_dir))
    ensure_dir(str(ex_dir))

    logs = []
    saved_archives = []

    expected_map = {f["path"]: int(f.get("length", 0)) for f in meta["files"]}

    for f in sevenz_files:
        rel = f["path"]
        final_url = None
        for seed in seeds:
            final_url = resolve_download_url([seed], root_name, rel)
            if final_url:
                break
        if not final_url:
            raise gr.Error(f"Could not resolve an HTTP URL for {rel} from bases {seeds}.")

        dest = dl_dir / rel
        expected_size = expected_map.get(rel) or None

        logs.append(f"Downloading: {final_url}")
        download_file_exact(final_url, dest, expected_size)
        if not dest.exists():
            raise gr.Error(f"Download failed: {final_url}")
        logs.append(f"Saved: {dest} ({human_bytes(dest.stat().st_size)})")

        if not test_7z_integrity(str(dest)):
            logs.append(f"CRC test failed for {dest.name}, retrying download fresh…")
            download_file_exact(final_url, dest, expected_size, max_attempts=2)
            if not test_7z_integrity(str(dest)):
                logs.append(f"Archive still reports CRC problems: {dest.name}. Will try per-file extraction and skip corrupt members.")
        saved_archives.append(str(dest))

    for apath in saved_archives:
        logs.append(f"Extracting: {apath}")
        count, skipped = safe_extract_7z(apath, str(ex_dir))
        if count == -1:
            logs.append(f"Extracted OK → {ex_dir}")
        else:
            logs.append(f"Extracted {count} members to {ex_dir}")
            if skipped:
                logs.append(f"Skipped {len(skipped)} corrupted member(s):")
                show = skipped[:10]
                logs += [f"  - {s}" for s in show]
                if len(skipped) > 10:
                    logs.append(f"  … and {len(skipped) - 10} more")

    extracted = list_files_recursive(ex_dir)
    if not extracted:
        logs.append("No files extracted (archive may be empty).")
    else:
        logs.append(f"Extracted files: {len(extracted)}")

    log_md = "### Run log\n" + "\n".join(f"- {l}" for l in logs)
    # RETURN the extracted dir so we can build JSON later
    return log_md, extracted, (extracted[0] if extracted else ""), str(ex_dir), str(base_dir)

def do_preview(path: str):
    md, _ = preview_path(path)
    return md

# NEW: hook to build JSONL and return a downloadable file
def do_build_jsonl(ex_dir: str, base_dir: str):
    if not ex_dir or not os.path.isdir(ex_dir):
        raise gr.Error("Extraction folder not found. Run the download/extract step first.")
    out_dir = str(pathlib.Path(base_dir) / "exports")
    out_path, html_count, meta_count = build_jsonl_from_extracted(ex_dir, out_dir)
    summary = f"Built JSONL at: `{out_path}`\n- HTML files: {html_count}\n- META files: {meta_count}\n"
    return summary, out_path

# =========================
# UI
# =========================

with gr.Blocks(title="Torrent → 7z → View (HTTP only)") as demo:
    gr.Markdown(
        """
# Torrent → 7z → View (HTTP only)
Paste a **.torrent URL** (with web seeds or DDoSecrets-style layout).
The app downloads `.7z` file(s), verifies size & CRC, extracts them, lets you preview text/csv/json, **and exports all `.html` + `.meta` to a single JSONL**.
        """
    )
    url_in = gr.Textbox(label=".torrent URL", placeholder="https://data.ddosecrets.com/Collection/Collection.torrent")
    go_btn = gr.Button("Download, Extract & List")
    log_out = gr.Markdown()
    files_dd = gr.Dropdown(label="Extracted files", choices=[], interactive=True)
    preview_btn = gr.Button("Preview selected")
    preview_md = gr.Markdown()

    # NEW: export controls
    gr.Markdown("### Export `.html` and `.meta` → combined JSONL")
    build_btn = gr.Button("Build JSONL from extracted")
    build_log = gr.Markdown()
    dl_file = gr.File(label="Download combined JSONL", interactive=False)

    # internal state: extracted dir & base dir for exports
    ex_dir_state = gr.State()
    base_dir_state = gr.State()

    def _go(url):
        log, files, first, ex_dir, base_dir = run_pipeline(url)
        return (
            log,
            gr.update(choices=files, value=(first if first else None)),
            (first if first else ""),
            ex_dir,
            base_dir
        )

    go_btn.click(fn=_go, inputs=[url_in], outputs=[log_out, files_dd, files_dd, ex_dir_state, base_dir_state])
    preview_btn.click(fn=do_preview, inputs=[files_dd], outputs=[preview_md])

    build_btn.click(fn=do_build_jsonl, inputs=[ex_dir_state, base_dir_state], outputs=[build_log, dl_file])

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=int(os.environ.get("PORT", 7860)),
        allowed_paths=["/mnt/data"]  # allow returning files from /mnt/data
    )