File size: 17,565 Bytes
abae4cb
 
 
 
600c87b
abae4cb
 
 
 
600c87b
4c1d09c
 
 
 
abae4cb
29f8cf0
 
 
abae4cb
 
 
29f8cf0
abae4cb
 
 
 
 
600c87b
 
abae4cb
 
 
 
 
 
600c87b
02f919f
abae4cb
600c87b
abae4cb
29f8cf0
abae4cb
 
 
 
 
600c87b
 
abae4cb
 
 
 
29f8cf0
600c87b
 
abae4cb
 
600c87b
 
abae4cb
600c87b
 
 
 
 
 
 
 
 
abae4cb
600c87b
 
 
 
29f8cf0
600c87b
abae4cb
600c87b
abae4cb
 
 
 
 
 
e1ab87a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29f8cf0
 
 
 
e1ab87a
 
 
 
 
600c87b
e1ab87a
 
 
29f8cf0
 
e1ab87a
 
 
abae4cb
 
29f8cf0
 
 
 
 
 
e1ab87a
 
600c87b
29f8cf0
 
abae4cb
 
29f8cf0
 
 
 
 
 
 
 
 
 
 
 
 
abae4cb
29f8cf0
 
 
 
 
 
 
 
 
 
 
 
600c87b
 
29f8cf0
 
 
 
 
 
 
 
 
4c1d09c
29f8cf0
 
 
 
 
 
600c87b
 
 
 
 
 
 
 
 
 
abae4cb
 
 
4c1d09c
abae4cb
 
600c87b
abae4cb
 
600c87b
abae4cb
600c87b
 
e1ab87a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29f8cf0
 
 
4c1d09c
29f8cf0
 
 
 
4b94fe6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c1d09c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29f8cf0
 
 
600c87b
 
 
 
 
 
 
e1ab87a
29f8cf0
600c87b
 
 
 
 
 
 
 
e1ab87a
29f8cf0
 
e1ab87a
600c87b
 
 
 
 
abae4cb
600c87b
 
 
29f8cf0
 
600c87b
 
e1ab87a
 
 
 
 
 
 
29f8cf0
600c87b
29f8cf0
 
e1ab87a
29f8cf0
600c87b
e1ab87a
600c87b
29f8cf0
 
 
 
 
4b94fe6
600c87b
 
 
 
4b94fe6
 
 
 
 
 
 
 
 
 
 
600c87b
 
 
 
 
 
abae4cb
600c87b
4c1d09c
 
abae4cb
600c87b
 
 
abae4cb
4c1d09c
 
 
 
 
 
 
 
 
29f8cf0
600c87b
29f8cf0
abae4cb
600c87b
 
 
 
29f8cf0
4c1d09c
600c87b
abae4cb
e1ab87a
600c87b
 
 
 
 
 
4c1d09c
 
 
 
 
 
 
 
 
 
600c87b
4c1d09c
600c87b
 
 
4c1d09c
 
 
abae4cb
 
4c1d09c
600c87b
abae4cb
4c1d09c
 
abae4cb
ffa2b77
 
 
4c1d09c
ffa2b77
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
import os
import json
import hashlib
import pathlib
from typing import List, Tuple, Optional, Dict

import requests
import gradio as gr
import bencodepy
import py7zr
from py7zr.exceptions import CrcError

# NEW: HTML parsing
from bs4 import BeautifulSoup

# =========================
# Helpers
# =========================

def human_bytes(n: int) -> str:
    f = float(n)
    for unit in ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]:
        if f < 1024.0:
            return f"{f:.2f} {unit}"
        f /= 1024.0
    return f"{f:.2f} EiB"

def ensure_dir(p: str):
    os.makedirs(p, exist_ok=True)

def fetch_bytes(url: str, timeout: int = 45) -> bytes:
    r = requests.get(url, timeout=timeout)
    r.raise_for_status()
    return r.content

def parse_torrent(raw: bytes) -> Dict:
    data = bencodepy.decode(raw)
    if not isinstance(data, dict) or b"info" not in data:
        raise ValueError("Invalid .torrent (missing 'info').")
    info = data[b"info"]
    infohash_v1 = hashlib.sha1(bencodepy.encode(info)).hexdigest()

    name = info.get(b"name")
    if isinstance(name, (bytes, bytearray)):
        name = name.decode("utf-8", errors="replace")

    files = []
    if b"files" in info:
        for f in info[b"files"]:
            length = int(f.get(b"length", 0))
            parts = []
            for pe in f.get(b"path", []):
                parts.append(pe.decode("utf-8", "replace") if isinstance(pe, (bytes, bytearray)) else str(pe))
            rel = "/".join(parts) if parts else "(unknown)"
            files.append({"path": rel, "length": length})
    else:
        length = int(info.get(b"length", 0))
        rel = name or "(unnamed)"
        files.append({"path": rel, "length": length})

    web_seeds = []
    if b"url-list" in data:
        v = data[b"url-list"]
        if isinstance(v, (bytes, bytearray)):
            web_seeds = [v.decode("utf-8", "replace")]
        elif isinstance(v, list):
            for u in v:
                if isinstance(u, (bytes, bytearray)):
                    web_seeds.append(u.decode("utf-8", "replace"))

    return {
        "infohash": infohash_v1,
        "name": name or "(unknown)",
        "files": files,
        "web_seeds": [s.rstrip("/") for s in web_seeds if isinstance(s, str) and s.strip()],
    }

def join_url(base: str, *segs: str) -> str:
    parts = [base.rstrip("/")]
    for s in segs:
        enc = "/".join([requests.utils.quote(p) for p in s.split("/")])
        parts.append(enc)
    return "/".join(parts)

def _head_or_peek(url: str, timeout: int = 20) -> Tuple[bool, Optional[int]]:
    try:
        r = requests.head(url, timeout=timeout, allow_redirects=True)
        if r.status_code < 400:
            size = r.headers.get("Content-Length")
            return True, (int(size) if size and size.isdigit() else None)
    except Exception:
        pass
    try:
        r = requests.get(url, stream=True, timeout=timeout, allow_redirects=True)
        if r.status_code < 400:
            size = r.headers.get("Content-Length")
            try:
                next(r.iter_content(chunk_size=1024))
            except Exception:
                pass
            try:
                r.close()
            except Exception:
                pass
            return True, (int(size) if size and size.isdigit() else None)
    except Exception:
        pass
    return False, None

def supports_range_and_size(url: str, timeout: int = 30) -> Tuple[bool, Optional[int]]:
    try:
        r = requests.head(url, timeout=timeout, allow_redirects=True)
        if r.status_code < 400:
            size = int(r.headers.get("Content-Length", "0") or 0)
            return (("bytes" in r.headers.get("Accept-Ranges", "").lower()) or size > 0, size if size > 0 else None)
    except Exception:
        pass
    try:
        r = requests.get(url, stream=True, timeout=timeout, allow_redirects=True)
        r.raise_for_status()
        size = int(r.headers.get("Content-Length", "0") or 0)
        try:
            r.close()
        except Exception:
            pass
        return ("bytes" in r.headers.get("Accept-Ranges", "").lower() or size > 0, size if size > 0 else None)
    except Exception:
        return False, None

def download_file_exact(url: str, dest_path: pathlib.Path, expected_size: Optional[int],
                        timeout: int = 120, max_attempts: int = 2):
    dest_path.parent.mkdir(parents=True, exist_ok=True)

    def _resume_once():
        tmp = dest_path.with_suffix(dest_path.suffix + ".part")
        existing = tmp.stat().st_size if tmp.exists() else 0
        can_range, _ = supports_range_and_size(url)
        headers = {"Range": f"bytes={existing}-"} if (can_range and existing > 0) else {}
        mode = "ab" if headers else "wb"
        with requests.get(url, stream=True, timeout=timeout, headers=headers) as r:
            r.raise_for_status()
            with open(tmp, mode) as f:
                for chunk in r.iter_content(chunk_size=1024 * 1024):
                    if chunk:
                        f.write(chunk)
        tmp.rename(dest_path)

    def _fresh_once():
        tmp = dest_path.with_suffix(dest_path.suffix + ".part")
        if tmp.exists():
            tmp.unlink()
        if dest_path.exists():
            dest_path.unlink()
        with requests.get(url, stream=True, timeout=timeout) as r:
            r.raise_for_status()
            with open(tmp, "wb") as f:
                for chunk in r.iter_content(chunk_size=1024 * 1024):
                    if chunk:
                        f.write(chunk)
        tmp.rename(dest_path)

    attempts = 0
    while attempts < max_attempts:
        attempts += 1
        if attempts == 1:
            _resume_once()
        else:
            _fresh_once()

        if expected_size is None:
            return
        if dest_path.exists() and dest_path.stat().st_size == expected_size:
            return

    got = dest_path.stat().st_size if dest_path.exists() else 0
    raise gr.Error(f"Downloaded size mismatch for {dest_path.name}: got {got} bytes, expected {expected_size}.")

def list_files_recursive(root: pathlib.Path) -> List[str]:
    out = []
    for p in root.rglob("*"):
        if p.is_file():
            out.append(str(p))
    return out

def preview_path(path_str: str, max_bytes: int = 250_000) -> Tuple[str, Optional[str]]:
    if not path_str or not os.path.exists(path_str):
        return "File not found.", None
    p = pathlib.Path(path_str)
    suffix = p.suffix.lower()
    try:
        if suffix in [".csv", ".tsv", ".json", ".ndjson", ".txt", ".log", ".md", ".eml", ".html", ".htm", ".meta"]:
            raw = open(p, "rb").read(max_bytes)
            text = raw.decode("utf-8", errors="replace")
            return f"Previewing {p.name} (truncated):\n\n```\n{text}\n```", None
        else:
            st = p.stat()
            return (f"File: **{p.name}**\nSize: {human_bytes(st.st_size)}\nPath: `{p}`", None)
    except Exception as e:
        return f"Error previewing file: {type(e).__name__}: {e}", None

def infer_bases_from_torrent_url(torrent_url: str) -> List[str]:
    u = torrent_url.strip()
    if "/" not in u:
        return []
    base = u.rsplit("/", 1)[0]
    return [base]

def resolve_download_url(bases: List[str], root_name: str, rel_path: str) -> Optional[str]:
    candidates = []
    for b in bases:
        candidates.append(join_url(b, root_name, rel_path))
        candidates.append(join_url(b, rel_path))
    for c in candidates:
        ok, _ = _head_or_peek(c)
        if ok:
            return c
    return None

def test_7z_integrity(archive_path: str) -> bool:
    try:
        with py7zr.SevenZipFile(archive_path, mode="r") as z:
            z.test()
        return True
    except Exception:
        return False

def safe_extract_7z(archive_path: str, dest_dir: str) -> Tuple[int, List[str]]:
    extracted_count = 0
    skipped: List[str] = []
    dest = pathlib.Path(dest_dir)
    dest.mkdir(parents=True, exist_ok=True)

    try:
        with py7zr.SevenZipFile(archive_path, mode="r") as z:
            z.extract(path=str(dest))
        return -1, skipped
    except CrcError:
        pass

    with py7zr.SevenZipFile(archive_path, mode="r") as z:
        members = [info.filename for info in z.list() if not info.is_directory]
        for name in members:
            try:
                z.extract(targets=[name], path=str(dest))
                extracted_count += 1
            except CrcError:
                skipped.append(name)
            except Exception:
                skipped.append(name)

    return extracted_count, skipped

# =========================
# NEW: HTML/.meta → JSONL exporter
# =========================

def _parse_meta_file(path: pathlib.Path) -> Dict:
    """
    Try JSON parse; else parse simple 'key: value' lines; else return raw text.
    """
    raw = path.read_text(encoding="utf-8", errors="replace")
    # try JSON
    try:
        obj = json.loads(raw)
        return {"type": "meta", "path": str(path), "content": obj}
    except Exception:
        pass
    # key: value lines
    data: Dict[str, str] = {}
    lines = [ln.strip() for ln in raw.splitlines() if ln.strip()]
    for ln in lines:
        if ":" in ln:
            k, v = ln.split(":", 1)
            data[k.strip()] = v.strip()
    if data:
        return {"type": "meta", "path": str(path), "content": data}
    # fallback raw
    return {"type": "meta", "path": str(path), "content_raw": raw}

def _parse_html_file(path: pathlib.Path) -> Dict:
    """
    Extract title, meta[name/content], and plain text.
    """
    raw = path.read_text(encoding="utf-8", errors="replace")
    # Prefer lxml if present; fallback to built-in parser
    try:
        soup = BeautifulSoup(raw, "lxml")
    except Exception:
        soup = BeautifulSoup(raw, "html.parser")
    title = (soup.title.string.strip() if soup.title and soup.title.string else "")
    meta = {}
    for tag in soup.find_all("meta"):
        name = tag.get("name") or tag.get("property")
        content = tag.get("content")
        if name and content:
            meta[str(name)] = str(content)
    text = soup.get_text(separator="\n", strip=True)
    return {"type": "html", "path": str(path), "title": title, "meta": meta, "text": text}

def build_jsonl_from_extracted(ex_dir: str, out_dir: str, max_records: Optional[int] = None) -> Tuple[str, int, int]:
    """
    Walk extracted dir, convert all .html/.htm and .meta files to JSONL.
    Returns (output_path, html_count, meta_count).
    """
    ex_root = pathlib.Path(ex_dir)
    out_root = pathlib.Path(out_dir)
    out_root.mkdir(parents=True, exist_ok=True)
    out_path = out_root / "converted.jsonl"

    html_count = 0
    meta_count = 0
    written = 0

    with open(out_path, "w", encoding="utf-8") as fout:
        for p in ex_root.rglob("*"):
            if not p.is_file():
                continue
            suf = p.suffix.lower()
            try:
                if suf in (".html", ".htm"):
                    rec = _parse_html_file(p)
                    html_count += 1
                elif suf == ".meta":
                    rec = _parse_meta_file(p)
                    meta_count += 1
                else:
                    continue
                fout.write(json.dumps(rec, ensure_ascii=False) + "\n")
                written += 1
                if max_records and written >= max_records:
                    break
            except Exception as e:
                # Skip unreadable files but carry on
                err = {"type": "error", "path": str(p), "error": f"{type(e).__name__}: {e}"}
                fout.write(json.dumps(err, ensure_ascii=False) + "\n")

    return str(out_path), html_count, meta_count

# =========================
# Pipeline
# =========================

def run_pipeline(torrent_url: str):
    if not torrent_url.strip().lower().endswith(".torrent"):
        raise gr.Error("Please provide a direct .torrent URL.")

    raw = fetch_bytes(torrent_url.strip())
    meta = parse_torrent(raw)

    seeds = list(meta["web_seeds"]) or infer_bases_from_torrent_url(torrent_url)

    infohash = meta["infohash"]
    root_name = meta["name"]

    sevenz_files = [f for f in meta["files"] if f["path"].lower().endswith(".7z")]
    if not sevenz_files:
        raise gr.Error("No .7z files listed in the torrent.")

    if not seeds:
        raise gr.Error("No HTTP source found to fetch files. "
                       "If this is DDoSecrets, ensure the .torrent sits with the files over HTTPS.")

    base_dir = pathlib.Path("/mnt/data/work") / infohash
    dl_dir = base_dir / "downloads"
    ex_dir = base_dir / "extracted"
    ensure_dir(str(dl_dir))
    ensure_dir(str(ex_dir))

    logs = []
    saved_archives = []

    expected_map = {f["path"]: int(f.get("length", 0)) for f in meta["files"]}

    for f in sevenz_files:
        rel = f["path"]
        final_url = None
        for seed in seeds:
            final_url = resolve_download_url([seed], root_name, rel)
            if final_url:
                break
        if not final_url:
            raise gr.Error(f"Could not resolve an HTTP URL for {rel} from bases {seeds}.")

        dest = dl_dir / rel
        expected_size = expected_map.get(rel) or None

        logs.append(f"Downloading: {final_url}")
        download_file_exact(final_url, dest, expected_size)
        if not dest.exists():
            raise gr.Error(f"Download failed: {final_url}")
        logs.append(f"Saved: {dest} ({human_bytes(dest.stat().st_size)})")

        if not test_7z_integrity(str(dest)):
            logs.append(f"CRC test failed for {dest.name}, retrying download fresh…")
            download_file_exact(final_url, dest, expected_size, max_attempts=2)
            if not test_7z_integrity(str(dest)):
                logs.append(f"Archive still reports CRC problems: {dest.name}. Will try per-file extraction and skip corrupt members.")
        saved_archives.append(str(dest))

    for apath in saved_archives:
        logs.append(f"Extracting: {apath}")
        count, skipped = safe_extract_7z(apath, str(ex_dir))
        if count == -1:
            logs.append(f"Extracted OK → {ex_dir}")
        else:
            logs.append(f"Extracted {count} members to {ex_dir}")
            if skipped:
                logs.append(f"Skipped {len(skipped)} corrupted member(s):")
                show = skipped[:10]
                logs += [f"  - {s}" for s in show]
                if len(skipped) > 10:
                    logs.append(f"  … and {len(skipped) - 10} more")

    extracted = list_files_recursive(ex_dir)
    if not extracted:
        logs.append("No files extracted (archive may be empty).")
    else:
        logs.append(f"Extracted files: {len(extracted)}")

    log_md = "### Run log\n" + "\n".join(f"- {l}" for l in logs)
    # RETURN the extracted dir so we can build JSON later
    return log_md, extracted, (extracted[0] if extracted else ""), str(ex_dir), str(base_dir)

def do_preview(path: str):
    md, _ = preview_path(path)
    return md

# NEW: hook to build JSONL and return a downloadable file
def do_build_jsonl(ex_dir: str, base_dir: str):
    if not ex_dir or not os.path.isdir(ex_dir):
        raise gr.Error("Extraction folder not found. Run the download/extract step first.")
    out_dir = str(pathlib.Path(base_dir) / "exports")
    out_path, html_count, meta_count = build_jsonl_from_extracted(ex_dir, out_dir)
    summary = f"Built JSONL at: `{out_path}`\n- HTML files: {html_count}\n- META files: {meta_count}\n"
    return summary, out_path

# =========================
# UI
# =========================

with gr.Blocks(title="Torrent → 7z → View (HTTP only)") as demo:
    gr.Markdown(
        """
# Torrent → 7z → View (HTTP only)
Paste a **.torrent URL** (with web seeds or DDoSecrets-style layout).
The app downloads `.7z` file(s), verifies size & CRC, extracts them, lets you preview text/csv/json, **and exports all `.html` + `.meta` to a single JSONL**.
        """
    )
    url_in = gr.Textbox(label=".torrent URL", placeholder="https://data.ddosecrets.com/Collection/Collection.torrent")
    go_btn = gr.Button("Download, Extract & List")
    log_out = gr.Markdown()
    files_dd = gr.Dropdown(label="Extracted files", choices=[], interactive=True)
    preview_btn = gr.Button("Preview selected")
    preview_md = gr.Markdown()

    # NEW: export controls
    gr.Markdown("### Export `.html` and `.meta` → combined JSONL")
    build_btn = gr.Button("Build JSONL from extracted")
    build_log = gr.Markdown()
    dl_file = gr.File(label="Download combined JSONL", interactive=False)

    # internal state: extracted dir & base dir for exports
    ex_dir_state = gr.State()
    base_dir_state = gr.State()

    def _go(url):
        log, files, first, ex_dir, base_dir = run_pipeline(url)
        return (
            log,
            gr.update(choices=files, value=(first if first else None)),
            (first if first else ""),
            ex_dir,
            base_dir
        )

    go_btn.click(fn=_go, inputs=[url_in], outputs=[log_out, files_dd, files_dd, ex_dir_state, base_dir_state])
    preview_btn.click(fn=do_preview, inputs=[files_dd], outputs=[preview_md])

    build_btn.click(fn=do_build_jsonl, inputs=[ex_dir_state, base_dir_state], outputs=[build_log, dl_file])

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=int(os.environ.get("PORT", 7860)),
        allowed_paths=["/mnt/data"]  # allow returning files from /mnt/data
    )