Spaces:

wuhp
/

testemails

Paused

App Files Files Community

wuhp commited on Aug 31, 2025

Commit

4c1d09c

verified ·

1 Parent(s): bd0b355

Update app.py

Browse files

Files changed (1) hide show

app.py +123 -56

app.py CHANGED Viewed

@@ -8,7 +8,10 @@ import requests
 import gradio as gr
 import bencodepy
 import py7zr
-from py7zr.exceptions import CrcError  # for granular handling
 # =========================
 # Helpers
@@ -31,15 +34,6 @@ def fetch_bytes(url: str, timeout: int = 45) -> bytes:
     return r.content
 def parse_torrent(raw: bytes) -> Dict:
-    """
-    Return:
-      {
-        "infohash": str,
-        "name": str,
-        "files": [{"path": str, "length": int}, ...],
-        "web_seeds": [str, ...]
-      }
-    """
     data = bencodepy.decode(raw)
     if not isinstance(data, dict) or b"info" not in data:
         raise ValueError("Invalid .torrent (missing 'info').")
@@ -89,7 +83,6 @@ def join_url(base: str, *segs: str) -> str:
     return "/".join(parts)
 def _head_or_peek(url: str, timeout: int = 20) -> Tuple[bool, Optional[int]]:
-    # HEAD
     try:
         r = requests.head(url, timeout=timeout, allow_redirects=True)
         if r.status_code < 400:
@@ -97,7 +90,6 @@ def _head_or_peek(url: str, timeout: int = 20) -> Tuple[bool, Optional[int]]:
             return True, (int(size) if size and size.isdigit() else None)
     except Exception:
         pass
-    # Tiny GET
     try:
         r = requests.get(url, stream=True, timeout=timeout, allow_redirects=True)
         if r.status_code < 400:
@@ -137,10 +129,6 @@ def supports_range_and_size(url: str, timeout: int = 30) -> Tuple[bool, Optional
 def download_file_exact(url: str, dest_path: pathlib.Path, expected_size: Optional[int],
                         timeout: int = 120, max_attempts: int = 2):
-    """
-    Download to dest_path. If expected_size is known, enforce it.
-    Attempt resume first; if mismatch, retry once with full GET.
-    """
     dest_path.parent.mkdir(parents=True, exist_ok=True)
     def _resume_once():
@@ -180,7 +168,7 @@ def download_file_exact(url: str, dest_path: pathlib.Path, expected_size: Option
             _fresh_once()
         if expected_size is None:
-            return  # no verification possible
         if dest_path.exists() and dest_path.stat().st_size == expected_size:
             return
@@ -200,7 +188,7 @@ def preview_path(path_str: str, max_bytes: int = 250_000) -> Tuple[str, Optional
     p = pathlib.Path(path_str)
     suffix = p.suffix.lower()
     try:
-        if suffix in [".csv", ".tsv", ".json", ".ndjson", ".txt", ".log", ".md", ".eml"]:
             raw = open(p, "rb").read(max_bytes)
             text = raw.decode("utf-8", errors="replace")
             return f"Previewing {p.name} (truncated):\n\n```\n{text}\n```", None
@@ -211,12 +199,6 @@ def preview_path(path_str: str, max_bytes: int = 250_000) -> Tuple[str, Optional
         return f"Error previewing file: {type(e).__name__}: {e}", None
 def infer_bases_from_torrent_url(torrent_url: str) -> List[str]:
-    """
-    For URLs like:
-      https://data.ddosecrets.com/Collection/Collection.torrent
-    return:
-      ["https://data.ddosecrets.com/Collection"]
-    """
     u = torrent_url.strip()
     if "/" not in u:
         return []
@@ -224,12 +206,6 @@ def infer_bases_from_torrent_url(torrent_url: str) -> List[str]:
     return [base]
 def resolve_download_url(bases: List[str], root_name: str, rel_path: str) -> Optional[str]:
-    """
-    Try both:
-      base/root_name/rel_path
-      base/rel_path
-    Return the first that responds.
-    """
     candidates = []
     for b in bases:
         candidates.append(join_url(b, root_name, rel_path))
@@ -243,37 +219,28 @@ def resolve_download_url(bases: List[str], root_name: str, rel_path: str) -> Opt
 def test_7z_integrity(archive_path: str) -> bool:
     try:
         with py7zr.SevenZipFile(archive_path, mode="r") as z:
-            z.test()   # raises on CRC or structure errors
         return True
     except Exception:
         return False
 def safe_extract_7z(archive_path: str, dest_dir: str) -> Tuple[int, List[str]]:
-    """
-    Extract an archive. If a CRC error occurs, fall back to per-member extraction,
-    skipping only the bad members. Returns (#extracted, skipped_list).
-    """
     extracted_count = 0
     skipped: List[str] = []
     dest = pathlib.Path(dest_dir)
     dest.mkdir(parents=True, exist_ok=True)
-    # First try normal extraction (fast path).
     try:
         with py7zr.SevenZipFile(archive_path, mode="r") as z:
             z.extract(path=str(dest))
-        # We don't know exact count from here; return -1 to mean "unknown but success"
         return -1, skipped
     except CrcError:
-        # Fall back to per-member extraction, skipping corrupted ones.
         pass
-    # Per-member pass
     with py7zr.SevenZipFile(archive_path, mode="r") as z:
         members = [info.filename for info in z.list() if not info.is_directory]
         for name in members:
             try:
-                # Extract only this member; py7zr streams it to disk
                 z.extract(targets=[name], path=str(dest))
                 extracted_count += 1
             except CrcError:
@@ -283,6 +250,92 @@ def safe_extract_7z(archive_path: str, dest_dir: str) -> Tuple[int, List[str]]:
     return extracted_count, skipped
 # =========================
 # Pipeline
 # =========================
@@ -291,17 +344,14 @@ def run_pipeline(torrent_url: str):
     if not torrent_url.strip().lower().endswith(".torrent"):
         raise gr.Error("Please provide a direct .torrent URL.")
-    # Parse torrent metadata
     raw = fetch_bytes(torrent_url.strip())
     meta = parse_torrent(raw)
-    # Seeds: prefer BEP-19 web seeds, else infer from torrent URL folder (DDoSecrets-friendly)
     seeds = list(meta["web_seeds"]) or infer_bases_from_torrent_url(torrent_url)
     infohash = meta["infohash"]
     root_name = meta["name"]
-    # Expect .7z payloads
     sevenz_files = [f for f in meta["files"] if f["path"].lower().endswith(".7z")]
     if not sevenz_files:
         raise gr.Error("No .7z files listed in the torrent.")
@@ -310,7 +360,6 @@ def run_pipeline(torrent_url: str):
         raise gr.Error("No HTTP source found to fetch files. "
                        "If this is DDoSecrets, ensure the .torrent sits with the files over HTTPS.")
-    # Work dirs
     base_dir = pathlib.Path("/mnt/data/work") / infohash
     dl_dir = base_dir / "downloads"
     ex_dir = base_dir / "extracted"
@@ -320,10 +369,8 @@ def run_pipeline(torrent_url: str):
     logs = []
     saved_archives = []
-    # Expected sizes from torrent metadata
     expected_map = {f["path"]: int(f.get("length", 0)) for f in meta["files"]}
-    # Download each .7z over HTTP with verification and retry
     for f in sevenz_files:
         rel = f["path"]
         final_url = None
@@ -343,7 +390,6 @@ def run_pipeline(torrent_url: str):
             raise gr.Error(f"Download failed: {final_url}")
         logs.append(f"Saved: {dest} ({human_bytes(dest.stat().st_size)})")
-        # Integrity test; if fails, re-fetch once fresh (handled inside download_file_exact via attempts)
         if not test_7z_integrity(str(dest)):
             logs.append(f"CRC test failed for {dest.name}, retrying download fresh…")
             download_file_exact(final_url, dest, expected_size, max_attempts=2)
@@ -351,7 +397,6 @@ def run_pipeline(torrent_url: str):
                 logs.append(f"Archive still reports CRC problems: {dest.name}. Will try per-file extraction and skip corrupt members.")
         saved_archives.append(str(dest))
-    # Extract all .7z archives (with resilient per-member fallback)
     for apath in saved_archives:
         logs.append(f"Extracting: {apath}")
         count, skipped = safe_extract_7z(apath, str(ex_dir))
@@ -361,13 +406,11 @@ def run_pipeline(torrent_url: str):
             logs.append(f"Extracted {count} members to {ex_dir}")
             if skipped:
                 logs.append(f"Skipped {len(skipped)} corrupted member(s):")
-                # show up to a few to keep log readable
                 show = skipped[:10]
                 logs += [f"  - {s}" for s in show]
                 if len(skipped) > 10:
                     logs.append(f"  … and {len(skipped) - 10} more")
-    # List extracted files
     extracted = list_files_recursive(ex_dir)
     if not extracted:
         logs.append("No files extracted (archive may be empty).")
@@ -375,12 +418,22 @@ def run_pipeline(torrent_url: str):
         logs.append(f"Extracted files: {len(extracted)}")
     log_md = "### Run log\n" + "\n".join(f"- {l}" for l in logs)
-    return log_md, extracted, (extracted[0] if extracted else "")
 def do_preview(path: str):
     md, _ = preview_path(path)
     return md
 # =========================
 # UI
 # =========================
@@ -390,7 +443,7 @@ with gr.Blocks(title="Torrent → 7z → View (HTTP only)") as demo:
         """
 # Torrent → 7z → View (HTTP only)
 Paste a **.torrent URL** (with web seeds or DDoSecrets-style layout).
-The app downloads `.7z` file(s), verifies size & CRC, extracts them, and lets you preview text/csv/json.
         """
     )
     url_in = gr.Textbox(label=".torrent URL", placeholder="https://data.ddosecrets.com/Collection/Collection.torrent")
@@ -400,20 +453,34 @@ The app downloads `.7z` file(s), verifies size & CRC, extracts them, and lets yo
     preview_btn = gr.Button("Preview selected")
     preview_md = gr.Markdown()
     def _go(url):
-        log, files, first = run_pipeline(url)
         return (
             log,
             gr.update(choices=files, value=(first if first else None)),
-            (first if first else "")
         )
-    go_btn.click(fn=_go, inputs=[url_in], outputs=[log_out, files_dd, files_dd])
     preview_btn.click(fn=do_preview, inputs=[files_dd], outputs=[preview_md])
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=int(os.environ.get("PORT", 7860)),
-        allowed_paths=["/mnt/data"]  # allow returning files from /mnt/data if needed
     )

 import gradio as gr
 import bencodepy
 import py7zr
+from py7zr.exceptions import CrcError
+# NEW: HTML parsing
+from bs4 import BeautifulSoup
 # =========================
 # Helpers
     return r.content
 def parse_torrent(raw: bytes) -> Dict:
     data = bencodepy.decode(raw)
     if not isinstance(data, dict) or b"info" not in data:
         raise ValueError("Invalid .torrent (missing 'info').")
     return "/".join(parts)
 def _head_or_peek(url: str, timeout: int = 20) -> Tuple[bool, Optional[int]]:
     try:
         r = requests.head(url, timeout=timeout, allow_redirects=True)
         if r.status_code < 400:
             return True, (int(size) if size and size.isdigit() else None)
     except Exception:
         pass
     try:
         r = requests.get(url, stream=True, timeout=timeout, allow_redirects=True)
         if r.status_code < 400:
 def download_file_exact(url: str, dest_path: pathlib.Path, expected_size: Optional[int],
                         timeout: int = 120, max_attempts: int = 2):
     dest_path.parent.mkdir(parents=True, exist_ok=True)
     def _resume_once():
             _fresh_once()
         if expected_size is None:
+            return
         if dest_path.exists() and dest_path.stat().st_size == expected_size:
             return
     p = pathlib.Path(path_str)
     suffix = p.suffix.lower()
     try:
+        if suffix in [".csv", ".tsv", ".json", ".ndjson", ".txt", ".log", ".md", ".eml", ".html", ".htm", ".meta"]:
             raw = open(p, "rb").read(max_bytes)
             text = raw.decode("utf-8", errors="replace")
             return f"Previewing {p.name} (truncated):\n\n```\n{text}\n```", None
         return f"Error previewing file: {type(e).__name__}: {e}", None
 def infer_bases_from_torrent_url(torrent_url: str) -> List[str]:
     u = torrent_url.strip()
     if "/" not in u:
         return []
     return [base]
 def resolve_download_url(bases: List[str], root_name: str, rel_path: str) -> Optional[str]:
     candidates = []
     for b in bases:
         candidates.append(join_url(b, root_name, rel_path))
 def test_7z_integrity(archive_path: str) -> bool:
     try:
         with py7zr.SevenZipFile(archive_path, mode="r") as z:
+            z.test()
         return True
     except Exception:
         return False
 def safe_extract_7z(archive_path: str, dest_dir: str) -> Tuple[int, List[str]]:
     extracted_count = 0
     skipped: List[str] = []
     dest = pathlib.Path(dest_dir)
     dest.mkdir(parents=True, exist_ok=True)
     try:
         with py7zr.SevenZipFile(archive_path, mode="r") as z:
             z.extract(path=str(dest))
         return -1, skipped
     except CrcError:
         pass
     with py7zr.SevenZipFile(archive_path, mode="r") as z:
         members = [info.filename for info in z.list() if not info.is_directory]
         for name in members:
             try:
                 z.extract(targets=[name], path=str(dest))
                 extracted_count += 1
             except CrcError:
     return extracted_count, skipped
+# =========================
+# NEW: HTML/.meta → JSONL exporter
+# =========================
+def _parse_meta_file(path: pathlib.Path) -> Dict:
+    """
+    Try JSON parse; else parse simple 'key: value' lines; else return raw text.
+    """
+    raw = path.read_text(encoding="utf-8", errors="replace")
+    # try JSON
+    try:
+        obj = json.loads(raw)
+        return {"type": "meta", "path": str(path), "content": obj}
+    except Exception:
+        pass
+    # key: value lines
+    data: Dict[str, str] = {}
+    lines = [ln.strip() for ln in raw.splitlines() if ln.strip()]
+    for ln in lines:
+        if ":" in ln:
+            k, v = ln.split(":", 1)
+            data[k.strip()] = v.strip()
+    if data:
+        return {"type": "meta", "path": str(path), "content": data}
+    # fallback raw
+    return {"type": "meta", "path": str(path), "content_raw": raw}
+def _parse_html_file(path: pathlib.Path) -> Dict:
+    """
+    Extract title, meta[name/content], and plain text.
+    """
+    raw = path.read_text(encoding="utf-8", errors="replace")
+    # Prefer lxml if present; fallback to built-in parser
+    try:
+        soup = BeautifulSoup(raw, "lxml")
+    except Exception:
+        soup = BeautifulSoup(raw, "html.parser")
+    title = (soup.title.string.strip() if soup.title and soup.title.string else "")
+    meta = {}
+    for tag in soup.find_all("meta"):
+        name = tag.get("name") or tag.get("property")
+        content = tag.get("content")
+        if name and content:
+            meta[str(name)] = str(content)
+    text = soup.get_text(separator="\n", strip=True)
+    return {"type": "html", "path": str(path), "title": title, "meta": meta, "text": text}
+def build_jsonl_from_extracted(ex_dir: str, out_dir: str, max_records: Optional[int] = None) -> Tuple[str, int, int]:
+    """
+    Walk extracted dir, convert all .html/.htm and .meta files to JSONL.
+    Returns (output_path, html_count, meta_count).
+    """
+    ex_root = pathlib.Path(ex_dir)
+    out_root = pathlib.Path(out_dir)
+    out_root.mkdir(parents=True, exist_ok=True)
+    out_path = out_root / "converted.jsonl"
+    html_count = 0
+    meta_count = 0
+    written = 0
+    with open(out_path, "w", encoding="utf-8") as fout:
+        for p in ex_root.rglob("*"):
+            if not p.is_file():
+                continue
+            suf = p.suffix.lower()
+            try:
+                if suf in (".html", ".htm"):
+                    rec = _parse_html_file(p)
+                    html_count += 1
+                elif suf == ".meta":
+                    rec = _parse_meta_file(p)
+                    meta_count += 1
+                else:
+                    continue
+                fout.write(json.dumps(rec, ensure_ascii=False) + "\n")
+                written += 1
+                if max_records and written >= max_records:
+                    break
+            except Exception as e:
+                # Skip unreadable files but carry on
+                err = {"type": "error", "path": str(p), "error": f"{type(e).__name__}: {e}"}
+                fout.write(json.dumps(err, ensure_ascii=False) + "\n")
+    return str(out_path), html_count, meta_count
 # =========================
 # Pipeline
 # =========================
     if not torrent_url.strip().lower().endswith(".torrent"):
         raise gr.Error("Please provide a direct .torrent URL.")
     raw = fetch_bytes(torrent_url.strip())
     meta = parse_torrent(raw)
     seeds = list(meta["web_seeds"]) or infer_bases_from_torrent_url(torrent_url)
     infohash = meta["infohash"]
     root_name = meta["name"]
     sevenz_files = [f for f in meta["files"] if f["path"].lower().endswith(".7z")]
     if not sevenz_files:
         raise gr.Error("No .7z files listed in the torrent.")
         raise gr.Error("No HTTP source found to fetch files. "
                        "If this is DDoSecrets, ensure the .torrent sits with the files over HTTPS.")
     base_dir = pathlib.Path("/mnt/data/work") / infohash
     dl_dir = base_dir / "downloads"
     ex_dir = base_dir / "extracted"
     logs = []
     saved_archives = []
     expected_map = {f["path"]: int(f.get("length", 0)) for f in meta["files"]}
     for f in sevenz_files:
         rel = f["path"]
         final_url = None
             raise gr.Error(f"Download failed: {final_url}")
         logs.append(f"Saved: {dest} ({human_bytes(dest.stat().st_size)})")
         if not test_7z_integrity(str(dest)):
             logs.append(f"CRC test failed for {dest.name}, retrying download fresh…")
             download_file_exact(final_url, dest, expected_size, max_attempts=2)
                 logs.append(f"Archive still reports CRC problems: {dest.name}. Will try per-file extraction and skip corrupt members.")
         saved_archives.append(str(dest))
     for apath in saved_archives:
         logs.append(f"Extracting: {apath}")
         count, skipped = safe_extract_7z(apath, str(ex_dir))
             logs.append(f"Extracted {count} members to {ex_dir}")
             if skipped:
                 logs.append(f"Skipped {len(skipped)} corrupted member(s):")
                 show = skipped[:10]
                 logs += [f"  - {s}" for s in show]
                 if len(skipped) > 10:
                     logs.append(f"  … and {len(skipped) - 10} more")
     extracted = list_files_recursive(ex_dir)
     if not extracted:
         logs.append("No files extracted (archive may be empty).")
         logs.append(f"Extracted files: {len(extracted)}")
     log_md = "### Run log\n" + "\n".join(f"- {l}" for l in logs)
+    # RETURN the extracted dir so we can build JSON later
+    return log_md, extracted, (extracted[0] if extracted else ""), str(ex_dir), str(base_dir)
 def do_preview(path: str):
     md, _ = preview_path(path)
     return md
+# NEW: hook to build JSONL and return a downloadable file
+def do_build_jsonl(ex_dir: str, base_dir: str):
+    if not ex_dir or not os.path.isdir(ex_dir):
+        raise gr.Error("Extraction folder not found. Run the download/extract step first.")
+    out_dir = str(pathlib.Path(base_dir) / "exports")
+    out_path, html_count, meta_count = build_jsonl_from_extracted(ex_dir, out_dir)
+    summary = f"Built JSONL at: `{out_path}`\n- HTML files: {html_count}\n- META files: {meta_count}\n"
+    return summary, out_path
 # =========================
 # UI
 # =========================
         """
 # Torrent → 7z → View (HTTP only)
 Paste a **.torrent URL** (with web seeds or DDoSecrets-style layout).
+The app downloads `.7z` file(s), verifies size & CRC, extracts them, lets you preview text/csv/json, **and exports all `.html` + `.meta` to a single JSONL**.
         """
     )
     url_in = gr.Textbox(label=".torrent URL", placeholder="https://data.ddosecrets.com/Collection/Collection.torrent")
     preview_btn = gr.Button("Preview selected")
     preview_md = gr.Markdown()
+    # NEW: export controls
+    gr.Markdown("### Export `.html` and `.meta` → combined JSONL")
+    build_btn = gr.Button("Build JSONL from extracted")
+    build_log = gr.Markdown()
+    dl_file = gr.File(label="Download combined JSONL", interactive=False)
+    # internal state: extracted dir & base dir for exports
+    ex_dir_state = gr.State()
+    base_dir_state = gr.State()
     def _go(url):
+        log, files, first, ex_dir, base_dir = run_pipeline(url)
         return (
             log,
             gr.update(choices=files, value=(first if first else None)),
+            (first if first else ""),
+            ex_dir,
+            base_dir
         )
+    go_btn.click(fn=_go, inputs=[url_in], outputs=[log_out, files_dd, files_dd, ex_dir_state, base_dir_state])
     preview_btn.click(fn=do_preview, inputs=[files_dd], outputs=[preview_md])
+    build_btn.click(fn=do_build_jsonl, inputs=[ex_dir_state, base_dir_state], outputs=[build_log, dl_file])
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=int(os.environ.get("PORT", 7860)),
+        allowed_paths=["/mnt/data"]  # allow returning files from /mnt/data
     )