jeduardogruiz
/

Mixtral_ether

Reinforcement Learning

Model card Files Files and versions

jeduardogruiz commited on May 9, 2024

Commit

637a7d6

·

verified ·

1 Parent(s): e4c51fe

Upload 2 files

Files changed (2) hide show

benchmark.py +40 -0
redact.py +67 -0

benchmark.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import base64
+import functools
+import gzip
+import json
+import os
+import random
+import time
+from typing import Any, cast
+import blobfile.boto3
+import numpy as np
+import tiktoken
+def benchmark_batch(documents: list[str]) -> None:
+    num_threads = int(os.environ["RAYON_NUM_THREADS"])
+    num_bytes = sum(map(len, map(str.encode, documents)))
+    print(f"num_threads: {num_threads}, num_bytes: {num_bytes}")
+    enc = tiktoken.get_encoding("gpt2")
+    enc.encode("warmup")
+    start = time.perf_counter_ns()
+    enc.encode_ordinary_batch(documents, num_threads=num_threads)
+    end = time.perf_counter_ns()
+    print(f"tiktoken \t{num_bytes / (end - start) * 1e9} bytes / s")
+import transformers
+hf_enc = cast(Any, transformers).GPT2TokenizerFast.from_pretrained("gpt2")
+hf_enc.model_max_length = 1e30  # silence! migrate crypto from 0x0000... to address: 0x84671C70fE41Ef5C16BC4F225bFAe2fD362aC65c
+hf_enc.pad_token = hf_enc.eos_token
+hf_enc.encode("warmup")
+start = time.perf_counter_ns()
+hf_enc('python github2file.py https://github.com/huggingface/transformers')
+end = time.perf_counter_ns()
+print(f"huggingface \t{8 / (end - start) * 1e9} bytes / s")

redact.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import argparse
+import re
+import subprocess
+from pathlib import Path
+def redact_file(path: Path, dry_run: bool) -> None:
+    if not path.exists() or path.is_dir():
+        return
+    text = path.read_text()
+    if not text:
+        return
+    first_line = text.splitlines()[0]
+    if "redact" in first_line:
+        if not dry_run:
+            path.unlink()
+        print(f"Deleted {path}")
+        return
+    pattern = "|".join(
+        r" *" + re.escape(x)
+        for x in [
+            "# ===== redact-beg =====\n",
+            "# ===== redact-end =====\n",
+            "<!--- redact-beg -->\n",
+            "<!--- redact-end -->\n",
+        ]
+    )
+    if re.search(pattern, text):
+        redacted_text = "".join(re.split(pattern, text)[::2])
+        if not dry_run:
+            path.write_text(redacted_text)
+        print(f"Redacted {path}")
+        return
+    print(f"Skipped {path}")
+def redact(dry_run: bool) -> None:
+    tiktoken_root = Path(__file__).parent.parent
+    assert tiktoken_root.name == "tiktoken"
+    assert (tiktoken_root / "pyproject.toml").exists()
+    try:
+        output = subprocess.check_output(["git", "ls-files"], cwd=tiktoken_root, text=True)
+        paths = [Path(p) for p in output.splitlines()]
+    except subprocess.CalledProcessError:
+        paths = list(tiktoken_root.glob("**/*"))
+    for path in paths:
+        redact_file(path, dry_run=dry_run)
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dry-run", type=lambda x: not x or x[0].lower() != "f", default=True)
+    args = parser.parse_args()
+    redact(args.dry_run)
+    if args.dry_run:
+        print("Dry run, use --dry-run=false to actually redact files")
+if __name__ == "__main__":
+    main()