Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Commit ·
77f7fc5
1
Parent(s): 290123d
added finephrase section and moved progress monitoring section there from infra
Browse files- app/scripts/extract_finephrase_samples.py +110 -0
- app/src/content/assets/data/finephrase-samples.jsonl +3 -0
- app/src/content/assets/image/finephrase-progress.png +3 -0
- app/src/content/chapters/5-infrastructure.mdx +0 -35
- app/src/content/chapters/6-finephrase.mdx +169 -1
- app/src/content/embeds/finephrase-explorer.html +443 -0
app/scripts/extract_finephrase_samples.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Extract aligned sample rows from the FinePhrase dataset for the blog data explorer widget.
|
| 3 |
+
|
| 4 |
+
Streams through HuggingFaceFW/finephrase and collects aligned samples where the
|
| 5 |
+
same source document has outputs for all four prompt configs (faq, math, table, tutorial).
|
| 6 |
+
Stops once 1000 aligned samples are found.
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
python app/scripts/extract_finephrase_samples.py
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import json
|
| 13 |
+
import logging
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
|
| 16 |
+
from datasets import load_dataset
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
|
| 20 |
+
|
| 21 |
+
REPO_ID = "HuggingFaceFW/finephrase"
|
| 22 |
+
PROMPTS = ["faq", "math", "table", "tutorial"]
|
| 23 |
+
TARGET_SAMPLES = 1000
|
| 24 |
+
OUTPUT_PATH = Path(__file__).parent.parent / "public" / "data" / "finephrase-samples.jsonl"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def extract_samples() -> list[dict]:
|
| 28 |
+
"""Stream all prompts and collect aligned samples until we reach TARGET_SAMPLES."""
|
| 29 |
+
# Accumulate rows by document id for each prompt
|
| 30 |
+
prompt_data: dict[str, dict[str, dict]] = {p: {} for p in PROMPTS}
|
| 31 |
+
# Track which doc ids are complete (present in all 4 prompts)
|
| 32 |
+
complete_ids: list[str] = []
|
| 33 |
+
complete_set: set[str] = set()
|
| 34 |
+
|
| 35 |
+
# Stream all prompts in parallel via iterators
|
| 36 |
+
iterators = {}
|
| 37 |
+
for prompt in PROMPTS:
|
| 38 |
+
logger.info(f"Opening stream for {prompt}...")
|
| 39 |
+
ds = load_dataset(REPO_ID, name=prompt, split="train", streaming=True)
|
| 40 |
+
iterators[prompt] = iter(ds)
|
| 41 |
+
|
| 42 |
+
batch_size = 500
|
| 43 |
+
while len(complete_ids) < TARGET_SAMPLES:
|
| 44 |
+
# Fetch a batch from each prompt
|
| 45 |
+
any_progress = False
|
| 46 |
+
for prompt in PROMPTS:
|
| 47 |
+
it = iterators[prompt]
|
| 48 |
+
fetched = 0
|
| 49 |
+
for _ in range(batch_size):
|
| 50 |
+
try:
|
| 51 |
+
row = next(it)
|
| 52 |
+
except StopIteration:
|
| 53 |
+
break
|
| 54 |
+
fetched += 1
|
| 55 |
+
doc_id = row["id"]
|
| 56 |
+
prompt_data[prompt][doc_id] = row
|
| 57 |
+
|
| 58 |
+
# Check if this doc_id is now complete across all prompts
|
| 59 |
+
if doc_id not in complete_set and all(
|
| 60 |
+
doc_id in prompt_data[p] for p in PROMPTS
|
| 61 |
+
):
|
| 62 |
+
complete_set.add(doc_id)
|
| 63 |
+
complete_ids.append(doc_id)
|
| 64 |
+
if len(complete_ids) >= TARGET_SAMPLES:
|
| 65 |
+
break
|
| 66 |
+
if fetched > 0:
|
| 67 |
+
any_progress = True
|
| 68 |
+
logger.info(
|
| 69 |
+
f" {prompt}: {len(prompt_data[prompt])} rows loaded, "
|
| 70 |
+
f"{len(complete_ids)} aligned so far"
|
| 71 |
+
)
|
| 72 |
+
if len(complete_ids) >= TARGET_SAMPLES:
|
| 73 |
+
break
|
| 74 |
+
|
| 75 |
+
if not any_progress:
|
| 76 |
+
logger.warning("All streams exhausted before reaching target")
|
| 77 |
+
break
|
| 78 |
+
|
| 79 |
+
logger.info(f"Found {len(complete_ids)} aligned documents")
|
| 80 |
+
|
| 81 |
+
samples: list[dict] = []
|
| 82 |
+
for doc_id in complete_ids[:TARGET_SAMPLES]:
|
| 83 |
+
ref_row = prompt_data[PROMPTS[0]][doc_id]
|
| 84 |
+
|
| 85 |
+
entry: dict = {
|
| 86 |
+
"id": doc_id,
|
| 87 |
+
"url": ref_row.get("url", ""),
|
| 88 |
+
"file_path": ref_row.get("file_path", ""),
|
| 89 |
+
"source": str(ref_row.get("text", "")),
|
| 90 |
+
}
|
| 91 |
+
for prompt in PROMPTS:
|
| 92 |
+
rollout = prompt_data[prompt][doc_id].get("rollout_results", [])
|
| 93 |
+
entry[prompt] = str(rollout[0]["text"]) if rollout else ""
|
| 94 |
+
samples.append(entry)
|
| 95 |
+
|
| 96 |
+
logger.info(f"Built {len(samples)} samples")
|
| 97 |
+
return samples
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def main() -> None:
|
| 101 |
+
samples = extract_samples()
|
| 102 |
+
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
| 103 |
+
with open(OUTPUT_PATH, "w") as f:
|
| 104 |
+
for sample in samples:
|
| 105 |
+
f.write(json.dumps(sample, ensure_ascii=False) + "\n")
|
| 106 |
+
logger.info(f"Saved {len(samples)} samples to {OUTPUT_PATH}")
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
if __name__ == "__main__":
|
| 110 |
+
main()
|
app/src/content/assets/data/finephrase-samples.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:20878e20e24b16853b12ed5d5c6232183eaa181c4030fb118a887f6f8ee36039
|
| 3 |
+
size 11165261
|
app/src/content/assets/image/finephrase-progress.png
ADDED
|
Git LFS Details
|
app/src/content/chapters/5-infrastructure.mdx
CHANGED
|
@@ -1,10 +1,8 @@
|
|
| 1 |
-
import Image from "../../components/Image.astro";
|
| 2 |
import HtmlEmbed from "../../components/HtmlEmbed.astro";
|
| 3 |
import Sidenote from "../../components/Sidenote.astro";
|
| 4 |
import FigRef from "../../components/FigRef.astro";
|
| 5 |
import Accordion from "../../components/Accordion.astro";
|
| 6 |
import Wide from "../../components/Wide.astro";
|
| 7 |
-
import datasetCardImg from "../assets/image/auto-dataset-card.png";
|
| 8 |
|
| 9 |
## Infrastructure
|
| 10 |
|
|
@@ -189,39 +187,6 @@ Need multiple samples per document? Set `rollouts_per_document` in your `Inferen
|
|
| 189 |
|
| 190 |
**Use case: Best-of-N sampling for code generation.** When generating code solutions, you want multiple attempts per problem to increase the chance of a correct answer. Set `rollouts_per_document=10` and later filter for solutions that pass your test suite.
|
| 191 |
|
| 192 |
-
### Automatic HF Upload and Progress Monitoring
|
| 193 |
-
|
| 194 |
-
We want you to be able to just press a button, let the GPUs go brrrr, and check back in to the finished dataset. DataTrove continuously uploads data to your specified Hugging Face dataset repo whenever a chunk is finished. At the end, the `InferenceDatasetCardGenerator` pipeline step checks the logs directory, collects information about the throughput, and uploads a dataset card to document your new synthetic dataset. <FigRef target="auto-dataset-card" /> shows an example of the auto-generated dataset card.
|
| 195 |
-
|
| 196 |
-
<figure id="auto-dataset-card">
|
| 197 |
-
<Image src={datasetCardImg} alt="Auto-generated dataset card on the Hugging Face Hub" />
|
| 198 |
-
<figcaption>Example of an auto-generated dataset card with throughput metrics, uploaded to the Hugging Face Hub after inference completes.</figcaption>
|
| 199 |
-
</figure>
|
| 200 |
-
|
| 201 |
-
For long-running inference jobs, you can use `InferenceProgressMonitor` to periodically update a HuggingFace dataset card with a progress bar and ETA. After inference completes, `InferenceDatasetCardGenerator` creates a final dataset card with statistics.
|
| 202 |
-
|
| 203 |
-
```python
|
| 204 |
-
params = InferenceDatasetCardParams(
|
| 205 |
-
output_repo_id="your-username/output-dataset",
|
| 206 |
-
input_dataset_name="HuggingFaceFW/fineweb-edu",
|
| 207 |
-
input_dataset_split="train",
|
| 208 |
-
model_name="HuggingFaceTB/SmolLM3-3B",
|
| 209 |
-
# ... other params
|
| 210 |
-
)
|
| 211 |
-
|
| 212 |
-
# Runs in parallel with inference on Slurm
|
| 213 |
-
monitor_pipeline = [
|
| 214 |
-
InferenceProgressMonitor(
|
| 215 |
-
params=params, update_interval=3600
|
| 216 |
-
)
|
| 217 |
-
]
|
| 218 |
-
|
| 219 |
-
# Runs after inference completes
|
| 220 |
-
datacard_pipeline = [
|
| 221 |
-
InferenceDatasetCardGenerator(params=params)
|
| 222 |
-
]
|
| 223 |
-
```
|
| 224 |
-
|
| 225 |
### Throughput Benchmarking
|
| 226 |
|
| 227 |
For synthetic data generation, we may run language model inference for millions of GPU hours. Finding a configuration that maximizes throughput is critical: it can accelerate generation by days and save thousands of dollars. In this section, we describe our experiments to identify optimal parameters for a selection of popular models.
|
|
|
|
|
|
|
| 1 |
import HtmlEmbed from "../../components/HtmlEmbed.astro";
|
| 2 |
import Sidenote from "../../components/Sidenote.astro";
|
| 3 |
import FigRef from "../../components/FigRef.astro";
|
| 4 |
import Accordion from "../../components/Accordion.astro";
|
| 5 |
import Wide from "../../components/Wide.astro";
|
|
|
|
| 6 |
|
| 7 |
## Infrastructure
|
| 8 |
|
|
|
|
| 187 |
|
| 188 |
**Use case: Best-of-N sampling for code generation.** When generating code solutions, you want multiple attempts per problem to increase the chance of a correct answer. Set `rollouts_per_document=10` and later filter for solutions that pass your test suite.
|
| 189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
### Throughput Benchmarking
|
| 191 |
|
| 192 |
For synthetic data generation, we may run language model inference for millions of GPU hours. Finding a configuration that maximizes throughput is critical: it can accelerate generation by days and save thousands of dollars. In this section, we describe our experiments to identify optimal parameters for a selection of popular models.
|
app/src/content/chapters/6-finephrase.mdx
CHANGED
|
@@ -1,2 +1,170 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import Image from "../../components/Image.astro";
|
| 2 |
+
import HtmlEmbed from "../../components/HtmlEmbed.astro";
|
| 3 |
+
import Sidenote from "../../components/Sidenote.astro";
|
| 4 |
+
import FigRef from "../../components/FigRef.astro";
|
| 5 |
+
import Wide from "../../components/Wide.astro";
|
| 6 |
+
import datasetCardImg from "../assets/image/auto-dataset-card.png";
|
| 7 |
+
import finephraseProgressImg from "../assets/image/finephrase-progress.png";
|
| 8 |
|
| 9 |
+
## Applying the Recipe at Scale
|
| 10 |
+
|
| 11 |
+
We ran 90 experiments to figure out what works. Now we apply those findings to build [FinePhrase](https://huggingface.co/datasets/HuggingFaceFW/finephrase), a large-scale synthetic dataset that rephrases all XXX million documents from [FineWeb-Edu](https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu) (sample-350BT) into four structured formats, producing XXX billion tokens of synthetic pretraining data.
|
| 12 |
+
|
| 13 |
+
The recipe is simple: take the best model (SmolLM2-1.7B-Instruct), the best prompts (FAQ, Math, Table, Tutorial), the optimized inference settings from our throughput benchmarks, and the battle-tested DataTrove infrastructure. Launch 100 parallel Slurm workers, each running on a single H100 GPU with suffix-32 speculative decoding. Let it run for about two weeks.
|
| 14 |
+
|
| 15 |
+
To get a sense of the scale: our infrastructure benchmarks showed that SmolLM2-1.7B-Instruct achieves ~9,200 tokens per second per GPU with suffix-32 speculative decoding. With 100 GPUs running in parallel, that is ~920,000 tokens per second, or about 3.3 billion tokens per hour. Rephrasing ~339 million documents four times (once per prompt) at an average of ~XXX tokens per document means roughly XXX trillion tokens of total generation. At our throughput rate, that takes approximately XXX GPU-days, or about XXX wall-clock days with 100 GPUs.
|
| 16 |
+
|
| 17 |
+
### The Recipe
|
| 18 |
+
|
| 19 |
+
Every configuration choice traces back to a finding from our experiments or infrastructure benchmarks:
|
| 20 |
+
|
| 21 |
+
- **Model**: [SmolLM2-1.7B-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B-Instruct), which dominated all other model families across every prompt in our [model family comparison](#does-the-model-family-matter)
|
| 22 |
+
- **Prompts**: [FAQ](#faq), [Math](#math), [Table](#table), and [Tutorial](#tutorial), the four prompts that [consistently beat DCLM](#can-new-prompts-beat-dclm) in our experiments
|
| 23 |
+
- **Source data**: [FineWeb-Edu sample-350BT](https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu), since our experiments showed that [source quality is secondary](#does-the-source-dataset-matter) when paired with a strong mix-in dataset
|
| 24 |
+
- **Inference settings**: tp=1 with suffix-32 speculative decoding, mns=2048, mnbt=16384, gmu=0.90, all derived from the [throughput benchmark](#throughput-benchmarking) that found a 1.75x speedup for SmolLM2-1.7B-Instruct with this configuration
|
| 25 |
+
|
| 26 |
+
The entire FinePhrase production run is defined in a [single script](https://github.com/huggingface/datatrove/blob/main/examples/inference/finephrase.py) that is intentionally thin. It declares the configuration and calls the [`generate_data`](https://github.com/huggingface/datatrove/blob/main/examples/inference/generate_data.py) script introduced in the [Infrastructure](#infrastructure) section (the same script we used for all throughput benchmarking). Here is the core configuration:
|
| 27 |
+
|
| 28 |
+
```python
|
| 29 |
+
KWARGS = {
|
| 30 |
+
"model_name_or_path": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
|
| 31 |
+
"model_max_context": 8192,
|
| 32 |
+
"max_tokens": 2048,
|
| 33 |
+
"input_dataset_name": "HuggingFaceFW/fineweb-edu",
|
| 34 |
+
"input_dataset_config": "sample-350BT",
|
| 35 |
+
"output_dataset_name": "HuggingFaceFW/finephrase",
|
| 36 |
+
"max_num_seqs": 2048,
|
| 37 |
+
"max_num_batched_tokens": 16384,
|
| 38 |
+
"gpu_memory_utilization": 0.90,
|
| 39 |
+
"speculative_config": '{"method":"suffix","num_speculative_tokens":32}',
|
| 40 |
+
"enable_monitoring": True,
|
| 41 |
+
"examples_per_chunk": 100_000,
|
| 42 |
+
"workers": 100,
|
| 43 |
+
"tasks": 100,
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
PROMPT_TEMPLATES = {
|
| 47 |
+
"math": "Rewrite the document to create a mathematical word problem ...",
|
| 48 |
+
"table": "Rewrite the document as a structured table ...",
|
| 49 |
+
"faq": "Rewrite the document as a comprehensive FAQ ...",
|
| 50 |
+
"tutorial": "Rewrite the document as a clear, step-by-step tutorial ...",
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
for name, template in PROMPT_TEMPLATES.items():
|
| 54 |
+
generate_data_main(**KWARGS, name=f"finephrase_{name}",
|
| 55 |
+
prompt_template=[name, template])
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
<Sidenote>
|
| 59 |
+
We set `max_tokens=2048` instead of 4096 because SmolLM2-1.7B-Instruct rarely generates more than 2K tokens per document anyway. Halving the max token budget lets vLLM allocate more KV cache for concurrent sequences.
|
| 60 |
+
</Sidenote>
|
| 61 |
+
|
| 62 |
+
All the operational complexity lives in DataTrove itself: chunked processing with checkpoint-based resume, distributed Slurm execution, incremental Hub uploads, and automatic dataset card generation. The [`generate_data`](https://github.com/huggingface/datatrove/blob/main/examples/inference/generate_data.py) script wires these pieces together into a single CLI for synthetic data generation, which is why the FinePhrase production script is only less than 100 lines of code. Before any GPU time is spent, it runs pre-flight checks: `check_hf_auth()` verifies you have a write token, `ensure_repo_exists()` creates the output dataset repo, and `validate_config()` catches invalid parallelism settings and validates that prompt templates contain the `[[DOCUMENT]]` placeholder. It reads the model's `GenerationConfig` from the Hub to inherit default sampling parameters rather than requiring you to hardcode them. The rollout function automatically truncates documents that exceed the context budget at newline boundaries, which is critical at 339 million documents where some will inevitably be too long.
|
| 63 |
+
|
| 64 |
+
On Slurm, a single `generate_data` call orchestrates three coordinated jobs: the inference job (100 parallel workers doing the actual generation), a monitor job (updating the dataset card with progress bars and ETAs), and a datacard job (generating final statistics after completion). The monitor tracks the inference job ID and stops if inference fails. The datacard job uses Slurm's `afterok` dependency to run only on success.
|
| 65 |
+
|
| 66 |
+
### Automatic HF Upload and Progress Monitoring
|
| 67 |
+
|
| 68 |
+
We want you to be able to just press a button, let the GPUs go brrrr, and check back in to the finished dataset. DataTrove continuously uploads data to your specified Hugging Face dataset repo whenever a chunk is finished, using `ParquetWriter` with `hf://` paths so data appears on the Hub within minutes of generation, not after the full run completes. At the end, the `InferenceDatasetCardGenerator` pipeline step checks the logs directory, collects information about the throughput, and uploads a dataset card to document your new synthetic dataset. <FigRef target="auto-dataset-card" /> shows an example of the auto-generated dataset card.
|
| 69 |
+
|
| 70 |
+
<figure id="auto-dataset-card">
|
| 71 |
+
<Image src={datasetCardImg} alt="Auto-generated dataset card on the Hugging Face Hub" />
|
| 72 |
+
<figcaption>Example of an auto-generated dataset card with throughput metrics, uploaded to the Hugging Face Hub after inference completes.</figcaption>
|
| 73 |
+
</figure>
|
| 74 |
+
|
| 75 |
+
For long-running inference jobs like FinePhrase (which runs for about two weeks), the `InferenceProgressMonitor` runs as a separate Slurm job alongside the inference workers. It periodically scans the output directory, counts completed chunks across all 100 tasks, and updates the dataset card on the Hub with a progress bar and ETA for each prompt template. <FigRef target="finephrase-progress" /> shows the live progress dashboard during the FinePhrase generation run.
|
| 76 |
+
|
| 77 |
+
<figure id="finephrase-progress">
|
| 78 |
+
<Image src={finephraseProgressImg} alt="Live progress monitoring of the FinePhrase generation run" />
|
| 79 |
+
<figcaption>Live progress dashboard for FinePhrase, showing per-prompt completion status, document counts, and ETAs. The monitor runs as a separate Slurm job and updates the dataset card hourly.</figcaption>
|
| 80 |
+
</figure>
|
| 81 |
+
|
| 82 |
+
Both the progress monitor and the dataset card generator are configured through an `InferenceDatasetCardParams` object that captures the full run metadata. The `generate_data` script creates these pipelines automatically, but here is what happens under the hood:
|
| 83 |
+
|
| 84 |
+
```python
|
| 85 |
+
params = InferenceDatasetCardParams(
|
| 86 |
+
output_repo_id="HuggingFaceFW/finephrase",
|
| 87 |
+
input_dataset_name="HuggingFaceFW/fineweb-edu",
|
| 88 |
+
input_dataset_split="train",
|
| 89 |
+
model_name="HuggingFaceTB/SmolLM2-1.7B-Instruct",
|
| 90 |
+
# ... other params
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
monitor_pipeline = [
|
| 94 |
+
InferenceProgressMonitor(
|
| 95 |
+
params=params, update_interval=3600
|
| 96 |
+
)
|
| 97 |
+
]
|
| 98 |
+
|
| 99 |
+
datacard_pipeline = [
|
| 100 |
+
InferenceDatasetCardGenerator(params=params)
|
| 101 |
+
]
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
### What's in the Dataset?
|
| 105 |
+
|
| 106 |
+
<FigRef target="finephrase-explorer" /> lets you browse real examples from FinePhrase. Each sample shows the original FineWeb-Edu source document alongside all four rephrased versions. Navigate through samples to see how the same web document becomes a FAQ, a math problem, a structured table, and a step-by-step tutorial.
|
| 107 |
+
|
| 108 |
+
<Wide>
|
| 109 |
+
<HtmlEmbed
|
| 110 |
+
id="finephrase-explorer"
|
| 111 |
+
src="finephrase-explorer.html"
|
| 112 |
+
caption="Browse real examples from the FinePhrase dataset. Each sample shows the original source document alongside all four rephrased versions (FAQ, Math, Table, Tutorial). Use the arrows or Random button to navigate between samples."
|
| 113 |
+
/>
|
| 114 |
+
</Wide>
|
| 115 |
+
|
| 116 |
+
### Improvements to DataTrove
|
| 117 |
+
|
| 118 |
+
Building FinePhrase was not just about running inference at scale. It required hardening DataTrove's inference pipeline to handle the realities of processing 339 million documents across 100 parallel workers over two weeks. Every failure mode you can imagine showed up: documents that crash the model, workers racing to commit to the same repo, Slurm jobs dying on startup, and caches corrupting under contention. We merged over a dozen PRs to make this work. Here are the most impactful ones.
|
| 119 |
+
|
| 120 |
+
#### Graceful error handling for bad documents
|
| 121 |
+
|
| 122 |
+
At 339 million documents, some will inevitably trigger errors: documents too long for the context window even after truncation, malformed content that produces invalid tokens, or edge cases in the tokenizer. Before [PR #450](https://github.com/huggingface/datatrove/pull/450), a single bad document would crash the entire worker, losing all progress for that task. The `skip_bad_requests` option lets the `InferenceRunner` catch provider-side `BadRequestError` exceptions, log the problematic document, and continue processing the rest of the chunk.
|
| 123 |
+
|
| 124 |
+
```python
|
| 125 |
+
InferenceRunner(
|
| 126 |
+
rollout_fn=simple_rollout,
|
| 127 |
+
config=inference_config,
|
| 128 |
+
skip_bad_requests=True, # Log and skip instead of crashing
|
| 129 |
+
)
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
#### Fast resume with checkpoint-aware skipping
|
| 133 |
+
|
| 134 |
+
The first version of `skip_bad_requests` had a subtle problem: skipped documents were not written to checkpoints. This meant chunks containing bad documents never reached completion, `last_chunk` never advanced, and every restart re-parsed the entire checkpoint history from scratch. For FinePhrase with 100,000 documents per chunk, this made restarts painfully slow (sometimes leading to multiple hours of wasted GPU time per worker). [PR #464](https://github.com/huggingface/datatrove/pull/464) fixes this by writing skipped documents to checkpoints with a special marker so they count toward chunk completion but are excluded from the final output. It also speeds up resume by sorting checkpoint files and skipping replay for chunks that are already complete.
|
| 135 |
+
|
| 136 |
+
#### Hardening Hub uploads against transient failures
|
| 137 |
+
|
| 138 |
+
With 100 workers writing to the same Hugging Face Hub repository, transient failures are not rare, they are guaranteed. We encountered three distinct failure modes and fixed each one:
|
| 139 |
+
|
| 140 |
+
- **Commit races** ([PR #448](https://github.com/huggingface/datatrove/pull/448)): Two workers commit simultaneously and one gets `412 Precondition Failed` with "A commit has happened since." The fix adds retry logic with exponential backoff to the `DiskWriter`, which all Hub-writing paths go through.
|
| 141 |
+
- **Transient server errors** ([PR #463](https://github.com/huggingface/datatrove/pull/463)): `503 Service Unavailable` and other transient API errors were not retried consistently. This PR normalizes retry logic across `DiskWriter` and `HuggingFaceDatasetWriter` so all transient errors are handled uniformly.
|
| 142 |
+
- **LFS verification failures** ([PR #455](https://github.com/huggingface/datatrove/pull/455)): Large file uploads occasionally fail LFS verification on the server side. A one-line fix adds `"lfs-verify"` to the list of retryable error messages.
|
| 143 |
+
|
| 144 |
+
#### Isolating the Xet cache per Slurm task
|
| 145 |
+
|
| 146 |
+
Hugging Face Hub uses [Xet](https://huggingface.co/docs/hub/storage-backends#xet-storage-backend) as a storage backend, and its local cache is not designed for concurrent access from 100 parallel processes. Shared cache access caused corruption and failures. [PR #465](https://github.com/huggingface/datatrove/pull/465) gives each Slurm task its own cache directory derived from the job, task, and process IDs:
|
| 147 |
+
|
| 148 |
+
```bash
|
| 149 |
+
export HF_XET_CACHE="/tmp/hf_xet/${SLURM_JOB_ID}_${SLURM_ARRAY_TASK_ID}_${SLURM_PROCID}"
|
| 150 |
+
mkdir -p "$HF_XET_CACHE"
|
| 151 |
+
```
|
| 152 |
+
|
| 153 |
+
#### Multi-config dataset support
|
| 154 |
+
|
| 155 |
+
FinePhrase runs four prompt templates that produce four independent dataset configs (faq, math, table, tutorial). Without config-awareness, all four templates would fight over a single dataset card and progress counters would exceed 100%. [PR #447](https://github.com/huggingface/datatrove/pull/447) adds first-class config support: outputs go to config-specific folders (`hf://datasets/HuggingFaceFW/finephrase/faq/`, `.../math/`, etc.), the dataset card merges information from all configs, and the progress monitor tracks each config independently so you see four separate progress bars (as in <FigRef target="finephrase-progress" />).
|
| 156 |
+
|
| 157 |
+
#### Configurable server startup
|
| 158 |
+
|
| 159 |
+
vLLM server startup time varies wildly depending on model size, optimization level, and cluster load. With `optimization_level=3` (the highest throughput setting), vLLM compiles CUDA graphs during startup, which can take several minutes. Fixed startup timeouts would kill healthy jobs that were simply slow to initialize. [PR #451](https://github.com/huggingface/datatrove/pull/451) makes all startup parameters configurable via `InferenceConfig`: timeout, max attempts, retry delay, and max retries.
|
| 160 |
+
|
| 161 |
+
#### Fixing SLURM CPU binding
|
| 162 |
+
|
| 163 |
+
A one-liner, but without it nothing runs. Slurm's default CPU binding policy conflicts with how DataTrove launches vLLM servers, sometimes causing jobs to fail immediately with `srun: error: CPU binding outside of job step allocation`. [PR #457](https://github.com/huggingface/datatrove/pull/457) passes `--cpu-bind=none` to srun, disabling the restrictive binding policy.
|
| 164 |
+
|
| 165 |
+
```python
|
| 166 |
+
SlurmPipelineExecutor(
|
| 167 |
+
srun_args={"cpu-bind": "none"},
|
| 168 |
+
...
|
| 169 |
+
)
|
| 170 |
+
```
|
app/src/content/embeds/finephrase-explorer.html
ADDED
|
@@ -0,0 +1,443 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<div class="finephrase-explorer"></div>
|
| 2 |
+
<style>
|
| 3 |
+
.finephrase-explorer {
|
| 4 |
+
font-family: var(--font-body, system-ui, sans-serif);
|
| 5 |
+
color: var(--text-color, #1a1a1a);
|
| 6 |
+
position: relative;
|
| 7 |
+
}
|
| 8 |
+
.finephrase-explorer .controls {
|
| 9 |
+
display: flex;
|
| 10 |
+
align-items: center;
|
| 11 |
+
gap: 10px;
|
| 12 |
+
flex-wrap: wrap;
|
| 13 |
+
margin-bottom: 10px;
|
| 14 |
+
}
|
| 15 |
+
.finephrase-explorer .nav-btn {
|
| 16 |
+
display: inline-flex;
|
| 17 |
+
align-items: center;
|
| 18 |
+
justify-content: center;
|
| 19 |
+
width: 36px;
|
| 20 |
+
height: 36px;
|
| 21 |
+
border: 1px solid var(--border-color, #ddd);
|
| 22 |
+
border-radius: 8px;
|
| 23 |
+
background: var(--surface-bg, #fff);
|
| 24 |
+
color: var(--text-color, #1a1a1a);
|
| 25 |
+
font-size: 16px;
|
| 26 |
+
cursor: pointer;
|
| 27 |
+
transition: background 0.15s, border-color 0.15s;
|
| 28 |
+
user-select: none;
|
| 29 |
+
flex-shrink: 0;
|
| 30 |
+
}
|
| 31 |
+
.finephrase-explorer .nav-btn:hover {
|
| 32 |
+
background: var(--primary-color, #6366f1);
|
| 33 |
+
color: #fff;
|
| 34 |
+
border-color: var(--primary-color, #6366f1);
|
| 35 |
+
}
|
| 36 |
+
.finephrase-explorer .random-btn {
|
| 37 |
+
padding: 6px 14px;
|
| 38 |
+
width: auto;
|
| 39 |
+
font-size: 13px;
|
| 40 |
+
font-weight: 600;
|
| 41 |
+
}
|
| 42 |
+
.finephrase-explorer .sample-counter {
|
| 43 |
+
margin-left: auto;
|
| 44 |
+
font-size: 12px;
|
| 45 |
+
color: var(--muted-color, #888);
|
| 46 |
+
white-space: nowrap;
|
| 47 |
+
}
|
| 48 |
+
.finephrase-explorer .source-meta {
|
| 49 |
+
padding: 12px 14px;
|
| 50 |
+
font-size: 12px;
|
| 51 |
+
color: var(--muted-color, #888);
|
| 52 |
+
line-height: 1.8;
|
| 53 |
+
border-top: 1px solid var(--border-color, #ddd);
|
| 54 |
+
word-break: break-all;
|
| 55 |
+
flex-shrink: 0;
|
| 56 |
+
margin-top: auto;
|
| 57 |
+
}
|
| 58 |
+
.finephrase-explorer .source-meta a {
|
| 59 |
+
color: var(--primary-color, #6366f1);
|
| 60 |
+
text-decoration: none;
|
| 61 |
+
}
|
| 62 |
+
.finephrase-explorer .source-meta a:hover {
|
| 63 |
+
text-decoration: underline;
|
| 64 |
+
}
|
| 65 |
+
.finephrase-explorer .source-meta .meta-label {
|
| 66 |
+
font-weight: 600;
|
| 67 |
+
color: var(--text-color, #1a1a1a);
|
| 68 |
+
font-size: 11px;
|
| 69 |
+
}
|
| 70 |
+
.finephrase-explorer .source-meta .meta-value {
|
| 71 |
+
font-family: var(--font-mono, monospace);
|
| 72 |
+
font-size: 11px;
|
| 73 |
+
}
|
| 74 |
+
.finephrase-explorer .panels {
|
| 75 |
+
display: grid;
|
| 76 |
+
grid-template-columns: 1fr 1fr 1fr;
|
| 77 |
+
grid-template-rows: 1fr 1fr;
|
| 78 |
+
gap: 14px;
|
| 79 |
+
}
|
| 80 |
+
.finephrase-explorer .source-panel {
|
| 81 |
+
grid-row: 1 / 3;
|
| 82 |
+
grid-column: 1;
|
| 83 |
+
}
|
| 84 |
+
.finephrase-explorer .panel {
|
| 85 |
+
border: 1px solid var(--border-color, #ddd);
|
| 86 |
+
border-radius: 10px;
|
| 87 |
+
background: var(--surface-bg, #fff);
|
| 88 |
+
overflow: hidden;
|
| 89 |
+
display: flex;
|
| 90 |
+
flex-direction: column;
|
| 91 |
+
min-height: 0;
|
| 92 |
+
}
|
| 93 |
+
.finephrase-explorer .panel-header {
|
| 94 |
+
display: flex;
|
| 95 |
+
align-items: center;
|
| 96 |
+
justify-content: space-between;
|
| 97 |
+
padding: 8px 14px;
|
| 98 |
+
font-size: 12px;
|
| 99 |
+
font-weight: 700;
|
| 100 |
+
border-bottom: 1px solid var(--border-color, #ddd);
|
| 101 |
+
background: var(--surface-bg, #fff);
|
| 102 |
+
flex-shrink: 0;
|
| 103 |
+
}
|
| 104 |
+
.finephrase-explorer .panel-header .header-stats {
|
| 105 |
+
font-size: 11px;
|
| 106 |
+
font-weight: 400;
|
| 107 |
+
color: var(--muted-color, #888);
|
| 108 |
+
text-transform: none;
|
| 109 |
+
letter-spacing: 0;
|
| 110 |
+
white-space: nowrap;
|
| 111 |
+
}
|
| 112 |
+
.finephrase-explorer .prompt-tag {
|
| 113 |
+
display: inline-block;
|
| 114 |
+
padding: 3px 10px;
|
| 115 |
+
border-radius: 5px;
|
| 116 |
+
font-size: 11px;
|
| 117 |
+
font-weight: 700;
|
| 118 |
+
text-transform: uppercase;
|
| 119 |
+
letter-spacing: 0.04em;
|
| 120 |
+
}
|
| 121 |
+
.finephrase-explorer .panel-body {
|
| 122 |
+
padding: 14px;
|
| 123 |
+
overflow-y: auto;
|
| 124 |
+
max-height: 320px;
|
| 125 |
+
font-size: 13px;
|
| 126 |
+
line-height: 1.6;
|
| 127 |
+
white-space: pre-wrap;
|
| 128 |
+
word-wrap: break-word;
|
| 129 |
+
}
|
| 130 |
+
.finephrase-explorer .source-panel {
|
| 131 |
+
max-height: 760px;
|
| 132 |
+
}
|
| 133 |
+
.finephrase-explorer .source-panel .panel-body {
|
| 134 |
+
flex: 1;
|
| 135 |
+
min-height: 0;
|
| 136 |
+
max-height: none;
|
| 137 |
+
}
|
| 138 |
+
.finephrase-explorer .panel-body table {
|
| 139 |
+
border-collapse: collapse;
|
| 140 |
+
width: 100%;
|
| 141 |
+
font-size: 12px;
|
| 142 |
+
margin: 8px 0;
|
| 143 |
+
}
|
| 144 |
+
.finephrase-explorer .panel-body th,
|
| 145 |
+
.finephrase-explorer .panel-body td {
|
| 146 |
+
border: 1px solid var(--border-color, #ddd);
|
| 147 |
+
padding: 6px 10px;
|
| 148 |
+
text-align: left;
|
| 149 |
+
}
|
| 150 |
+
.finephrase-explorer .panel-body th {
|
| 151 |
+
background: var(--surface-bg, #f5f5f5);
|
| 152 |
+
font-weight: 600;
|
| 153 |
+
}
|
| 154 |
+
.finephrase-explorer .panel-body h2,
|
| 155 |
+
.finephrase-explorer .panel-body h3 {
|
| 156 |
+
margin: 12px 0 6px;
|
| 157 |
+
font-size: 14px;
|
| 158 |
+
font-weight: 700;
|
| 159 |
+
white-space: normal;
|
| 160 |
+
}
|
| 161 |
+
.finephrase-explorer .panel-body h2:first-child,
|
| 162 |
+
.finephrase-explorer .panel-body h3:first-child {
|
| 163 |
+
margin-top: 0;
|
| 164 |
+
}
|
| 165 |
+
.finephrase-explorer .panel-body strong {
|
| 166 |
+
font-weight: 700;
|
| 167 |
+
}
|
| 168 |
+
.finephrase-explorer .error-msg {
|
| 169 |
+
color: #dc2626;
|
| 170 |
+
padding: 20px;
|
| 171 |
+
font-size: 14px;
|
| 172 |
+
}
|
| 173 |
+
@media (max-width: 768px) {
|
| 174 |
+
.finephrase-explorer .panels {
|
| 175 |
+
grid-template-columns: 1fr;
|
| 176 |
+
grid-template-rows: auto;
|
| 177 |
+
}
|
| 178 |
+
.finephrase-explorer .source-panel {
|
| 179 |
+
grid-row: auto;
|
| 180 |
+
}
|
| 181 |
+
.finephrase-explorer .panel-body {
|
| 182 |
+
max-height: 250px;
|
| 183 |
+
}
|
| 184 |
+
}
|
| 185 |
+
</style>
|
| 186 |
+
<script>
|
| 187 |
+
(() => {
|
| 188 |
+
const PROMPTS = ['faq', 'math', 'table', 'tutorial'];
|
| 189 |
+
const PROMPT_LABELS = { faq: 'FAQ', math: 'Math', table: 'Table', tutorial: 'Tutorial' };
|
| 190 |
+
|
| 191 |
+
// Indices into the full 12-prompt alphabetical order used by experiment charts
|
| 192 |
+
const PROMPT_COLOR_INDICES = { faq: 6, math: 8, table: 9, tutorial: 10 };
|
| 193 |
+
|
| 194 |
+
const getPromptColors = () => {
|
| 195 |
+
const palette = window.ColorPalettes
|
| 196 |
+
? window.ColorPalettes.getColors('categorical', 12)
|
| 197 |
+
: ['#4e79a7','#f28e2b','#e15759','#76b7b2','#59a14f','#edc948','#b07aa1','#ff9da7','#9c755f','#bab0ac','#d37295','#a0cbe8'];
|
| 198 |
+
const colors = {};
|
| 199 |
+
for (const p of PROMPTS) colors[p] = palette[PROMPT_COLOR_INDICES[p]];
|
| 200 |
+
return colors;
|
| 201 |
+
};
|
| 202 |
+
|
| 203 |
+
const hexToRgb = (hex) => {
|
| 204 |
+
const h = hex.replace('#', '');
|
| 205 |
+
return { r: parseInt(h.slice(0, 2), 16), g: parseInt(h.slice(2, 4), 16), b: parseInt(h.slice(4, 6), 16) };
|
| 206 |
+
};
|
| 207 |
+
const luminance = ({ r, g, b }) => 0.299 * r + 0.587 * g + 0.114 * b;
|
| 208 |
+
|
| 209 |
+
const bootstrap = () => {
|
| 210 |
+
const scriptEl = document.currentScript;
|
| 211 |
+
let container = scriptEl ? scriptEl.previousElementSibling : null;
|
| 212 |
+
if (!(container && container.classList && container.classList.contains('finephrase-explorer'))) {
|
| 213 |
+
const cs = Array.from(document.querySelectorAll('.finephrase-explorer'))
|
| 214 |
+
.filter(el => !(el.dataset && el.dataset.mounted === 'true'));
|
| 215 |
+
container = cs[cs.length - 1] || null;
|
| 216 |
+
}
|
| 217 |
+
if (!container) return;
|
| 218 |
+
if (container.dataset) {
|
| 219 |
+
if (container.dataset.mounted === 'true') return;
|
| 220 |
+
container.dataset.mounted = 'true';
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
const parseJsonl = (text) => text.trim().split('\n').filter(Boolean).map(line => JSON.parse(line));
|
| 224 |
+
|
| 225 |
+
const fetchFirstAvailable = async (paths) => {
|
| 226 |
+
for (const p of paths) {
|
| 227 |
+
try {
|
| 228 |
+
const r = await fetch(p, { cache: 'no-cache' });
|
| 229 |
+
if (r.ok) {
|
| 230 |
+
const text = await r.text();
|
| 231 |
+
return p.endsWith('.jsonl') ? parseJsonl(text) : JSON.parse(text);
|
| 232 |
+
}
|
| 233 |
+
} catch (_) {}
|
| 234 |
+
}
|
| 235 |
+
throw new Error('Data file not found');
|
| 236 |
+
};
|
| 237 |
+
|
| 238 |
+
const renderMarkdown = (text) => {
|
| 239 |
+
if (!text) return '';
|
| 240 |
+
let html = text
|
| 241 |
+
.replace(/&/g, '&')
|
| 242 |
+
.replace(/</g, '<')
|
| 243 |
+
.replace(/>/g, '>');
|
| 244 |
+
|
| 245 |
+
const lines = html.split('\n');
|
| 246 |
+
let inTable = false;
|
| 247 |
+
const out = [];
|
| 248 |
+
let tableRows = [];
|
| 249 |
+
|
| 250 |
+
const flushTable = () => {
|
| 251 |
+
if (tableRows.length < 2) {
|
| 252 |
+
out.push(...tableRows.map(r => r.raw));
|
| 253 |
+
tableRows = [];
|
| 254 |
+
return;
|
| 255 |
+
}
|
| 256 |
+
let t = '<table>';
|
| 257 |
+
tableRows.forEach((row, i) => {
|
| 258 |
+
if (row.isSep) return;
|
| 259 |
+
const tag = i === 0 ? 'th' : 'td';
|
| 260 |
+
t += '<tr>' + row.cells.map(c => `<${tag}>${c.trim()}</${tag}>`).join('') + '</tr>';
|
| 261 |
+
});
|
| 262 |
+
t += '</table>';
|
| 263 |
+
out.push(t);
|
| 264 |
+
tableRows = [];
|
| 265 |
+
};
|
| 266 |
+
|
| 267 |
+
for (const line of lines) {
|
| 268 |
+
const trimmed = line.trim();
|
| 269 |
+
if (trimmed.startsWith('|') && trimmed.endsWith('|')) {
|
| 270 |
+
const cells = trimmed.slice(1, -1).split('|');
|
| 271 |
+
const isSep = cells.every(c => /^[\s\-:]+$/.test(c));
|
| 272 |
+
tableRows.push({ cells, isSep, raw: line });
|
| 273 |
+
inTable = true;
|
| 274 |
+
} else {
|
| 275 |
+
if (inTable) { flushTable(); inTable = false; }
|
| 276 |
+
out.push(line);
|
| 277 |
+
}
|
| 278 |
+
}
|
| 279 |
+
if (inTable) flushTable();
|
| 280 |
+
|
| 281 |
+
html = out.join('\n');
|
| 282 |
+
html = html.replace(/^### (.+)$/gm, '<h3>$1</h3>');
|
| 283 |
+
html = html.replace(/^## (.+)$/gm, '<h2>$1</h2>');
|
| 284 |
+
html = html.replace(/\*\*([^*]+)\*\*/g, '<strong>$1</strong>');
|
| 285 |
+
html = html.replace(/\n(?!<)/g, '<br>');
|
| 286 |
+
return html;
|
| 287 |
+
};
|
| 288 |
+
|
| 289 |
+
const wc = (text) => text ? text.split(/\s+/).filter(Boolean).length : 0;
|
| 290 |
+
const fmtNum = (n) => n.toLocaleString('en-US');
|
| 291 |
+
const statsText = (text) => `${fmtNum(wc(text))} words · ${fmtNum(text.length)} chars`;
|
| 292 |
+
|
| 293 |
+
const makeTag = (label, color) => {
|
| 294 |
+
const textColor = luminance(hexToRgb(color)) > 160 ? '#111' : '#fff';
|
| 295 |
+
const tag = document.createElement('span');
|
| 296 |
+
tag.className = 'prompt-tag';
|
| 297 |
+
tag.style.cssText = `background:${color};color:${textColor};`;
|
| 298 |
+
tag.textContent = label;
|
| 299 |
+
return tag;
|
| 300 |
+
};
|
| 301 |
+
|
| 302 |
+
const render = (data, idx, promptColors) => {
|
| 303 |
+
const total = data.length;
|
| 304 |
+
const entry = data[idx];
|
| 305 |
+
container.innerHTML = '';
|
| 306 |
+
|
| 307 |
+
// Controls row: nav buttons + sample counter
|
| 308 |
+
const controls = document.createElement('div');
|
| 309 |
+
controls.className = 'controls';
|
| 310 |
+
|
| 311 |
+
const prevBtn = document.createElement('button');
|
| 312 |
+
prevBtn.className = 'nav-btn';
|
| 313 |
+
prevBtn.textContent = '←';
|
| 314 |
+
prevBtn.title = 'Previous sample';
|
| 315 |
+
prevBtn.addEventListener('click', () => { currentIdx = (currentIdx - 1 + total) % total; render(data, currentIdx, promptColors); });
|
| 316 |
+
|
| 317 |
+
const nextBtn = document.createElement('button');
|
| 318 |
+
nextBtn.className = 'nav-btn';
|
| 319 |
+
nextBtn.textContent = '→';
|
| 320 |
+
nextBtn.title = 'Next sample';
|
| 321 |
+
nextBtn.addEventListener('click', () => { currentIdx = (currentIdx + 1) % total; render(data, currentIdx, promptColors); });
|
| 322 |
+
|
| 323 |
+
const randBtn = document.createElement('button');
|
| 324 |
+
randBtn.className = 'nav-btn random-btn';
|
| 325 |
+
randBtn.textContent = 'Random';
|
| 326 |
+
randBtn.title = 'Random sample';
|
| 327 |
+
randBtn.addEventListener('click', () => { currentIdx = Math.floor(Math.random() * total); render(data, currentIdx, promptColors); });
|
| 328 |
+
|
| 329 |
+
const counter = document.createElement('span');
|
| 330 |
+
counter.className = 'sample-counter';
|
| 331 |
+
counter.textContent = `${idx + 1} / ${total}`;
|
| 332 |
+
|
| 333 |
+
controls.appendChild(prevBtn);
|
| 334 |
+
controls.appendChild(nextBtn);
|
| 335 |
+
controls.appendChild(randBtn);
|
| 336 |
+
controls.appendChild(counter);
|
| 337 |
+
container.appendChild(controls);
|
| 338 |
+
|
| 339 |
+
const esc = (s) => s.replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>');
|
| 340 |
+
|
| 341 |
+
// Panels grid
|
| 342 |
+
const panels = document.createElement('div');
|
| 343 |
+
panels.className = 'panels';
|
| 344 |
+
|
| 345 |
+
// Source panel
|
| 346 |
+
const sourceText = entry.source || '';
|
| 347 |
+
const sourcePanel = document.createElement('div');
|
| 348 |
+
sourcePanel.className = 'panel source-panel';
|
| 349 |
+
const sourceHeader = document.createElement('div');
|
| 350 |
+
sourceHeader.className = 'panel-header';
|
| 351 |
+
const sourceTag = makeTag('FineWeb-Edu', 'var(--primary-color, #6366f1)');
|
| 352 |
+
sourceTag.style.cssText = 'background:var(--primary-color, #6366f1);color:var(--on-primary, #fff);';
|
| 353 |
+
const sourceStats = document.createElement('span');
|
| 354 |
+
sourceStats.className = 'header-stats';
|
| 355 |
+
sourceStats.textContent = statsText(sourceText);
|
| 356 |
+
sourceHeader.appendChild(sourceTag);
|
| 357 |
+
sourceHeader.appendChild(sourceStats);
|
| 358 |
+
const sourceBody = document.createElement('div');
|
| 359 |
+
sourceBody.className = 'panel-body';
|
| 360 |
+
sourceBody.innerHTML = renderMarkdown(sourceText);
|
| 361 |
+
sourcePanel.appendChild(sourceHeader);
|
| 362 |
+
sourcePanel.appendChild(sourceBody);
|
| 363 |
+
|
| 364 |
+
// Metadata section inside source panel
|
| 365 |
+
const sourceMeta = document.createElement('div');
|
| 366 |
+
sourceMeta.className = 'source-meta';
|
| 367 |
+
const metaLines = [];
|
| 368 |
+
if (entry.id) metaLines.push(`<span class="meta-label">ID:</span> <span class="meta-value">${esc(entry.id)}</span>`);
|
| 369 |
+
if (entry.url) metaLines.push(`<span class="meta-label">Original Website:</span> <a href="${entry.url}" target="_blank" rel="noopener">${esc(entry.url)}</a>`);
|
| 370 |
+
if (entry.file_path) metaLines.push(`<span class="meta-label">CommonCrawl S3:</span> <span class="meta-value">${esc(entry.file_path)}</span>`);
|
| 371 |
+
sourceMeta.innerHTML = metaLines.join('<br>');
|
| 372 |
+
sourcePanel.appendChild(sourceMeta);
|
| 373 |
+
panels.appendChild(sourcePanel);
|
| 374 |
+
|
| 375 |
+
// 4 prompt panels
|
| 376 |
+
for (const prompt of PROMPTS) {
|
| 377 |
+
const color = promptColors[prompt];
|
| 378 |
+
const genText = entry[prompt] || '';
|
| 379 |
+
const panel = document.createElement('div');
|
| 380 |
+
panel.className = 'panel';
|
| 381 |
+
|
| 382 |
+
const header = document.createElement('div');
|
| 383 |
+
header.className = 'panel-header';
|
| 384 |
+
header.appendChild(makeTag(PROMPT_LABELS[prompt], color));
|
| 385 |
+
const stats = document.createElement('span');
|
| 386 |
+
stats.className = 'header-stats';
|
| 387 |
+
stats.textContent = statsText(genText);
|
| 388 |
+
header.appendChild(stats);
|
| 389 |
+
|
| 390 |
+
const body = document.createElement('div');
|
| 391 |
+
body.className = 'panel-body';
|
| 392 |
+
body.innerHTML = renderMarkdown(genText);
|
| 393 |
+
|
| 394 |
+
panel.appendChild(header);
|
| 395 |
+
panel.appendChild(body);
|
| 396 |
+
panels.appendChild(panel);
|
| 397 |
+
}
|
| 398 |
+
|
| 399 |
+
container.appendChild(panels);
|
| 400 |
+
};
|
| 401 |
+
|
| 402 |
+
let currentIdx = 0;
|
| 403 |
+
const promptColors = getPromptColors();
|
| 404 |
+
|
| 405 |
+
document.addEventListener('palettes:updated', () => {
|
| 406 |
+
Object.assign(promptColors, getPromptColors());
|
| 407 |
+
if (loadedData) render(loadedData, currentIdx, promptColors);
|
| 408 |
+
});
|
| 409 |
+
|
| 410 |
+
let loadedData = null;
|
| 411 |
+
|
| 412 |
+
fetchFirstAvailable([
|
| 413 |
+
'/data/finephrase-samples.jsonl',
|
| 414 |
+
'/data/finephrase-samples.json',
|
| 415 |
+
'./assets/data/finephrase-samples.jsonl',
|
| 416 |
+
'./assets/data/finephrase-samples.json',
|
| 417 |
+
'../assets/data/finephrase-samples.jsonl',
|
| 418 |
+
'../../assets/data/finephrase-samples.jsonl',
|
| 419 |
+
]).then(data => {
|
| 420 |
+
loadedData = data;
|
| 421 |
+
currentIdx = Math.floor(Math.random() * data.length);
|
| 422 |
+
render(data, currentIdx, promptColors);
|
| 423 |
+
document.addEventListener('keydown', (e) => {
|
| 424 |
+
if (e.key === 'ArrowLeft') {
|
| 425 |
+
currentIdx = (currentIdx - 1 + data.length) % data.length;
|
| 426 |
+
render(data, currentIdx, promptColors);
|
| 427 |
+
} else if (e.key === 'ArrowRight') {
|
| 428 |
+
currentIdx = (currentIdx + 1) % data.length;
|
| 429 |
+
render(data, currentIdx, promptColors);
|
| 430 |
+
}
|
| 431 |
+
});
|
| 432 |
+
}).catch(err => {
|
| 433 |
+
container.innerHTML = `<pre class="error-msg">Failed to load sample data: ${err.message}</pre>`;
|
| 434 |
+
});
|
| 435 |
+
};
|
| 436 |
+
|
| 437 |
+
if (document.readyState === 'loading') {
|
| 438 |
+
document.addEventListener('DOMContentLoaded', bootstrap, { once: true });
|
| 439 |
+
} else {
|
| 440 |
+
bootstrap();
|
| 441 |
+
}
|
| 442 |
+
})();
|
| 443 |
+
</script>
|