| import argparse |
| import json |
| import re |
| from pathlib import Path |
|
|
|
|
| def preprocess_gutenberg(raw_dir: str, output_path: str): |
| """ |
| Read all files from raw_dir (Gutenberg books), split each into paragraphs, |
| and write a single JSONL file where each line is one book: |
| |
| {"file_name": "3046.ref", "text": ["paragraph 1", "paragraph 2", ...]} |
| |
| Paragraph boundaries are determined by: |
| - double newlines (\\n\\n) |
| - lines starting with capitalised text, a number, or ========== |
| """ |
| raw = Path(raw_dir) |
| out = Path(output_path) |
| out.parent.mkdir(parents=True, exist_ok=True) |
|
|
| n_books = 0 |
| with open(out, "w", encoding="utf-8") as fout: |
| for filepath in sorted(raw.iterdir()): |
| if filepath.is_dir(): |
| continue |
| text = filepath.read_text(encoding="utf-8") |
|
|
| |
| sections = re.split(r"\n*==========\n*", text) |
|
|
| paragraphs = [] |
| for section in sections: |
| section = section.strip() |
| if not section: |
| continue |
| |
| first_newline = section.find("\n") |
| if first_newline == -1: |
| |
| paragraphs.append(section) |
| else: |
| heading = section[:first_newline].strip() |
| body = section[first_newline + 1:].strip() |
| if heading: |
| paragraphs.append(heading) |
| if body: |
| paragraphs.append(body) |
|
|
| if not paragraphs: |
| continue |
|
|
| record = {"file_name": filepath.name, "text": paragraphs} |
| fout.write(json.dumps(record, ensure_ascii=False) + "\n") |
| n_books += 1 |
|
|
| print(f"Wrote {n_books:,} books to {out}") |
|
|
|
|
| def preprocess_ted(input_dir: str, output_dir: str, max_paragraphs: int = 60): |
| """ |
| Process all TED split files (train.jsonl, val.jsonl, test.jsonl) in input_dir, |
| drop samples with more than max_paragraphs paragraphs, and write each as |
| {name}_preprocessed.jsonl into output_dir. |
| """ |
| inp = Path(input_dir) |
| out = Path(output_dir) |
| out.mkdir(parents=True, exist_ok=True) |
|
|
| for split_file in sorted(inp.glob("*.jsonl")): |
| out_file = out / f"{split_file.stem}_preprocessed.jsonl" |
| n_kept = 0 |
| n_dropped = 0 |
| with open(split_file, encoding="utf-8") as fin, open(out_file, "w", encoding="utf-8") as fout: |
| for line in fin: |
| line = line.strip() |
| if not line: |
| continue |
| obj = json.loads(line) |
| paragraphs = obj.get("text", []) |
| if len(paragraphs) > max_paragraphs: |
| n_dropped += 1 |
| continue |
| fout.write(json.dumps(obj, ensure_ascii=False) + "\n") |
| n_kept += 1 |
| print(f"{split_file.name}: kept {n_kept:,}, dropped {n_dropped:,} with >{max_paragraphs} paragraphs → {out_file}") |
|
|
|
|
| def preprocess_jsonl_paragraphs(input_path: str, output_path: str, text_field: str = "text"): |
| """ |
| Read a raw JSONL file, split the text field on double newlines into |
| a list of paragraphs, and write the result as a new JSONL file. |
| |
| Output format per line: |
| {"document_idx": 0, "text": ["paragraph 1", "paragraph 2", ...], ...} |
| """ |
| inp = Path(input_path) |
| out = Path(output_path) |
| out.parent.mkdir(parents=True, exist_ok=True) |
|
|
| n_docs = 0 |
| with open(inp, encoding="utf-8") as fin, open(out, "w", encoding="utf-8") as fout: |
| for line in fin: |
| line = line.strip() |
| if not line: |
| continue |
| obj = json.loads(line) |
| raw_text = obj.get(text_field, "") |
| paragraphs = [p.strip() for p in raw_text.split("\n\n") if p.strip()] |
| obj[text_field] = paragraphs |
| fout.write(json.dumps(obj, ensure_ascii=False) + "\n") |
| n_docs += 1 |
|
|
| print(f"Wrote {n_docs:,} documents to {out}") |
|
|
|
|
| def _strip_yaml_frontmatter(text: str) -> str: |
| """Remove YAML frontmatter (--- ... ---) from the beginning of text.""" |
| if text.startswith("---"): |
| end = text.find("\n---", 3) |
| if end != -1: |
| text = text[end + 4:].lstrip("\n") |
| return text |
|
|
|
|
| def preprocess_pubmed(input_path: str, output_path: str): |
| inp = Path(input_path) |
| out = Path(output_path) |
| out.parent.mkdir(parents=True, exist_ok=True) |
|
|
| n_docs = 0 |
| with open(inp, encoding="utf-8") as fin, open(out, "w", encoding="utf-8") as fout: |
| for line in fin: |
| line = line.strip() |
| if not line: |
| continue |
| obj = json.loads(line) |
| raw_text = _strip_yaml_frontmatter(obj.get("text", "")) |
| paragraphs = [p.strip() for p in raw_text.split("\n\n") if p.strip()] |
| obj["text"] = paragraphs |
| fout.write(json.dumps(obj, ensure_ascii=False) + "\n") |
| n_docs += 1 |
|
|
| print(f"Wrote {n_docs:,} documents to {out}") |
|
|
|
|
| def preprocess_wikipedia(input_path: str, output_path: str): |
| preprocess_jsonl_paragraphs(input_path, output_path, text_field="text") |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="Preprocess raw data into JSONL.") |
| sub = parser.add_subparsers(dest="method", required=True) |
|
|
| gut = sub.add_parser("gutenberg", help="Convert raw Gutenberg books to JSONL") |
| gut.add_argument("--raw_dir", required=True, help="Directory with raw Gutenberg text files") |
| gut.add_argument("--output", required=True, help="Output JSONL path") |
|
|
| ted = sub.add_parser("ted", help="Filter TED talks JSONL by paragraph count") |
| ted.add_argument("--input_dir", required=True, help="Directory with TED split JSONL files") |
| ted.add_argument("--output_dir", required=True, help="Output directory for preprocessed files") |
| ted.add_argument("--max_paragraphs", type=int, default=60, |
| help="Drop talks with more than this many paragraphs (default: 60)") |
|
|
| pub = sub.add_parser("pubmed", help="Split PubMed text into paragraphs") |
| pub.add_argument("--input", required=True, help="Input raw JSONL path") |
| pub.add_argument("--output", required=True, help="Output JSONL path") |
|
|
| wiki = sub.add_parser("wikipedia", help="Split Wikipedia text into paragraphs") |
| wiki.add_argument("--input", required=True, help="Input raw JSONL path") |
| wiki.add_argument("--output", required=True, help="Output JSONL path") |
|
|
| args = parser.parse_args() |
|
|
| if args.method == "gutenberg": |
| preprocess_gutenberg(args.raw_dir, args.output) |
| elif args.method == "ted": |
| preprocess_ted(args.input_dir, args.output_dir, args.max_paragraphs) |
| elif args.method == "pubmed": |
| preprocess_pubmed(args.input, args.output) |
| elif args.method == "wikipedia": |
| preprocess_wikipedia(args.input, args.output) |
|
|