import argparse import json import re from pathlib import Path def preprocess_gutenberg(raw_dir: str, output_path: str): """ Read all files from raw_dir (Gutenberg books), split each into paragraphs, and write a single JSONL file where each line is one book: {"file_name": "3046.ref", "text": ["paragraph 1", "paragraph 2", ...]} Paragraph boundaries are determined by: - double newlines (\\n\\n) - lines starting with capitalised text, a number, or ========== """ raw = Path(raw_dir) out = Path(output_path) out.parent.mkdir(parents=True, exist_ok=True) n_books = 0 with open(out, "w", encoding="utf-8") as fout: for filepath in sorted(raw.iterdir()): if filepath.is_dir(): continue text = filepath.read_text(encoding="utf-8") # split on ========== into sections sections = re.split(r"\n*==========\n*", text) paragraphs = [] for section in sections: section = section.strip() if not section: continue # first line is the heading, rest is body text first_newline = section.find("\n") if first_newline == -1: # section is just a heading with no body paragraphs.append(section) else: heading = section[:first_newline].strip() body = section[first_newline + 1:].strip() if heading: paragraphs.append(heading) if body: paragraphs.append(body) if not paragraphs: continue record = {"file_name": filepath.name, "text": paragraphs} fout.write(json.dumps(record, ensure_ascii=False) + "\n") n_books += 1 print(f"Wrote {n_books:,} books to {out}") def preprocess_ted(input_dir: str, output_dir: str, max_paragraphs: int = 60): """ Process all TED split files (train.jsonl, val.jsonl, test.jsonl) in input_dir, drop samples with more than max_paragraphs paragraphs, and write each as {name}_preprocessed.jsonl into output_dir. """ inp = Path(input_dir) out = Path(output_dir) out.mkdir(parents=True, exist_ok=True) for split_file in sorted(inp.glob("*.jsonl")): out_file = out / f"{split_file.stem}_preprocessed.jsonl" n_kept = 0 n_dropped = 0 with open(split_file, encoding="utf-8") as fin, open(out_file, "w", encoding="utf-8") as fout: for line in fin: line = line.strip() if not line: continue obj = json.loads(line) paragraphs = obj.get("text", []) if len(paragraphs) > max_paragraphs: n_dropped += 1 continue fout.write(json.dumps(obj, ensure_ascii=False) + "\n") n_kept += 1 print(f"{split_file.name}: kept {n_kept:,}, dropped {n_dropped:,} with >{max_paragraphs} paragraphs → {out_file}") def preprocess_jsonl_paragraphs(input_path: str, output_path: str, text_field: str = "text"): """ Read a raw JSONL file, split the text field on double newlines into a list of paragraphs, and write the result as a new JSONL file. Output format per line: {"document_idx": 0, "text": ["paragraph 1", "paragraph 2", ...], ...} """ inp = Path(input_path) out = Path(output_path) out.parent.mkdir(parents=True, exist_ok=True) n_docs = 0 with open(inp, encoding="utf-8") as fin, open(out, "w", encoding="utf-8") as fout: for line in fin: line = line.strip() if not line: continue obj = json.loads(line) raw_text = obj.get(text_field, "") paragraphs = [p.strip() for p in raw_text.split("\n\n") if p.strip()] obj[text_field] = paragraphs fout.write(json.dumps(obj, ensure_ascii=False) + "\n") n_docs += 1 print(f"Wrote {n_docs:,} documents to {out}") def _strip_yaml_frontmatter(text: str) -> str: """Remove YAML frontmatter (--- ... ---) from the beginning of text.""" if text.startswith("---"): end = text.find("\n---", 3) if end != -1: text = text[end + 4:].lstrip("\n") return text def preprocess_pubmed(input_path: str, output_path: str): inp = Path(input_path) out = Path(output_path) out.parent.mkdir(parents=True, exist_ok=True) n_docs = 0 with open(inp, encoding="utf-8") as fin, open(out, "w", encoding="utf-8") as fout: for line in fin: line = line.strip() if not line: continue obj = json.loads(line) raw_text = _strip_yaml_frontmatter(obj.get("text", "")) paragraphs = [p.strip() for p in raw_text.split("\n\n") if p.strip()] obj["text"] = paragraphs fout.write(json.dumps(obj, ensure_ascii=False) + "\n") n_docs += 1 print(f"Wrote {n_docs:,} documents to {out}") def preprocess_wikipedia(input_path: str, output_path: str): preprocess_jsonl_paragraphs(input_path, output_path, text_field="text") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Preprocess raw data into JSONL.") sub = parser.add_subparsers(dest="method", required=True) gut = sub.add_parser("gutenberg", help="Convert raw Gutenberg books to JSONL") gut.add_argument("--raw_dir", required=True, help="Directory with raw Gutenberg text files") gut.add_argument("--output", required=True, help="Output JSONL path") ted = sub.add_parser("ted", help="Filter TED talks JSONL by paragraph count") ted.add_argument("--input_dir", required=True, help="Directory with TED split JSONL files") ted.add_argument("--output_dir", required=True, help="Output directory for preprocessed files") ted.add_argument("--max_paragraphs", type=int, default=60, help="Drop talks with more than this many paragraphs (default: 60)") pub = sub.add_parser("pubmed", help="Split PubMed text into paragraphs") pub.add_argument("--input", required=True, help="Input raw JSONL path") pub.add_argument("--output", required=True, help="Output JSONL path") wiki = sub.add_parser("wikipedia", help="Split Wikipedia text into paragraphs") wiki.add_argument("--input", required=True, help="Input raw JSONL path") wiki.add_argument("--output", required=True, help="Output JSONL path") args = parser.parse_args() if args.method == "gutenberg": preprocess_gutenberg(args.raw_dir, args.output) elif args.method == "ted": preprocess_ted(args.input_dir, args.output_dir, args.max_paragraphs) elif args.method == "pubmed": preprocess_pubmed(args.input, args.output) elif args.method == "wikipedia": preprocess_wikipedia(args.input, args.output)