bc-test / src /datasets /preprocess.py
lamossta's picture
readme, makefile, service files
7b4cc2e
import argparse
import json
import re
from pathlib import Path
def preprocess_gutenberg(raw_dir: str, output_path: str):
"""
Read all files from raw_dir (Gutenberg books), split each into paragraphs,
and write a single JSONL file where each line is one book:
{"file_name": "3046.ref", "text": ["paragraph 1", "paragraph 2", ...]}
Paragraph boundaries are determined by:
- double newlines (\\n\\n)
- lines starting with capitalised text, a number, or ==========
"""
raw = Path(raw_dir)
out = Path(output_path)
out.parent.mkdir(parents=True, exist_ok=True)
n_books = 0
with open(out, "w", encoding="utf-8") as fout:
for filepath in sorted(raw.iterdir()):
if filepath.is_dir():
continue
text = filepath.read_text(encoding="utf-8")
# split on ========== into sections
sections = re.split(r"\n*==========\n*", text)
paragraphs = []
for section in sections:
section = section.strip()
if not section:
continue
# first line is the heading, rest is body text
first_newline = section.find("\n")
if first_newline == -1:
# section is just a heading with no body
paragraphs.append(section)
else:
heading = section[:first_newline].strip()
body = section[first_newline + 1:].strip()
if heading:
paragraphs.append(heading)
if body:
paragraphs.append(body)
if not paragraphs:
continue
record = {"file_name": filepath.name, "text": paragraphs}
fout.write(json.dumps(record, ensure_ascii=False) + "\n")
n_books += 1
print(f"Wrote {n_books:,} books to {out}")
def preprocess_ted(input_dir: str, output_dir: str, max_paragraphs: int = 60):
"""
Process all TED split files (train.jsonl, val.jsonl, test.jsonl) in input_dir,
drop samples with more than max_paragraphs paragraphs, and write each as
{name}_preprocessed.jsonl into output_dir.
"""
inp = Path(input_dir)
out = Path(output_dir)
out.mkdir(parents=True, exist_ok=True)
for split_file in sorted(inp.glob("*.jsonl")):
out_file = out / f"{split_file.stem}_preprocessed.jsonl"
n_kept = 0
n_dropped = 0
with open(split_file, encoding="utf-8") as fin, open(out_file, "w", encoding="utf-8") as fout:
for line in fin:
line = line.strip()
if not line:
continue
obj = json.loads(line)
paragraphs = obj.get("text", [])
if len(paragraphs) > max_paragraphs:
n_dropped += 1
continue
fout.write(json.dumps(obj, ensure_ascii=False) + "\n")
n_kept += 1
print(f"{split_file.name}: kept {n_kept:,}, dropped {n_dropped:,} with >{max_paragraphs} paragraphs → {out_file}")
def preprocess_jsonl_paragraphs(input_path: str, output_path: str, text_field: str = "text"):
"""
Read a raw JSONL file, split the text field on double newlines into
a list of paragraphs, and write the result as a new JSONL file.
Output format per line:
{"document_idx": 0, "text": ["paragraph 1", "paragraph 2", ...], ...}
"""
inp = Path(input_path)
out = Path(output_path)
out.parent.mkdir(parents=True, exist_ok=True)
n_docs = 0
with open(inp, encoding="utf-8") as fin, open(out, "w", encoding="utf-8") as fout:
for line in fin:
line = line.strip()
if not line:
continue
obj = json.loads(line)
raw_text = obj.get(text_field, "")
paragraphs = [p.strip() for p in raw_text.split("\n\n") if p.strip()]
obj[text_field] = paragraphs
fout.write(json.dumps(obj, ensure_ascii=False) + "\n")
n_docs += 1
print(f"Wrote {n_docs:,} documents to {out}")
def _strip_yaml_frontmatter(text: str) -> str:
"""Remove YAML frontmatter (--- ... ---) from the beginning of text."""
if text.startswith("---"):
end = text.find("\n---", 3)
if end != -1:
text = text[end + 4:].lstrip("\n")
return text
def preprocess_pubmed(input_path: str, output_path: str):
inp = Path(input_path)
out = Path(output_path)
out.parent.mkdir(parents=True, exist_ok=True)
n_docs = 0
with open(inp, encoding="utf-8") as fin, open(out, "w", encoding="utf-8") as fout:
for line in fin:
line = line.strip()
if not line:
continue
obj = json.loads(line)
raw_text = _strip_yaml_frontmatter(obj.get("text", ""))
paragraphs = [p.strip() for p in raw_text.split("\n\n") if p.strip()]
obj["text"] = paragraphs
fout.write(json.dumps(obj, ensure_ascii=False) + "\n")
n_docs += 1
print(f"Wrote {n_docs:,} documents to {out}")
def preprocess_wikipedia(input_path: str, output_path: str):
preprocess_jsonl_paragraphs(input_path, output_path, text_field="text")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Preprocess raw data into JSONL.")
sub = parser.add_subparsers(dest="method", required=True)
gut = sub.add_parser("gutenberg", help="Convert raw Gutenberg books to JSONL")
gut.add_argument("--raw_dir", required=True, help="Directory with raw Gutenberg text files")
gut.add_argument("--output", required=True, help="Output JSONL path")
ted = sub.add_parser("ted", help="Filter TED talks JSONL by paragraph count")
ted.add_argument("--input_dir", required=True, help="Directory with TED split JSONL files")
ted.add_argument("--output_dir", required=True, help="Output directory for preprocessed files")
ted.add_argument("--max_paragraphs", type=int, default=60,
help="Drop talks with more than this many paragraphs (default: 60)")
pub = sub.add_parser("pubmed", help="Split PubMed text into paragraphs")
pub.add_argument("--input", required=True, help="Input raw JSONL path")
pub.add_argument("--output", required=True, help="Output JSONL path")
wiki = sub.add_parser("wikipedia", help="Split Wikipedia text into paragraphs")
wiki.add_argument("--input", required=True, help="Input raw JSONL path")
wiki.add_argument("--output", required=True, help="Output JSONL path")
args = parser.parse_args()
if args.method == "gutenberg":
preprocess_gutenberg(args.raw_dir, args.output)
elif args.method == "ted":
preprocess_ted(args.input_dir, args.output_dir, args.max_paragraphs)
elif args.method == "pubmed":
preprocess_pubmed(args.input, args.output)
elif args.method == "wikipedia":
preprocess_wikipedia(args.input, args.output)