Spaces:

slamos
/

bc-test

Paused

App Files Files Community

bc-test / src /datasets /preprocess.py

lamossta

readme, makefile, service files

7b4cc2e about 1 month ago

raw

history blame contribute delete

7.08 kB

	import argparse
	import json
	import re
	from pathlib import Path


	def preprocess_gutenberg(raw_dir: str, output_path: str):
	"""
	Read all files from raw_dir (Gutenberg books), split each into paragraphs,
	and write a single JSONL file where each line is one book:

	{"file_name": "3046.ref", "text": ["paragraph 1", "paragraph 2", ...]}

	Paragraph boundaries are determined by:
	- double newlines (\\n\\n)
	- lines starting with capitalised text, a number, or ==========
	"""
	raw = Path(raw_dir)
	out = Path(output_path)
	out.parent.mkdir(parents=True, exist_ok=True)

	n_books = 0
	with open(out, "w", encoding="utf-8") as fout:
	for filepath in sorted(raw.iterdir()):
	if filepath.is_dir():
	continue
	text = filepath.read_text(encoding="utf-8")

	# split on ========== into sections
	sections = re.split(r"\n==========\n", text)

	paragraphs = []
	for section in sections:
	section = section.strip()
	if not section:
	continue
	# first line is the heading, rest is body text
	first_newline = section.find("\n")
	if first_newline == -1:
	# section is just a heading with no body
	paragraphs.append(section)
	else:
	heading = section[:first_newline].strip()
	body = section[first_newline + 1:].strip()
	if heading:
	paragraphs.append(heading)
	if body:
	paragraphs.append(body)

	if not paragraphs:
	continue

	record = {"file_name": filepath.name, "text": paragraphs}
	fout.write(json.dumps(record, ensure_ascii=False) + "\n")
	n_books += 1

	print(f"Wrote {n_books:,} books to {out}")


	def preprocess_ted(input_dir: str, output_dir: str, max_paragraphs: int = 60):
	"""
	Process all TED split files (train.jsonl, val.jsonl, test.jsonl) in input_dir,
	drop samples with more than max_paragraphs paragraphs, and write each as
	{name}_preprocessed.jsonl into output_dir.
	"""
	inp = Path(input_dir)
	out = Path(output_dir)
	out.mkdir(parents=True, exist_ok=True)

	for split_file in sorted(inp.glob("*.jsonl")):
	out_file = out / f"{split_file.stem}_preprocessed.jsonl"
	n_kept = 0
	n_dropped = 0
	with open(split_file, encoding="utf-8") as fin, open(out_file, "w", encoding="utf-8") as fout:
	for line in fin:
	line = line.strip()
	if not line:
	continue
	obj = json.loads(line)
	paragraphs = obj.get("text", [])
	if len(paragraphs) > max_paragraphs:
	n_dropped += 1
	continue
	fout.write(json.dumps(obj, ensure_ascii=False) + "\n")
	n_kept += 1
	print(f"{split_file.name}: kept {n_kept:,}, dropped {n_dropped:,} with >{max_paragraphs} paragraphs → {out_file}")


	def preprocess_jsonl_paragraphs(input_path: str, output_path: str, text_field: str = "text"):
	"""
	Read a raw JSONL file, split the text field on double newlines into
	a list of paragraphs, and write the result as a new JSONL file.

	Output format per line:
	{"document_idx": 0, "text": ["paragraph 1", "paragraph 2", ...], ...}
	"""
	inp = Path(input_path)
	out = Path(output_path)
	out.parent.mkdir(parents=True, exist_ok=True)

	n_docs = 0
	with open(inp, encoding="utf-8") as fin, open(out, "w", encoding="utf-8") as fout:
	for line in fin:
	line = line.strip()
	if not line:
	continue
	obj = json.loads(line)
	raw_text = obj.get(text_field, "")
	paragraphs = [p.strip() for p in raw_text.split("\n\n") if p.strip()]
	obj[text_field] = paragraphs
	fout.write(json.dumps(obj, ensure_ascii=False) + "\n")
	n_docs += 1

	print(f"Wrote {n_docs:,} documents to {out}")


	def _strip_yaml_frontmatter(text: str) -> str:
	"""Remove YAML frontmatter (--- ... ---) from the beginning of text."""
	if text.startswith("---"):
	end = text.find("\n---", 3)
	if end != -1:
	text = text[end + 4:].lstrip("\n")
	return text


	def preprocess_pubmed(input_path: str, output_path: str):
	inp = Path(input_path)
	out = Path(output_path)
	out.parent.mkdir(parents=True, exist_ok=True)

	n_docs = 0
	with open(inp, encoding="utf-8") as fin, open(out, "w", encoding="utf-8") as fout:
	for line in fin:
	line = line.strip()
	if not line:
	continue
	obj = json.loads(line)
	raw_text = _strip_yaml_frontmatter(obj.get("text", ""))
	paragraphs = [p.strip() for p in raw_text.split("\n\n") if p.strip()]
	obj["text"] = paragraphs
	fout.write(json.dumps(obj, ensure_ascii=False) + "\n")
	n_docs += 1

	print(f"Wrote {n_docs:,} documents to {out}")


	def preprocess_wikipedia(input_path: str, output_path: str):
	preprocess_jsonl_paragraphs(input_path, output_path, text_field="text")


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Preprocess raw data into JSONL.")
	sub = parser.add_subparsers(dest="method", required=True)

	gut = sub.add_parser("gutenberg", help="Convert raw Gutenberg books to JSONL")
	gut.add_argument("--raw_dir", required=True, help="Directory with raw Gutenberg text files")
	gut.add_argument("--output", required=True, help="Output JSONL path")

	ted = sub.add_parser("ted", help="Filter TED talks JSONL by paragraph count")
	ted.add_argument("--input_dir", required=True, help="Directory with TED split JSONL files")
	ted.add_argument("--output_dir", required=True, help="Output directory for preprocessed files")
	ted.add_argument("--max_paragraphs", type=int, default=60,
	help="Drop talks with more than this many paragraphs (default: 60)")

	pub = sub.add_parser("pubmed", help="Split PubMed text into paragraphs")
	pub.add_argument("--input", required=True, help="Input raw JSONL path")
	pub.add_argument("--output", required=True, help="Output JSONL path")

	wiki = sub.add_parser("wikipedia", help="Split Wikipedia text into paragraphs")
	wiki.add_argument("--input", required=True, help="Input raw JSONL path")
	wiki.add_argument("--output", required=True, help="Output JSONL path")

	args = parser.parse_args()

	if args.method == "gutenberg":
	preprocess_gutenberg(args.raw_dir, args.output)
	elif args.method == "ted":
	preprocess_ted(args.input_dir, args.output_dir, args.max_paragraphs)
	elif args.method == "pubmed":
	preprocess_pubmed(args.input, args.output)
	elif args.method == "wikipedia":
	preprocess_wikipedia(args.input, args.output)