Spaces:

slamos
/

bc-test

Paused

File size: 7,082 Bytes

7b4cc2e

import argparse
import json
import re
from pathlib import Path


def preprocess_gutenberg(raw_dir: str, output_path: str):
    """
    Read all files from raw_dir (Gutenberg books), split each into paragraphs,
    and write a single JSONL file where each line is one book:

        {"file_name": "3046.ref", "text": ["paragraph 1", "paragraph 2", ...]}

    Paragraph boundaries are determined by:
      - double newlines (\\n\\n)
      - lines starting with capitalised text, a number, or ==========
    """
    raw = Path(raw_dir)
    out = Path(output_path)
    out.parent.mkdir(parents=True, exist_ok=True)

    n_books = 0
    with open(out, "w", encoding="utf-8") as fout:
        for filepath in sorted(raw.iterdir()):
            if filepath.is_dir():
                continue
            text = filepath.read_text(encoding="utf-8")

            # split on ========== into sections
            sections = re.split(r"\n*==========\n*", text)

            paragraphs = []
            for section in sections:
                section = section.strip()
                if not section:
                    continue
                # first line is the heading, rest is body text
                first_newline = section.find("\n")
                if first_newline == -1:
                    # section is just a heading with no body
                    paragraphs.append(section)
                else:
                    heading = section[:first_newline].strip()
                    body = section[first_newline + 1:].strip()
                    if heading:
                        paragraphs.append(heading)
                    if body:
                        paragraphs.append(body)

            if not paragraphs:
                continue

            record = {"file_name": filepath.name, "text": paragraphs}
            fout.write(json.dumps(record, ensure_ascii=False) + "\n")
            n_books += 1

    print(f"Wrote {n_books:,} books to {out}")


def preprocess_ted(input_dir: str, output_dir: str, max_paragraphs: int = 60):
    """
    Process all TED split files (train.jsonl, val.jsonl, test.jsonl) in input_dir,
    drop samples with more than max_paragraphs paragraphs, and write each as
    {name}_preprocessed.jsonl into output_dir.
    """
    inp = Path(input_dir)
    out = Path(output_dir)
    out.mkdir(parents=True, exist_ok=True)

    for split_file in sorted(inp.glob("*.jsonl")):
        out_file = out / f"{split_file.stem}_preprocessed.jsonl"
        n_kept = 0
        n_dropped = 0
        with open(split_file, encoding="utf-8") as fin, open(out_file, "w", encoding="utf-8") as fout:
            for line in fin:
                line = line.strip()
                if not line:
                    continue
                obj = json.loads(line)
                paragraphs = obj.get("text", [])
                if len(paragraphs) > max_paragraphs:
                    n_dropped += 1
                    continue
                fout.write(json.dumps(obj, ensure_ascii=False) + "\n")
                n_kept += 1
        print(f"{split_file.name}: kept {n_kept:,}, dropped {n_dropped:,} with >{max_paragraphs} paragraphs → {out_file}")


def preprocess_jsonl_paragraphs(input_path: str, output_path: str, text_field: str = "text"):
    """
    Read a raw JSONL file, split the text field on double newlines into
    a list of paragraphs, and write the result as a new JSONL file.

    Output format per line:
        {"document_idx": 0, "text": ["paragraph 1", "paragraph 2", ...], ...}
    """
    inp = Path(input_path)
    out = Path(output_path)
    out.parent.mkdir(parents=True, exist_ok=True)

    n_docs = 0
    with open(inp, encoding="utf-8") as fin, open(out, "w", encoding="utf-8") as fout:
        for line in fin:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)
            raw_text = obj.get(text_field, "")
            paragraphs = [p.strip() for p in raw_text.split("\n\n") if p.strip()]
            obj[text_field] = paragraphs
            fout.write(json.dumps(obj, ensure_ascii=False) + "\n")
            n_docs += 1

    print(f"Wrote {n_docs:,} documents to {out}")


def _strip_yaml_frontmatter(text: str) -> str:
    """Remove YAML frontmatter (--- ... ---) from the beginning of text."""
    if text.startswith("---"):
        end = text.find("\n---", 3)
        if end != -1:
            text = text[end + 4:].lstrip("\n")
    return text


def preprocess_pubmed(input_path: str, output_path: str):
    inp = Path(input_path)
    out = Path(output_path)
    out.parent.mkdir(parents=True, exist_ok=True)

    n_docs = 0
    with open(inp, encoding="utf-8") as fin, open(out, "w", encoding="utf-8") as fout:
        for line in fin:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)
            raw_text = _strip_yaml_frontmatter(obj.get("text", ""))
            paragraphs = [p.strip() for p in raw_text.split("\n\n") if p.strip()]
            obj["text"] = paragraphs
            fout.write(json.dumps(obj, ensure_ascii=False) + "\n")
            n_docs += 1

    print(f"Wrote {n_docs:,} documents to {out}")


def preprocess_wikipedia(input_path: str, output_path: str):
    preprocess_jsonl_paragraphs(input_path, output_path, text_field="text")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Preprocess raw data into JSONL.")
    sub = parser.add_subparsers(dest="method", required=True)

    gut = sub.add_parser("gutenberg", help="Convert raw Gutenberg books to JSONL")
    gut.add_argument("--raw_dir", required=True, help="Directory with raw Gutenberg text files")
    gut.add_argument("--output", required=True, help="Output JSONL path")

    ted = sub.add_parser("ted", help="Filter TED talks JSONL by paragraph count")
    ted.add_argument("--input_dir", required=True, help="Directory with TED split JSONL files")
    ted.add_argument("--output_dir", required=True, help="Output directory for preprocessed files")
    ted.add_argument("--max_paragraphs", type=int, default=60,
                     help="Drop talks with more than this many paragraphs (default: 60)")

    pub = sub.add_parser("pubmed", help="Split PubMed text into paragraphs")
    pub.add_argument("--input", required=True, help="Input raw JSONL path")
    pub.add_argument("--output", required=True, help="Output JSONL path")

    wiki = sub.add_parser("wikipedia", help="Split Wikipedia text into paragraphs")
    wiki.add_argument("--input", required=True, help="Input raw JSONL path")
    wiki.add_argument("--output", required=True, help="Output JSONL path")

    args = parser.parse_args()

    if args.method == "gutenberg":
        preprocess_gutenberg(args.raw_dir, args.output)
    elif args.method == "ted":
        preprocess_ted(args.input_dir, args.output_dir, args.max_paragraphs)
    elif args.method == "pubmed":
        preprocess_pubmed(args.input, args.output)
    elif args.method == "wikipedia":
        preprocess_wikipedia(args.input, args.output)