taraky's picture
Upload folder using huggingface_hub
b7f3196 verified
raw
history blame
811 Bytes
import json, pathlib
from .data_schemas import Doc
def load_jsonl(path: str, text_fields=("question","answer")):
p = pathlib.Path(path)
docs = []
with p.open(encoding="utf-8") as f:
for i, line in enumerate(f):
row = json.loads(line)
# Collect fields; allow either "text" or joined fields
if "text" in row and row["text"]:
combined = row["text"]
else:
combined = " ".join([row.get(tf, "") for tf in text_fields]).strip()
title = row.get("title") or row.get("category") or ""
docs.append(Doc(
id=str(row.get("id", f"{p.stem}:{i}")),
text=combined,
title=title,
meta=row
))
return docs