import json import re from pathlib import Path DATA_DIR = Path("data") def load_json(filename): """Load a JSON file and return list of records.""" with open(DATA_DIR / filename, "r", encoding="utf-8") as f: data = json.load(f) if isinstance(data, dict) and "results" in data: return data["results"] return data if isinstance(data, list) else [] def extract_text(item): """Extract textual fields from a JSON record.""" texts = [] for k in ("text", "description", "body", "content", "name"): if k in item and item[k]: texts.append(str(item[k])) if "content_json" in item and isinstance(item["content_json"], dict): for v in item["content_json"].values(): if isinstance(v, str) and v.strip(): texts.append(v) return texts def chunk_text(text, max_words=80): """Split long text into smaller chunks.""" sentences = re.split(r'(?<=[.!?]) +', text) chunks, cur, count = [], [], 0 for s in sentences: words = s.split() if len(words) < 5: continue if count + len(words) > max_words and cur: chunks.append(" ".join(cur)) cur, count = [s], len(words) else: cur.append(s) count += len(words) if cur: chunks.append(" ".join(cur)) return chunks