File size: 1,367 Bytes
597fb81
 
01458ca
 
 
 
 
597fb81
 
01458ca
 
 
 
 
 
597fb81
01458ca
597fb81
 
 
 
 
 
 
01458ca
 
 
597fb81
01458ca
 
 
 
597fb81
 
01458ca
 
 
 
597fb81
 
 
 
01458ca
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import json
import re
from pathlib import Path

DATA_DIR = Path("data")

def load_json(filename):
    """Load a JSON file and return list of records."""
    with open(DATA_DIR / filename, "r", encoding="utf-8") as f:
        data = json.load(f)
    if isinstance(data, dict) and "results" in data:
        return data["results"]
    return data if isinstance(data, list) else []

def extract_text(item):
    """Extract textual fields from a JSON record."""
    texts = []
    for k in ("text", "description", "body", "content", "name"):
        if k in item and item[k]:
            texts.append(str(item[k]))
    if "content_json" in item and isinstance(item["content_json"], dict):
        for v in item["content_json"].values():
            if isinstance(v, str) and v.strip():
                texts.append(v)
    return texts

def chunk_text(text, max_words=80):
    """Split long text into smaller chunks."""
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks, cur, count = [], [], 0
    for s in sentences:
        words = s.split()
        if len(words) < 5:
            continue
        if count + len(words) > max_words and cur:
            chunks.append(" ".join(cur))
            cur, count = [s], len(words)
        else:
            cur.append(s)
            count += len(words)
    if cur:
        chunks.append(" ".join(cur))
    return chunks