File size: 597 Bytes
b127d35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import json
def merge_short_lines(file_path,min_length=32):
    merged = []
    buffer = ""
    
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            data = json.loads(line)
            text = data["text"]
            buffer += " " + text.strip()
            if len(buffer) >= min_length:
                merged.append({"text": buffer.strip()})
                buffer = ""
        
    if buffer.strip():
        merged.append({"text": buffer.strip})

    print(f"Merged {len(merged)} lines")
    return merged