| import json | |
| def merge_short_lines(file_path,min_length=32): | |
| merged = [] | |
| buffer = "" | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| for line in f: | |
| line = line.strip() | |
| data = json.loads(line) | |
| text = data["text"] | |
| buffer += " " + text.strip() | |
| if len(buffer) >= min_length: | |
| merged.append({"text": buffer.strip()}) | |
| buffer = "" | |
| if buffer.strip(): | |
| merged.append({"text": buffer.strip}) | |
| print(f"Merged {len(merged)} lines") | |
| return merged |