MiniGPT / localscripts /mergelines.py
CreatedNull's picture
Upload folder using huggingface_hub
b127d35 verified
raw
history blame contribute delete
597 Bytes
import json
def merge_short_lines(file_path,min_length=32):
merged = []
buffer = ""
with open(file_path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
data = json.loads(line)
text = data["text"]
buffer += " " + text.strip()
if len(buffer) >= min_length:
merged.append({"text": buffer.strip()})
buffer = ""
if buffer.strip():
merged.append({"text": buffer.strip})
print(f"Merged {len(merged)} lines")
return merged