Delete mergelines.py
Browse files- mergelines.py +0 -20
mergelines.py
DELETED
|
@@ -1,20 +0,0 @@
|
|
| 1 |
-
import json
|
| 2 |
-
def merge_short_lines(file_path,min_length=32):
|
| 3 |
-
merged = []
|
| 4 |
-
buffer = ""
|
| 5 |
-
|
| 6 |
-
with open(file_path, "r", encoding="utf-8") as f:
|
| 7 |
-
for line in f:
|
| 8 |
-
line = line.strip()
|
| 9 |
-
data = json.loads(line)
|
| 10 |
-
text = data["text"]
|
| 11 |
-
buffer += " " + text.strip()
|
| 12 |
-
if len(buffer) >= min_length:
|
| 13 |
-
merged.append({"text": buffer.strip()})
|
| 14 |
-
buffer = ""
|
| 15 |
-
|
| 16 |
-
if buffer.strip():
|
| 17 |
-
merged.append({"text": buffer.strip})
|
| 18 |
-
|
| 19 |
-
print(f"Merged {len(merged)} lines")
|
| 20 |
-
return merged
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|