File size: 1,525 Bytes
030876e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | #!/usr/bin/env python3
"""Concatenate three translation JSON files and remove instances with any null attribute."""
import json
from pathlib import Path
INPUT_DIR = Path("/home/mshahidul/readctrl/data/translated_data/translation_testing_3396")
OUTPUT_DIR = Path("/home/mshahidul/readctrl/data/translated_data")
OUTPUT_FILE = OUTPUT_DIR / "translation_testing_3396_merged.json"
FILES = [
"multiclinsum_test_en2bn_gemma(0_1000)_3396.json",
"multiclinsum_test_en2bn_gemma(1000_2000)_3396.json",
"multiclinsum_test_en2bn_gemma(2000_3396)_3396.json",
]
REQUIRED_ATTRS = ["id", "fulltext", "summary", "translated_fulltext", "translated_summary"]
def has_any_null(obj):
"""Return True if any required attribute is None/null."""
for attr in REQUIRED_ATTRS:
if obj.get(attr) is None:
return True
return False
def main():
merged = []
removed = 0
for fname in FILES:
path = INPUT_DIR / fname
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
for item in data:
if has_any_null(item):
removed += 1
continue
merged.append(item)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(merged, f, ensure_ascii=False, indent=4)
print(f"Total instances: {len(merged)}")
print(f"Removed (null in any attr): {removed}")
print(f"Saved to: {OUTPUT_FILE}")
if __name__ == "__main__":
main()
|