File size: 1,525 Bytes
030876e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/env python3
"""Concatenate three translation JSON files and remove instances with any null attribute."""

import json
from pathlib import Path

INPUT_DIR = Path("/home/mshahidul/readctrl/data/translated_data/translation_testing_3396")
OUTPUT_DIR = Path("/home/mshahidul/readctrl/data/translated_data")
OUTPUT_FILE = OUTPUT_DIR / "translation_testing_3396_merged.json"

FILES = [
    "multiclinsum_test_en2bn_gemma(0_1000)_3396.json",
    "multiclinsum_test_en2bn_gemma(1000_2000)_3396.json",
    "multiclinsum_test_en2bn_gemma(2000_3396)_3396.json",
]

REQUIRED_ATTRS = ["id", "fulltext", "summary", "translated_fulltext", "translated_summary"]


def has_any_null(obj):
    """Return True if any required attribute is None/null."""
    for attr in REQUIRED_ATTRS:
        if obj.get(attr) is None:
            return True
    return False


def main():
    merged = []
    removed = 0
    for fname in FILES:
        path = INPUT_DIR / fname
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        for item in data:
            if has_any_null(item):
                removed += 1
                continue
            merged.append(item)
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(merged, f, ensure_ascii=False, indent=4)
    print(f"Total instances: {len(merged)}")
    print(f"Removed (null in any attr): {removed}")
    print(f"Saved to: {OUTPUT_FILE}")


if __name__ == "__main__":
    main()