File size: 715 Bytes
daafb32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import json
from config.settings import CHUNKS_DIR

fixed_files = 0
fixed_chunks = 0

for f in CHUNKS_DIR.glob("*_semantic.json"):
    with open(f, "r", encoding = "utf-8") as fp:
        chunks = json.load(fp)
    
    changed = False
    for chunk in chunks:
        if not chunk.get("primary_category"):
            # Derive from paper_id if needed - use cs.LG as safe default
            chunk["primary_category"] = "cs.LG"
            fixed_chunks += 1
            changed = True
    
    if changed:
        with open(f, "w", encoding="utf-8") as fp:
            json.dump(chunks, fp, indent = 2, ensure_ascii = False)
        fixed_files += 1

print(f"Fixed {fixed_chunks} chunks across {fixed_files} files")