|
|
import json |
|
|
import sys |
|
|
from typing import List, Dict, Any |
|
|
|
|
|
def clean_datum(item: Dict[str, Any]) -> Dict[str, Any]: |
|
|
""" |
|
|
Clean a single data item: |
|
|
1. Remove 'none' if other categories are present. |
|
|
2. Deduplicate categories. |
|
|
3. Ensure consistent formatting. |
|
|
""" |
|
|
if "labels" not in item or "categories" not in item["labels"]: |
|
|
return item |
|
|
|
|
|
cats = item["labels"]["categories"] |
|
|
|
|
|
cats = list(set(cats)) |
|
|
|
|
|
|
|
|
if len(cats) > 1 and "none" in cats: |
|
|
cats.remove("none") |
|
|
|
|
|
|
|
|
item["labels"]["categories"] = cats |
|
|
return item |
|
|
|
|
|
def clean_file(input_path: str, output_path: str): |
|
|
print(f"Cleaning {input_path} -> {output_path}") |
|
|
cleaned_count = 0 |
|
|
data = [] |
|
|
|
|
|
|
|
|
with open(input_path, 'r') as f: |
|
|
content = f.read().strip() |
|
|
if not content: |
|
|
print("Empty file") |
|
|
return |
|
|
|
|
|
|
|
|
if content.startswith('[') and content.endswith(']'): |
|
|
raw_data = json.loads(content) |
|
|
else: |
|
|
raw_data = [json.loads(line) for line in content.split('\n') if line.strip()] |
|
|
|
|
|
|
|
|
for item in raw_data: |
|
|
original_cats = item.get("labels", {}).get("categories", []) |
|
|
cleaned_item = clean_datum(item) |
|
|
new_cats = cleaned_item["labels"]["categories"] |
|
|
|
|
|
if set(original_cats) != set(new_cats): |
|
|
cleaned_count += 1 |
|
|
|
|
|
data.append(cleaned_item) |
|
|
|
|
|
|
|
|
with open(output_path, 'w') as f: |
|
|
for item in data: |
|
|
f.write(json.dumps(item) + '\n') |
|
|
|
|
|
print(f"Processed {len(data)} items. Cleaned {cleaned_count} items (removed 'none' or duplicates).") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
if len(sys.argv) < 2: |
|
|
print("Usage: python clean_data.py input_file [output_file]") |
|
|
sys.exit(1) |
|
|
|
|
|
input_file = sys.argv[1] |
|
|
output_file = sys.argv[2] if len(sys.argv) > 2 else input_file.replace('.json', '_cleaned.jsonl').replace('.jsonl', '_cleaned.jsonl') |
|
|
|
|
|
clean_file(input_file, output_file) |
|
|
|
|
|
|