MuratcanKoylan's picture
Upload folder using huggingface_hub
685d968 verified
import json
import sys
from typing import List, Dict, Any
def clean_datum(item: Dict[str, Any]) -> Dict[str, Any]:
"""
Clean a single data item:
1. Remove 'none' if other categories are present.
2. Deduplicate categories.
3. Ensure consistent formatting.
"""
if "labels" not in item or "categories" not in item["labels"]:
return item
cats = item["labels"]["categories"]
# Deduplicate
cats = list(set(cats))
# Remove 'none' if other categories exist
if len(cats) > 1 and "none" in cats:
cats.remove("none")
# Update the item
item["labels"]["categories"] = cats
return item
def clean_file(input_path: str, output_path: str):
print(f"Cleaning {input_path} -> {output_path}")
cleaned_count = 0
data = []
# Read input
with open(input_path, 'r') as f:
content = f.read().strip()
if not content:
print("Empty file")
return
# Handle JSONL or list of JSON
if content.startswith('[') and content.endswith(']'):
raw_data = json.loads(content)
else:
raw_data = [json.loads(line) for line in content.split('\n') if line.strip()]
# Process
for item in raw_data:
original_cats = item.get("labels", {}).get("categories", [])
cleaned_item = clean_datum(item)
new_cats = cleaned_item["labels"]["categories"]
if set(original_cats) != set(new_cats):
cleaned_count += 1
data.append(cleaned_item)
# Write output (always as JSONL for training)
with open(output_path, 'w') as f:
for item in data:
f.write(json.dumps(item) + '\n')
print(f"Processed {len(data)} items. Cleaned {cleaned_count} items (removed 'none' or duplicates).")
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python clean_data.py input_file [output_file]")
sys.exit(1)
input_file = sys.argv[1]
output_file = sys.argv[2] if len(sys.argv) > 2 else input_file.replace('.json', '_cleaned.jsonl').replace('.jsonl', '_cleaned.jsonl')
clean_file(input_file, output_file)