Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import hashlib | |
| import glob | |
| PROCESSED_DIR = "d:/NLP_KMH/Chatbot_NIHE_v2/data/processed" | |
| def get_hash(text): | |
| """Compute MD5 hash of text.""" | |
| return hashlib.md5(text.encode('utf-8')).hexdigest() | |
| def main(): | |
| files = glob.glob(os.path.join(PROCESSED_DIR, "*.json")) | |
| seen_hashes = set() | |
| removed_count = 0 | |
| print(f"Checking {len(files)} processed files for duplicates...") | |
| for filepath in files: | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| content_hash = get_hash(data['text']) | |
| if content_hash in seen_hashes: | |
| print(f"Removing duplicate: {os.path.basename(filepath)}") | |
| os.remove(filepath) | |
| removed_count += 1 | |
| else: | |
| seen_hashes.add(content_hash) | |
| print(f"Deduplication complete. Removed {removed_count} duplicate files.") | |
| if __name__ == "__main__": | |
| main() | |