import os import json import hashlib import glob PROCESSED_DIR = "d:/NLP_KMH/Chatbot_NIHE_v2/data/processed" def get_hash(text): """Compute MD5 hash of text.""" return hashlib.md5(text.encode('utf-8')).hexdigest() def main(): files = glob.glob(os.path.join(PROCESSED_DIR, "*.json")) seen_hashes = set() removed_count = 0 print(f"Checking {len(files)} processed files for duplicates...") for filepath in files: with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) content_hash = get_hash(data['text']) if content_hash in seen_hashes: print(f"Removing duplicate: {os.path.basename(filepath)}") os.remove(filepath) removed_count += 1 else: seen_hashes.add(content_hash) print(f"Deduplication complete. Removed {removed_count} duplicate files.") if __name__ == "__main__": main()