Spaces:
Sleeping
Sleeping
File size: 969 Bytes
f9b0dca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
import os
import json
import hashlib
import glob
PROCESSED_DIR = "d:/NLP_KMH/Chatbot_NIHE_v2/data/processed"
def get_hash(text):
"""Compute MD5 hash of text."""
return hashlib.md5(text.encode('utf-8')).hexdigest()
def main():
files = glob.glob(os.path.join(PROCESSED_DIR, "*.json"))
seen_hashes = set()
removed_count = 0
print(f"Checking {len(files)} processed files for duplicates...")
for filepath in files:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
content_hash = get_hash(data['text'])
if content_hash in seen_hashes:
print(f"Removing duplicate: {os.path.basename(filepath)}")
os.remove(filepath)
removed_count += 1
else:
seen_hashes.add(content_hash)
print(f"Deduplication complete. Removed {removed_count} duplicate files.")
if __name__ == "__main__":
main()
|