chatbot_nihe / src /preprocessing /deduplication.py
Auto Deploy Script
Auto deploy from local machine
f9b0dca
import os
import json
import hashlib
import glob
PROCESSED_DIR = "d:/NLP_KMH/Chatbot_NIHE_v2/data/processed"
def get_hash(text):
"""Compute MD5 hash of text."""
return hashlib.md5(text.encode('utf-8')).hexdigest()
def main():
files = glob.glob(os.path.join(PROCESSED_DIR, "*.json"))
seen_hashes = set()
removed_count = 0
print(f"Checking {len(files)} processed files for duplicates...")
for filepath in files:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
content_hash = get_hash(data['text'])
if content_hash in seen_hashes:
print(f"Removing duplicate: {os.path.basename(filepath)}")
os.remove(filepath)
removed_count += 1
else:
seen_hashes.add(content_hash)
print(f"Deduplication complete. Removed {removed_count} duplicate files.")
if __name__ == "__main__":
main()