senti-beta / scratch /final_sanitization.py
joseph njoroge kariuki
Deploy Senti AI to Hugging Face Spaces
021e065
import os
import json
import re
SENTI_AI_ROOT = r"C:\Users\LENOVO\Desktop\senti_ai"
def validate_and_sanitize():
print("=== FINAL DATA SANITIZATION SWEEP ===")
total_cleaned = 0
total_deleted = 0
for folder in os.listdir(SENTI_AI_ROOT):
if os.path.isdir(folder) and folder.startswith("senti"):
cleaned_dir = os.path.join(SENTI_AI_ROOT, folder, "cleaned")
if not os.path.exists(cleaned_dir): continue
for file in os.listdir(cleaned_dir):
file_path = os.path.join(cleaned_dir, file)
# Delete non-json files
if not file.endswith(".json"):
print(f"Deleting non-JSON file: {file_path}")
os.remove(file_path)
total_deleted += 1
continue
# Validate JSON and content
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
content = data.get('content', {}).get('cleaned_text', '')
# Extra aggressive cleaning if anything slipped through
# Remove any remaining Wikipedia artifacts
new_content = re.sub(r'==\s*(See also|References|Further reading|External links|Notes)\s*==.*', '', content, flags=re.DOTALL | re.IGNORECASE)
new_content = re.sub(r'==.*?==', '', new_content)
new_content = re.sub(r'\[\[(?:[^|\]]*\|)?([^\]]+)\]\]', r'\1', new_content)
new_content = re.sub(r'\{\{[^\}]*\}\}', '', new_content)
new_content = re.sub(r'Ref[0-9]+', '', new_content)
new_content = re.sub(r'\s+', ' ', new_content).strip()
if new_content != content:
data['content']['cleaned_text'] = new_content
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2)
total_cleaned += 1
except Exception as e:
print(f"Deleting invalid JSON: {file_path} (Error: {e})")
os.remove(file_path)
total_deleted += 1
print(f"\n=== SWEEP COMPLETED ===")
print(f"Files re-sanitized: {total_cleaned}")
print(f"Invalid files deleted: {total_deleted}")
if __name__ == "__main__":
validate_and_sanitize()