import os import json import re SENTI_AI_ROOT = r"C:\Users\LENOVO\Desktop\senti_ai" def validate_and_sanitize(): print("=== FINAL DATA SANITIZATION SWEEP ===") total_cleaned = 0 total_deleted = 0 for folder in os.listdir(SENTI_AI_ROOT): if os.path.isdir(folder) and folder.startswith("senti"): cleaned_dir = os.path.join(SENTI_AI_ROOT, folder, "cleaned") if not os.path.exists(cleaned_dir): continue for file in os.listdir(cleaned_dir): file_path = os.path.join(cleaned_dir, file) # Delete non-json files if not file.endswith(".json"): print(f"Deleting non-JSON file: {file_path}") os.remove(file_path) total_deleted += 1 continue # Validate JSON and content try: with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) content = data.get('content', {}).get('cleaned_text', '') # Extra aggressive cleaning if anything slipped through # Remove any remaining Wikipedia artifacts new_content = re.sub(r'==\s*(See also|References|Further reading|External links|Notes)\s*==.*', '', content, flags=re.DOTALL | re.IGNORECASE) new_content = re.sub(r'==.*?==', '', new_content) new_content = re.sub(r'\[\[(?:[^|\]]*\|)?([^\]]+)\]\]', r'\1', new_content) new_content = re.sub(r'\{\{[^\}]*\}\}', '', new_content) new_content = re.sub(r'Ref[0-9]+', '', new_content) new_content = re.sub(r'\s+', ' ', new_content).strip() if new_content != content: data['content']['cleaned_text'] = new_content with open(file_path, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2) total_cleaned += 1 except Exception as e: print(f"Deleting invalid JSON: {file_path} (Error: {e})") os.remove(file_path) total_deleted += 1 print(f"\n=== SWEEP COMPLETED ===") print(f"Files re-sanitized: {total_cleaned}") print(f"Invalid files deleted: {total_deleted}") if __name__ == "__main__": validate_and_sanitize()