Spaces:
Running
Running
| import os | |
| import json | |
| import re | |
| SENTI_AI_ROOT = r"C:\Users\LENOVO\Desktop\senti_ai" | |
| def validate_and_sanitize(): | |
| print("=== FINAL DATA SANITIZATION SWEEP ===") | |
| total_cleaned = 0 | |
| total_deleted = 0 | |
| for folder in os.listdir(SENTI_AI_ROOT): | |
| if os.path.isdir(folder) and folder.startswith("senti"): | |
| cleaned_dir = os.path.join(SENTI_AI_ROOT, folder, "cleaned") | |
| if not os.path.exists(cleaned_dir): continue | |
| for file in os.listdir(cleaned_dir): | |
| file_path = os.path.join(cleaned_dir, file) | |
| # Delete non-json files | |
| if not file.endswith(".json"): | |
| print(f"Deleting non-JSON file: {file_path}") | |
| os.remove(file_path) | |
| total_deleted += 1 | |
| continue | |
| # Validate JSON and content | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| content = data.get('content', {}).get('cleaned_text', '') | |
| # Extra aggressive cleaning if anything slipped through | |
| # Remove any remaining Wikipedia artifacts | |
| new_content = re.sub(r'==\s*(See also|References|Further reading|External links|Notes)\s*==.*', '', content, flags=re.DOTALL | re.IGNORECASE) | |
| new_content = re.sub(r'==.*?==', '', new_content) | |
| new_content = re.sub(r'\[\[(?:[^|\]]*\|)?([^\]]+)\]\]', r'\1', new_content) | |
| new_content = re.sub(r'\{\{[^\}]*\}\}', '', new_content) | |
| new_content = re.sub(r'Ref[0-9]+', '', new_content) | |
| new_content = re.sub(r'\s+', ' ', new_content).strip() | |
| if new_content != content: | |
| data['content']['cleaned_text'] = new_content | |
| with open(file_path, 'w', encoding='utf-8') as f: | |
| json.dump(data, f, indent=2) | |
| total_cleaned += 1 | |
| except Exception as e: | |
| print(f"Deleting invalid JSON: {file_path} (Error: {e})") | |
| os.remove(file_path) | |
| total_deleted += 1 | |
| print(f"\n=== SWEEP COMPLETED ===") | |
| print(f"Files re-sanitized: {total_cleaned}") | |
| print(f"Invalid files deleted: {total_deleted}") | |
| if __name__ == "__main__": | |
| validate_and_sanitize() | |