import os import json SENTI_AI_ROOT = r"C:\Users\LENOVO\Desktop\senti_ai" PACKS = ['sentiglobal', 'sentilaw', 'sentibanking', 'sentitax', 'sentibiz', 'senticredit', 'sentirisk', 'senticoach', 'sentiplan', 'sentiwealth', 'sentiinsurance', 'senticommunity', 'senticorporate', 'sentiaccounting'] print("=== DEEP DATA AUDIT REPORT ===") for p in PACKS: path = os.path.join(SENTI_AI_ROOT, p, "cleaned") if not os.path.exists(path): print(f"{p.ljust(20)} : FOLDER MISSING") continue all_files = [f for f in os.listdir(path) if f.endswith('.json')] global_files = [f for f in all_files if f.startswith('global_')] wild_files = [f for f in all_files if f.startswith('wild_')] other_files = [f for f in all_files if not f.startswith(('global_', 'wild_'))] print(f"{p.ljust(20)} : {len(all_files)} total records") print(f" - Global Crawl: {len(global_files)}") print(f" - Wild Ingests: {len(wild_files)}") print(f" - Base/Kaggle : {len(other_files)}") # Cleanliness check (Sample 1) if all_files: sample_file = all_files[0] with open(os.path.join(path, sample_file), 'r', encoding='utf-8') as f: try: data = json.load(f) content = data.get('content', {}).get('cleaned_text', '') is_clean = "==" not in content and "[[" not in content and "{{" not in content char_count = len(content) print(f" - Quality Check ({sample_file}): {'CLEAN' if is_clean else 'MESSY'} | {char_count} chars") except Exception as e: print(f" - Quality Check: FAILED TO READ ({e})") print("-" * 30)