Spaces:
Running
Running
| import os | |
| import json | |
| SENTI_AI_ROOT = r"C:\Users\LENOVO\Desktop\senti_ai" | |
| PACKS = ['sentiglobal', 'sentilaw', 'sentibanking', 'sentitax', 'sentibiz', 'senticredit', 'sentirisk', 'senticoach', 'sentiplan', 'sentiwealth', 'sentiinsurance', 'senticommunity', 'senticorporate', 'sentiaccounting'] | |
| print("=== DEEP DATA AUDIT REPORT ===") | |
| for p in PACKS: | |
| path = os.path.join(SENTI_AI_ROOT, p, "cleaned") | |
| if not os.path.exists(path): | |
| print(f"{p.ljust(20)} : FOLDER MISSING") | |
| continue | |
| all_files = [f for f in os.listdir(path) if f.endswith('.json')] | |
| global_files = [f for f in all_files if f.startswith('global_')] | |
| wild_files = [f for f in all_files if f.startswith('wild_')] | |
| other_files = [f for f in all_files if not f.startswith(('global_', 'wild_'))] | |
| print(f"{p.ljust(20)} : {len(all_files)} total records") | |
| print(f" - Global Crawl: {len(global_files)}") | |
| print(f" - Wild Ingests: {len(wild_files)}") | |
| print(f" - Base/Kaggle : {len(other_files)}") | |
| # Cleanliness check (Sample 1) | |
| if all_files: | |
| sample_file = all_files[0] | |
| with open(os.path.join(path, sample_file), 'r', encoding='utf-8') as f: | |
| try: | |
| data = json.load(f) | |
| content = data.get('content', {}).get('cleaned_text', '') | |
| is_clean = "==" not in content and "[[" not in content and "{{" not in content | |
| char_count = len(content) | |
| print(f" - Quality Check ({sample_file}): {'CLEAN' if is_clean else 'MESSY'} | {char_count} chars") | |
| except Exception as e: | |
| print(f" - Quality Check: FAILED TO READ ({e})") | |
| print("-" * 30) | |