senti-beta / scratch /deep_data_audit.py
joseph njoroge kariuki
Deploy Senti AI to Hugging Face Spaces
021e065
import os
import json
SENTI_AI_ROOT = r"C:\Users\LENOVO\Desktop\senti_ai"
PACKS = ['sentiglobal', 'sentilaw', 'sentibanking', 'sentitax', 'sentibiz', 'senticredit', 'sentirisk', 'senticoach', 'sentiplan', 'sentiwealth', 'sentiinsurance', 'senticommunity', 'senticorporate', 'sentiaccounting']
print("=== DEEP DATA AUDIT REPORT ===")
for p in PACKS:
path = os.path.join(SENTI_AI_ROOT, p, "cleaned")
if not os.path.exists(path):
print(f"{p.ljust(20)} : FOLDER MISSING")
continue
all_files = [f for f in os.listdir(path) if f.endswith('.json')]
global_files = [f for f in all_files if f.startswith('global_')]
wild_files = [f for f in all_files if f.startswith('wild_')]
other_files = [f for f in all_files if not f.startswith(('global_', 'wild_'))]
print(f"{p.ljust(20)} : {len(all_files)} total records")
print(f" - Global Crawl: {len(global_files)}")
print(f" - Wild Ingests: {len(wild_files)}")
print(f" - Base/Kaggle : {len(other_files)}")
# Cleanliness check (Sample 1)
if all_files:
sample_file = all_files[0]
with open(os.path.join(path, sample_file), 'r', encoding='utf-8') as f:
try:
data = json.load(f)
content = data.get('content', {}).get('cleaned_text', '')
is_clean = "==" not in content and "[[" not in content and "{{" not in content
char_count = len(content)
print(f" - Quality Check ({sample_file}): {'CLEAN' if is_clean else 'MESSY'} | {char_count} chars")
except Exception as e:
print(f" - Quality Check: FAILED TO READ ({e})")
print("-" * 30)