File size: 2,982 Bytes
511a4f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
daafb32
511a4f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
"""
Verify chunk quality across the full dataset.
Run this before embedding to catch any data issues early.
"""


import json
from pathlib import Path
from config.settings import CHUNKS_DIR
from src.utils.logger import setup_logger, get_logger


setup_logger()
logger = get_logger(__name__)



def main():
    chunk_files = list(CHUNKS_DIR.glob("*_semantic.json"))
    logger.info(f"Checking {len(chunk_files)} chunk files...")


    total_chunks    = 0
    total_words     = 0
    tiny_chunks     = 0     # < 50 words
    giant_chunks    = 0     # > 600 words
    clean_endings   = 0
    sample_chunks   = []    # Store a few for display


    for cf in chunk_files:
        with open(cf, encoding = 'utf-8') as f:
            chunks = json.load(f)

        
        for c in chunks:
            total_chunks += 1
            wc = c["word_count"]
            total_words += wc


            if wc < 50:
                tiny_chunks += 1
            if wc > 600:
                giant_chunks += 1
            if c["text"].rstrip().endswith(('.', '!', '?')):
                clean_endings += 1


            if len(sample_chunks) < 3:
                sample_chunks.append(c)


    avg_words = total_words / total_chunks if total_chunks else 0


    print(f"\n{'='*55}")
    print(f"  CHUNK QUALITY REPORT")
    print(f"{'='*55}")
    print(f"  Total chunk files:     {len(chunk_files)}")
    print(f"  Total chunks:          {total_chunks:,}")
    print(f"  Avg words per chunk:   {avg_words:.0f}")
    print(f"  Tiny chunks (<50w):    {tiny_chunks} ({100*tiny_chunks/total_chunks:.1f}%)")
    print(f"  Giant chunks (>600w):  {giant_chunks} ({100*giant_chunks/total_chunks:.1f}%)")
    print(f"  Clean endings:         {clean_endings} ({100*clean_endings/total_chunks:.1f}%)")
    print()


    print("  SAMPLE CHUNKS:")
    print(f"  {'-'*50}")
    for i, c in enumerate(sample_chunks):
        print(f"  [{i+1}] Paper: {c['paper_id']}")
        print(f"       Words: {c['word_count']} | Strategy: {c['chunking_strategy']}")
        print(f"       Text: {c['text'][:120].replace(chr(10), ' ')}...")
        print()

    # Quality gates - these thresholds indicate healthy chunking
    print(f"{'='*55}")
    print(f"  QUALITY GATES")
    print(f"{'='*55}")


    gates = [
        ("Total chunks > 10,000",         total_chunks > 10_000),
        ("Avg words 100-400",             100 <= avg_words <= 400),
        ("Tiny chunks < 15%",             tiny_chunks/total_chunks < 0.15),
        ("Clean endings > 70%",           clean_endings/total_chunks > 0.70),
    ]


    all_pass = True
    for name, passed in gates:
        status = "✅ PASS" if passed else "❌ FAIL"
        print(f"  {status}  {name}")
        if not passed:
            all_pass = False

    print()

    if all_pass:
        print("  ✅ All quality gates passed. Ready for Phase 6.")
    else:
        print("  ⚠️  Some gates failed. Review before proceeding.")

if __name__ == "__main__":
    main()