File size: 7,356 Bytes
90645a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
"""Calculate total token counts over data/ folder.

Reports three tiers:
  1. Raw           β€” full OpenAlex JSON as-is on disk
  2. Optimised     β€” only the fields the RAG pipeline actually reads
                     (id, title, abstract, authorships, topics, keywords,
                      referenced_works, cited_by_count, publication_year, doi,
                      best_oa_location.pdf_url)
  3. LLM context   β€” what actually reaches the embedding model and the LLM:
                     plain-text "Title: …\\n\\nAbstract: …" per paper
"""

import json
import sys
from pathlib import Path

# data/ lives at the project root, two levels up from utils/
DATA_DIR = Path(__file__).parents[1] / "data"

# Approximate token ratio (1 token β‰ˆ 4 chars for English text β€” GPT/Llama heuristic)
CHARS_PER_TOKEN = 4

# Fields the pipeline actually uses
_KEEP_KEYS = {
    "id", "title", "display_name", "abstract_inverted_index",
    "publication_year", "cited_by_count",
    "authorships", "topics", "keywords", "referenced_works",
    "primary_topic", "doi", "type",
}


def reconstruct_abstract(inv_index: dict) -> str:
    """Rebuild plain-text abstract from OpenAlex inverted index."""
    if not inv_index:
        return ""
    pairs = []
    for word, positions in inv_index.items():
        for pos in positions:
            pairs.append((pos, word))
    pairs.sort()
    return " ".join(w for _, w in pairs)


def slim_authorships(authorships: list) -> list:
    """Keep only author name + position (drop institutions, affiliations, etc.)."""
    return [
        {
            "name": a.get("author", {}).get("display_name", ""),
            "position": a.get("author_position", ""),
        }
        for a in (authorships or [])
    ]


def optimise_paper(raw: dict) -> dict:
    """Strip a raw OpenAlex JSON to only pipeline-relevant fields."""
    optimised = {k: raw[k] for k in _KEEP_KEYS if k in raw}

    # Replace inverted index with plain text abstract
    if "abstract_inverted_index" in optimised:
        optimised["abstract"] = reconstruct_abstract(optimised.pop("abstract_inverted_index"))

    # Slim authorships
    if "authorships" in optimised:
        optimised["authorships"] = slim_authorships(optimised["authorships"])

    # Slim topics to just display_name
    if "topics" in optimised:
        optimised["topics"] = [t.get("display_name", "") for t in (optimised["topics"] or [])]

    # Slim keywords
    if "keywords" in optimised:
        optimised["keywords"] = [k.get("display_name", "") for k in (optimised["keywords"] or [])]

    # Slim referenced_works to just IDs
    if "referenced_works" in optimised:
        optimised["referenced_works"] = [
            r.rsplit("/", 1)[-1] if "/" in r else r
            for r in (optimised["referenced_works"] or [])
        ]

    return optimised


def count_tokens(text: str) -> dict:
    chars = len(text)
    words = len(text.split())
    tokens_est = chars // CHARS_PER_TOKEN
    return {"chars": chars, "words": words, "tokens_est": tokens_est}


def main():
    if not DATA_DIR.exists():
        print(f"ERROR: {DATA_DIR} not found. Expected: {DATA_DIR.resolve()}")
        sys.exit(1)

    files = sorted(DATA_DIR.glob("*.json"))
    total = len(files)
    print(f"Found {total} JSON files in {DATA_DIR.resolve()}\n")

    raw_total = {"chars": 0, "words": 0, "tokens_est": 0, "bytes": 0}
    opt_total = {"chars": 0, "words": 0, "tokens_est": 0, "bytes": 0}
    llm_total = {"chars": 0, "words": 0, "tokens_est": 0, "bytes": 0}

    for f in files:
        raw_text = f.read_text(encoding="utf-8")
        raw_total["bytes"] += len(raw_text.encode("utf-8"))
        stats = count_tokens(raw_text)
        raw_total["chars"] += stats["chars"]
        raw_total["words"] += stats["words"]
        raw_total["tokens_est"] += stats["tokens_est"]

        try:
            data = json.loads(raw_text)

            # Tier 2: optimised JSON (all pipeline-read fields)
            optimised = optimise_paper(data)
            opt_text = json.dumps(optimised, ensure_ascii=False)
            opt_total["bytes"] += len(opt_text.encode("utf-8"))
            opt_stats = count_tokens(opt_text)
            opt_total["chars"] += opt_stats["chars"]
            opt_total["words"] += opt_stats["words"]
            opt_total["tokens_est"] += opt_stats["tokens_est"]

            # Tier 3: LLM context β€” the exact text sent to the embedding model and LLM
            # Mirrors indexer.py _doc_text() and setup.py embed text construction
            title = data.get("title") or ""
            abstract = reconstruct_abstract(data.get("abstract_inverted_index") or {})
            llm_text = f"Title: {title}\n\nAbstract: {abstract}"
            llm_total["bytes"] += len(llm_text.encode("utf-8"))
            llm_stats = count_tokens(llm_text)
            llm_total["chars"] += llm_stats["chars"]
            llm_total["words"] += llm_stats["words"]
            llm_total["tokens_est"] += llm_stats["tokens_est"]

        except json.JSONDecodeError:
            pass

    def fmt(n):
        return f"{n:,}"

    print("=" * 60)
    print("  TIER 1 β€” RAW (full OpenAlex JSON as-is on disk)")
    print("=" * 60)
    print(f"  Files:            {fmt(total)}")
    print(f"  Total bytes:      {fmt(raw_total['bytes'])}  ({raw_total['bytes'] / 1e6:.1f} MB)")
    print(f"  Total chars:      {fmt(raw_total['chars'])}")
    print(f"  Total words:      {fmt(raw_total['words'])}")
    print(f"  Est. tokens:      {fmt(raw_total['tokens_est'])}  (~{raw_total['tokens_est'] / 1e6:.2f}M)")
    print()
    print("=" * 60)
    print("  TIER 2 β€” OPTIMISED (pipeline-relevant fields only)")
    print("  Fields: id, title, abstract, authorships, topics,")
    print("          keywords, referenced_works, doi, year,")
    print("          cited_by_count, best_oa_location.pdf_url")
    print("=" * 60)
    print(f"  Files:            {fmt(total)}")
    print(f"  Total bytes:      {fmt(opt_total['bytes'])}  ({opt_total['bytes'] / 1e6:.1f} MB)")
    print(f"  Total chars:      {fmt(opt_total['chars'])}")
    print(f"  Total words:      {fmt(opt_total['words'])}")
    print(f"  Est. tokens:      {fmt(opt_total['tokens_est'])}  (~{opt_total['tokens_est'] / 1e6:.2f}M)")
    print()
    print("=" * 60)
    print("  TIER 3 β€” LLM CONTEXT (what reaches the embedding model + LLM)")
    print('  Format: "Title: …\\n\\nAbstract: …" per paper')
    print("=" * 60)
    print(f"  Files:            {fmt(total)}")
    print(f"  Total bytes:      {fmt(llm_total['bytes'])}  ({llm_total['bytes'] / 1e6:.1f} MB)")
    print(f"  Total chars:      {fmt(llm_total['chars'])}")
    print(f"  Total words:      {fmt(llm_total['words'])}")
    print(f"  Est. tokens:      {fmt(llm_total['tokens_est'])}  (~{llm_total['tokens_est'] / 1e6:.2f}M)")
    print()

    r2_pct = (1 - opt_total["bytes"] / raw_total["bytes"]) * 100 if raw_total["bytes"] else 0
    r3_pct = (1 - llm_total["bytes"] / raw_total["bytes"]) * 100 if raw_total["bytes"] else 0
    print(f"  Tier 1 β†’ Tier 2 reduction: {r2_pct:.1f}%  "
          f"({fmt(raw_total['tokens_est'] - opt_total['tokens_est'])} tokens saved)")
    print(f"  Tier 1 β†’ Tier 3 reduction: {r3_pct:.1f}%  "
          f"({fmt(raw_total['tokens_est'] - llm_total['tokens_est'])} tokens saved)")


if __name__ == "__main__":
    main()