"""Calculate total token counts over data/ folder. Reports three tiers: 1. Raw — full OpenAlex JSON as-is on disk 2. Optimised — only the fields the RAG pipeline actually reads (id, title, abstract, authorships, topics, keywords, referenced_works, cited_by_count, publication_year, doi, best_oa_location.pdf_url) 3. LLM context — what actually reaches the embedding model and the LLM: plain-text "Title: …\\n\\nAbstract: …" per paper """ import json import sys from pathlib import Path # data/ lives at the project root, two levels up from utils/ DATA_DIR = Path(__file__).parents[1] / "data" # Approximate token ratio (1 token ≈ 4 chars for English text — GPT/Llama heuristic) CHARS_PER_TOKEN = 4 # Fields the pipeline actually uses _KEEP_KEYS = { "id", "title", "display_name", "abstract_inverted_index", "publication_year", "cited_by_count", "authorships", "topics", "keywords", "referenced_works", "primary_topic", "doi", "type", } def reconstruct_abstract(inv_index: dict) -> str: """Rebuild plain-text abstract from OpenAlex inverted index.""" if not inv_index: return "" pairs = [] for word, positions in inv_index.items(): for pos in positions: pairs.append((pos, word)) pairs.sort() return " ".join(w for _, w in pairs) def slim_authorships(authorships: list) -> list: """Keep only author name + position (drop institutions, affiliations, etc.).""" return [ { "name": a.get("author", {}).get("display_name", ""), "position": a.get("author_position", ""), } for a in (authorships or []) ] def optimise_paper(raw: dict) -> dict: """Strip a raw OpenAlex JSON to only pipeline-relevant fields.""" optimised = {k: raw[k] for k in _KEEP_KEYS if k in raw} # Replace inverted index with plain text abstract if "abstract_inverted_index" in optimised: optimised["abstract"] = reconstruct_abstract(optimised.pop("abstract_inverted_index")) # Slim authorships if "authorships" in optimised: optimised["authorships"] = slim_authorships(optimised["authorships"]) # Slim topics to just display_name if "topics" in optimised: optimised["topics"] = [t.get("display_name", "") for t in (optimised["topics"] or [])] # Slim keywords if "keywords" in optimised: optimised["keywords"] = [k.get("display_name", "") for k in (optimised["keywords"] or [])] # Slim referenced_works to just IDs if "referenced_works" in optimised: optimised["referenced_works"] = [ r.rsplit("/", 1)[-1] if "/" in r else r for r in (optimised["referenced_works"] or []) ] return optimised def count_tokens(text: str) -> dict: chars = len(text) words = len(text.split()) tokens_est = chars // CHARS_PER_TOKEN return {"chars": chars, "words": words, "tokens_est": tokens_est} def main(): if not DATA_DIR.exists(): print(f"ERROR: {DATA_DIR} not found. Expected: {DATA_DIR.resolve()}") sys.exit(1) files = sorted(DATA_DIR.glob("*.json")) total = len(files) print(f"Found {total} JSON files in {DATA_DIR.resolve()}\n") raw_total = {"chars": 0, "words": 0, "tokens_est": 0, "bytes": 0} opt_total = {"chars": 0, "words": 0, "tokens_est": 0, "bytes": 0} llm_total = {"chars": 0, "words": 0, "tokens_est": 0, "bytes": 0} for f in files: raw_text = f.read_text(encoding="utf-8") raw_total["bytes"] += len(raw_text.encode("utf-8")) stats = count_tokens(raw_text) raw_total["chars"] += stats["chars"] raw_total["words"] += stats["words"] raw_total["tokens_est"] += stats["tokens_est"] try: data = json.loads(raw_text) # Tier 2: optimised JSON (all pipeline-read fields) optimised = optimise_paper(data) opt_text = json.dumps(optimised, ensure_ascii=False) opt_total["bytes"] += len(opt_text.encode("utf-8")) opt_stats = count_tokens(opt_text) opt_total["chars"] += opt_stats["chars"] opt_total["words"] += opt_stats["words"] opt_total["tokens_est"] += opt_stats["tokens_est"] # Tier 3: LLM context — the exact text sent to the embedding model and LLM # Mirrors indexer.py _doc_text() and setup.py embed text construction title = data.get("title") or "" abstract = reconstruct_abstract(data.get("abstract_inverted_index") or {}) llm_text = f"Title: {title}\n\nAbstract: {abstract}" llm_total["bytes"] += len(llm_text.encode("utf-8")) llm_stats = count_tokens(llm_text) llm_total["chars"] += llm_stats["chars"] llm_total["words"] += llm_stats["words"] llm_total["tokens_est"] += llm_stats["tokens_est"] except json.JSONDecodeError: pass def fmt(n): return f"{n:,}" print("=" * 60) print(" TIER 1 — RAW (full OpenAlex JSON as-is on disk)") print("=" * 60) print(f" Files: {fmt(total)}") print(f" Total bytes: {fmt(raw_total['bytes'])} ({raw_total['bytes'] / 1e6:.1f} MB)") print(f" Total chars: {fmt(raw_total['chars'])}") print(f" Total words: {fmt(raw_total['words'])}") print(f" Est. tokens: {fmt(raw_total['tokens_est'])} (~{raw_total['tokens_est'] / 1e6:.2f}M)") print() print("=" * 60) print(" TIER 2 — OPTIMISED (pipeline-relevant fields only)") print(" Fields: id, title, abstract, authorships, topics,") print(" keywords, referenced_works, doi, year,") print(" cited_by_count, best_oa_location.pdf_url") print("=" * 60) print(f" Files: {fmt(total)}") print(f" Total bytes: {fmt(opt_total['bytes'])} ({opt_total['bytes'] / 1e6:.1f} MB)") print(f" Total chars: {fmt(opt_total['chars'])}") print(f" Total words: {fmt(opt_total['words'])}") print(f" Est. tokens: {fmt(opt_total['tokens_est'])} (~{opt_total['tokens_est'] / 1e6:.2f}M)") print() print("=" * 60) print(" TIER 3 — LLM CONTEXT (what reaches the embedding model + LLM)") print(' Format: "Title: …\\n\\nAbstract: …" per paper') print("=" * 60) print(f" Files: {fmt(total)}") print(f" Total bytes: {fmt(llm_total['bytes'])} ({llm_total['bytes'] / 1e6:.1f} MB)") print(f" Total chars: {fmt(llm_total['chars'])}") print(f" Total words: {fmt(llm_total['words'])}") print(f" Est. tokens: {fmt(llm_total['tokens_est'])} (~{llm_total['tokens_est'] / 1e6:.2f}M)") print() r2_pct = (1 - opt_total["bytes"] / raw_total["bytes"]) * 100 if raw_total["bytes"] else 0 r3_pct = (1 - llm_total["bytes"] / raw_total["bytes"]) * 100 if raw_total["bytes"] else 0 print(f" Tier 1 → Tier 2 reduction: {r2_pct:.1f}% " f"({fmt(raw_total['tokens_est'] - opt_total['tokens_est'])} tokens saved)") print(f" Tier 1 → Tier 3 reduction: {r3_pct:.1f}% " f"({fmt(raw_total['tokens_est'] - llm_total['tokens_est'])} tokens saved)") if __name__ == "__main__": main()