balochi-tokenizers / Code /Renyi_Entropy_Analysis.py
hafeez007's picture
Update tokenizer models and README
e899795 verified
Raw
History Blame Contribute Delete
9.15 kB
# -*- coding: utf-8 -*-
"""
Rényi Entropy Analysis — Balochi Tokenizer Vocabulary Optimization
===================================================================
Analyzes vocabulary efficiency using information-theoretic measures
(Shannon entropy, Rényi entropy, effective vocabulary size).
Relies on Tokenizers_Comparison_Extended for tokenizer loading and tokenization.
"""
import os
import sys
import time
from collections import Counter
from datetime import datetime
# Fix Windows console encoding
if sys.platform == "win32":
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
# Import dependencies
try:
import numpy as np
from Tokenizers_Comparison import load_ablation_tokenizers, tokenize_text
except ImportError as e:
print(f"❌ Error importing dependencies: {e}")
sys.exit(1)
# ============================================
# 1. Configuration
# ============================================
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
BASE_DIR = SCRIPT_DIR
OUTPUT_DIR = os.path.join(SCRIPT_DIR, "Output")
os.makedirs(OUTPUT_DIR, exist_ok=True)
EVAL_TEXTS = [
os.path.join(BASE_DIR, "..", "Tokens", "Tahir Hakim.txt"),
os.path.join(BASE_DIR, "..", "Tokens", "liberal capitalism.txt"),
]
RENYI_ORDERS = [0, 0.5, 2, 3, 10, float('inf')]
# ============================================
# 2. Information-Theoretic Metrics
# ============================================
def compute_token_distribution(tokens):
counter = Counter(tokens)
total = len(tokens)
probs = np.array([count / total for count in counter.values()])
return counter, probs
def shannon_entropy(probs):
p = probs[probs > 0]
return -np.sum(p * np.log2(p))
def renyi_entropy(probs, alpha):
p = probs[probs > 0]
if alpha == 0:
return np.log2(len(p))
elif alpha == 1:
return shannon_entropy(p)
elif alpha == float('inf'):
return -np.log2(np.max(p))
else:
return (1.0 / (1.0 - alpha)) * np.log2(np.sum(p ** alpha))
def zipf_fit(counter, top_n=100):
freqs = sorted(counter.values(), reverse=True)[:top_n]
if len(freqs) < 5: return 0.0, 0.0
ranks = np.arange(1, len(freqs) + 1, dtype=float)
log_ranks = np.log(ranks)
log_freqs = np.log(np.array(freqs, dtype=float))
n = len(log_ranks)
sum_x = np.sum(log_ranks)
sum_y = np.sum(log_freqs)
sum_xy = np.sum(log_ranks * log_freqs)
sum_x2 = np.sum(log_ranks ** 2)
denominator = n * sum_x2 - sum_x ** 2
if denominator == 0: return 0.0, 0.0
slope = (n * sum_xy - sum_x * sum_y) / denominator
intercept = (sum_y - slope * sum_x) / n
y_pred = slope * log_ranks + intercept
ss_res = np.sum((log_freqs - y_pred) ** 2)
ss_tot = np.sum((log_freqs - np.mean(log_freqs)) ** 2)
r_squared = 1 - (ss_res / ss_tot) if ss_tot > 0 else 0
return -slope, r_squared
# ============================================
# 3. Core Analysis
# ============================================
def analyze_tokenizer_on_text(tok_name, tok_info, text, text_name):
tokens, _ = tokenize_text(tok_name, tok_info, text)
if not tokens: return None
counter, probs = compute_token_distribution(tokens)
h1 = shannon_entropy(probs)
v_eff = 2.0 ** h1
v_total = tok_info.get("vocab_size", None)
renyi_vals = {}
for alpha in RENYI_ORDERS:
label = f"H_{alpha}" if alpha != float('inf') else "H_inf"
renyi_vals[label] = renyi_entropy(probs, alpha)
z_exp, z_r2 = zipf_fit(counter)
# Identify algorithm
algorithm = "Unknown"
if "BPE" in tok_name: algorithm = "BPE"
elif "WordPiece" in tok_name or "BERT" in tok_name: algorithm = "WordPiece"
elif "SentencePiece" in tok_name or "Gemma" in tok_name: algorithm = "SentencePiece"
return {
"tokenizer": tok_name,
"text": text_name,
"algorithm": algorithm,
"vocab_size": v_total,
"total_tokens": len(tokens),
"unique_types": len(counter),
"type_token_ratio": len(counter) / len(tokens),
"hapax_types": sum(1 for c in counter.values() if c == 1),
"hapax_rate": sum(1 for c in counter.values() if c == 1) / len(counter),
"shannon_entropy_H1": h1,
"effective_vocab_V_eff": v_eff,
"vocab_efficiency_eta": (v_eff / v_total) if v_total else None,
"zipf_exponent": z_exp,
"zipf_r_squared": z_r2,
**renyi_vals,
}
# ============================================
# 4. Report Generation
# ============================================
def generate_markdown_report(results, output_path):
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
lines = [
"# 📐 Rényi Entropy Analysis — Balochi Tokenizer Vocabulary Optimization\n",
f"**Generated on:** {timestamp}\n\n",
"## 1. Purpose\n",
"Applies information-theoretic measures to formally evaluate vocabulary efficiency.\n"
]
texts = sorted(set(r["text"] for r in results))
for text_name in texts:
text_results = [r for r in results if r["text"] == text_name]
lines.append(f"## 2. Results — `{text_name}`\n")
lines.append("### Shannon & Rényi Entropy Spectrum\n")
# Sort results by effective vocab for cleaner display
text_results.sort(key=lambda x: x["effective_vocab_V_eff"], reverse=True)
lines.append("| Metric | " + " | ".join(r["tokenizer"] for r in text_results) + " |")
lines.append("|--------|" + "|".join("-" * (len(r["tokenizer"]) + 2) for r in text_results) + "|")
rows = [
("Vocab Size", "vocab_size", "{:,}"),
("Total Tokens", "total_tokens", "{:,}"),
("Unique Types", "unique_types", "{:,}"),
("Type-Token Ratio", "type_token_ratio", "{:.4f}"),
("**Shannon H₁**", "shannon_entropy_H1", "**{:.4f}**"),
("H₀ (Hartley)", "H_0", "{:.4f}"),
("H₂ (Collision)", "H_2", "{:.4f}"),
("H∞ (Min-entropy)", "H_inf", "{:.4f}"),
("**Effective Vocab (V_eff)**", "effective_vocab_V_eff", "**{:.1f}**"),
("**Vocab Efficiency (η)**", "vocab_efficiency_eta", "**{:.4f}**"),
("Zipf Exponent (s)", "zipf_exponent", "{:.3f}"),
]
for label, key, fmt in rows:
vals = []
for r in text_results:
v = r.get(key, None)
if v is None:
vals.append("—")
else:
vals.append(fmt.format(v))
lines.append(f"| {label} | " + " | ".join(vals) + " |")
lines.append("")
report_text = "\n".join(lines)
with open(output_path, "w", encoding="utf-8") as f:
f.write(report_text)
return report_text
def generate_csv(results, output_path):
if not results: return
keys = results[0].keys()
with open(output_path, "w", encoding="utf-8-sig") as f:
f.write(",".join(keys) + "\n")
for row in results:
vals = []
for k in keys:
v = row.get(k, "")
if isinstance(v, float):
if v == float('inf'): vals.append("inf")
else: vals.append(f"{v:.6f}")
elif v is None: vals.append("")
else: vals.append(str(v))
f.write(",".join(vals) + "\n")
# ============================================
# 5. Main Execution
# ============================================
def main():
print("=" * 80)
print(" 📐 Rényi Entropy Analysis — Balochi Tokenizers")
print("=" * 80)
# 1. Load evaluation texts
texts = {}
for path in EVAL_TEXTS:
if os.path.exists(path):
name = os.path.basename(path)
with open(path, "r", encoding="utf-8") as f:
texts[name] = f.read()
if not texts:
print("❌ No evaluation texts found.")
return
# 2. Load ablation tokenizers
print("\n Loading ablation tokenizers...")
tokenizers = load_ablation_tokenizers()
if not tokenizers:
print("\n ❌ Required ablation tokenizers not found. Exiting.")
return
# 3. Analyze
print(f"\n Running entropy analysis...")
results = []
for tok_name, tok_info in tokenizers.items():
for text_name, text_content in texts.items():
print(f" Analyzing {tok_name} on {text_name}...", end="")
start = time.perf_counter()
res = analyze_tokenizer_on_text(tok_name, tok_info, text_content, text_name)
if res:
results.append(res)
elapsed = time.perf_counter() - start
print(f" done ({elapsed:.3f}s)")
# 4. Reporting
report_path = os.path.join(OUTPUT_DIR, "Renyi_Entropy_Report.md")
csv_path = os.path.join(OUTPUT_DIR, "renyi_entropy_data.csv")
generate_markdown_report(results, report_path)
generate_csv(results, csv_path)
print(f"\n ✅ ANALYSIS COMPLETE")
print(f" Report: {report_path}")
if __name__ == "__main__":
main()