Code/update_readmes.py · hafeez007/balochi-tokenizers at main

balochi-tokenizers / Code /update_readmes.py

Update tokenizer models and README

e899795 verified 27 days ago

10.6 kB

	import os
	import re

	# Paths to the READMEs
	README_PATHS = [
	r"..\..\Balochi Tokenizer_HF\README.md",
	r"..\..\Final Tokenizers\README.md",
	r"..\..\Final Tokenizers-4\README.md",
	r"..\README_Git.md"
	]

	# Paths to the new reports
	EXTENDED_REPORT_PATH = "Output/Tokenizer_Comparison_Extended_Report.md"
	ABLATION_REPORT_PATH = "Output/Ablation/Vocab_Ablation_Report.md"

	def parse_markdown_table(md_text, section_header):
	"""Finds a table under a specific header and returns its rows."""
	pattern = re.compile(rf"{re.escape(section_header)}\n+.?\\|(.?)\n\n", re.DOTALL)
	m = pattern.search(md_text)
	if not m:
	return []

	table_text = m.group(0)
	lines = [line.strip() for line in table_text.split('\n') if line.strip().startswith('\|')]
	return lines

	def extract_master_metrics():
	"""Extracts the transposed master metrics table from the new report."""
	with open(EXTENDED_REPORT_PATH, "r", encoding="utf-8") as f:
	content = f.read()

	lines = parse_markdown_table(content, "## 4. Master Metrics Table")
	if not lines or len(lines) < 3:
	return {}

	# Header row
	headers = [col.strip() for col in lines[0].split('\|')[1:-1]]
	tokenizers = headers[1:] # First col is 'Metric'

	data = {tok: {} for tok in tokenizers}

	for row in lines[2:]:
	cols = [col.strip() for col in row.split('\|')[1:-1]]
	if not cols:
	continue
	metric_name = cols[0].replace("**", "").strip()
	for i, val in enumerate(cols[1:]):
	if i < len(tokenizers):
	tok = tokenizers[i]
	data[tok][metric_name] = val

	return data

	def extract_ablation_metrics(algo_header, eval_text_header):
	"""Extracts ablation metrics for a specific algorithm and text."""
	with open(ABLATION_REPORT_PATH, "r", encoding="utf-8") as f:
	content = f.read()

	# First find the text section
	text_split = content.split(eval_text_header)
	if len(text_split) < 2: return {}
	text_content = text_split[1]

	# Then find the algorithm section
	lines = parse_markdown_table(text_content, algo_header)
	if not lines or len(lines) < 3: return {}

	sizes = [col.strip() for col in lines[0].split('\|')[1:-1]][1:]

	data = {size: {} for size in sizes}
	for row in lines[2:]:
	cols = [col.strip() for col in row.split('\|')[1:-1]]
	if not cols: continue
	metric_name = cols[0].replace("**", "").strip()
	for i, val in enumerate(cols[1:]):
	if i < len(sizes):
	size = sizes[i]
	data[size][metric_name] = val

	return data

	def extract_diminishing_returns(algo_header):
	with open(ABLATION_REPORT_PATH, "r", encoding="utf-8") as f:
	content = f.read()

	text_split = content.split("## 3. Diminishing Returns Analysis")
	if len(text_split) < 2: return {}

	lines = parse_markdown_table(text_split[1], algo_header)
	if not lines or len(lines) < 3: return {}

	data = {}
	for row in lines[2:]:
	cols = [col.strip() for col in row.split('\|')[1:-1]]
	if len(cols) >= 4:
	size = cols[0]
	data[size] = {
	"Avg Compression": cols[1],
	"Avg Fertility": cols[2],
	"Avg Tokens": cols[3]
	}
	return data

	def process_readme(filepath, master_data, labzank_ablation, lib_cap_ablation, dr_data):
	with open(filepath, "r", encoding="utf-8") as f:
	content = f.read()

	# Mapping between README tokenizer names and new report names
	tok_map = {
	"Balochi BPE": "Balochi_BPE",
	"Balochi WordPiece": "Balochi_WordPiece",
	"Balochi SentencePiece": "Balochi_SentencePiece",
	"NLTK (baseline)": "NLTK",
	"BERT Multilingual": "BERT",
	"Gemma 2B": "Gemma",
	"Balochi_30K (HF)": "Balochi_30K",
	"AraBERT_v2": "AraBERT_v2",
	"CAMeLBERT_MSA": "CAMeLBERT_MSA",
	"ARBERT": "ARBERT",
	"AraGPT2": "AraGPT2",
	"ParsBERT": "ParsBERT",
	"PersianBERT_FA": "PersianBERT_FA",
	"PersianBPE": "PersianBPE",
	"UrduBERT": "UrduBERT"
	}

	# 1. Update Master Performance Results
	def replace_master_row(m):
	row = m.group(0)
	cols = row.split('\|')
	tok_display = cols[1].replace("**", "").strip()

	if tok_display in tok_map:
	tok_key = tok_map[tok_display]
	if tok_key in master_data:
	d = master_data[tok_key]

	# Format exactly as original
	is_bold = "**" in cols[1]

	def fmt(val, bold=False):
	return f" {val} " if bold else f" {val} "

	try:
	cols[2] = fmt(d.get("Token Count", ""), is_bold)
	cols[3] = fmt(d.get("Compression Ratio", ""), is_bold)
	cols[4] = fmt(d.get("Fertility", ""), is_bold)
	cols[5] = fmt(d.get("UNK Count", ""), False)
	cols[6] = fmt(d.get("UNK Rate (%)", ""), "**" in cols[6])
	if len(cols) > 7:
	cols[7] = fmt(d.get("Continuation Rate", ""), False)
	except Exception as e:
	print(f"Error on row: {row}")
	print(cols)
	raise e

	return "\|".join(cols)
	return row

	# Find the master table lines and replace
	lines = content.split('\n')
	in_master_table = False
	for i, line in enumerate(lines):
	if "### 5.2 Master Performance Results" in line:
	in_master_table = True
	elif in_master_table and (line.startswith("## ") or line.startswith("### ") and "5.2" not in line):
	in_master_table = False

	if in_master_table and line.strip().startswith('\|') and not line.strip().startswith('\|:') and not line.strip().startswith('\| Tokenizer'):
	lines[i] = replace_master_row(re.match(r'.*', line))

	# Rejoin lines
	content = '\n'.join(lines)

	# 2. Update Ablation Results
	def update_ablation_table(content, header, data_source):
	in_table = False
	lines = content.split('\n')
	for i, line in enumerate(lines):
	if header in line:
	in_table = True
	elif in_table and line.startswith("##") and not line.startswith("####"):
	in_table = False

	if in_table and line.strip().startswith('\|') and not line.strip().startswith('\|:') and not line.strip().startswith('\| Algorithm'):
	cols = line.split('\|')
	algo = cols[1].strip()
	size = cols[2].strip()
	size_key = size.replace("K", "") + "K"

	algo_key = "BPE" if "BPE" in algo else ("WORDPIECE" if "WP" in algo else "SENTENCEPIECE")
	if algo_key in data_source and size_key in data_source[algo_key]:
	d = data_source[algo_key][size_key]

	is_bold = "**" in cols[3]
	def fmt(val, bold=False): return f" {val} " if bold else f" {val} "

	# Token Count \| Compression \| Fertility \| Vocab Util.
	# Some tables have Unique Tokens, some don't.
	if "Unique Tokens" in content.split(header)[1].split('\n')[2]: # Check header row for this table
	if len(cols) > 5:
	cols[3] = fmt(d.get("Token Count", ""), is_bold)
	# cols[4] = Unique Tokens
	cols[5] = fmt(d.get("Compression Ratio", ""), is_bold)
	cols[6] = fmt(d.get("Fertility", ""), is_bold)
	if len(cols) > 7 and "Vocab Util." in cols[7]:
	pass
	else:
	if len(cols) > 5:
	cols[3] = fmt(d.get("Token Count", ""), is_bold)
	cols[4] = fmt(d.get("Compression Ratio", ""), is_bold)
	cols[5] = fmt(d.get("Fertility", ""), is_bold)

	lines[i] = "\|".join(cols)
	return '\n'.join(lines)

	content = update_ablation_table(content, "#### `liberal capitalism.txt` Results", lib_cap_ablation)

	# 3. Update Diminishing Returns
	# Doing a simpler regex replacement for Diminishing returns numbers
	for algo, dr_algo_name in [("SentencePiece", "### SENTENCEPIECE"), ("BPE", "### BPE"), ("WordPiece", "### WORDPIECE")]:
	algo_data = dr_data.get(dr_algo_name, {})
	for size_key, vals in algo_data.items():
	# e.g., \| 32,000 \| 3.9982 \| 1.1748 \| 5,244 \| — \|
	pattern = re.compile(rf"\\|\s\{size_key}\\s\\|\s[\d\.]+\s\\|\s[\d\.]+\s\\|\s[\d,]+\s\\|")
	def repl(m):
	s = m.group(0)
	parts = s.split('\|')
	parts[2] = f" {vals['Avg Compression']} "
	parts[3] = f" {vals['Avg Fertility']} "
	parts[4] = f" {vals['Avg Tokens']} "
	return "\|".join(parts)
	content = pattern.sub(repl, content)

	# Save
	with open(filepath, "w", encoding="utf-8") as f:
	f.write(content)
	print(f"Updated {filepath}")


	def main():
	print("Extracting new data...")
	master_data = extract_master_metrics()

	lib_cap_ablation = {
	"BPE": extract_ablation_metrics("### BPE Algorithm", "## 2. Results — `liberal capitalism.txt`"),
	"WORDPIECE": extract_ablation_metrics("### WORDPIECE Algorithm", "## 2. Results — `liberal capitalism.txt`"),
	"SENTENCEPIECE": extract_ablation_metrics("### SENTENCEPIECE Algorithm", "## 2. Results — `liberal capitalism.txt`")
	}

	dr_data = {
	"### BPE": extract_diminishing_returns("### BPE"),
	"### WORDPIECE": extract_diminishing_returns("### WORDPIECE"),
	"### SENTENCEPIECE": extract_diminishing_returns("### SENTENCEPIECE")
	}

	print("Updating README files...")
	for path in README_PATHS:
	full_path = os.path.abspath(path)
	if os.path.exists(full_path):
	process_readme(full_path, master_data, {}, lib_cap_ablation, dr_data)
	else:
	print(f"Warning: Could not find {full_path}")

	if __name__ == "__main__":
	main()