balochi-tokenizers / Code /update_readmes.py
hafeez007's picture
Update tokenizer models and README
e899795 verified
Raw
History Blame Contribute Delete
10.6 kB
import os
import re
# Paths to the READMEs
README_PATHS = [
r"..\..\Balochi Tokenizer_HF\README.md",
r"..\..\Final Tokenizers\README.md",
r"..\..\Final Tokenizers-4\README.md",
r"..\README_Git.md"
]
# Paths to the new reports
EXTENDED_REPORT_PATH = "Output/Tokenizer_Comparison_Extended_Report.md"
ABLATION_REPORT_PATH = "Output/Ablation/Vocab_Ablation_Report.md"
def parse_markdown_table(md_text, section_header):
"""Finds a table under a specific header and returns its rows."""
pattern = re.compile(rf"{re.escape(section_header)}\n+.*?\|(.*?)\n\n", re.DOTALL)
m = pattern.search(md_text)
if not m:
return []
table_text = m.group(0)
lines = [line.strip() for line in table_text.split('\n') if line.strip().startswith('|')]
return lines
def extract_master_metrics():
"""Extracts the transposed master metrics table from the new report."""
with open(EXTENDED_REPORT_PATH, "r", encoding="utf-8") as f:
content = f.read()
lines = parse_markdown_table(content, "## 4. Master Metrics Table")
if not lines or len(lines) < 3:
return {}
# Header row
headers = [col.strip() for col in lines[0].split('|')[1:-1]]
tokenizers = headers[1:] # First col is 'Metric'
data = {tok: {} for tok in tokenizers}
for row in lines[2:]:
cols = [col.strip() for col in row.split('|')[1:-1]]
if not cols:
continue
metric_name = cols[0].replace("**", "").strip()
for i, val in enumerate(cols[1:]):
if i < len(tokenizers):
tok = tokenizers[i]
data[tok][metric_name] = val
return data
def extract_ablation_metrics(algo_header, eval_text_header):
"""Extracts ablation metrics for a specific algorithm and text."""
with open(ABLATION_REPORT_PATH, "r", encoding="utf-8") as f:
content = f.read()
# First find the text section
text_split = content.split(eval_text_header)
if len(text_split) < 2: return {}
text_content = text_split[1]
# Then find the algorithm section
lines = parse_markdown_table(text_content, algo_header)
if not lines or len(lines) < 3: return {}
sizes = [col.strip() for col in lines[0].split('|')[1:-1]][1:]
data = {size: {} for size in sizes}
for row in lines[2:]:
cols = [col.strip() for col in row.split('|')[1:-1]]
if not cols: continue
metric_name = cols[0].replace("**", "").strip()
for i, val in enumerate(cols[1:]):
if i < len(sizes):
size = sizes[i]
data[size][metric_name] = val
return data
def extract_diminishing_returns(algo_header):
with open(ABLATION_REPORT_PATH, "r", encoding="utf-8") as f:
content = f.read()
text_split = content.split("## 3. Diminishing Returns Analysis")
if len(text_split) < 2: return {}
lines = parse_markdown_table(text_split[1], algo_header)
if not lines or len(lines) < 3: return {}
data = {}
for row in lines[2:]:
cols = [col.strip() for col in row.split('|')[1:-1]]
if len(cols) >= 4:
size = cols[0]
data[size] = {
"Avg Compression": cols[1],
"Avg Fertility": cols[2],
"Avg Tokens": cols[3]
}
return data
def process_readme(filepath, master_data, labzank_ablation, lib_cap_ablation, dr_data):
with open(filepath, "r", encoding="utf-8") as f:
content = f.read()
# Mapping between README tokenizer names and new report names
tok_map = {
"Balochi BPE": "Balochi_BPE",
"Balochi WordPiece": "Balochi_WordPiece",
"Balochi SentencePiece": "Balochi_SentencePiece",
"NLTK (baseline)": "NLTK",
"BERT Multilingual": "BERT",
"Gemma 2B": "Gemma",
"Balochi_30K (HF)": "Balochi_30K",
"AraBERT_v2": "AraBERT_v2",
"CAMeLBERT_MSA": "CAMeLBERT_MSA",
"ARBERT": "ARBERT",
"AraGPT2": "AraGPT2",
"ParsBERT": "ParsBERT",
"PersianBERT_FA": "PersianBERT_FA",
"PersianBPE": "PersianBPE",
"UrduBERT": "UrduBERT"
}
# 1. Update Master Performance Results
def replace_master_row(m):
row = m.group(0)
cols = row.split('|')
tok_display = cols[1].replace("**", "").strip()
if tok_display in tok_map:
tok_key = tok_map[tok_display]
if tok_key in master_data:
d = master_data[tok_key]
# Format exactly as original
is_bold = "**" in cols[1]
def fmt(val, bold=False):
return f" **{val}** " if bold else f" {val} "
try:
cols[2] = fmt(d.get("Token Count", ""), is_bold)
cols[3] = fmt(d.get("Compression Ratio", ""), is_bold)
cols[4] = fmt(d.get("Fertility", ""), is_bold)
cols[5] = fmt(d.get("UNK Count", ""), False)
cols[6] = fmt(d.get("UNK Rate (%)", ""), "**" in cols[6])
if len(cols) > 7:
cols[7] = fmt(d.get("Continuation Rate", ""), False)
except Exception as e:
print(f"Error on row: {row}")
print(cols)
raise e
return "|".join(cols)
return row
# Find the master table lines and replace
lines = content.split('\n')
in_master_table = False
for i, line in enumerate(lines):
if "### 5.2 Master Performance Results" in line:
in_master_table = True
elif in_master_table and (line.startswith("## ") or line.startswith("### ") and "5.2" not in line):
in_master_table = False
if in_master_table and line.strip().startswith('|') and not line.strip().startswith('|:') and not line.strip().startswith('| Tokenizer'):
lines[i] = replace_master_row(re.match(r'.*', line))
# Rejoin lines
content = '\n'.join(lines)
# 2. Update Ablation Results
def update_ablation_table(content, header, data_source):
in_table = False
lines = content.split('\n')
for i, line in enumerate(lines):
if header in line:
in_table = True
elif in_table and line.startswith("##") and not line.startswith("####"):
in_table = False
if in_table and line.strip().startswith('|') and not line.strip().startswith('|:') and not line.strip().startswith('| Algorithm'):
cols = line.split('|')
algo = cols[1].strip()
size = cols[2].strip()
size_key = size.replace("K", "") + "K"
algo_key = "BPE" if "BPE" in algo else ("WORDPIECE" if "WP" in algo else "SENTENCEPIECE")
if algo_key in data_source and size_key in data_source[algo_key]:
d = data_source[algo_key][size_key]
is_bold = "**" in cols[3]
def fmt(val, bold=False): return f" **{val}** " if bold else f" {val} "
# Token Count | Compression | Fertility | Vocab Util.
# Some tables have Unique Tokens, some don't.
if "Unique Tokens" in content.split(header)[1].split('\n')[2]: # Check header row for this table
if len(cols) > 5:
cols[3] = fmt(d.get("Token Count", ""), is_bold)
# cols[4] = Unique Tokens
cols[5] = fmt(d.get("Compression Ratio", ""), is_bold)
cols[6] = fmt(d.get("Fertility", ""), is_bold)
if len(cols) > 7 and "Vocab Util." in cols[7]:
pass
else:
if len(cols) > 5:
cols[3] = fmt(d.get("Token Count", ""), is_bold)
cols[4] = fmt(d.get("Compression Ratio", ""), is_bold)
cols[5] = fmt(d.get("Fertility", ""), is_bold)
lines[i] = "|".join(cols)
return '\n'.join(lines)
content = update_ablation_table(content, "#### `liberal capitalism.txt` Results", lib_cap_ablation)
# 3. Update Diminishing Returns
# Doing a simpler regex replacement for Diminishing returns numbers
for algo, dr_algo_name in [("SentencePiece", "### SENTENCEPIECE"), ("BPE", "### BPE"), ("WordPiece", "### WORDPIECE")]:
algo_data = dr_data.get(dr_algo_name, {})
for size_key, vals in algo_data.items():
# e.g., | **32,000** | 3.9982 | 1.1748 | 5,244 | — |
pattern = re.compile(rf"\|\s*\**{size_key}\**\s*\|\s*[\d\.]+\s*\|\s*[\d\.]+\s*\|\s*[\d,]+\s*\|")
def repl(m):
s = m.group(0)
parts = s.split('|')
parts[2] = f" {vals['Avg Compression']} "
parts[3] = f" {vals['Avg Fertility']} "
parts[4] = f" {vals['Avg Tokens']} "
return "|".join(parts)
content = pattern.sub(repl, content)
# Save
with open(filepath, "w", encoding="utf-8") as f:
f.write(content)
print(f"Updated {filepath}")
def main():
print("Extracting new data...")
master_data = extract_master_metrics()
lib_cap_ablation = {
"BPE": extract_ablation_metrics("### BPE Algorithm", "## 2. Results — `liberal capitalism.txt`"),
"WORDPIECE": extract_ablation_metrics("### WORDPIECE Algorithm", "## 2. Results — `liberal capitalism.txt`"),
"SENTENCEPIECE": extract_ablation_metrics("### SENTENCEPIECE Algorithm", "## 2. Results — `liberal capitalism.txt`")
}
dr_data = {
"### BPE": extract_diminishing_returns("### BPE"),
"### WORDPIECE": extract_diminishing_returns("### WORDPIECE"),
"### SENTENCEPIECE": extract_diminishing_returns("### SENTENCEPIECE")
}
print("Updating README files...")
for path in README_PATHS:
full_path = os.path.abspath(path)
if os.path.exists(full_path):
process_readme(full_path, master_data, {}, lib_cap_ablation, dr_data)
else:
print(f"Warning: Could not find {full_path}")
if __name__ == "__main__":
main()