| import os |
| import re |
|
|
| |
| README_PATHS = [ |
| r"..\..\Balochi Tokenizer_HF\README.md", |
| r"..\..\Final Tokenizers\README.md", |
| r"..\..\Final Tokenizers-4\README.md", |
| r"..\README_Git.md" |
| ] |
|
|
| |
| EXTENDED_REPORT_PATH = "Output/Tokenizer_Comparison_Extended_Report.md" |
| ABLATION_REPORT_PATH = "Output/Ablation/Vocab_Ablation_Report.md" |
|
|
| def parse_markdown_table(md_text, section_header): |
| """Finds a table under a specific header and returns its rows.""" |
| pattern = re.compile(rf"{re.escape(section_header)}\n+.*?\|(.*?)\n\n", re.DOTALL) |
| m = pattern.search(md_text) |
| if not m: |
| return [] |
| |
| table_text = m.group(0) |
| lines = [line.strip() for line in table_text.split('\n') if line.strip().startswith('|')] |
| return lines |
|
|
| def extract_master_metrics(): |
| """Extracts the transposed master metrics table from the new report.""" |
| with open(EXTENDED_REPORT_PATH, "r", encoding="utf-8") as f: |
| content = f.read() |
| |
| lines = parse_markdown_table(content, "## 4. Master Metrics Table") |
| if not lines or len(lines) < 3: |
| return {} |
| |
| |
| headers = [col.strip() for col in lines[0].split('|')[1:-1]] |
| tokenizers = headers[1:] |
| |
| data = {tok: {} for tok in tokenizers} |
| |
| for row in lines[2:]: |
| cols = [col.strip() for col in row.split('|')[1:-1]] |
| if not cols: |
| continue |
| metric_name = cols[0].replace("**", "").strip() |
| for i, val in enumerate(cols[1:]): |
| if i < len(tokenizers): |
| tok = tokenizers[i] |
| data[tok][metric_name] = val |
| |
| return data |
|
|
| def extract_ablation_metrics(algo_header, eval_text_header): |
| """Extracts ablation metrics for a specific algorithm and text.""" |
| with open(ABLATION_REPORT_PATH, "r", encoding="utf-8") as f: |
| content = f.read() |
| |
| |
| text_split = content.split(eval_text_header) |
| if len(text_split) < 2: return {} |
| text_content = text_split[1] |
| |
| |
| lines = parse_markdown_table(text_content, algo_header) |
| if not lines or len(lines) < 3: return {} |
| |
| sizes = [col.strip() for col in lines[0].split('|')[1:-1]][1:] |
| |
| data = {size: {} for size in sizes} |
| for row in lines[2:]: |
| cols = [col.strip() for col in row.split('|')[1:-1]] |
| if not cols: continue |
| metric_name = cols[0].replace("**", "").strip() |
| for i, val in enumerate(cols[1:]): |
| if i < len(sizes): |
| size = sizes[i] |
| data[size][metric_name] = val |
| |
| return data |
|
|
| def extract_diminishing_returns(algo_header): |
| with open(ABLATION_REPORT_PATH, "r", encoding="utf-8") as f: |
| content = f.read() |
| |
| text_split = content.split("## 3. Diminishing Returns Analysis") |
| if len(text_split) < 2: return {} |
| |
| lines = parse_markdown_table(text_split[1], algo_header) |
| if not lines or len(lines) < 3: return {} |
| |
| data = {} |
| for row in lines[2:]: |
| cols = [col.strip() for col in row.split('|')[1:-1]] |
| if len(cols) >= 4: |
| size = cols[0] |
| data[size] = { |
| "Avg Compression": cols[1], |
| "Avg Fertility": cols[2], |
| "Avg Tokens": cols[3] |
| } |
| return data |
|
|
| def process_readme(filepath, master_data, labzank_ablation, lib_cap_ablation, dr_data): |
| with open(filepath, "r", encoding="utf-8") as f: |
| content = f.read() |
| |
| |
| tok_map = { |
| "Balochi BPE": "Balochi_BPE", |
| "Balochi WordPiece": "Balochi_WordPiece", |
| "Balochi SentencePiece": "Balochi_SentencePiece", |
| "NLTK (baseline)": "NLTK", |
| "BERT Multilingual": "BERT", |
| "Gemma 2B": "Gemma", |
| "Balochi_30K (HF)": "Balochi_30K", |
| "AraBERT_v2": "AraBERT_v2", |
| "CAMeLBERT_MSA": "CAMeLBERT_MSA", |
| "ARBERT": "ARBERT", |
| "AraGPT2": "AraGPT2", |
| "ParsBERT": "ParsBERT", |
| "PersianBERT_FA": "PersianBERT_FA", |
| "PersianBPE": "PersianBPE", |
| "UrduBERT": "UrduBERT" |
| } |
|
|
| |
| def replace_master_row(m): |
| row = m.group(0) |
| cols = row.split('|') |
| tok_display = cols[1].replace("**", "").strip() |
| |
| if tok_display in tok_map: |
| tok_key = tok_map[tok_display] |
| if tok_key in master_data: |
| d = master_data[tok_key] |
| |
| |
| is_bold = "**" in cols[1] |
| |
| def fmt(val, bold=False): |
| return f" **{val}** " if bold else f" {val} " |
| |
| try: |
| cols[2] = fmt(d.get("Token Count", ""), is_bold) |
| cols[3] = fmt(d.get("Compression Ratio", ""), is_bold) |
| cols[4] = fmt(d.get("Fertility", ""), is_bold) |
| cols[5] = fmt(d.get("UNK Count", ""), False) |
| cols[6] = fmt(d.get("UNK Rate (%)", ""), "**" in cols[6]) |
| if len(cols) > 7: |
| cols[7] = fmt(d.get("Continuation Rate", ""), False) |
| except Exception as e: |
| print(f"Error on row: {row}") |
| print(cols) |
| raise e |
| |
| return "|".join(cols) |
| return row |
|
|
| |
| lines = content.split('\n') |
| in_master_table = False |
| for i, line in enumerate(lines): |
| if "### 5.2 Master Performance Results" in line: |
| in_master_table = True |
| elif in_master_table and (line.startswith("## ") or line.startswith("### ") and "5.2" not in line): |
| in_master_table = False |
| |
| if in_master_table and line.strip().startswith('|') and not line.strip().startswith('|:') and not line.strip().startswith('| Tokenizer'): |
| lines[i] = replace_master_row(re.match(r'.*', line)) |
|
|
| |
| content = '\n'.join(lines) |
|
|
| |
| def update_ablation_table(content, header, data_source): |
| in_table = False |
| lines = content.split('\n') |
| for i, line in enumerate(lines): |
| if header in line: |
| in_table = True |
| elif in_table and line.startswith("##") and not line.startswith("####"): |
| in_table = False |
| |
| if in_table and line.strip().startswith('|') and not line.strip().startswith('|:') and not line.strip().startswith('| Algorithm'): |
| cols = line.split('|') |
| algo = cols[1].strip() |
| size = cols[2].strip() |
| size_key = size.replace("K", "") + "K" |
| |
| algo_key = "BPE" if "BPE" in algo else ("WORDPIECE" if "WP" in algo else "SENTENCEPIECE") |
| if algo_key in data_source and size_key in data_source[algo_key]: |
| d = data_source[algo_key][size_key] |
| |
| is_bold = "**" in cols[3] |
| def fmt(val, bold=False): return f" **{val}** " if bold else f" {val} " |
| |
| |
| |
| if "Unique Tokens" in content.split(header)[1].split('\n')[2]: |
| if len(cols) > 5: |
| cols[3] = fmt(d.get("Token Count", ""), is_bold) |
| |
| cols[5] = fmt(d.get("Compression Ratio", ""), is_bold) |
| cols[6] = fmt(d.get("Fertility", ""), is_bold) |
| if len(cols) > 7 and "Vocab Util." in cols[7]: |
| pass |
| else: |
| if len(cols) > 5: |
| cols[3] = fmt(d.get("Token Count", ""), is_bold) |
| cols[4] = fmt(d.get("Compression Ratio", ""), is_bold) |
| cols[5] = fmt(d.get("Fertility", ""), is_bold) |
| |
| lines[i] = "|".join(cols) |
| return '\n'.join(lines) |
| |
| content = update_ablation_table(content, "#### `liberal capitalism.txt` Results", lib_cap_ablation) |
| |
| |
| |
| for algo, dr_algo_name in [("SentencePiece", "### SENTENCEPIECE"), ("BPE", "### BPE"), ("WordPiece", "### WORDPIECE")]: |
| algo_data = dr_data.get(dr_algo_name, {}) |
| for size_key, vals in algo_data.items(): |
| |
| pattern = re.compile(rf"\|\s*\**{size_key}\**\s*\|\s*[\d\.]+\s*\|\s*[\d\.]+\s*\|\s*[\d,]+\s*\|") |
| def repl(m): |
| s = m.group(0) |
| parts = s.split('|') |
| parts[2] = f" {vals['Avg Compression']} " |
| parts[3] = f" {vals['Avg Fertility']} " |
| parts[4] = f" {vals['Avg Tokens']} " |
| return "|".join(parts) |
| content = pattern.sub(repl, content) |
|
|
| |
| with open(filepath, "w", encoding="utf-8") as f: |
| f.write(content) |
| print(f"Updated {filepath}") |
|
|
|
|
| def main(): |
| print("Extracting new data...") |
| master_data = extract_master_metrics() |
| |
| lib_cap_ablation = { |
| "BPE": extract_ablation_metrics("### BPE Algorithm", "## 2. Results — `liberal capitalism.txt`"), |
| "WORDPIECE": extract_ablation_metrics("### WORDPIECE Algorithm", "## 2. Results — `liberal capitalism.txt`"), |
| "SENTENCEPIECE": extract_ablation_metrics("### SENTENCEPIECE Algorithm", "## 2. Results — `liberal capitalism.txt`") |
| } |
| |
| dr_data = { |
| "### BPE": extract_diminishing_returns("### BPE"), |
| "### WORDPIECE": extract_diminishing_returns("### WORDPIECE"), |
| "### SENTENCEPIECE": extract_diminishing_returns("### SENTENCEPIECE") |
| } |
|
|
| print("Updating README files...") |
| for path in README_PATHS: |
| full_path = os.path.abspath(path) |
| if os.path.exists(full_path): |
| process_readme(full_path, master_data, {}, lib_cap_ablation, dr_data) |
| else: |
| print(f"Warning: Could not find {full_path}") |
|
|
| if __name__ == "__main__": |
| main() |
|
|