import os import re # Paths to the READMEs README_PATHS = [ r"..\..\Balochi Tokenizer_HF\README.md", r"..\..\Final Tokenizers\README.md", r"..\..\Final Tokenizers-4\README.md", r"..\README_Git.md" ] # Paths to the new reports EXTENDED_REPORT_PATH = "Output/Tokenizer_Comparison_Extended_Report.md" ABLATION_REPORT_PATH = "Output/Ablation/Vocab_Ablation_Report.md" def parse_markdown_table(md_text, section_header): """Finds a table under a specific header and returns its rows.""" pattern = re.compile(rf"{re.escape(section_header)}\n+.*?\|(.*?)\n\n", re.DOTALL) m = pattern.search(md_text) if not m: return [] table_text = m.group(0) lines = [line.strip() for line in table_text.split('\n') if line.strip().startswith('|')] return lines def extract_master_metrics(): """Extracts the transposed master metrics table from the new report.""" with open(EXTENDED_REPORT_PATH, "r", encoding="utf-8") as f: content = f.read() lines = parse_markdown_table(content, "## 4. Master Metrics Table") if not lines or len(lines) < 3: return {} # Header row headers = [col.strip() for col in lines[0].split('|')[1:-1]] tokenizers = headers[1:] # First col is 'Metric' data = {tok: {} for tok in tokenizers} for row in lines[2:]: cols = [col.strip() for col in row.split('|')[1:-1]] if not cols: continue metric_name = cols[0].replace("**", "").strip() for i, val in enumerate(cols[1:]): if i < len(tokenizers): tok = tokenizers[i] data[tok][metric_name] = val return data def extract_ablation_metrics(algo_header, eval_text_header): """Extracts ablation metrics for a specific algorithm and text.""" with open(ABLATION_REPORT_PATH, "r", encoding="utf-8") as f: content = f.read() # First find the text section text_split = content.split(eval_text_header) if len(text_split) < 2: return {} text_content = text_split[1] # Then find the algorithm section lines = parse_markdown_table(text_content, algo_header) if not lines or len(lines) < 3: return {} sizes = [col.strip() for col in lines[0].split('|')[1:-1]][1:] data = {size: {} for size in sizes} for row in lines[2:]: cols = [col.strip() for col in row.split('|')[1:-1]] if not cols: continue metric_name = cols[0].replace("**", "").strip() for i, val in enumerate(cols[1:]): if i < len(sizes): size = sizes[i] data[size][metric_name] = val return data def extract_diminishing_returns(algo_header): with open(ABLATION_REPORT_PATH, "r", encoding="utf-8") as f: content = f.read() text_split = content.split("## 3. Diminishing Returns Analysis") if len(text_split) < 2: return {} lines = parse_markdown_table(text_split[1], algo_header) if not lines or len(lines) < 3: return {} data = {} for row in lines[2:]: cols = [col.strip() for col in row.split('|')[1:-1]] if len(cols) >= 4: size = cols[0] data[size] = { "Avg Compression": cols[1], "Avg Fertility": cols[2], "Avg Tokens": cols[3] } return data def process_readme(filepath, master_data, labzank_ablation, lib_cap_ablation, dr_data): with open(filepath, "r", encoding="utf-8") as f: content = f.read() # Mapping between README tokenizer names and new report names tok_map = { "Balochi BPE": "Balochi_BPE", "Balochi WordPiece": "Balochi_WordPiece", "Balochi SentencePiece": "Balochi_SentencePiece", "NLTK (baseline)": "NLTK", "BERT Multilingual": "BERT", "Gemma 2B": "Gemma", "Balochi_30K (HF)": "Balochi_30K", "AraBERT_v2": "AraBERT_v2", "CAMeLBERT_MSA": "CAMeLBERT_MSA", "ARBERT": "ARBERT", "AraGPT2": "AraGPT2", "ParsBERT": "ParsBERT", "PersianBERT_FA": "PersianBERT_FA", "PersianBPE": "PersianBPE", "UrduBERT": "UrduBERT" } # 1. Update Master Performance Results def replace_master_row(m): row = m.group(0) cols = row.split('|') tok_display = cols[1].replace("**", "").strip() if tok_display in tok_map: tok_key = tok_map[tok_display] if tok_key in master_data: d = master_data[tok_key] # Format exactly as original is_bold = "**" in cols[1] def fmt(val, bold=False): return f" **{val}** " if bold else f" {val} " try: cols[2] = fmt(d.get("Token Count", ""), is_bold) cols[3] = fmt(d.get("Compression Ratio", ""), is_bold) cols[4] = fmt(d.get("Fertility", ""), is_bold) cols[5] = fmt(d.get("UNK Count", ""), False) cols[6] = fmt(d.get("UNK Rate (%)", ""), "**" in cols[6]) if len(cols) > 7: cols[7] = fmt(d.get("Continuation Rate", ""), False) except Exception as e: print(f"Error on row: {row}") print(cols) raise e return "|".join(cols) return row # Find the master table lines and replace lines = content.split('\n') in_master_table = False for i, line in enumerate(lines): if "### 5.2 Master Performance Results" in line: in_master_table = True elif in_master_table and (line.startswith("## ") or line.startswith("### ") and "5.2" not in line): in_master_table = False if in_master_table and line.strip().startswith('|') and not line.strip().startswith('|:') and not line.strip().startswith('| Tokenizer'): lines[i] = replace_master_row(re.match(r'.*', line)) # Rejoin lines content = '\n'.join(lines) # 2. Update Ablation Results def update_ablation_table(content, header, data_source): in_table = False lines = content.split('\n') for i, line in enumerate(lines): if header in line: in_table = True elif in_table and line.startswith("##") and not line.startswith("####"): in_table = False if in_table and line.strip().startswith('|') and not line.strip().startswith('|:') and not line.strip().startswith('| Algorithm'): cols = line.split('|') algo = cols[1].strip() size = cols[2].strip() size_key = size.replace("K", "") + "K" algo_key = "BPE" if "BPE" in algo else ("WORDPIECE" if "WP" in algo else "SENTENCEPIECE") if algo_key in data_source and size_key in data_source[algo_key]: d = data_source[algo_key][size_key] is_bold = "**" in cols[3] def fmt(val, bold=False): return f" **{val}** " if bold else f" {val} " # Token Count | Compression | Fertility | Vocab Util. # Some tables have Unique Tokens, some don't. if "Unique Tokens" in content.split(header)[1].split('\n')[2]: # Check header row for this table if len(cols) > 5: cols[3] = fmt(d.get("Token Count", ""), is_bold) # cols[4] = Unique Tokens cols[5] = fmt(d.get("Compression Ratio", ""), is_bold) cols[6] = fmt(d.get("Fertility", ""), is_bold) if len(cols) > 7 and "Vocab Util." in cols[7]: pass else: if len(cols) > 5: cols[3] = fmt(d.get("Token Count", ""), is_bold) cols[4] = fmt(d.get("Compression Ratio", ""), is_bold) cols[5] = fmt(d.get("Fertility", ""), is_bold) lines[i] = "|".join(cols) return '\n'.join(lines) content = update_ablation_table(content, "#### `liberal capitalism.txt` Results", lib_cap_ablation) # 3. Update Diminishing Returns # Doing a simpler regex replacement for Diminishing returns numbers for algo, dr_algo_name in [("SentencePiece", "### SENTENCEPIECE"), ("BPE", "### BPE"), ("WordPiece", "### WORDPIECE")]: algo_data = dr_data.get(dr_algo_name, {}) for size_key, vals in algo_data.items(): # e.g., | **32,000** | 3.9982 | 1.1748 | 5,244 | — | pattern = re.compile(rf"\|\s*\**{size_key}\**\s*\|\s*[\d\.]+\s*\|\s*[\d\.]+\s*\|\s*[\d,]+\s*\|") def repl(m): s = m.group(0) parts = s.split('|') parts[2] = f" {vals['Avg Compression']} " parts[3] = f" {vals['Avg Fertility']} " parts[4] = f" {vals['Avg Tokens']} " return "|".join(parts) content = pattern.sub(repl, content) # Save with open(filepath, "w", encoding="utf-8") as f: f.write(content) print(f"Updated {filepath}") def main(): print("Extracting new data...") master_data = extract_master_metrics() lib_cap_ablation = { "BPE": extract_ablation_metrics("### BPE Algorithm", "## 2. Results — `liberal capitalism.txt`"), "WORDPIECE": extract_ablation_metrics("### WORDPIECE Algorithm", "## 2. Results — `liberal capitalism.txt`"), "SENTENCEPIECE": extract_ablation_metrics("### SENTENCEPIECE Algorithm", "## 2. Results — `liberal capitalism.txt`") } dr_data = { "### BPE": extract_diminishing_returns("### BPE"), "### WORDPIECE": extract_diminishing_returns("### WORDPIECE"), "### SENTENCEPIECE": extract_diminishing_returns("### SENTENCEPIECE") } print("Updating README files...") for path in README_PATHS: full_path = os.path.abspath(path) if os.path.exists(full_path): process_readme(full_path, master_data, {}, lib_cap_ablation, dr_data) else: print(f"Warning: Could not find {full_path}") if __name__ == "__main__": main()