| import streamlit as st | |
| import os | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from collections import Counter | |
| import json | |
| from io import StringIO, BytesIO | |
| import tempfile | |
| import re | |
| import base64 | |
| from tokenizers_trainer import train_bpe, train_wordpiece, train_unigram | |
| from tokenizers_analysis import calculate_oov | |
| st.set_page_config(page_title='Tokenizer Explorer', layout="wide") | |
| st.title('Tokenizer Explorer') | |
| UPLOAD_DIR = 'uploads' | |
| os.makedirs(UPLOAD_DIR, exist_ok=True) | |
| SAMPLE_DATA_PATH = 'core/united_core.txt' | |
| st.sidebar.header('Data loading') | |
| data_source = st.sidebar.radio('Data source', ['Upload your file', 'Use example']) | |
| text_lines = [] | |
| if data_source == 'Upload your file': | |
| uploaded_file = st.sidebar.file_uploader('Upload file (.txt)', type=['txt']) | |
| if uploaded_file is not None: | |
| content = uploaded_file.read().decode('utf-8') | |
| text_lines = [line.strip() for line in content.splitlines() if line.strip()] | |
| st.session_state['raw_text'] = content | |
| else: | |
| st.info('Upload your file.') | |
| else: | |
| if os.path.exists(SAMPLE_DATA_PATH): | |
| with open(SAMPLE_DATA_PATH, encoding='utf-8') as f: | |
| content = f.read() | |
| text_lines = [line.strip() for line in content.splitlines() if line.strip()] | |
| st.session_state['raw_text'] = content | |
| st.sidebar.success(f'Example uploaded: {SAMPLE_DATA_PATH}') | |
| else: | |
| st.error(f'Example file not found: {SAMPLE_DATA_PATH}') | |
| if not text_lines: | |
| st.stop() | |
| st.sidebar.header('Settings') | |
| vocab_size = st.sidebar.slider('Vocabulary size', 5000, 50000, 20000, step=1000) | |
| min_freq = st.sidebar.slider('Minimal token frequency', 1, 10, 2) | |
| model_type = st.sidebar.selectbox('Tokenizer', ['BPE', 'WordPiece', 'Unigram']) | |
| normalize_text = st.sidebar.checkbox('Normalize text', True) | |
| def normalize(line): | |
| if normalize_text: | |
| line = line.lower() | |
| line = re.sub(r'[^\w\s]', '', line) | |
| return line.strip() | |
| texts = [normalize(line) for line in text_lines if normalize(line)] | |
| if not texts: | |
| st.error('Text is empty after normalization.') | |
| st.stop() | |
| corpus_path = os.path.join(UPLOAD_DIR, 'temp_corpus.txt') | |
| with open(corpus_path, 'w', encoding='utf-8') as f: | |
| f.write("\n".join(texts)) | |
| st.sidebar.header('Training') | |
| if st.sidebar.button('Train tokenizer'): | |
| with st.spinner('training...'): | |
| try: | |
| if model_type == 'BPE': | |
| st.session_state['tokenizer'] = train_bpe(vocab_size, min_freq, corpus_path) | |
| st.session_state['model_name'] = 'BPE' | |
| elif model_type == 'WordPiece': | |
| st.session_state['tokenizer'] = train_wordpiece(vocab_size, min_freq, corpus_path) | |
| st.session_state['model_name'] = 'WordPiece' | |
| elif model_type == 'Unigram': | |
| st.session_state['tokenizer'] = train_unigram(vocab_size, min_freq, corpus_path) | |
| st.session_state['model_name'] = 'Unigram' | |
| st.sidebar.success('Training complete') | |
| except Exception as e: | |
| st.sidebar.error(f'Training error: {e}') | |
| if 'tokenizer' not in st.session_state: | |
| st.info('Setup parameters and press "Train tokenizer" on left panel') | |
| st.stop() | |
| tokenizer = st.session_state['tokenizer'] | |
| model_name = st.session_state['model_name'] | |
| def tokenize_text(text): | |
| if model_name in ['BPE', 'WordPiece']: | |
| return tokenizer.encode(text).tokens | |
| else: | |
| return tokenizer.encode_as_pieces(text) | |
| def get_vocabulary(tokenizer): | |
| if hasattr(tokenizer, 'get_vocab'): | |
| return tokenizer.get_vocab() | |
| else: | |
| size = tokenizer.get_piece_size() | |
| return {tokenizer.id_to_piece(i): i for i in range(size)} | |
| all_tokens = [] | |
| for line in texts[:1000]: | |
| tokens = tokenize_text(line) | |
| all_tokens.extend(tokens) | |
| token_counter = Counter(all_tokens) | |
| df_tokens = pd.DataFrame(token_counter.items(), columns=['token', 'frequency']).sort_values('frequency', ascending=False) | |
| st.header(f'Report: {model_name} (Vocab={vocab_size}, MinFreq={min_freq})') | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader('Token length distribution') | |
| token_lengths = [len(t) for t in all_tokens] | |
| fig1, ax1 = plt.subplots() | |
| sns.histplot(token_lengths, bins=30, kde=True, ax=ax1) | |
| ax1.set_xlabel('Token length, chars') | |
| ax1.set_ylabel('Frequency') | |
| st.pyplot(fig1) | |
| with col2: | |
| st.subheader('Most frequent tokens') | |
| top20 = df_tokens.head(20) | |
| fig2, ax2 = plt.subplots(figsize=(8, 6)) | |
| sns.barplot(data=top20, x='frequency', y='token', ax=ax2) | |
| ax2.set_xlabel('Frequency') | |
| ax2.set_ylabel('Token') | |
| st.pyplot(fig2) | |
| st.subheader('Out-of-Vocabulary percentage') | |
| oov_rate = calculate_oov(' '.join(texts), get_vocabulary(tokenizer)) | |
| st.metric(label='', value=f'{oov_rate:.2%}') | |
| st.sidebar.header('Export') | |
| if st.sidebar.button('Export as HTML'): | |
| def fig_to_base64(fig): | |
| buf = BytesIO() | |
| fig.savefig(buf, format='png', dpi=150, bbox_inches='tight') | |
| buf.seek(0) | |
| img_str = base64.b64encode(buf.read()).decode() | |
| buf.close() | |
| return f'<img src="data:image/png;base64,{img_str}" style="max-width:100%;">' | |
| token_lengths = [len(t) for t in all_tokens] | |
| fig1, ax1 = plt.subplots(figsize=(6, 4)) | |
| sns.histplot(token_lengths, bins=30, kde=True, ax=ax1) | |
| ax1.set_xlabel('Token length, chars') | |
| ax1.set_ylabel('Frequency') | |
| ax1.set_title('Token Length Distribution') | |
| chart1_html = fig_to_base64(fig1) | |
| plt.close(fig1) | |
| top20 = df_tokens.head(20) | |
| fig2, ax2 = plt.subplots(figsize=(8, 6)) | |
| sns.barplot(data=top20, x='frequency', y='token', ax=ax2) | |
| ax2.set_xlabel('Frequency') | |
| ax2.set_ylabel('Token') | |
| ax2.set_title('Top 20 Most Frequent Tokens') | |
| chart2_html = fig_to_base64(fig2) | |
| plt.close(fig2) | |
| oov_rate = calculate_oov(' '.join(texts), get_vocabulary(tokenizer)) | |
| report_html = f''' | |
| <html> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <title>Tokenizer Report: {model_name}</title> | |
| <style> | |
| body {{ | |
| font-family: Arial, sans-serif; | |
| margin: 40px; | |
| line-height: 1.6; | |
| color: #333; | |
| }} | |
| h1, h2, h3 {{ | |
| color: #2c3e50; | |
| }} | |
| table {{ | |
| border-collapse: collapse; | |
| width: 100%; | |
| margin: 20px 0; | |
| }} | |
| table th, table td {{ | |
| border: 1px solid #bdc3c7; | |
| padding: 8px; | |
| text-align: left; | |
| }} | |
| table th {{ | |
| background-color: #ecf0f1; | |
| }} | |
| .chart {{ | |
| margin: 30px 0; | |
| }} | |
| .info-box {{ | |
| background-color: #f9f9f9; | |
| border-left: 4px solid #3498db; | |
| padding: 15px; | |
| margin: 20px 0; | |
| }} | |
| footer {{ | |
| margin-top: 50px; | |
| font-size: 0.9em; | |
| color: #7f8c8d; | |
| text-align: center; | |
| }} | |
| </style> | |
| </head> | |
| <body> | |
| <h1>Tokenizer Report: {model_name}</h1> | |
| <p><strong>Generated on:</strong> {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}</p> | |
| <h2>Model Parameters</h2> | |
| <ul> | |
| <li><strong>Vocabulary size:</strong> {vocab_size}</li> | |
| <li><strong>Minimum frequency:</strong> {min_freq}</li> | |
| <li><strong>Normalization:</strong> {'Yes' if normalize_text else 'No'}</li> | |
| <li><strong>Total tokens processed:</strong> {len(all_tokens)}</li> | |
| <li><strong>Unique tokens found:</strong> {len(token_counter)}</li> | |
| <li><strong>Out-of-Vocabulary rate:</strong> {oov_rate:.2%}</li> | |
| </ul> | |
| <h2>Token Length Distribution</h2> | |
| <div class="chart"> | |
| {chart1_html} | |
| </div> | |
| <h2>Most Frequent Tokens (Top 20)</h2> | |
| <div class="chart"> | |
| {chart2_html} | |
| </div> | |
| <h2>Top 10 Most Frequent Tokens</h2> | |
| <table> | |
| <tr><th>Token</th><th>Frequency</th></tr> | |
| ''' | |
| for _, row in df_tokens.head(10).iterrows(): | |
| report_html += f'<tr><td>{row["token"]}</td><td>{row["frequency"]:,}</td></tr>' | |
| report_html += '</table>' | |
| report_html += ''' | |
| <h2>Dictionary (First 100 Tokens)</h2> | |
| <table> | |
| <tr><th>Rank</th><th>Token</th><th>Frequency</th></tr> | |
| ''' | |
| for i, (_, row) in enumerate(df_tokens.head(100).iterrows()): | |
| report_html += f'<tr><td>{i+1}</td><td>{row["token"]}</td><td>{row["frequency"]:,}</td></tr>' | |
| report_html += ''' | |
| </table> | |
| </body> | |
| </html> | |
| ''' | |
| html_path = os.path.join(UPLOAD_DIR, 'tokenizer_report.html') | |
| with open(html_path, 'w', encoding='utf-8') as f: | |
| f.write(report_html) | |
| with open(html_path, encoding='utf-8') as f: | |
| st.sidebar.download_button( | |
| 'Download Full Report', | |
| f.read(), | |
| file_name='tokenizer_report.html', | |
| mime='text/html' | |
| ) | |
| with st.expander('View dictionary'): | |
| st.dataframe(df_tokens.head(100)) | |
| with st.expander('Info'): | |
| st.write(f'- Method: {model_name}') | |
| st.write(f'- Vocabulary size: {vocab_size}') | |
| st.write(f'- Min. frequency: {min_freq}') | |
| st.write(f'- Normalization: {"Yes" if normalize_text else "No"}') | |
| st.write(f'- Unique tokens: {len(token_counter)}') | |
| st.write(f'- Total tokens: {len(all_tokens)}') | |