Spaces:

theformatisvalid
/

tokenizers-training

Sleeping

App Files Files Community

theformatisvalid commited on Oct 13, 2025

Commit

34f32f7

verified ·

1 Parent(s): 0463151

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +294 -38

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,296 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+import os
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from collections import Counter
+import json
+from io import StringIO, BytesIO
+import tempfile
+import re
+import base64
+from tokenizers_trainer import train_bpe, train_wordpiece, train_unigram
+from tokenizers_analysis import calculate_oov
+st.set_page_config(page_title='Tokenizer Explorer', layout="wide")
+st.title('Tokenizer Explorer')
+UPLOAD_DIR = 'uploads'
+os.makedirs(UPLOAD_DIR, exist_ok=True)
+SAMPLE_DATA_PATH = 'core/united_core.txt'
+st.sidebar.header('Data loading')
+data_source = st.sidebar.radio('Data source', ['Upload your file', 'Use example'])
+text_lines = []
+if data_source == 'Upload your file':
+    uploaded_file = st.sidebar.file_uploader('Upload file (.txt)', type=['txt'])
+    if uploaded_file is not None:
+        content = uploaded_file.read().decode('utf-8')
+        text_lines = [line.strip() for line in content.splitlines() if line.strip()]
+        st.session_state['raw_text'] = content
+    else:
+        st.info('Upload your file.')
+else:
+    if os.path.exists(SAMPLE_DATA_PATH):
+        with open(SAMPLE_DATA_PATH, encoding='utf-8') as f:
+            content = f.read()
+            text_lines = [line.strip() for line in content.splitlines() if line.strip()]
+        st.session_state['raw_text'] = content
+        st.sidebar.success(f'Example uploaded: {SAMPLE_DATA_PATH}')
+    else:
+        st.error(f'Example file not found: {SAMPLE_DATA_PATH}')
+if not text_lines:
+    st.stop()
+st.sidebar.header('Settings')
+vocab_size = st.sidebar.slider('Vocabulary size', 5000, 50000, 20000, step=1000)
+min_freq = st.sidebar.slider('Minimal token frequency', 1, 10, 2)
+model_type = st.sidebar.selectbox('Tokenizer', ['BPE', 'WordPiece', 'Unigram'])
+normalize_text = st.sidebar.checkbox('Normalize text', True)
+def normalize(line):
+    if normalize_text:
+        line = line.lower()
+        line = re.sub(r'[^\w\s]', '', line)
+    return line.strip()
+texts = [normalize(line) for line in text_lines if normalize(line)]
+if not texts:
+    st.error('Text is empty after normalization.')
+    st.stop()
+corpus_path = os.path.join(UPLOAD_DIR, 'temp_corpus.txt')
+with open(corpus_path, 'w', encoding='utf-8') as f:
+    f.write("\n".join(texts))
+st.sidebar.header('Training')
+if st.sidebar.button('Train tokenizer'):
+    with st.spinner('training...'):
+        try:
+            if model_type == 'BPE':
+                st.session_state['tokenizer'] = train_bpe(vocab_size, min_freq, corpus_path)
+                st.session_state['model_name'] = 'BPE'
+            elif model_type == 'WordPiece':
+                st.session_state['tokenizer'] = train_wordpiece(vocab_size, min_freq, corpus_path)
+                st.session_state['model_name'] = 'WordPiece'
+            elif model_type == 'Unigram':
+                st.session_state['tokenizer'] = train_unigram(vocab_size, min_freq, corpus_path)
+                st.session_state['model_name'] = 'Unigram'
+            st.sidebar.success('Training complete')
+        except Exception as e:
+            st.sidebar.error(f'Training error: {e}')
+if 'tokenizer' not in st.session_state:
+    st.info('Setup parameters and press "Train tokenizer" on left panel')
+    st.stop()
+tokenizer = st.session_state['tokenizer']
+model_name = st.session_state['model_name']
+def tokenize_text(text):
+    if model_name in ['BPE', 'WordPiece']:
+        return tokenizer.encode(text).tokens
+    else:
+        return tokenizer.encode_as_pieces(text)
+def get_vocabulary(tokenizer):
+    if hasattr(tokenizer, 'get_vocab'):
+        return tokenizer.get_vocab()
+    else:
+        size = tokenizer.get_piece_size()
+        return {tokenizer.id_to_piece(i): i for i in range(size)}
+all_tokens = []
+for line in texts[:1000]:
+    tokens = tokenize_text(line)
+    all_tokens.extend(tokens)
+token_counter = Counter(all_tokens)
+df_tokens = pd.DataFrame(token_counter.items(), columns=['token', 'frequency']).sort_values('frequency', ascending=False)
+st.header(f'Report: {model_name} (Vocab={vocab_size}, MinFreq={min_freq})')
+col1, col2 = st.columns(2)
+with col1:
+    st.subheader('Token length distribution')
+    token_lengths = [len(t) for t in all_tokens]
+    fig1, ax1 = plt.subplots()
+    sns.histplot(token_lengths, bins=30, kde=True, ax=ax1)
+    ax1.set_xlabel('Token length, chars')
+    ax1.set_ylabel('Frequency')
+    st.pyplot(fig1)
+with col2:
+    st.subheader('Most frequent tokens')
+    top20 = df_tokens.head(20)
+    fig2, ax2 = plt.subplots(figsize=(8, 6))
+    sns.barplot(data=top20, x='frequency', y='token', ax=ax2)
+    ax2.set_xlabel('Frequency')
+    ax2.set_ylabel('Token')
+    st.pyplot(fig2)
+st.subheader('Out-of-Vocabulary percentage')
+oov_rate = calculate_oov(' '.join(texts), get_vocabulary(tokenizer))
+st.metric(label='', value=f'{oov_rate:.2%}')
+st.sidebar.header('Export')
+if st.sidebar.button('Export as HTML'):
+    def fig_to_base64(fig):
+        buf = BytesIO()
+        fig.savefig(buf, format='png', dpi=150, bbox_inches='tight')
+        buf.seek(0)
+        img_str = base64.b64encode(buf.read()).decode()
+        buf.close()
+        return f'<img src="data:image/png;base64,{img_str}" style="max-width:100%;">'
+    token_lengths = [len(t) for t in all_tokens]
+    fig1, ax1 = plt.subplots(figsize=(6, 4))
+    sns.histplot(token_lengths, bins=30, kde=True, ax=ax1)
+    ax1.set_xlabel('Token length, chars')
+    ax1.set_ylabel('Frequency')
+    ax1.set_title('Token Length Distribution')
+    chart1_html = fig_to_base64(fig1)
+    plt.close(fig1)
+    top20 = df_tokens.head(20)
+    fig2, ax2 = plt.subplots(figsize=(8, 6))
+    sns.barplot(data=top20, x='frequency', y='token', ax=ax2)
+    ax2.set_xlabel('Frequency')
+    ax2.set_ylabel('Token')
+    ax2.set_title('Top 20 Most Frequent Tokens')
+    chart2_html = fig_to_base64(fig2)
+    plt.close(fig2)
+    oov_rate = calculate_oov(' '.join(texts), get_vocabulary(tokenizer))
+    report_html = f'''
+    <html>
+    <head>
+        <meta charset="UTF-8">
+        <title>Tokenizer Report: {model_name}</title>
+        <style>
+            body {{
+                font-family: Arial, sans-serif;
+                margin: 40px;
+                line-height: 1.6;
+                color: #333;
+            }}
+            h1, h2, h3 {{
+                color: #2c3e50;
+            }}
+            table {{
+                border-collapse: collapse;
+                width: 100%;
+                margin: 20px 0;
+            }}
+            table th, table td {{
+                border: 1px solid #bdc3c7;
+                padding: 8px;
+                text-align: left;
+            }}
+            table th {{
+                background-color: #ecf0f1;
+            }}
+            .chart {{
+                margin: 30px 0;
+            }}
+            .info-box {{
+                background-color: #f9f9f9;
+                border-left: 4px solid #3498db;
+                padding: 15px;
+                margin: 20px 0;
+            }}
+            footer {{
+                margin-top: 50px;
+                font-size: 0.9em;
+                color: #7f8c8d;
+                text-align: center;
+            }}
+        </style>
+    </head>
+    <body>
+        <h1>Tokenizer Report: {model_name}</h1>
+        <p><strong>Generated on:</strong> {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
+        <h2>Model Parameters</h2>
+        <ul>
+            <li><strong>Vocabulary size:</strong> {vocab_size}</li>
+            <li><strong>Minimum frequency:</strong> {min_freq}</li>
+            <li><strong>Normalization:</strong> {'Yes' if normalize_text else 'No'}</li>
+            <li><strong>Total tokens processed:</strong> {len(all_tokens)}</li>
+            <li><strong>Unique tokens found:</strong> {len(token_counter)}</li>
+            <li><strong>Out-of-Vocabulary rate:</strong> {oov_rate:.2%}</li>
+        </ul>
+        <h2>Token Length Distribution</h2>
+        <div class="chart">
+            {chart1_html}
+        </div>
+        <h2>Most Frequent Tokens (Top 20)</h2>
+        <div class="chart">
+            {chart2_html}
+        </div>
+        <h2>Top 10 Most Frequent Tokens</h2>
+        <table>
+            <tr><th>Token</th><th>Frequency</th></tr>
+    '''
+    for _, row in df_tokens.head(10).iterrows():
+        report_html += f'<tr><td>{row["token"]}</td><td>{row["frequency"]:,}</td></tr>'
+    report_html += '</table>'
+    report_html += '''
+    <h2>Dictionary (First 100 Tokens)</h2>
+    <table>
+        <tr><th>Rank</th><th>Token</th><th>Frequency</th></tr>
+    '''
+    for i, (_, row) in enumerate(df_tokens.head(100).iterrows()):
+        report_html += f'<tr><td>{i+1}</td><td>{row["token"]}</td><td>{row["frequency"]:,}</td></tr>'
+    report_html += '''
+    </table>
+    </body>
+    </html>
+    '''
+    html_path = os.path.join(UPLOAD_DIR, 'tokenizer_report.html')
+    with open(html_path, 'w', encoding='utf-8') as f:
+        f.write(report_html)
+    with open(html_path, encoding='utf-8') as f:
+        st.sidebar.download_button(
+            'Download Full Report',
+            f.read(),
+            file_name='tokenizer_report.html',
+            mime='text/html'
+        )
+with st.expander('View dictionary'):
+    st.dataframe(df_tokens.head(100))
+with st.expander('Info'):
+    st.write(f'- Method: {model_name}')
+    st.write(f'- Vocabulary size: {vocab_size}')
+    st.write(f'- Min. frequency: {min_freq}')
+    st.write(f'- Normalization: {"Yes" if normalize_text else "No"}')
+    st.write(f'- Unique tokens: {len(token_counter)}')
+    st.write(f'- Total tokens: {len(all_tokens)}')