Spaces:
Sleeping
Sleeping
| from datasets import load_dataset | |
| import pandas as pd | |
| import numpy as np | |
| import streamlit as st | |
| from transformers import AutoTokenizer | |
| import matplotlib.pyplot as plt | |
| st.set_page_config(layout="wide") | |
| with st.sidebar: | |
| subset = st.selectbox('subset', ('dev', 'devtest')) | |
| with st.echo(): | |
| tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") | |
| flores = load_dataset("facebook/flores", "eng_Latn-ukr_Cyrl") | |
| dataset = flores[subset] | |
| eng_num_tokens = dataset.map(lambda x: {'num_tokens':len(tokenizer(x['sentence_eng_Latn'])['input_ids'])})['num_tokens'] | |
| ukr_num_tokens = dataset.map(lambda x: {'num_tokens':len(tokenizer(x['sentence_ukr_Cyrl'])['input_ids'])})['num_tokens'] | |
| with st.sidebar: | |
| fig, (axl, axr) = plt.subplots(2, 1, figsize=(3,6)) | |
| axl.hist(eng_num_tokens) | |
| axl.set_title(f'eng mistral tokens ({np.sum(eng_num_tokens)} total)') | |
| axr.hist(ukr_num_tokens) | |
| axr.set_title(f'ukr mistral tokens ({np.sum(ukr_num_tokens)} total)') | |
| st.pyplot(fig) | |
| keyword = st.text_input("Filter by text", value="") | |
| if not keyword: | |
| st.dataframe(pd.DataFrame(dataset)) | |
| else: | |
| st.dataframe(pd.DataFrame(dataset.filter(lambda x: keyword in x['sentence_eng_Latn'] or keyword in x['sentence_ukr_Cyrl']))) | |