Spaces:
Runtime error
Runtime error
| import random | |
| import streamlit as st | |
| from bs4 import BeautifulSoup | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| from transformers import pipeline | |
| from transformers_interpret import SequenceClassificationExplainer | |
| model_hub_url = 'https://huggingface.co/ml6team/distilbert-base-german-cased-toxic-comments' | |
| model_name = 'ml6team/distilbert-base-german-cased-toxic-comments' | |
| about_page_markdown = f"""# π€¬ Toxic Comment Detection Space | |
| Made by [ML6](https://ml6.eu/). | |
| Token attribution is performed using [transformers-interpret](https://github.com/cdpierse/transformers-interpret). | |
| """ | |
| regular_emojis = [ | |
| 'π', 'π', 'πΆ', 'π', | |
| ] | |
| undecided_emojis = [ | |
| 'π€¨', 'π§', 'π₯Έ', 'π₯΄', 'π€·', | |
| ] | |
| potty_mouth_emojis = [ | |
| 'π€', 'πΏ', 'π‘', 'π€¬', 'β οΈ', 'β£οΈ', 'β’οΈ', | |
| ] | |
| # Page setup | |
| st.set_page_config( | |
| page_title="Toxic Comment Detection Space", | |
| page_icon="π€¬", | |
| layout="centered", | |
| initial_sidebar_state="auto", | |
| menu_items={ | |
| 'Get help': None, | |
| 'Report a bug': None, | |
| 'About': about_page_markdown, | |
| } | |
| ) | |
| # Model setup | |
| def load_pipeline(): | |
| with st.spinner('Loading the model (this might take a while)...'): | |
| toxicity_pipeline = pipeline( | |
| 'text-classification', | |
| model=model_name, | |
| tokenizer=model_name) | |
| cls_explainer = SequenceClassificationExplainer( | |
| toxicity_pipeline.model, | |
| toxicity_pipeline.tokenizer) | |
| return toxicity_pipeline, cls_explainer | |
| toxicity_pipeline, cls_explainer = load_pipeline() | |
| # Auxiliary functions | |
| def format_explainer_html(html_string): | |
| """Extract tokens with attribution-based background color.""" | |
| inside_token_prefix = '##' | |
| soup = BeautifulSoup(html_string, 'html.parser') | |
| p = soup.new_tag('p', | |
| attrs={'style': 'color: black; background-color: white;'}) | |
| # Select token elements and remove model specific tokens | |
| current_word = None | |
| for token in soup.find_all('td')[-1].find_all('mark')[1:-1]: | |
| text = token.font.text.strip() | |
| if text.startswith(inside_token_prefix): | |
| text = text[len(inside_token_prefix):] | |
| else: | |
| # Create a new span for each word (sequence of sub-tokens) | |
| if current_word is not None: | |
| p.append(current_word) | |
| p.append(' ') | |
| current_word = soup.new_tag('span') | |
| token.attrs['style'] = f"{token.attrs['style']}; padding: 0.2em 0em;" | |
| token.string = text | |
| current_word.append(token) | |
| # Add last word | |
| p.append(current_word) | |
| # Add left and right-padding to each word | |
| for span in p.find_all('span'): | |
| span.find_all('mark')[0].attrs['style'] = ( | |
| f"{span.find_all('mark')[0].attrs['style']} padding-left: 0.2em;") | |
| span.find_all('mark')[-1].attrs['style'] = ( | |
| f"{span.find_all('mark')[-1].attrs['style']} padding-right: 0.2em;") | |
| return p | |
| def classify_comment(comment): | |
| """Classify the given comment and augment with additional information.""" | |
| result = toxicity_pipeline(comment)[0] | |
| # Add explanation | |
| result['word_attribution'] = cls_explainer(comment, class_name="non_toxic") | |
| result['visualitsation_html'] = cls_explainer.visualize()._repr_html_() | |
| result['tokens_with_background'] = format_explainer_html( | |
| result['visualitsation_html']) | |
| # Choose emoji reaction | |
| label, score = result['label'], result['score'] | |
| if label == 'toxic' and score > 0.1: | |
| emoji = random.choice(potty_mouth_emojis) | |
| elif label == 'non_toxic' and score > 0.1: | |
| emoji = random.choice(regular_emojis) | |
| else: | |
| emoji = random.choice(undecided_emojis) | |
| result.update({'text': comment, 'emoji': emoji}) | |
| # Add result to session | |
| st.session_state.results.append(result) | |
| # Start session | |
| if 'results' not in st.session_state: | |
| st.session_state.results = [] | |
| # Page | |
| st.title('π€¬ German Toxic Comment Detection') | |
| st.markdown("""This demo showcases the German toxic comment detection model.""") | |
| # Introduction | |
| st.markdown(f"""The model was trained using a sequence classification task on a combination of multiple German datasets containing toxicity, profanity, and hate speech. For a more comprehensive overview of the model check out the [model card on π€ Model Hub]({model_hub_url}). | |
| """) | |
| st.markdown("""Enter a comment that you want to classify below. The model will determine the probability that it is toxic and highlights how much each token contributes to its decision: | |
| <font color="black"> | |
| <span style="background-color: rgb(250, 219, 219); opacity: 1;">r</span><span style="background-color: rgb(244, 179, 179); opacity: 1;">e</span><span style="background-color: rgb(238, 135, 135); opacity: 1;">d</span> | |
| </font> | |
| tokens indicate toxicity whereas | |
| <font color="black"> | |
| <span style="background-color: rgb(224, 251, 224); opacity: 1;">g</span><span style="background-color: rgb(197, 247, 197); opacity: 1;">re</span><span style="background-color: rgb(121, 236, 121); opacity: 1;">en</span> | |
| </font> tokens indicate indicate the opposite. | |
| Try it yourself! π""", | |
| unsafe_allow_html=True) | |
| # Demo | |
| with st.form("german-toxic-comment-detection-input", clear_on_submit=True): | |
| text = st.text_area( | |
| label='Enter the comment you want to classify below (in German):') | |
| _, rightmost_col = st.columns([6,1]) | |
| submitted = rightmost_col.form_submit_button("Classify", | |
| help="Classify comment") | |
| # Listener | |
| if submitted: | |
| if text: | |
| with st.spinner('Analysing comment...'): | |
| classify_comment(text) | |
| else: | |
| st.error('**Error**: No comment to classify. Please provide a comment.') | |
| # Results | |
| if 'results' in st.session_state and st.session_state.results: | |
| first = True | |
| for result in st.session_state.results[::-1]: | |
| if not first: | |
| st.markdown("---") | |
| st.markdown(f"Text:\n> {result['text']}") | |
| col_1, col_2, col_3 = st.columns([1,2,2]) | |
| col_1.metric(label='', value=f"{result['emoji']}") | |
| col_2.metric(label='Label', value=f"{result['label']}") | |
| col_3.metric(label='Score', value=f"{result['score']:.3f}") | |
| st.markdown(f"Token Attribution:\n{result['tokens_with_background']}", | |
| unsafe_allow_html=True) | |
| first = False | |