Spaces:

twigs
/

simplifier

Runtime error

App Files Files Community

twigs commited on Jul 18, 2022

Commit

e38de8a

1 Parent(s): 2b20905

update app

Browse files

Files changed (1) hide show

app.py +43 -32

app.py CHANGED Viewed

@@ -2,23 +2,26 @@ import streamlit as st
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, BartTokenizer, BartForConditionalGeneration, pipeline
 import numpy as np
 import torch
 from textstat import textstat
 MAX_LEN = 256
 NUM_BEAMS = 4
 EARLY_STOPPING = True
 N_OUT = 4
 cwi_tok = AutoTokenizer.from_pretrained('twigs/cwi-regressor')
-cwi_model = AutoModelForSequenceClassification.from_pretrained('twigs/cwi-regressor')
 simpl_tok = BartTokenizer.from_pretrained('twigs/bart-text2text-simplifier')
-simpl_model = BartForConditionalGeneration.from_pretrained('twigs/bart-text2text-simplifier')
-cwi_pipe = pipeline('text-classification', model=cwi_model, tokenizer=cwi_tok, function_to_apply='none', device=0)
-fill_pipe = pipeline('fill-mask', model=simpl_model, tokenizer=simpl_tok, top_k=1, device=0)
 def id_replace_complex(s, threshold=0.4):
@@ -43,7 +46,8 @@ def id_replace_complex(s, threshold=0.4):
 def generate_candidate_text(s, model, tokenizer, tokenized=False):
-  out = simpl_tok([s], max_length=256, padding="max_length",  truncation=True, return_tensors='pt').to('cuda') if not tokenized else s
   generated_ids = model.generate(
       input_ids=out['input_ids'],
@@ -56,39 +60,38 @@ def generate_candidate_text(s, model, tokenizer, tokenized=False):
       num_return_sequences=N_OUT
   )
-  return  [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[
       1:] for ids in generated_ids]
 def rank_candidate_text(sentences):
-    """ Currently being done with simple FKGL """
     fkgl_scores = [textstat.flesch_kincaid_grade(s) for s in sentences]
     return sentences[np.argmin(fkgl_scores)]
 def full_pipeline(source, simpl_model, simpl_tok, tokens, lexical=False):
-  modified, complex_words  = id_replace_complex(source, threshold=0.2) if lexical else source, None
   cands = generate_candidate_text(tokens+modified, simpl_model, simpl_tok)
   output = rank_candidate_text(cands)
   return output, complex_words
-aug_tok = ['c_', 'lev_', 'dep_', 'rank_', 'rat_', 'n_syl_']
-tokens = ['CharRatio', 'LevSim', 'DependencyTreeDepth',
-          'WordComplexity', 'WordRatio']
-default_values = [0.8, 0.6, 0.9, 0.8, 0.9, 1.9]
-user_values = default_values
-tok_values = dict((t, default_values[idx]) for idx, t in enumerate(tokens))
-example_sentences = ["A matchbook is a small cardboard folder (matchcover) enclosing a quantity of matches and having a coarse striking surface on the exterior.",
-                     "If there are no strong land use controls, buildings are built along a bypass, converting it into an ordinary town road, and the bypass may eventually become as congested as the local streets it was intended to avoid.",
-                     "Plot Captain Caleb Holt (Kirk Cameron) is a firefighter in Albany, Georgia and firmly keeps the cardinal rule of all firemen, \"Never leave your partner behind\".",
-                     "Britpop emerged from the British independent music scene of the early 1990s and was characterised by bands influenced by British guitar pop music of the 1960s and 1970s."]
-def main():
     st.title("Make it Simple")
@@ -96,7 +99,8 @@ def main():
         for s in example_sentences:
             st.code(body=s)
-    with st.form(key="form"):
         input_sentence = st.text_area("Original sentence")
         tok = st.multiselect(
             label="Tokens to augment the sentence", options=tokens, default=tokens)
@@ -110,13 +114,20 @@ def main():
         if (submit):
             tokens = [t+str(v) for t, v in zip(aug_tok, user_values)]
-            output, words = full_pipeline(input_sentence, simpl_model, simpl_tok, tokens)
-            with st.container():
-                st.write("Original sentence:")
-                st.write(input_sentence)
-                st.write("Output sentence:")
-                st.write(output)
 if __name__ == '__main__':

 from transformers import AutoTokenizer, AutoModelForSequenceClassification, BartTokenizer, BartForConditionalGeneration, pipeline
 import numpy as np
 import torch
+import re
 from textstat import textstat
 MAX_LEN = 256
 NUM_BEAMS = 4
 EARLY_STOPPING = True
 N_OUT = 4
 cwi_tok = AutoTokenizer.from_pretrained('twigs/cwi-regressor')
+cwi_model = AutoModelForSequenceClassification.from_pretrained(
+    'twigs/cwi-regressor')
 simpl_tok = BartTokenizer.from_pretrained('twigs/bart-text2text-simplifier')
+simpl_model = BartForConditionalGeneration.from_pretrained(
+    'twigs/bart-text2text-simplifier')
+cwi_pipe = pipeline('text-classification', model=cwi_model,
+                    tokenizer=cwi_tok, function_to_apply='none', device=0)
+fill_pipe = pipeline('fill-mask', model=simpl_model,
+                     tokenizer=simpl_tok, top_k=1, device=0)
 def id_replace_complex(s, threshold=0.4):
 def generate_candidate_text(s, model, tokenizer, tokenized=False):
+  out = simpl_tok([s], max_length=256, padding="max_length",  truncation=True,
+                  return_tensors='pt').to('cuda') if not tokenized else s
   generated_ids = model.generate(
       input_ids=out['input_ids'],
       num_return_sequences=N_OUT
   )
+  return [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[
       1:] for ids in generated_ids]
 def rank_candidate_text(sentences):
     fkgl_scores = [textstat.flesch_kincaid_grade(s) for s in sentences]
     return sentences[np.argmin(fkgl_scores)]
 def full_pipeline(source, simpl_model, simpl_tok, tokens, lexical=False):
+  modified, complex_words = id_replace_complex(
+      source, threshold=0.2) if lexical else source, None
   cands = generate_candidate_text(tokens+modified, simpl_model, simpl_tok)
   output = rank_candidate_text(cands)
   return output, complex_words
+def main():
+    aug_tok = ['c_', 'lev_', 'dep_', 'rank_', 'rat_', 'n_syl_']
+    tokens = ['CharRatio', 'LevSim', 'DependencyTreeDepth',
+            'WordComplexity', 'WordRatio', 'NumberOfSyllables']
+    default_values = [0.8, 0.6, 0.9, 0.8, 0.9, 1.9]
+    user_values = default_values
+    tok_values = dict((t, default_values[idx]) for idx, t in enumerate(tokens))
+    example_sentences = ["A matchbook is a small cardboard folder (matchcover) enclosing a quantity of matches and having a coarse striking surface on the exterior.",
+                        "If there are no strong land use controls, buildings are built along a bypass, converting it into an ordinary town road, and the bypass may eventually become as congested as the local streets it was intended to avoid.",
+                        "Plot Captain Caleb Holt (Kirk Cameron) is a firefighter in Albany, Georgia and firmly keeps the cardinal rule of all firemen, \"Never leave your partner behind\".",
+                        "Britpop emerged from the British independent music scene of the early 1990s and was characterised by bands influenced by British guitar pop music of the 1960s and 1970s."]
     st.title("Make it Simple")
         for s in example_sentences:
             st.code(body=s)
+    with st.form(key="simplify"):
         input_sentence = st.text_area("Original sentence")
         tok = st.multiselect(
             label="Tokens to augment the sentence", options=tokens, default=tokens)
         if (submit):
             tokens = [t+str(v) for t, v in zip(aug_tok, user_values)]
+            #output, words = full_pipeline(input_sentence, simpl_model, simpl_tok, tokens)
+            output, words = full_pipeline(input_sentence)
+            c1, c2 = st.columns([1,2])
+            with c1:
+                st.markdown("#### Words identified as complex")
+                for w in words:
+                    st.markdown(f"* {w}")
+            with c2:
+                st.markdown(f"#### Original Sentence:\n > {input_sentence}")
+                st.markdown(f"#### Output Sentence:\n > {output}")
 if __name__ == '__main__':