# Load the packages import torch import streamlit as st from transformers import GPT2Tokenizer, GPT2LMHeadModel,BartTokenizer,BartForConditionalGeneration import spacy import spacy.cli spacy.cli.download("en_core_web_sm") nlp=spacy.load("en_core_web_sm") nlp=spacy.load("en_core_web_sm") from spacy import displacy #---Sidebar Design----- st.sidebar.subheader("Select from the dropdown list") # add the subheader of sidebar st.sidebar.text("") # add line space option_lang = st.sidebar.selectbox( 'What is your native language?', ('Japanese', 'Mandarin')) # add a dropdown list for native languages st.sidebar.write('You selected:', option_lang) # display the selected native language st.sidebar.text("") # add line space option_model=st.sidebar.selectbox( 'Which language model would like to use?', ('GPT-2', 'BART')) # add a dropdown list for language model st.sidebar.write('You selected:', option_model) # display the selected language model #---Main Body Design----- st.title('Make Friends with English 🤝') # add a title for the web app st.text("") # add line space st.markdown('This web app is designed for ESL speakers who may face difficulty in communicating context in English.') st.text("") # add line space st.markdown('

Enter your sentence 👇

',unsafe_allow_html=True) # add a subtitle original = st.text_input('', '',label_visibility="collapsed") # add a textbox to input original sentence go = st.button('Generate') # add a 'Generate button' to run the selected language model # Define the output directory if option_model=='GPT-2' and option_lang == 'Japanese': model_dir = "amyyang/80K-GPT2-v2" token_dir = "amyyang/token-80K-GPT2-v2" elif option_model == 'GPT-2' and option_lang == 'Mandarin': model_dir = "amyyang/40K-GPT2-MDN-v2" token_dir = "amyyang/token-40K-GPT2-MDN-v2" elif option_model == 'BART' and option_lang == 'Mandarin': model_dir = "amyyang/60K-BART-MDN-v2" token_dir = "amyyang/token-60K-BART-MDN-v2" else: model_dir = "amyyang/80K-BART-v2" token_dir = "amyyang/token-80K-BART-v2" # Assign cuda to the device to use for training if torch.cuda.is_available(): dev = "cuda:0" print("This model will run on CUDA") # elif torch.backends.mps.is_available(): # dev = "mps:0" # print("This model will run on MPS") else: dev = "cpu" print("This model will run on CPU") device = torch.device(dev) # Define the function to generate corrected sentence using GPT-2 model def generate_prediction(prompt, max_length=100, temperature=1.0, top_p=1.0): model = GPT2LMHeadModel.from_pretrained(model_dir).to(device) tokenizer = GPT2Tokenizer.from_pretrained(token_dir) input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device) attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=device) with torch.no_grad(): output = model.generate( input_ids, attention_mask=attention_mask, max_length=max_length, num_return_sequences=1, no_repeat_ngram_size=2, temperature=temperature, top_p=top_p, ) return tokenizer.decode(output[0], skip_special_tokens=True) # Define the function to extract the output (corrected sentence) def model_running(model): if go and model=='GPT-2': try: tokenizer = GPT2Tokenizer.from_pretrained(token_dir) prompt = f"input: {original} output:" prompt_length = len(tokenizer.encode(prompt)) dynamic_max_length = int(1.5 * len(original.split())) + prompt_length # Generate prediction prediction = generate_prediction(prompt, max_length=dynamic_max_length, temperature=0.8, top_p=0.8) # Extract the actual generated output generated_output = prediction.split("output:")[1].strip() return generated_output except Exception as e: st.exception("Exception: %s\n" % e) elif go and model=='BART': try: model = BartForConditionalGeneration.from_pretrained(model_dir) tokenizer = BartTokenizer.from_pretrained(token_dir) # Tokenize the input text input_ids = tokenizer.encode(original, return_tensors='pt') # Generate text with the fine-tuned BART model output_ids = model.generate(input_ids) # Decode the output text generated_output = tokenizer.decode(output_ids[0], skip_special_tokens=True) return generated_output except Exception as e: st.exception("Exception: %s\n" % e) output=model_running(option_model) # Add the warning message based on the output if output is None: st.markdown('Note: Please enter your sentence and click **Generate** button!',unsafe_allow_html=True) else: st.text("") st.markdown('

Recommended sentence 💡

',unsafe_allow_html=True) # add a subtitle st.text(output) # display the corrected sentence st.text("") # add line space st.markdown('

Part-of-speech Tagging 🏷

',unsafe_allow_html=True) # add a subtitle # Add the POS tags if original!='' and output is not None: doc=nlp(output) for token in doc: st.write(token,token.pos_) st.text("") # add line space st.markdown('

Dependency Tree 🌳

',unsafe_allow_html=True) # add a subtitle # Add a html wrapper to hold the html file of dependency tree HTML_WRAPPER = """
{}
""" # Add the dependency tree if original!='' and output is not None: doc=nlp(output) docs = [span.as_doc() for span in doc.sents] html=displacy.render(docs,style='dep') st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True) st.markdown('___') st.markdown('by [A very beta ChatGPT-4.5](https://github.com/danish-sven/anlp-at2-gpt45/)') # add the author # # The code below is to generate corrected sentences with GPT-2 or BART model. # if go and option_model=='GPT-2': # try: # model = GPT2LMHeadModel.from_pretrained(output_dir).to(device) # tokenizer = GPT2Tokenizer.from_pretrained(output_dir) # def generate_prediction(prompt, max_length=100, temperature=1.0, top_p=1.0): # input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device) # attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=device) # with torch.no_grad(): # output = model.generate( # input_ids, # attention_mask=attention_mask, # max_length=max_length, # num_return_sequences=1, # no_repeat_ngram_size=2, # temperature=temperature, # top_p=top_p, # ) # return tokenizer.decode(output[0], skip_special_tokens=True) # # Set max_length dynamically based on the length of the original text # prompt = f"input: {original} output:" # prompt_length = len(tokenizer.encode(prompt)) # dynamic_max_length = int(1.5 * len(original.split())) + prompt_length # # Generate prediction # prediction = generate_prediction(prompt, max_length=dynamic_max_length, temperature=0.8, top_p=0.8) # # Extract the actual generated output # generated_output = prediction.split("output:")[1].strip() # st.text(generated_output) # except Exception as e: # st.exception("Exception: %s\n" % e) # elif go and option_model=='BART': # try: # model = BartForConditionalGeneration.from_pretrained(output_dir) # tokenizer = BartTokenizer.from_pretrained(output_dir) # # Tokenize the input text # input_ids = tokenizer.encode(original, return_tensors='pt') # # Generate text with the fine-tuned BART model # output_ids = model.generate(input_ids) # # Decode the output text # generated_output = tokenizer.decode(output_ids[0], skip_special_tokens=True) # st.text(generated_output) # except Exception as e: # st.exception("Exception: %s\n" % e)