Spaces:

robzchhangte
/

Mizo-MLM

Runtime error

App Files Files Community

robzchhangte commited on Sep 15, 2024

Commit

e4ecad3

verified ·

1 Parent(s): 4653e6f

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -91

app.py CHANGED Viewed

@@ -1,98 +1,48 @@
 import os
-import torch
 import gradio as gr
-from transformers import AutoTokenizer, BertForMaskedLM
-import torch.nn.functional as F
-# Load model and tokenizer with authentication token
-token = os.getenv("hf_token")
-model = BertForMaskedLM.from_pretrained("robzchhangte/mMizBERT", use_auth_token=token)
-tokenizer = AutoTokenizer.from_pretrained("robzchhangte/mMizBERT", use_auth_token=token)
-# Function to clean subword tokens (remove '##' and filter out special tokens)
-def clean_token(token):
-    return token.replace('##', '')
-# Function to predict the masked word with cleaned tokens and scores
-def predict_masked_word(text):
-    # Tokenize input text
-    inputs = tokenizer(text, return_tensors="pt")
-    # Find the index of the [MASK] token
-    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
-    # Get predictions from the model
-    with torch.no_grad():
-        outputs = model(**inputs)
-        logits = outputs.logits
-    # Extract logits for the [MASK] token
-    mask_token_logits = logits[0, mask_token_index, :]
-    # Convert logits to probabilities using softmax
-    probs = F.softmax(mask_token_logits, dim=-1)
-    # Get the top 5 predictions (tokens) and their probabilities
-    top_5_tokens = torch.topk(probs, 5, dim=1).indices[0].tolist()
-    top_5_probs = torch.topk(probs, 5, dim=1).values[0].tolist()
-    # Convert token ids to words, clean them, and pair them with their probabilities
-    predicted_words_with_scores = [(clean_token(tokenizer.decode([token_id]).strip()), round(score, 4))
-                                   for token_id, score in zip(top_5_tokens, top_5_probs)]
-    # Filter out unwanted special characters or blank tokens
-    predicted_words_with_scores = [(word, score) for word, score in predicted_words_with_scores if word.isalpha()]
-    # Format the output as "word - score"
-    result = '\n'.join([f"{word} - {score}" for word, score in predicted_words_with_scores])
-    return result
-# Define examples for testing
-examples = [
-    ["nimin khan Mizoram pumah buaina leh [MASK] lian tham awm loin neih a ni a"],
-    ["vote thlak tawh dan enin, kum hmasa lam aiin kan tha dawn lo deuhin a lang a, thahnemngai lehzuala vote thlak [MASK] mipui kan ngen nawn leh a ni"]
-]
-# Description for the app
-description = """This is mBERT fine-tuned using Mizo Corpus. It is not the same as the model proposed in the MizBERT paper. You can find Mizo text here for testing the model: https://dipr.mizoram.gov.in/category/mizo-press-releases"""
-# Define custom CSS for UI
-css = """
-footer {display:none !important}
-.output-markdown {display:none !important}
-.gr-button-primary {
-    z-index: 14;
-    height: 43px;
-    width: 130px;
-    padding: 0px;
-    cursor: pointer !important;
-    background: rgb(17, 20, 45) !important;
-    border: none !important;
-    font-family: Poppins !important;
-    font-size: 14px !important;
-    font-weight: 500 !important;
-    color: rgb(255, 255, 255) !important;
-    border-radius: 12px !important;
-    transition: box-shadow 200ms ease, background 200ms ease !important;
-}
-.gr-button-primary:hover {
-    background: rgb(66, 133, 244) !important;
-    box-shadow: rgb(0 0 0 / 23%) 0px 1px 7px 0px !important;
-}
-"""
-# Gradio interface
-interface = gr.Interface(
-    fn=predict_masked_word,
-    inputs=gr.Textbox(lines=2, placeholder="Enter a sentence with [MASK] token"),
-    outputs=gr.Textbox(),  # Display output as plain text
-    title="Masked Language Model Prediction for Mizo",
-    examples=examples,
-    css=css,
-    theme="light",
-    description=description
-)
-# Launch the app
-interface.launch(share=False)

 import os
 import gradio as gr
+from transformers import pipeline, HfApi
+# Set your Hugging Face token here
+# HUGGINGFACE_TOKEN = "your_huggingface_token_here"
+# Authenticate with Hugging Face
+# HfApi().set_access_token(HUGGINGFACE_TOKEN)
+token = os.getenv("hf_token")
+# Instantiate the model
+model = pipeline(task="fill-mask",
+                 model="robzchhangte/mMizBERT",
+                 tokenizer="robzchhangte/mMizBERT",
+                 use_auth_token=token)  # Use the token to authenticate
+def fill_the_mask(text):
+    if "[MASK]" not in text:
+        return "You did not enter \"[MASK]\" in the text. Please write your text again!"
+    else:
+        # Apply the model
+        model_out = model(text)
+        # First sort the list of dictionaries according to the score
+        model_out = sorted(model_out, key=lambda x: x['score'], reverse=True)
+        # Create a dictionary to store the model output
+        out_dict = {}
+        # Iterate over the list of dictionaries and get the required output
+        for sub_dict in model_out:
+            out_dict[sub_dict["sequence"]] = round(sub_dict["score"], 3)
+        return out_dict
+# Create a Gradio user interface
+my_interface = gr.Interface(
+    title="Masked Language Model APP\n(by Umair Akram)",
+    description="This App uses a fine-tuned DistilBERT-Base-Uncased Masked Language Model to predict the missed word in a sentence.\nEnter your text and put \"[MASK]\" at the word which you want to predict, as shown in the following example: Can we [MASK] to Paris?",
+    fn=fill_the_mask,
+    inputs="text",
+    outputs="label"
+)
+# Define the main function
+if __name__ == "__main__":
+    # Launch the Gradio interface
+    my_interface.launch()