robzchhangte commited on
Commit
e4ecad3
·
verified ·
1 Parent(s): 4653e6f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -91
app.py CHANGED
@@ -1,98 +1,48 @@
1
  import os
2
- import torch
3
  import gradio as gr
4
- from transformers import AutoTokenizer, BertForMaskedLM
5
- import torch.nn.functional as F
6
 
7
- # Load model and tokenizer with authentication token
8
- token = os.getenv("hf_token")
9
- model = BertForMaskedLM.from_pretrained("robzchhangte/mMizBERT", use_auth_token=token)
10
- tokenizer = AutoTokenizer.from_pretrained("robzchhangte/mMizBERT", use_auth_token=token)
11
-
12
- # Function to clean subword tokens (remove '##' and filter out special tokens)
13
- def clean_token(token):
14
- return token.replace('##', '')
15
-
16
- # Function to predict the masked word with cleaned tokens and scores
17
- def predict_masked_word(text):
18
- # Tokenize input text
19
- inputs = tokenizer(text, return_tensors="pt")
20
-
21
- # Find the index of the [MASK] token
22
- mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
23
-
24
- # Get predictions from the model
25
- with torch.no_grad():
26
- outputs = model(**inputs)
27
- logits = outputs.logits
28
-
29
- # Extract logits for the [MASK] token
30
- mask_token_logits = logits[0, mask_token_index, :]
31
-
32
- # Convert logits to probabilities using softmax
33
- probs = F.softmax(mask_token_logits, dim=-1)
34
 
35
- # Get the top 5 predictions (tokens) and their probabilities
36
- top_5_tokens = torch.topk(probs, 5, dim=1).indices[0].tolist()
37
- top_5_probs = torch.topk(probs, 5, dim=1).values[0].tolist()
38
-
39
- # Convert token ids to words, clean them, and pair them with their probabilities
40
- predicted_words_with_scores = [(clean_token(tokenizer.decode([token_id]).strip()), round(score, 4))
41
- for token_id, score in zip(top_5_tokens, top_5_probs)]
 
 
 
 
 
 
 
 
42
 
43
- # Filter out unwanted special characters or blank tokens
44
- predicted_words_with_scores = [(word, score) for word, score in predicted_words_with_scores if word.isalpha()]
45
 
46
- # Format the output as "word - score"
47
- result = '\n'.join([f"{word} - {score}" for word, score in predicted_words_with_scores])
48
 
49
- return result
50
-
51
- # Define examples for testing
52
- examples = [
53
- ["nimin khan Mizoram pumah buaina leh [MASK] lian tham awm loin neih a ni a"],
54
- ["vote thlak tawh dan enin, kum hmasa lam aiin kan tha dawn lo deuhin a lang a, thahnemngai lehzuala vote thlak [MASK] mipui kan ngen nawn leh a ni"]
55
- ]
56
-
57
- # Description for the app
58
- description = """This is mBERT fine-tuned using Mizo Corpus. It is not the same as the model proposed in the MizBERT paper. You can find Mizo text here for testing the model: https://dipr.mizoram.gov.in/category/mizo-press-releases"""
59
-
60
- # Define custom CSS for UI
61
- css = """
62
- footer {display:none !important}
63
- .output-markdown {display:none !important}
64
- .gr-button-primary {
65
- z-index: 14;
66
- height: 43px;
67
- width: 130px;
68
- padding: 0px;
69
- cursor: pointer !important;
70
- background: rgb(17, 20, 45) !important;
71
- border: none !important;
72
- font-family: Poppins !important;
73
- font-size: 14px !important;
74
- font-weight: 500 !important;
75
- color: rgb(255, 255, 255) !important;
76
- border-radius: 12px !important;
77
- transition: box-shadow 200ms ease, background 200ms ease !important;
78
- }
79
- .gr-button-primary:hover {
80
- background: rgb(66, 133, 244) !important;
81
- box-shadow: rgb(0 0 0 / 23%) 0px 1px 7px 0px !important;
82
- }
83
- """
84
-
85
- # Gradio interface
86
- interface = gr.Interface(
87
- fn=predict_masked_word,
88
- inputs=gr.Textbox(lines=2, placeholder="Enter a sentence with [MASK] token"),
89
- outputs=gr.Textbox(), # Display output as plain text
90
- title="Masked Language Model Prediction for Mizo",
91
- examples=examples,
92
- css=css,
93
- theme="light",
94
- description=description
95
- )
96
-
97
- # Launch the app
98
- interface.launch(share=False)
 
1
  import os
 
2
  import gradio as gr
3
+ from transformers import pipeline, HfApi
 
4
 
5
+ # Set your Hugging Face token here
6
+ # HUGGINGFACE_TOKEN = "your_huggingface_token_here"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ # Authenticate with Hugging Face
9
+ # HfApi().set_access_token(HUGGINGFACE_TOKEN)
10
+ token = os.getenv("hf_token")
11
+ # Instantiate the model
12
+ model = pipeline(task="fill-mask",
13
+ model="robzchhangte/mMizBERT",
14
+ tokenizer="robzchhangte/mMizBERT",
15
+ use_auth_token=token) # Use the token to authenticate
16
+
17
+ def fill_the_mask(text):
18
+ if "[MASK]" not in text:
19
+ return "You did not enter \"[MASK]\" in the text. Please write your text again!"
20
+ else:
21
+ # Apply the model
22
+ model_out = model(text)
23
 
24
+ # First sort the list of dictionaries according to the score
25
+ model_out = sorted(model_out, key=lambda x: x['score'], reverse=True)
26
 
27
+ # Create a dictionary to store the model output
28
+ out_dict = {}
29
 
30
+ # Iterate over the list of dictionaries and get the required output
31
+ for sub_dict in model_out:
32
+ out_dict[sub_dict["sequence"]] = round(sub_dict["score"], 3)
33
+
34
+ return out_dict
35
+
36
+ # Create a Gradio user interface
37
+ my_interface = gr.Interface(
38
+ title="Masked Language Model APP\n(by Umair Akram)",
39
+ description="This App uses a fine-tuned DistilBERT-Base-Uncased Masked Language Model to predict the missed word in a sentence.\nEnter your text and put \"[MASK]\" at the word which you want to predict, as shown in the following example: Can we [MASK] to Paris?",
40
+ fn=fill_the_mask,
41
+ inputs="text",
42
+ outputs="label"
43
+ )
44
+
45
+ # Define the main function
46
+ if __name__ == "__main__":
47
+ # Launch the Gradio interface
48
+ my_interface.launch()