| import gradio as gr | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForTokenClassification | |
| tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicNER") | |
| model = AutoModelForTokenClassification.from_pretrained("ai4bharat/IndicNER") | |
| def get_ner(sentence): | |
| tok_sentence = tokenizer(sentence, return_tensors='pt') | |
| with torch.no_grad(): | |
| logits = model(**tok_sentence).logits.argmax(-1) | |
| predicted_tokens_classes = [ | |
| model.config.id2label[t.item()] for t in logits[0]] | |
| predicted_labels = [] | |
| previous_token_id = 0 | |
| word_ids = tok_sentence.word_ids() | |
| for word_index in range(len(word_ids)): | |
| if word_ids[word_index] == None: | |
| previous_token_id = word_ids[word_index] | |
| elif word_ids[word_index] == previous_token_id: | |
| previous_token_id = word_ids[word_index] | |
| else: | |
| predicted_labels.append(predicted_tokens_classes[word_index]) | |
| previous_token_id = word_ids[word_index] | |
| ner_output = [] | |
| for index in range(len(sentence.split(' '))): | |
| ner_output.append( | |
| (sentence.split(' ')[index], predicted_labels[index])) | |
| return ner_output | |
| iface = gr.Interface(get_ner, | |
| gr.Textbox(placeholder="Enter sentence here..."), | |
| ["highlight"], description='The 11 languages covered by IndicNER are: Assamese, Bengali, Gujarati, Hindi, Kannada, Malayalam, Marathi, Oriya, Punjabi, Tamil, Telugu.', | |
| examples=['लगातार हमलावर हो रहे शिवपाल और राजभर को सपा की दो टूक, चिट्ठी जारी कर कहा- जहां जाना चाहें जा सकते हैं', 'ಶರಣ್ ರ ನೀವು ನೋಡಲೇಬೇಕಾದ ಟಾಪ್ 5 ಕಾಮಿಡಿ ಚಲನಚಿತ್ರಗಳು'], title='IndicNER', | |
| article='IndicNER is a model trained to complete the task of identifying named entities from sentences in Indian languages. Our model is specifically fine-tuned to the 11 Indian languages mentioned above over millions of sentences. The model is then benchmarked over a human annotated testset and multiple other publicly available Indian NER datasets.' | |
| ) | |
| iface.launch(enable_queue=True) | |