| This model can be used for sentence compression (aka extractive sentence summarization). | |
| It predicts for each word, whether the word can be dropped from the sentence without severely affecting its meaning. | |
| The resulting sentences are often ungrammatical, but they still can be useful. | |
| The model is [rubert-tiny2]() fine-tuned on the dataset from the paper | |
| [Sentence compression for Russian: dataset and baselines](https://www.dialog-21.ru/media/5106/kuvshinovat-050.pdf) | |
| (the data can be found [here](https://drive.google.com/drive/folders/1WWqq187pN4aHHbRUwlhaKW4JP1FZ_9zh)). | |
| Example usage: | |
| ```python | |
| import torch | |
| from transformers import AutoModelForTokenClassification, AutoTokenizer | |
| model_name = 'cointegrated/rubert-tiny2-sentence-compression' | |
| model = AutoModelForTokenClassification.from_pretrained(model_name) | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| def compress(text, threshold=0.5, keep_ratio=None): | |
| """ Compress a sentence by removing the least important words. | |
| Parameters: | |
| threshold: cutoff for predicted probabilities of word removal | |
| keep_ratio: proportion of words to preserve | |
| By default, threshold of 0.5 is used. | |
| """ | |
| with torch.inference_mode(): | |
| tok = tokenizer(text, return_tensors='pt').to(model.device) | |
| proba = torch.softmax(model(**tok).logits, -1).cpu().numpy()[0, :, 1] | |
| if keep_ratio is not None: | |
| threshold = sorted(proba)[int(len(proba) * keep_ratio)] | |
| kept_toks = [] | |
| keep = False | |
| prev_word_id = None | |
| for word_id, score, token in zip(tok.word_ids(), proba, tok.input_ids[0]): | |
| if word_id is None: | |
| keep = True | |
| elif word_id != prev_word_id: | |
| keep = score < threshold | |
| if keep: | |
| kept_toks.append(token) | |
| prev_word_id = word_id | |
| return tokenizer.decode(kept_toks, skip_special_tokens=True) | |
| text = 'Кроме того, можно взять идею, рожденную из сердца, и выразить ее в рамках одной '\ | |
| 'из этих структур, без потери искренности идеи и смысла песни.' | |
| print(compress(text)) | |
| print(compress(text, threshold=0.3)) | |
| print(compress(text, threshold=0.1)) | |
| # можно взять идею, рожденную из сердца, и выразить ее в рамках одной из этих структур. | |
| # можно взять идею, рожденную из сердца выразить ее в рамках одной из этих структур. | |
| # можно взять идею рожденную выразить структур. | |
| print(compress(text, keep_ratio=0.5)) | |
| # можно взять идею, рожденную из сердца выразить ее в рамках структур. | |
| ``` |