| | import torch |
| | from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline |
| | import streamlit as st |
| | from keybert import KeyBERT |
| | import re |
| |
|
| |
|
| | |
| | def create_nest_sentences(document:str, token_max_length = 1024): |
| | nested = [] |
| | sent = [] |
| | length = 0 |
| | tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli') |
| |
|
| | for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", ' ')): |
| | tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] |
| | length += len(tokens_in_sentence) |
| |
|
| | if length < token_max_length: |
| | sent.append(sentence) |
| | else: |
| | nested.append(sent) |
| | sent = [sentence] |
| | length = 0 |
| |
|
| | if sent: |
| | nested.append(sent) |
| | return nested |
| |
|
| | |
| | @st.cache(allow_output_mutation=True) |
| | def load_keyword_model(): |
| | kw_model = KeyBERT() |
| | return kw_model |
| |
|
| | def keyword_gen(kw_model, sequence:str): |
| | keywords = kw_model.extract_keywords(sequence, |
| | keyphrase_ngram_range=(1, 1), |
| | stop_words='english', |
| | use_mmr=True, |
| | diversity=0.5, |
| | top_n=10) |
| | return keywords |
| |
|
| |
|
| |
|
| | |
| | @st.cache(allow_output_mutation=True) |
| | def load_summary_model(): |
| | model_name = "facebook/bart-large-cnn" |
| | summarizer = pipeline(task='summarization', model=model_name) |
| | return summarizer |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | def summarizer_gen(summarizer, sequence:str, maximum_tokens:int, minimum_tokens:int): |
| | output = summarizer(sequence, |
| | num_beams=4, |
| | length_penalty=2.0, |
| | max_length=maximum_tokens, |
| | min_length=minimum_tokens, |
| | do_sample=False, |
| | early_stopping = True, |
| | no_repeat_ngram_size=3) |
| | return output[0].get('summary_text') |
| |
|
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| |
|
| | |
| | @st.cache(allow_output_mutation=True) |
| | def load_model(): |
| | model_name = "facebook/bart-large-mnli" |
| | tokenizer = AutoTokenizer.from_pretrained(model_name) |
| | model = AutoModelForSequenceClassification.from_pretrained(model_name) |
| | classifier = pipeline(task='zero-shot-classification', model=model, tokenizer=tokenizer, framework='pt') |
| | return classifier |
| |
|
| | def classifier_zero(classifier, sequence:str, labels:list, multi_class:bool): |
| | outputs = classifier(sequence, labels, multi_label=multi_class) |
| | return outputs['labels'], outputs['scores'] |
| |
|
| |
|