Spaces:
Runtime error
Runtime error
| def get_all_models(): | |
| with open("requirements.txt") as f: | |
| content = f.readlines() | |
| models = [] | |
| for line in content: | |
| if "huggingface.co" in line: | |
| models.append(line.split("/")[4]) | |
| return models | |
| def clear_input(): | |
| return ("", "") | |
| def camembert_generate_summary(article_text): | |
| inputs = cmb_tokenizer([article_text], padding="max_length", truncation=True, | |
| max_length=512, | |
| return_tensors="pt") | |
| input_ids = inputs.input_ids.to(device) | |
| attention_mask = inputs.attention_mask.to(device) | |
| output = cmb_model.generate(input_ids, attention_mask=attention_mask) | |
| return cmb_tokenizer.decode(output[0], skip_special_tokens=True) | |
| def t5_generate_summary(article_text): | |
| input_ids = t5_tokenizer( | |
| [WHITESPACE_HANDLER(article_text)], | |
| return_tensors="pt", | |
| padding="max_length", | |
| truncation=True, | |
| max_length=512)["input_ids"] | |
| output_ids = t5_model.generate( | |
| input_ids=input_ids, | |
| max_length=84, | |
| no_repeat_ngram_size=2, | |
| num_beams=4 | |
| )[0] | |
| output = t5_tokenizer.decode( | |
| output_ids, | |
| skip_special_tokens=True, | |
| clean_up_tokenization_spaces=False | |
| ) | |
| return output | |
| def summarizer(dropdown_model, article_text): | |
| """ | |
| Ruturs a summarized version from the full article based on the selected pretrained-model | |
| """ | |
| if dropdown_model == 'camembert': | |
| summary = camembert_generate_summary(article_text) | |
| elif dropdown_model == 'T5': | |
| summary = t5_generate_summary(article_text) | |
| return summary | |
| class keyWordExtractor(): | |
| def __init__(self, | |
| article_text, | |
| similarity_model, | |
| n_gram = 1, | |
| top_n = 3, | |
| french_stopwords = None, | |
| ner= None, | |
| ): | |
| self.article_text = article_text | |
| self.french_stopwords = french_stopwords | |
| self.candidates = self.count_vectorizer(n_gram) | |
| self.noun_candidates, self.proper_noun_candidates = self.slice_only_noun_token(ner, self.candidates) | |
| self.top_n_keywords = self.top_n_extractor(similarity_model, top_n) | |
| def count_vectorizer(self, n_gram): | |
| n_gram_range = (n_gram, n_gram) | |
| # Extract candidate words/phrases | |
| count = CountVectorizer(ngram_range=n_gram_range, | |
| stop_words = self.french_stopwords).fit([self.article_text]) #Main change | |
| candidates = count.get_feature_names_out() | |
| return candidates | |
| def slice_only_noun_token(self, ner, token_list): | |
| """ | |
| Given the tokenized list, this function returns only the "NOUN" token | |
| Args: | |
| ner (spacy): The NER class to detect the `token.pos_` | |
| token_list (list): List of token from the full article | |
| Returns: | |
| slice_list (list): List of token containing only "NOUN" part of speech | |
| """ | |
| noun_slice_list = [] | |
| proper_noun_slice_list = [] | |
| for word_idx in range(len(token_list)): | |
| doc = ner(token_list[word_idx]) | |
| for token in doc: | |
| if token.pos_ == 'NOUN': | |
| noun_slice_list.append(token.text) | |
| elif token.pos_ == 'PROPN': | |
| proper_noun_slice_list.append(token.text) | |
| return noun_slice_list, proper_noun_slice_list | |
| def top_n_extractor(self, model, top_n): | |
| doc_embedding = model.encode([self.article_text]) | |
| candidate_embeddings = model.encode(self.noun_candidates) | |
| distances = cosine_similarity(doc_embedding, candidate_embeddings) | |
| keywords = [self.noun_candidates[index] for index in distances.argsort()[0][-top_n:]] | |
| return keywords | |
| def extract_top_3(article): | |
| nlp = spacy.load("fr_core_news_md") | |
| # model = SentenceTransformer("dangvantuan/sentence-camembert-large") # | |
| a= keyWordExtractor(article, | |
| n_gram = 1, | |
| top_n = 3, | |
| ner = nlp, | |
| similarity_model = model) | |
| keyword = ", ".join(a.top_n_keywords) #to return ['a' , 'b'] >> "a, b" | |
| proper_nonuns = ", ".join(a.proper_noun_candidates) | |
| return keyword, proper_nonuns | |
| def runall(dropdown_model, article_text): | |
| summary = summarizer(dropdown_model, article_text) | |
| keywords, proper_n = extract_top_3(article_text) | |
| return summary, keywords, proper_n |