Spaces:

Carlosito16
/

HXM-summarization

Runtime error

App Files Files Community

HXM-summarization / helper_function.py

Carlosito16

Update helper_function.py

971f803 almost 3 years ago

raw

history blame contribute delete

4.5 kB

	def get_all_models():
	with open("requirements.txt") as f:
	content = f.readlines()
	models = []
	for line in content:
	if "huggingface.co" in line:
	models.append(line.split("/")[4])
	return models


	def clear_input():
	return ("", "")


	def camembert_generate_summary(article_text):
	inputs = cmb_tokenizer([article_text], padding="max_length", truncation=True,
	max_length=512,
	return_tensors="pt")
	input_ids = inputs.input_ids.to(device)
	attention_mask = inputs.attention_mask.to(device)
	output = cmb_model.generate(input_ids, attention_mask=attention_mask)
	return cmb_tokenizer.decode(output[0], skip_special_tokens=True)


	def t5_generate_summary(article_text):
	input_ids = t5_tokenizer(
	[WHITESPACE_HANDLER(article_text)],
	return_tensors="pt",
	padding="max_length",
	truncation=True,
	max_length=512)["input_ids"]

	output_ids = t5_model.generate(
	input_ids=input_ids,
	max_length=84,
	no_repeat_ngram_size=2,
	num_beams=4
	)[0]

	output = t5_tokenizer.decode(
	output_ids,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False
	)

	return output

	def summarizer(dropdown_model, article_text):
	"""
	Ruturs a summarized version from the full article based on the selected pretrained-model
	"""

	if dropdown_model == 'camembert':
	summary = camembert_generate_summary(article_text)

	elif dropdown_model == 'T5':
	summary = t5_generate_summary(article_text)

	return summary


	class keyWordExtractor():

	def __init__(self,
	article_text,
	similarity_model,
	n_gram = 1,
	top_n = 3,
	french_stopwords = None,
	ner= None,
	):
	self.article_text = article_text
	self.french_stopwords = french_stopwords
	self.candidates = self.count_vectorizer(n_gram)
	self.noun_candidates, self.proper_noun_candidates = self.slice_only_noun_token(ner, self.candidates)
	self.top_n_keywords = self.top_n_extractor(similarity_model, top_n)

	def count_vectorizer(self, n_gram):
	n_gram_range = (n_gram, n_gram)
	# Extract candidate words/phrases
	count = CountVectorizer(ngram_range=n_gram_range,
	stop_words = self.french_stopwords).fit([self.article_text]) #Main change
	candidates = count.get_feature_names_out()

	return candidates

	def slice_only_noun_token(self, ner, token_list):
	"""
	Given the tokenized list, this function returns only the "NOUN" token
	Args:
	ner (spacy): The NER class to detect the `token.pos_`
	token_list (list): List of token from the full article

	Returns:
	slice_list (list): List of token containing only "NOUN" part of speech
	"""

	noun_slice_list = []
	proper_noun_slice_list = []
	for word_idx in range(len(token_list)):
	doc = ner(token_list[word_idx])

	for token in doc:
	if token.pos_ == 'NOUN':
	noun_slice_list.append(token.text)
	elif token.pos_ == 'PROPN':
	proper_noun_slice_list.append(token.text)

	return noun_slice_list, proper_noun_slice_list

	def top_n_extractor(self, model, top_n):
	doc_embedding = model.encode([self.article_text])
	candidate_embeddings = model.encode(self.noun_candidates)
	distances = cosine_similarity(doc_embedding, candidate_embeddings)
	keywords = [self.noun_candidates[index] for index in distances.argsort()[0][-top_n:]]

	return keywords



	def extract_top_3(article):
	nlp = spacy.load("fr_core_news_md")
	# model = SentenceTransformer("dangvantuan/sentence-camembert-large") #

	a= keyWordExtractor(article,
	n_gram = 1,
	top_n = 3,
	ner = nlp,
	similarity_model = model)
	keyword = ", ".join(a.top_n_keywords) #to return ['a' , 'b'] >> "a, b"
	proper_nonuns = ", ".join(a.proper_noun_candidates)

	return keyword, proper_nonuns


	def runall(dropdown_model, article_text):
	summary = summarizer(dropdown_model, article_text)
	keywords, proper_n = extract_top_3(article_text)

	return summary, keywords, proper_n