Spaces:

fuhsiao
/

Ext-Abs-StructuredSum

Runtime error

App Files Files Community

Ext-Abs-StructuredSum / utils /preprocess.py

fuhsiao

update

dfcc660 over 2 years ago

raw

history blame contribute delete

7.09 kB

	from sklearn.feature_extraction.text import TfidfVectorizer
	from sentence_transformers import SentenceTransformer, util
	import pandas as pd
	import numpy as np
	import gradio as gr
	import torch
	import spacy
	import re

	nlp = spacy.load("en_core_sci_sm")



	# ----------------------------------------------
	# Step 1. 讀取檔案轉換句子單位 JSON
	# ----------------------------------------------

	def read_text_to_json(path):
	paper = {}
	with open(path, 'r', encoding='utf-8') as txt:
	key = None
	for line in txt:
	line = line.strip()
	if line.startswith('@Paper') or line.startswith('@Section'):
	key = line.split()[1]
	paper[key] = []
	elif key and line:
	paper[key].append(line)
	return paper

	def is_valid_format(paper):
	for key in ['title', 'I', 'M', 'R', 'D']:
	if key not in paper or len(paper[key])==0:
	return False
	return True

	def remove_parentheses_with_useless_tokens(text):
	return re.sub(r'\s\(\s(?:table\|fig\|http\|www)[^()]*\)', '', text, flags = re.I) # re.I 不區分大小寫

	def segment_sentences(section, pos_para = False):
	sents = []
	sents_break = [".", "?", "!"]
	start = para_i = pre_para_i = 0
	conn = False
	for para in section:
	para = remove_parentheses_with_useless_tokens(para).strip() # 避免末端空白判斷為 token 而無法 sents_break
	doc = nlp(para)
	for sent in doc.sents:
	if any(t in sents_break for t in sent[-1].text): # 部分句尾詞如 3h. 無法分詞, 因此包含 sents_break 即可
	para_i +=1
	text = "".join(t.text_with_ws for t in doc[start:sent.end]) # 原始字串
	tokenize_text = " ".join(t.text for t in doc[start:sent.end]) # 分詞字串
	sentence = {"text":text, "tokenize_text":tokenize_text, "pos":pre_para_i+para_i} # 建立句子物件
	if pos_para: sentence['pos_para'] = para_i # pos 句子位置, pos_para 句子於每段位置
	sents.append(sentence)
	start = sent.end
	conn = False
	else:
	start = start if conn else sent.start # sent.end 非斷句字符紀錄此句 start, 直到斷句前不更改 start 位置
	conn = True
	pre_para_i += para_i
	start = para_i = 0
	return sents

	def convert_to_sentence_json(paper):
	sentJson = {
	'title': paper['title'],
	'body': {}
	}
	for key in ['I', 'M', 'R', 'D']:
	sentJson['body'][key] = segment_sentences(paper[key], True)
	return sentJson



	# ----------------------------------------------
	# Step 2. 句子單位進行特徵萃取
	# ----------------------------------------------

	# 句子列表
	def sent_lst(sents):
	return [sent['text'] for sent in sents]

	# 移除停用詞及標點
	def clean_token(doc):
	return [token for token in doc if not (token.is_stop or token.is_punct)]

	# 段落之總句數
	def add_num_sents_para(sents):
	reset = True
	for index, sent in reversed(list(enumerate(sents))):
	if reset: ptr = sent['pos_para']
	reset = True if sent['pos_para'] == 1 else False
	sents[index]['ns_para'] = ptr
	return sents

	# 位置重要性
	def position_imp(cur, ns):
	imp = 1 if cur == 1 else (ns-cur)/ns
	return imp

	# 標題詞列表
	def title_wlst(txt):
	doc = nlp(txt)
	wlst = [token.text.lower() for token in clean_token(doc)]
	return list(set(wlst))

	# 句子之標題詞數量
	def title_word_count(doc, wlst):
	titleLen = len(wlst)
	score = 0 if titleLen == 0 else len([token for token in doc if token.text.lower() in wlst])/titleLen
	return score

	# 標記詞性之數量
	def pos_token(doc, pos_type):
	return len([token for token in doc if token.pos_ == pos_type])

	# 自定分詞器
	def custom_toknizer(txt):
	doc = nlp(txt)
	words = [token.lemma_.lower() for token in doc if not (token.is_stop or token.is_punct or token.is_digit)]
	return words

	# 詞頻-逆向句子頻率
	def Tfisf(lst):
	tf = TfidfVectorizer(tokenizer=custom_toknizer, lowercase=False)
	tfisf_matrix = tf.fit_transform(lst)
	word_count = (tfisf_matrix!=0).sum(1)
	with np.errstate(divide='ignore', invalid='ignore'):
	mean_score = np.where(word_count == 0, 0, np.divide(tfisf_matrix.sum(1), word_count)).flatten()
	return mean_score

	# 餘弦相似度
	def similarity(lst, ptm):
	model = SentenceTransformer(ptm)
	embeddings = model.encode(lst, convert_to_tensor=True)
	cosine = util.cos_sim(embeddings, embeddings)
	cosine = cosine.sum(1)-1
	cosine = torch.divide(cosine, torch.max(cosine)).numpy() # .cpu().numpy()
	return cosine

	# 特徵萃取
	def feature_extraction(title, section, sents):
	lst = sent_lst(sents)
	tfisf = Tfisf(lst)
	cosine = similarity(lst, "sentence-transformers/all-MiniLM-L6-v2")

	# Number of sentences
	ns = len(sents)
	sents = add_num_sents_para(sents)
	# Extracting the features of each sentences
	arr = np.empty((0,9))
	for index, sent in enumerate(sents):
	doc = nlp(sent["text"])
	doc = clean_token(doc)

	F1 = len(doc) # Sentence Length (undone) -> len / longest sentence len
	F2 = position_imp(sent["pos"], ns) # Sentence Position
	F3 = position_imp(sent["pos_para"], sent["ns_para"]) # Sentence Position (in paragraph)
	F4 = title_word_count(doc, title) # Title Word
	F5 = 0 if F1 == 0 else pos_token(doc, "PROPN")/F1 # Proper Noun
	F6 = 0 if F1 == 0 else pos_token(doc, "NUM")/F1 # Numerical Token
	F7 = tfisf[index] # Term Frequency-Inverse Sentence Frequency
	F10 = cosine[index] # Cosine Similarity

	feat = np.array([[section, F1, F2, F3, F4, F5, F6, F7, F10]])
	arr = np.append(arr, feat, axis=0)
	# F1 (done)
	maxLen = np.amax(arr[:,1])
	arr[:,1] = arr[:,1]/maxLen
	return arr

	# 設置欄位類型
	def set_dtypes(df):
	df = df.astype({'section': 'int8', 'F1': 'float32', 'F2': 'float32',
	'F3': 'float32', 'F4': 'float32', 'F5': 'float32',
	'F6': 'float32', 'F7': 'float32', 'F10': 'float32'})
	return df

	# 文章 IMRD - 句子特徵
	def feature_from_imrd(body, title):
	paper = np.empty((0,9))
	for index, key in enumerate(['I', 'M', 'R', 'D'], start = 1):
	paper = np.append(paper, feature_extraction(title, index, body[key]), axis = 0)
	df = pd.DataFrame(paper, columns = ['section','F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F10'])
	return set_dtypes(df)

	def extract_sentence_features(sentJson):
	title = title_wlst(sentJson['title'][0])
	sentFeat = feature_from_imrd(sentJson['body'], title)
	return sentFeat