Spaces:
Sleeping
Sleeping
File size: 7,585 Bytes
b27de9c 7e0562e b27de9c 7e0562e b27de9c 7e0562e b27de9c 7e0562e b27de9c 7e0562e b27de9c e56df02 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
import streamlit as st
from easynmt import EasyNMT
from nltk import word_tokenize
from simalign import SentenceAligner
import re
import penman
import amrlib
from amrlib.graph_processing.annotator import add_lemmas
from amrlib.alignments.rbw_aligner import RBWAligner
@st.cache_resource
def load_easynmt():
return EasyNMT('opus-mt')
@st.cache_resource
def load_stog_model():
return amrlib.load_stog_model(model_dir='model_stog')
@st.cache_resource
def load_gtos_model():
return amrlib.load_gtos_model(model_dir='model_gtos')
# Find a node corresponding targetWord in the graph:
def getTargetWordNode(segmentTokens, aligner, alignments, target):
# Get target word in English:
if target in segmentTokens:
targetIndexFr = segmentTokens.index(target)
targetIndexesEn = [i for i in alignments['mwmf'] if i[0]==targetIndexFr]
if len(targetIndexesEn) > 0:
targetIndexEn = targetIndexesEn[0][1]
# Get a full name of the graph node:
if aligner.alignments[targetIndexEn] != None:
nodeConcepts = [i for i in re.split(',|\(|\"|\'', str(aligner.alignments[targetIndexEn])) if i.strip() != '']
return nodeConcepts[0]+' / '+nodeConcepts[2]
else:
return 'Error!' # Alignment between target word in French and its English instance not found
else:
return 'Error!' # Alignment between target word in French and its English instance not found
else:
return 'Error!' # Alignment between target word in French and its English instance not found
# Extract a subgraph containing target word with full path (all the node) to it:
def getTargetWordSubGraphFullPath(amrGraph, target):
stringTmp = [i+' ' for i in re.split('\n', amrGraph) if i[0] !='#']
stringTmp2 = []
for s in stringTmp:
stringTmp2+=[i for i in re.split('(:\w+\s|:\w+-\w+\s)', s) if i.strip() !='']
string = []
for s in stringTmp2:
string+=[i for i in re.split('(\(|\))', s) if i.strip() !='']
openListGlobal = []
openList = []
subGraph = ""
subGraphGlobal = []
flag = False
stop = False
for i in range(len(string)):
if flag:
if string[i] == '(':
openList.append('(')
subGraph+=string[i]
elif string[i] == ')':
openList.pop()
if openList == []:
flag = False
stop = True
subGraph+=')'
subGraphGlobal.append(subGraph)
else:
subGraph+=string[i]
else:
subGraph+=string[i]
else:
if target in string[i].strip():
flag = True
subGraph+=string[i]
openList.append('(')
else:
if not stop and string[i] == '(':
openListGlobal.append('(')
subGraphGlobal.append(string[i])
elif not stop and string[i] == ')':
openListGlobal.pop()
while subGraphGlobal[-1] != '(':
subGraphGlobal.pop()
subGraphGlobal.pop()
subGraphGlobal.pop()
elif not stop:
subGraphGlobal.append(string[i])
for i in openListGlobal:
if i=='(':
subGraphGlobal.append(')')
resultGraph = ""
for i in subGraphGlobal:
resultGraph+=i
# Fix the formatting:
g = penman.decode(resultGraph)
return penman.encode(g)
def main():
st.header('Abstract Meaning Representation based summary of French text', divider='blue')
segmentFr = st.text_area(
"Text to summarize:",
"Article 2 : Occupations ou utilisations du sol soumises à des conditions particulières\n\n"
"2) Dans les périmètres en bordure des cours d’eau définis dans les annexes sanitaires du PLU :\n\n"
"− Seules les clôtures en grillage pourront être autorisées à condition qu'elles soient conçues de\n"
"manière à ne pas faire obstacle au libre écoulement des eaux.",
height=170,
)
## Alternative example:
#segmentFr = st.text_area(
#"Text to summarize:",
#"Article 1: Le classement interdit tout changement d'affectation ou tout mode d'occupation du sol de nature à compromettre la conservation, la protection ou la création des boisements. Dans les bois, forêts ou parcs situés sur le territoire de communes où l'établissement d'un plan d'occupation des sols a été prescrit mais où ce plan n'a pas encore été rendu public, ainsi que dans tout espace boisé classé, les coupes et abattages d'arbres sont soumis à autorisation préalable.",
#height=170,
#)
targetWord = st.text_input('Keyword:', 'clôtures')
##targetWord = st.text_input('Keyword:', 'compromettre')
if st.button('Summarize'):
# Fix input formatting:
segmentFr = segmentFr.replace('\n',' ')
# Translate segment into English:
model = load_easynmt()
segmentEn = model.translate(segmentFr , source_lang='fr', target_lang='en')
# Get an AMR graph:
stog = load_stog_model()
inputGraph = stog.parse_sents([segmentEn])
# Get tokenized representation of segment in French:
segmentFrTokens = word_tokenize(segmentFr, language='french')
# Get tokenized representation of segment in English:
penmanGraph = add_lemmas(inputGraph[0], snt_key='snt')
aligner = RBWAligner.from_penman_w_json(penmanGraph)
segmentEnTokens = aligner.lemmas
# Get alignments between original version and translation:
myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai")
alignments = myaligner.get_word_aligns(segmentFrTokens, segmentEnTokens)
# Find a node corresponding targetWord in the graph:
targetNode = getTargetWordNode(segmentFrTokens, aligner, alignments, targetWord)
# Check if targetNode is in the graph:
errorFlag = False
if targetNode not in inputGraph[0]:
#if targetWord in inputGraph[0]:
if targetWord in ''.join(inputGraph[0].split('\n')[1:]):
targetNode = targetWord
else:
errorFlag = True
# Extract a subgraph containing target word with full path (all the node) to it:
if not errorFlag:
if targetNode != 'Error!':
targetSubGraph = getTargetWordSubGraphFullPath(inputGraph[0], targetNode)
# Generate text from given AMR-graph:
gtos = load_gtos_model()
rulesEn, _ = gtos.generate([targetSubGraph])
# Remove "1." from the text:
rulesEn = [re.sub('\d. ', '', rulesEn[0])]
# Translate it back to French
rulesFr = model.translate(rulesEn[0], source_lang='en', target_lang='fr')
st.write("Summary: ", rulesFr)
else:
st.write('Error! Alignment between target word in French and its English instance not found')
else:
st.write('Error! Cannot find keyword in the graph')
if __name__ == "__main__":
main()
|