Spaces:

koptelovmax
/

amrdemo

Sleeping

File size: 7,585 Bytes

import streamlit as st

from easynmt import EasyNMT

from nltk import word_tokenize
from simalign import SentenceAligner

import re
import penman

import amrlib
from amrlib.graph_processing.annotator import add_lemmas
from amrlib.alignments.rbw_aligner import RBWAligner

@st.cache_resource
def load_easynmt():
    return EasyNMT('opus-mt')

@st.cache_resource
def load_stog_model():
    return amrlib.load_stog_model(model_dir='model_stog')

@st.cache_resource
def load_gtos_model():
    return amrlib.load_gtos_model(model_dir='model_gtos')

# Find a node corresponding targetWord in the graph:
def getTargetWordNode(segmentTokens, aligner, alignments, target):
    # Get target word in English:
    if target in segmentTokens:
        targetIndexFr = segmentTokens.index(target)
        
        targetIndexesEn = [i for i in alignments['mwmf'] if i[0]==targetIndexFr]
        if len(targetIndexesEn) > 0:
            targetIndexEn = targetIndexesEn[0][1]
                
            # Get a full name of the graph node:
            if aligner.alignments[targetIndexEn] != None:
                nodeConcepts = [i for i in re.split(',|\(|\"|\'', str(aligner.alignments[targetIndexEn])) if i.strip() != '']
                return nodeConcepts[0]+' / '+nodeConcepts[2]
            else:
                return 'Error!' # Alignment between target word in French and its English instance not found
        else:
            return 'Error!' # Alignment between target word in French and its English instance not found
    else:
        return 'Error!' # Alignment between target word in French and its English instance not found
    
# Extract a subgraph containing target word with full path (all the node) to it:
def getTargetWordSubGraphFullPath(amrGraph, target):
    stringTmp = [i+' ' for i in re.split('\n', amrGraph) if i[0] !='#']

    stringTmp2 = []
    for s in stringTmp:
        stringTmp2+=[i for i in re.split('(:\w+\s|:\w+-\w+\s)', s) if i.strip() !='']
    
    string = []
    for s in stringTmp2:
        string+=[i for i in re.split('(\(|\))', s) if i.strip() !='']    
    
    openListGlobal = []
    openList = []
    subGraph = ""
    subGraphGlobal = []
    
    flag = False
    stop = False
    for i in range(len(string)):
        if flag:
            if string[i] == '(':
                openList.append('(')
                subGraph+=string[i]
            elif string[i] == ')':
                openList.pop()
                if openList == []:
                    flag = False
                    stop = True
                    subGraph+=')'
                    subGraphGlobal.append(subGraph)
                else:
                    subGraph+=string[i]
            else:
                subGraph+=string[i]
        else:
            if target in string[i].strip():
                flag = True
                subGraph+=string[i]
                openList.append('(')
            else:
                if not stop and string[i] == '(':
                    openListGlobal.append('(')
                    subGraphGlobal.append(string[i])
                elif not stop and string[i] == ')':
                    openListGlobal.pop()
                    while subGraphGlobal[-1] != '(':
                        subGraphGlobal.pop()
                    subGraphGlobal.pop()
                    subGraphGlobal.pop()
                elif not stop:
                    subGraphGlobal.append(string[i])
                    
    for i in openListGlobal:
        if i=='(':
            subGraphGlobal.append(')')
            
    resultGraph = ""
    for i in subGraphGlobal:
        resultGraph+=i
    
    # Fix the formatting:
    g = penman.decode(resultGraph)
    
    return penman.encode(g)

def main():
    st.header('Abstract Meaning Representation based summary of French text', divider='blue')
    
    segmentFr = st.text_area(
    "Text to summarize:",
    "Article 2 : Occupations ou utilisations du sol soumises à des conditions particulières\n\n"
    "2) Dans les périmètres en bordure des cours d’eau définis dans les annexes sanitaires du PLU :\n\n"
    "− Seules les clôtures en grillage pourront être autorisées à condition qu'elles soient conçues de\n"
    "manière à ne pas faire obstacle au libre écoulement des eaux.",
    height=170,
    )

    ## Alternative example:
    #segmentFr = st.text_area(
    #"Text to summarize:",
    #"Article 1: Le classement interdit tout changement d'affectation ou tout mode d'occupation du sol de nature à compromettre la conservation, la protection ou la création des boisements. Dans les bois, forêts ou parcs situés sur le territoire de communes où l'établissement d'un plan d'occupation des sols a été prescrit mais où ce plan n'a pas encore été rendu public, ainsi que dans tout espace boisé classé, les coupes et abattages d'arbres sont soumis à autorisation préalable.",
    #height=170,
    #)
    
    targetWord = st.text_input('Keyword:', 'clôtures')
    ##targetWord = st.text_input('Keyword:', 'compromettre')
    
    if st.button('Summarize'):
        # Fix input formatting:
        segmentFr = segmentFr.replace('\n',' ')
                
        # Translate segment into English:
        model = load_easynmt()
        segmentEn = model.translate(segmentFr , source_lang='fr', target_lang='en')
        
        # Get an AMR graph:
        stog = load_stog_model()
        inputGraph = stog.parse_sents([segmentEn])
        
        # Get tokenized representation of segment in French:
        segmentFrTokens = word_tokenize(segmentFr, language='french')
        
        # Get tokenized representation of segment in English:
        penmanGraph = add_lemmas(inputGraph[0], snt_key='snt')
        
        aligner = RBWAligner.from_penman_w_json(penmanGraph)
        segmentEnTokens = aligner.lemmas
        
        # Get alignments between original version and translation:
        myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai")
        alignments = myaligner.get_word_aligns(segmentFrTokens, segmentEnTokens)
        
        # Find a node corresponding targetWord in the graph:
        targetNode = getTargetWordNode(segmentFrTokens, aligner, alignments, targetWord)
        
        # Check if targetNode is in the graph:
        errorFlag = False
        if targetNode not in inputGraph[0]:
            #if targetWord in inputGraph[0]:
            if targetWord in ''.join(inputGraph[0].split('\n')[1:]):
                targetNode = targetWord
            else:
                errorFlag = True
                
        # Extract a subgraph containing target word with full path (all the node) to it:
        if not errorFlag:
            if targetNode != 'Error!':
                targetSubGraph = getTargetWordSubGraphFullPath(inputGraph[0], targetNode)
                
                # Generate text from given AMR-graph:
                gtos = load_gtos_model()
                rulesEn, _ = gtos.generate([targetSubGraph])
                
                # Remove "1." from the text:
                rulesEn = [re.sub('\d. ', '', rulesEn[0])]
                
                # Translate it back to French
                rulesFr = model.translate(rulesEn[0], source_lang='en', target_lang='fr')
        
                st.write("Summary: ", rulesFr)
            else:
                st.write('Error! Alignment between target word in French and its English instance not found')
        else:
            st.write('Error! Cannot find keyword in the graph')
    
if __name__ == "__main__":
    main()