import streamlit as st import spacy from spacy.matcher import Matcher import subprocess # Download the language model subprocess.run(["python", "-m", "spacy", "download", "en_core_web_lg"]) # Load the glinter-spacy model nlp = spacy.load("en_core_web_lg") # Initialize the PhraseMatcher with the shared vocab matcher = Matcher(nlp.vocab) # Define the patterns for agg_type patterns = { "COUNT": [ [{"LOWER": "how"}, {"LOWER": "many"}], [{"LOWER": "how"}, {"LOWER": "frequent"}], [{"LOWER": "how"}, {"LOWER": "often"}], [{"LOWER": "count"}], [{"LOWER": "total"}, {"LOWER": "number"}] ], "SUM": [ [{"LOWER": "how"}, {"LOWER": "much"}], [{"LOWER": "total"}, {"LOWER": "amount"}], [{"LOWER": "sum"}, {"LOWER": "of"}], [{"LOWER": "sum"}] ], "AVG": [ [{"LOWER": "usually"}, {"LOWER": "paid"}], [{"LOWER": "usually"}, {"LOWER": "spend"}], [{"LOWER": "average"}], [{"LOWER": "avg"}], [{"LOWER": "per"}, {"LOWER": "month"}], [{"LOWER": "usual"}, {"LOWER": "amount"}] ], "MIN": [ [{"LOWER": "minimum"}], [{"LOWER": "min"}], [{"LOWER": "lowest"}], [{"LOWER": "the"}, {"LOWER": "least"}] ], "MAX": [ [{"LOWER": "maximum"}], [{"LOWER": "max"}], [{"LOWER": "highest"}], [{"LOWER": "the"}, {"LOWER": "most"}] ] } # Add the patterns to the matcher with the label "agg_type" for label, pattern in patterns.items(): matcher.add(label, pattern) # Streamlit app st.title("Entity Extraction") options = [ "How much do I spend on average on groceries?", "Which is the most I have spent at Nike?", "How much do I usually spend at McDonalds per month?", "How much do I usually spend at McDonalds?", "How much do I spend at McDonalds per month?", "How many transactions do I have with Parkichen in December 2024?", "How often do I shop at Peek & Cloppenburg?", "How frequent do I dine at Parkitchen?" ] selection = st.pills("Sample questions about transactions", options, selection_mode="single") # Text input from the user user_input = st.text_area("Enter query:", value=selection if selection else "") if user_input: # Process the text doc = nlp(user_input) # Apply the matcher to the doc matches = matcher(doc) # Extract and display entities st.subheader("Extracted Entities and matched patterns") for ent in doc.ents: st.markdown(f"{ent.text} {ent.label_}", unsafe_allow_html=True) # Extract and display matched patterns for match_id, start, end in matches: span = doc[start:end] label = nlp.vocab.strings[match_id] # Get the label for the matched pattern st.markdown(f"{span.text} {label}", unsafe_allow_html=True) # Add custom CSS for styling st.markdown(""" """, unsafe_allow_html=True)