transaction_ner / app.py
tudorgeorgescu's picture
fix
b86ae30 verified
import streamlit as st
import spacy
from spacy.matcher import Matcher
import subprocess
# Download the language model
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_lg"])
# Load the glinter-spacy model
nlp = spacy.load("en_core_web_lg")
# Initialize the PhraseMatcher with the shared vocab
matcher = Matcher(nlp.vocab)
# Define the patterns for agg_type
patterns = {
"COUNT": [
[{"LOWER": "how"}, {"LOWER": "many"}],
[{"LOWER": "how"}, {"LOWER": "frequent"}],
[{"LOWER": "how"}, {"LOWER": "often"}],
[{"LOWER": "count"}],
[{"LOWER": "total"}, {"LOWER": "number"}]
],
"SUM": [
[{"LOWER": "how"}, {"LOWER": "much"}],
[{"LOWER": "total"}, {"LOWER": "amount"}],
[{"LOWER": "sum"}, {"LOWER": "of"}],
[{"LOWER": "sum"}]
],
"AVG": [
[{"LOWER": "usually"}, {"LOWER": "paid"}],
[{"LOWER": "usually"}, {"LOWER": "spend"}],
[{"LOWER": "average"}],
[{"LOWER": "avg"}],
[{"LOWER": "per"}, {"LOWER": "month"}],
[{"LOWER": "usual"}, {"LOWER": "amount"}]
],
"MIN": [
[{"LOWER": "minimum"}],
[{"LOWER": "min"}],
[{"LOWER": "lowest"}],
[{"LOWER": "the"}, {"LOWER": "least"}]
],
"MAX": [
[{"LOWER": "maximum"}],
[{"LOWER": "max"}],
[{"LOWER": "highest"}],
[{"LOWER": "the"}, {"LOWER": "most"}]
]
}
# Add the patterns to the matcher with the label "agg_type"
for label, pattern in patterns.items():
matcher.add(label, pattern)
# Streamlit app
st.title("Entity Extraction")
options = [
"How much do I spend on average on groceries?",
"Which is the most I have spent at Nike?",
"How much do I usually spend at McDonalds per month?",
"How much do I usually spend at McDonalds?",
"How much do I spend at McDonalds per month?",
"How many transactions do I have with Parkichen in December 2024?",
"How often do I shop at Peek & Cloppenburg?",
"How frequent do I dine at Parkitchen?"
]
selection = st.pills("Sample questions about transactions", options, selection_mode="single")
# Text input from the user
user_input = st.text_area("Enter query:", value=selection if selection else "")
if user_input:
# Process the text
doc = nlp(user_input)
# Apply the matcher to the doc
matches = matcher(doc)
# Extract and display entities
st.subheader("Extracted Entities and matched patterns")
for ent in doc.ents:
st.markdown(f"<span class='entity'>{ent.text} <span class='label'>{ent.label_}</span></span>", unsafe_allow_html=True)
# Extract and display matched patterns
for match_id, start, end in matches:
span = doc[start:end]
label = nlp.vocab.strings[match_id] # Get the label for the matched pattern
st.markdown(f"<span class='entity'>{span.text} <span class='label'>{label}</span></span>", unsafe_allow_html=True)
# Add custom CSS for styling
st.markdown("""
<style>
.entity {
display: inline-block;
padding: 0.25em 0.4em;
margin: 0 0.25em 0.25em 0;
border-radius: 0.25rem;
background: #e2e2e2;
border: 1px solid #cccccc;
}
.label {
font-size: 0.75em;
font-weight: bold;
color: #333333;
margin-left: 0.5em;
}
</style>
""", unsafe_allow_html=True)