wordrank / app.py
whuang06's picture
Update app.py
d25e34a verified
raw
history blame
5.35 kB
import requests
import spacy
import json
import time
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
headers = {'Accept': 'application/json'}
languages = {"French": "30", "German": "31", "Spanish": "32"}
try:
nlp = spacy.load("en_core_web_sm")
except:
spacy.cli.download('en_core_web_sm')
nlp = spacy.load("en_core_web_sm")
# 30 - french
# 31 - german
# 32 - spanish
def get_relevance(text, language, scaling=5):
link = f"https://books.google.com/ngrams/json?content={'+'.join(text.split(' '))}&year_start=2000&year_end=2019&corpus={languages[language]}&smoothing=0"
r = requests.get(link, headers=headers)
try:
ngrams = r.json()[0]['timeseries']
avg = sum(ngrams) / len(ngrams)
except:
avg = 0.0
return avg * pow(10, scaling)
def remove_punc(text):
result = ""
for c in text:
if c not in '''!()-[]{};:"\,<>./?@#$%^&*_~1234567890''':
result += c
return result
def tokenize_text_with_spacy(text):
doc = nlp(text)
tokens = [token.text for token in doc]
return tokens
def split_text(text):
text = remove_punc(text.lower())
tokens = tokenize_text_with_spacy(text)
return tokens
def process(text, excluded=[], lang="fr", scaling=5, upperbnd=float("inf"), lowerbnd=0):
tokens = set(split_text(text))
wordlist = []
for i, phrase in enumerate(tokens):
my_bar.progress((i + 1)/len(tokens), text=f"Calculating N-grams {round((i + 1)/len(tokens) * 100)}%")
if phrase not in excluded:
result = get_relevance(phrase, lang, scaling)
if lowerbnd <= result <= upperbnd:
wordlist.append([phrase, result])
wordlist = sorted(wordlist, key=lambda x: x[1], reverse=True)
return wordlist
def make_clickable(val):
return '<a href="{}">{}</a>'.format(val,val)
st.title("WordRank™")
text = st.text_area("Enter Text:", "Sample text to process")
lang = st.selectbox("Choose language", ["French", "German", "Spanish"])
common_words = {
"French": "Bonjour, Merci, S'il vous plaît, Excusez-moi, Oui, Non, Merci, Au revoir, Comment ça va ?, Bien, Mal, "
"Amour, Maison, Famille, Travail, École, Temps, Nourriture, Eau, Vin, Ville, Rue, Voiture, Train, "
"Avion, Livre, Musique, Art, Cinéma, Sport, Chat, Chien, Ami, Fête, Vacances, Bonheur, Tristesse, Jour, "
"Nuit, Semaine, Mois, Année, Nombre, Couleur, Joyeux, Triste, Grand, Petit, Beau, Laid, je, tu, il, de, "
"la, et, les, pour, avec, sa, fait, français, en, une, un, dans, qui, est, au, plus, a, le, un, du, "
"d'après, ne, pas, elle, trop, cas, jeune, était, devait, peux"
"des, mais, été, alors, assez, ce, cette, tout, toutes, depuis, sujet, presque, lequel, laquelle, n'y, "
"tant, \", que, n'en, peu, cour, eu, ses, pret, prets, sur, d'une, qu'elle, quelle"
"dans, se, plus, son, comme, y, aussi, à, au, aux, sont, aussi, ont, vie, alors, ou, où, faut"
"elle, on, nous, vous, ils, elles, faire, voir, avoir, être, que, qu'il, qu'elle, j'ai, j'avais, j'étais",
"German": "Hallo, Danke, Bitte, Entschuldigung, Ja, Nein, Auf Wiedersehen, Wie geht es dir?, Gut, Schlecht, "
"Liebe, Haus, Familie, Arbeit, Schule, Zeit, Essen, Wasser, Wein, Stadt, Straße, Auto, Zug, Flugzeug, "
"Buch, Musik, Kunst, Kino, Sport, Katze, Hund, Freund, Party, Urlaub, Glück, Traurigkeit, Tag, Nacht, "
"Woche, Monat, Jahr, Zahl, Farbe, Froh, Traurig, Groß, Klein, Schön, Hässlich",
"Spanish": "Hola, Gracias, Por favor, Disculpa, Sí, No, Adiós, ¿Cómo estás?, Bien, Mal, Amor, Casa, Familia, "
"Trabajo, Escuela, Tiempo, Comida, Agua, Vino, Ciudad, Calle, Coche, Tren, Avión, Libro, Música, Arte, "
"Cine, Deporte, Gato, Perro, Amigo, Fiesta, Vacaciones, Felicidad, Tristeza, Día, Noche, Semana, Mes, "
"Año, Número, Color, Feliz, Triste, Grande, Pequeño, Bonito, Feo"
}
excluded = st.text_input("Common words to exclude:", common_words[lang])
excluded = excluded.replace(" ", "").lower().split(",")
upper_bound = st.number_input('Upper bound N-gram score', 0.0, 1000.0, value=10.0)
lower_bound = st.number_input('Lower bound N-gram score', 0.0, 1000.0, value=1e-19)
langMP = {"French": "fr", "German": "de", "Spanish": "es"}
if st.button("Calculate"):
my_bar = st.progress(0, text="Calculating N-grams 0%")
output = process(text, excluded, lang, 5, upperbnd=upper_bound, lowerbnd=lower_bound)
df = pd.DataFrame(output, columns=["Word", "N-Gram"])
fig, ax = plt.subplots()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.barh(df["Word"], df["N-Gram"])
# ax.get_xaxis().set_ticks([])
ax.set_ylabel("Words")
st.subheader("Word Relevance")
st.pyplot(fig)
definitions = []
langcode = langMP[lang]
for word in df["Word"].tolist():
definitions.append(f'<a target="_blank" href="https://www.wordreference.com/{langcode}en/{word}">{word}</a>')
st.subheader("WordReference Links")
st.markdown("<br>".join(definitions), unsafe_allow_html=True)
my_bar.empty()