|
|
import requests |
|
|
import spacy |
|
|
import json |
|
|
import time |
|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
import matplotlib.pyplot as plt |
|
|
|
|
|
headers = {'Accept': 'application/json'} |
|
|
languages = {"French": "30", "German": "31", "Spanish": "32"} |
|
|
|
|
|
try: |
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
except: |
|
|
spacy.cli.download('en_core_web_sm') |
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_relevance(text, language, scaling=5): |
|
|
link = f"https://books.google.com/ngrams/json?content={'+'.join(text.split(' '))}&year_start=2000&year_end=2019&corpus={languages[language]}&smoothing=0" |
|
|
r = requests.get(link, headers=headers) |
|
|
try: |
|
|
ngrams = r.json()[0]['timeseries'] |
|
|
avg = sum(ngrams) / len(ngrams) |
|
|
except: |
|
|
avg = 0.0 |
|
|
return avg * pow(10, scaling) |
|
|
|
|
|
def remove_punc(text): |
|
|
result = "" |
|
|
for c in text: |
|
|
if c not in '''!()-[]{};:"\,<>./?@#$%^&*_~1234567890''': |
|
|
result += c |
|
|
return result |
|
|
|
|
|
def tokenize_text_with_spacy(text): |
|
|
doc = nlp(text) |
|
|
tokens = [token.text for token in doc] |
|
|
return tokens |
|
|
|
|
|
def split_text(text): |
|
|
text = remove_punc(text.lower()) |
|
|
tokens = tokenize_text_with_spacy(text) |
|
|
return tokens |
|
|
|
|
|
def process(text, excluded=[], lang="fr", scaling=5, upperbnd=float("inf"), lowerbnd=0): |
|
|
tokens = set(split_text(text)) |
|
|
wordlist = [] |
|
|
for i, phrase in enumerate(tokens): |
|
|
my_bar.progress((i + 1)/len(tokens), text=f"Calculating N-grams {round((i + 1)/len(tokens) * 100)}%") |
|
|
if phrase not in excluded: |
|
|
result = get_relevance(phrase, lang, scaling) |
|
|
if lowerbnd <= result <= upperbnd: |
|
|
wordlist.append([phrase, result]) |
|
|
wordlist = sorted(wordlist, key=lambda x: x[1], reverse=True) |
|
|
return wordlist |
|
|
|
|
|
def make_clickable(val): |
|
|
return '<a href="{}">{}</a>'.format(val,val) |
|
|
|
|
|
st.title("WordRank™") |
|
|
|
|
|
text = st.text_area("Enter Text:", "Sample text to process") |
|
|
lang = st.selectbox("Choose language", ["French", "German", "Spanish"]) |
|
|
|
|
|
common_words = { |
|
|
"French": "Bonjour, Merci, S'il vous plaît, Excusez-moi, Oui, Non, Merci, Au revoir, Comment ça va ?, Bien, Mal, " |
|
|
"Amour, Maison, Famille, Travail, École, Temps, Nourriture, Eau, Vin, Ville, Rue, Voiture, Train, " |
|
|
"Avion, Livre, Musique, Art, Cinéma, Sport, Chat, Chien, Ami, Fête, Vacances, Bonheur, Tristesse, Jour, " |
|
|
"Nuit, Semaine, Mois, Année, Nombre, Couleur, Joyeux, Triste, Grand, Petit, Beau, Laid, je, tu, il, de, " |
|
|
"la, et, les, pour, avec, sa, fait, français, en, une, un, dans, qui, est, au, plus, a, le, un, du, " |
|
|
"d'après, ne, pas, elle, trop, cas, jeune, était, devait, peux" |
|
|
"des, mais, été, alors, assez, ce, cette, tout, toutes, depuis, sujet, presque, lequel, laquelle, n'y, " |
|
|
"tant, \", que, n'en, peu, cour, eu, ses, pret, prets, sur, d'une, qu'elle, quelle" |
|
|
"dans, se, plus, son, comme, y, aussi, à, au, aux, sont, aussi, ont, vie, alors, ou, où, faut" |
|
|
"elle, on, nous, vous, ils, elles, faire, voir, avoir, être, que, qu'il, qu'elle, j'ai, j'avais, j'étais", |
|
|
"German": "Hallo, Danke, Bitte, Entschuldigung, Ja, Nein, Auf Wiedersehen, Wie geht es dir?, Gut, Schlecht, " |
|
|
"Liebe, Haus, Familie, Arbeit, Schule, Zeit, Essen, Wasser, Wein, Stadt, Straße, Auto, Zug, Flugzeug, " |
|
|
"Buch, Musik, Kunst, Kino, Sport, Katze, Hund, Freund, Party, Urlaub, Glück, Traurigkeit, Tag, Nacht, " |
|
|
"Woche, Monat, Jahr, Zahl, Farbe, Froh, Traurig, Groß, Klein, Schön, Hässlich", |
|
|
"Spanish": "Hola, Gracias, Por favor, Disculpa, Sí, No, Adiós, ¿Cómo estás?, Bien, Mal, Amor, Casa, Familia, " |
|
|
"Trabajo, Escuela, Tiempo, Comida, Agua, Vino, Ciudad, Calle, Coche, Tren, Avión, Libro, Música, Arte, " |
|
|
"Cine, Deporte, Gato, Perro, Amigo, Fiesta, Vacaciones, Felicidad, Tristeza, Día, Noche, Semana, Mes, " |
|
|
"Año, Número, Color, Feliz, Triste, Grande, Pequeño, Bonito, Feo" |
|
|
} |
|
|
|
|
|
excluded = st.text_input("Common words to exclude:", common_words[lang]) |
|
|
excluded = excluded.replace(" ", "").lower().split(",") |
|
|
|
|
|
upper_bound = st.number_input('Upper bound N-gram score', 0.0, 1000.0, value=10.0) |
|
|
lower_bound = st.number_input('Lower bound N-gram score', 0.0, 1000.0, value=1e-19) |
|
|
|
|
|
langMP = {"French": "fr", "German": "de", "Spanish": "es"} |
|
|
|
|
|
if st.button("Calculate"): |
|
|
my_bar = st.progress(0, text="Calculating N-grams 0%") |
|
|
|
|
|
output = process(text, excluded, lang, 5, upperbnd=upper_bound, lowerbnd=lower_bound) |
|
|
|
|
|
df = pd.DataFrame(output, columns=["Word", "N-Gram"]) |
|
|
|
|
|
fig, ax = plt.subplots() |
|
|
ax.spines['top'].set_visible(False) |
|
|
ax.spines['right'].set_visible(False) |
|
|
ax.spines['bottom'].set_visible(False) |
|
|
ax.spines['left'].set_visible(False) |
|
|
ax.barh(df["Word"], df["N-Gram"]) |
|
|
|
|
|
ax.set_ylabel("Words") |
|
|
|
|
|
st.subheader("Word Relevance") |
|
|
st.pyplot(fig) |
|
|
|
|
|
definitions = [] |
|
|
langcode = langMP[lang] |
|
|
for word in df["Word"].tolist(): |
|
|
definitions.append(f'<a target="_blank" href="https://www.wordreference.com/{langcode}en/{word}">{word}</a>') |
|
|
|
|
|
st.subheader("WordReference Links") |
|
|
st.markdown("<br>".join(definitions), unsafe_allow_html=True) |
|
|
|
|
|
my_bar.empty() |
|
|
|
|
|
|