File size: 5,719 Bytes
7bf520c bfbd92e 7bf520c bfbd92e 7bf520c bfbd92e 7bf520c bfbd92e 7bf520c d25e34a 7bf520c d25e34a 7bf520c e2cb0f6 7bf520c bfbd92e 7bf520c bfbd92e d25e34a 7bf520c bfbd92e d25e34a bfbd92e d25e34a 7bf520c d25e34a bfbd92e d25e34a 7bf520c d25e34a 7bf520c d25e34a 7bf520c d25e34a 7bf520c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import requests
import spacy
import json
import time
import datetime
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
headers = {'Accept': 'application/json'}
languages = {"French": "30", "German": "31", "Spanish": "32"}
if "memo" not in st.session_state:
st.session_state["memo"] = {}
try:
nlp = spacy.load("en_core_web_sm")
except:
spacy.cli.download('en_core_web_sm')
nlp = spacy.load("en_core_web_sm")
# 30 - french
# 31 - german
# 32 - spanish
def get_relevance(text, language, scaling=5):
if f"{text}_{language}_{scaling}" in st.session_state["memo"]:
return st.session_state["memo"][f"{text}_{language}_{scaling}"]
link = f"https://books.google.com/ngrams/json?content={'+'.join(text.split(' '))}&year_start=1990&year_end=2019&corpus={languages[language]}&smoothing=0"
r = requests.get(link, headers=headers)
try:
ngrams = r.json()[0]['timeseries']
avg = sum(ngrams) / len(ngrams)
except:
avg = 0.0
st.session_state["memo"][f"{text}_{language}_{scaling}"] = avg * pow(10, scaling)
return avg * pow(10, scaling)
def remove_punc(text):
result = ""
for c in text:
if c not in '''!()-[]{};:"\,<>./?@#$%^&*_~1234567890''':
result += c
return result
def tokenize_text_with_spacy(text):
doc = nlp(text)
tokens = [token.text for token in doc]
return tokens
def split_text(text):
text = remove_punc(text.lower())
tokens = tokenize_text_with_spacy(text)
return tokens
def process(text, excluded=[], lang="fr", scaling=5, upperbnd=float("inf"), lowerbnd=0):
tokens = set(split_text(text))
wordlist = []
for i, phrase in enumerate(tokens):
my_bar.progress((i + 1)/len(tokens), text=f"Calculating N-grams {round((i + 1)/len(tokens) * 100)}%")
if phrase not in excluded:
result = get_relevance(phrase, lang, scaling)
if lowerbnd <= result <= upperbnd:
wordlist.append([phrase, result])
wordlist = sorted(wordlist, key=lambda x: x[1], reverse=True)
return wordlist
def make_clickable(val):
return '<a href="{}">{}</a>'.format(val,val)
st.title("WordRank™")
text = st.text_area("Enter Text:", "Sample text to process")
lang = st.selectbox("Choose language", ["French", "German", "Spanish"])
common_words = {
"French": "Bonjour, Merci, S'il vous plaît, Excusez-moi, Oui, Non, Merci, Au revoir, Comment ça va ?, Bien, Mal, "
"Amour, Maison, Famille, Travail, École, Temps, Nourriture, Eau, Vin, Ville, Rue, Voiture, Train, "
"Avion, Livre, Musique, Art, Cinéma, Sport, Chat, Chien, Ami, Fête, Vacances, Bonheur, Tristesse, Jour, "
"Nuit, Semaine, Mois, Année, Nombre, Couleur, Joyeux, Triste, Grand, Petit, Beau, Laid, je, tu, il, de, "
"la, et, les, pour, avec, sa, fait, français, en, une, un, dans, qui, est, au, plus, a, le, un, du, "
"d'après, ne, pas, elle, trop, cas, jeune, était, devait, peux"
"des, mais, été, alors, assez, ce, cette, tout, toutes, depuis, sujet, presque, lequel, laquelle, n'y, "
"tant, \", que, n'en, peu, cour, eu, ses, pret, prets, sur, d'une, qu'elle, quelle"
"dans, se, plus, son, comme, y, aussi, à, au, aux, sont, aussi, ont, vie, alors, ou, où, faut"
"elle, on, nous, vous, ils, elles, faire, voir, avoir, être, que, qu'il, qu'elle, j'ai, j'avais, j'étais",
"German": "Hallo, Danke, Bitte, Entschuldigung, Ja, Nein, Auf Wiedersehen, Wie geht es dir?, Gut, Schlecht, "
"Liebe, Haus, Familie, Arbeit, Schule, Zeit, Essen, Wasser, Wein, Stadt, Straße, Auto, Zug, Flugzeug, "
"Buch, Musik, Kunst, Kino, Sport, Katze, Hund, Freund, Party, Urlaub, Glück, Traurigkeit, Tag, Nacht, "
"Woche, Monat, Jahr, Zahl, Farbe, Froh, Traurig, Groß, Klein, Schön, Hässlich",
"Spanish": "Hola, Gracias, Por favor, Disculpa, Sí, No, Adiós, ¿Cómo estás?, Bien, Mal, Amor, Casa, Familia, "
"Trabajo, Escuela, Tiempo, Comida, Agua, Vino, Ciudad, Calle, Coche, Tren, Avión, Libro, Música, Arte, "
"Cine, Deporte, Gato, Perro, Amigo, Fiesta, Vacaciones, Felicidad, Tristeza, Día, Noche, Semana, Mes, "
"Año, Número, Color, Feliz, Triste, Grande, Pequeño, Bonito, Feo"
}
excluded = st.text_input("Common words to exclude:", common_words[lang])
excluded = excluded.replace(" ", "").lower().split(",")
upper_bound = st.number_input('Upper bound N-gram score', 0.0, 1000.0, value=10.0)
lower_bound = st.number_input('Lower bound N-gram score', 0.0, 1000.0, value=1e-19)
langMP = {"French": "fr", "German": "de", "Spanish": "es"}
if st.button("Calculate"):
my_bar = st.progress(0, text="Calculating N-grams 0%")
output = process(text, excluded, lang, 5, upperbnd=upper_bound, lowerbnd=lower_bound)
df = pd.DataFrame(output, columns=["Word", "N-Gram"])
fig, ax = plt.subplots(figsize=(5, int((len(set(df["Word"].tolist()))) ** 0.6)))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.barh(df["Word"], df["N-Gram"])
# ax.get_xaxis().set_ticks([])
ax.set_ylabel("Words")
st.subheader("Word Relevance")
st.pyplot(fig)
definitions = []
langcode = langMP[lang]
for word in df["Word"].tolist():
definitions.append(f'<a target="_blank" href="https://www.wordreference.com/{langcode}en/{word}">{word}</a>')
st.subheader("WordReference Links")
st.markdown("<br>".join(definitions), unsafe_allow_html=True)
my_bar.empty()
|