Upload 3 files
Browse files- app.py +121 -0
- main.py +18 -0
- requirements.txt +5 -0
app.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import spacy
|
| 3 |
+
import json
|
| 4 |
+
import time
|
| 5 |
+
import streamlit as st
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import matplotlib.pyplot as plt
|
| 8 |
+
|
| 9 |
+
headers = {'Accept': 'application/json'}
|
| 10 |
+
languages = {"French": "30", "German": "31", "Spanish": "32"}
|
| 11 |
+
|
| 12 |
+
try:
|
| 13 |
+
nlp = spacy.load("en_core_web_sm")
|
| 14 |
+
except:
|
| 15 |
+
spacy.cli.download('en_core_web_sm')
|
| 16 |
+
nlp = spacy.load("en_core_web_sm")
|
| 17 |
+
|
| 18 |
+
# 30 - french
|
| 19 |
+
# 31 - german
|
| 20 |
+
# 32 - spanish
|
| 21 |
+
|
| 22 |
+
def get_relevance(text, language, scaling=5):
|
| 23 |
+
link = f"https://books.google.com/ngrams/json?content={'+'.join(text.split(' '))}&year_start=2000&year_end=2019&corpus={languages[language]}&smoothing=0"
|
| 24 |
+
r = requests.get(link, headers=headers)
|
| 25 |
+
try:
|
| 26 |
+
ngrams = r.json()[0]['timeseries']
|
| 27 |
+
avg = sum(ngrams) / len(ngrams)
|
| 28 |
+
except:
|
| 29 |
+
avg = 0.0
|
| 30 |
+
return avg * pow(10, scaling)
|
| 31 |
+
|
| 32 |
+
def remove_punc(text):
|
| 33 |
+
result = ""
|
| 34 |
+
for c in text:
|
| 35 |
+
if c not in '''!()-[]{};:"\,<>./?@#$%^&*_~1234567890''':
|
| 36 |
+
result += c
|
| 37 |
+
return result
|
| 38 |
+
|
| 39 |
+
def tokenize_text_with_spacy(text):
|
| 40 |
+
doc = nlp(text)
|
| 41 |
+
tokens = [token.text for token in doc]
|
| 42 |
+
return tokens
|
| 43 |
+
|
| 44 |
+
def split_text(text):
|
| 45 |
+
text = remove_punc(text.lower())
|
| 46 |
+
tokens = tokenize_text_with_spacy(text)
|
| 47 |
+
return tokens
|
| 48 |
+
|
| 49 |
+
def process(text, excluded=[], lang="fr", scaling=5, upperbnd=float("inf"), lowerbnd=0):
|
| 50 |
+
tokens = set(split_text(text))
|
| 51 |
+
wordlist = []
|
| 52 |
+
|
| 53 |
+
for phrase in tokens:
|
| 54 |
+
if phrase not in excluded:
|
| 55 |
+
result = get_relevance(phrase, lang, scaling)
|
| 56 |
+
if 0 < result <= upperbnd and result >= lowerbnd:
|
| 57 |
+
wordlist.append([phrase, result])
|
| 58 |
+
|
| 59 |
+
wordlist = sorted(wordlist, key=lambda x: x[1], reverse=True)
|
| 60 |
+
return wordlist
|
| 61 |
+
|
| 62 |
+
def make_clickable(val):
|
| 63 |
+
return '<a href="{}">{}</a>'.format(val,val)
|
| 64 |
+
|
| 65 |
+
st.title("Vocabulary Learning")
|
| 66 |
+
|
| 67 |
+
text = st.text_area("Enter Text:", "Sample text to process")
|
| 68 |
+
lang = st.selectbox("Choose language", ["French", "German", "Spanish"])
|
| 69 |
+
|
| 70 |
+
common_words = {
|
| 71 |
+
"French": "Bonjour, Merci, S'il vous plaît, Excusez-moi, Oui, Non, Merci, Au revoir, Comment ça va ?, Bien, Mal, "
|
| 72 |
+
"Amour, Maison, Famille, Travail, École, Temps, Nourriture, Eau, Vin, Ville, Rue, Voiture, Train, "
|
| 73 |
+
"Avion, Livre, Musique, Art, Cinéma, Sport, Chat, Chien, Ami, Fête, Vacances, Bonheur, Tristesse, Jour, "
|
| 74 |
+
"Nuit, Semaine, Mois, Année, Nombre, Couleur, Joyeux, Triste, Grand, Petit, Beau, Laid, je, tu, il, de, "
|
| 75 |
+
"la, et, les, pour, avec, sa, fait, français, en, une, un, dans, qui, est, au, plus, a, le, un, du, "
|
| 76 |
+
"d'après, ne, pas, elle, trop, cas, jeune, était, devait, peux"
|
| 77 |
+
"des, mais, été, alors, assez, ce, cette, tout, toutes, depuis, sujet, presque, lequel, laquelle, n'y, "
|
| 78 |
+
"tant, \", que, n'en, peu, cour, eu, ses, pret, prets, sur, d'une, qu'elle, quelle"
|
| 79 |
+
"dans, se, plus, son, comme, y, aussi, à, au, aux, sont, aussi, ont, vie, alors, ou, où, faut"
|
| 80 |
+
"elle, on, nous, vous, ils, elles, faire, voir, avoir, être, que, qu'il, qu'elle, j'ai, j'avais, j'étais",
|
| 81 |
+
"German": "Hallo, Danke, Bitte, Entschuldigung, Ja, Nein, Auf Wiedersehen, Wie geht es dir?, Gut, Schlecht, "
|
| 82 |
+
"Liebe, Haus, Familie, Arbeit, Schule, Zeit, Essen, Wasser, Wein, Stadt, Straße, Auto, Zug, Flugzeug, "
|
| 83 |
+
"Buch, Musik, Kunst, Kino, Sport, Katze, Hund, Freund, Party, Urlaub, Glück, Traurigkeit, Tag, Nacht, "
|
| 84 |
+
"Woche, Monat, Jahr, Zahl, Farbe, Froh, Traurig, Groß, Klein, Schön, Hässlich",
|
| 85 |
+
"Spanish": "Hola, Gracias, Por favor, Disculpa, Sí, No, Adiós, ¿Cómo estás?, Bien, Mal, Amor, Casa, Familia, "
|
| 86 |
+
"Trabajo, Escuela, Tiempo, Comida, Agua, Vino, Ciudad, Calle, Coche, Tren, Avión, Libro, Música, Arte, "
|
| 87 |
+
"Cine, Deporte, Gato, Perro, Amigo, Fiesta, Vacaciones, Felicidad, Tristeza, Día, Noche, Semana, Mes, "
|
| 88 |
+
"Año, Número, Color, Feliz, Triste, Grande, Pequeño, Bonito, Feo"
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
excluded = st.text_input("Common words to exclude:", common_words[lang])
|
| 92 |
+
excluded = excluded.replace(" ", "").lower().split(",")
|
| 93 |
+
|
| 94 |
+
upper_bound = st.number_input('Upper bound N-gram score', 0.0, 1000.0, value=100.0)
|
| 95 |
+
lower_bound = st.number_input('Lower bound N-gram score', 0.0, 1000.0, value=0.01)
|
| 96 |
+
|
| 97 |
+
langMP = {"French": "fr", "German": "de", "Spanish": "es"}
|
| 98 |
+
|
| 99 |
+
output = process(text, excluded, lang, 5, upperbnd=upper_bound, lowerbnd=lower_bound)
|
| 100 |
+
|
| 101 |
+
df = pd.DataFrame(output, columns=["Word", "N-Gram"])
|
| 102 |
+
|
| 103 |
+
fig, ax = plt.subplots()
|
| 104 |
+
ax.spines['top'].set_visible(False)
|
| 105 |
+
ax.spines['right'].set_visible(False)
|
| 106 |
+
ax.spines['bottom'].set_visible(False)
|
| 107 |
+
ax.spines['left'].set_visible(False)
|
| 108 |
+
ax.barh(df["Word"], df["N-Gram"])
|
| 109 |
+
# ax.get_xaxis().set_ticks([])
|
| 110 |
+
ax.set_ylabel("Words")
|
| 111 |
+
|
| 112 |
+
st.subheader("Word Relevance")
|
| 113 |
+
st.pyplot(fig)
|
| 114 |
+
|
| 115 |
+
definitions = []
|
| 116 |
+
langcode = langMP[lang]
|
| 117 |
+
for word in df["Word"].tolist():
|
| 118 |
+
definitions.append(f'<a target="_blank" href="https://www.wordreference.com/{langcode}en/{word}">{word}</a>')
|
| 119 |
+
|
| 120 |
+
st.subheader("WordReference Links")
|
| 121 |
+
st.markdown("<br>".join(definitions), unsafe_allow_html=True)
|
main.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
# This is a sample Python script.
|
| 4 |
+
|
| 5 |
+
# Press ⌃R to execute it or replace it with your code.
|
| 6 |
+
# Press Double ⇧ to search everywhere for classes, files, tool windows, actions, and settings.
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def print_hi(name):
|
| 10 |
+
# Use a breakpoint in the code line below to debug your script.
|
| 11 |
+
print(f'Hi, {name}') # Press ⌘F8 to toggle the breakpoint.
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# Press the green button in the gutter to run the script.
|
| 15 |
+
if __name__ == '__main__':
|
| 16 |
+
print_hi('PyCharm')
|
| 17 |
+
|
| 18 |
+
# See PyCharm help at https://www.jetbrains.com/help/pycharm/
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spacy
|
| 2 |
+
requests
|
| 3 |
+
streamlit
|
| 4 |
+
matplotlib
|
| 5 |
+
pandas
|