whuang06 commited on
Commit
7bf520c
·
verified ·
1 Parent(s): 41c199a

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +121 -0
  2. main.py +18 -0
  3. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import spacy
3
+ import json
4
+ import time
5
+ import streamlit as st
6
+ import pandas as pd
7
+ import matplotlib.pyplot as plt
8
+
9
+ headers = {'Accept': 'application/json'}
10
+ languages = {"French": "30", "German": "31", "Spanish": "32"}
11
+
12
+ try:
13
+ nlp = spacy.load("en_core_web_sm")
14
+ except:
15
+ spacy.cli.download('en_core_web_sm')
16
+ nlp = spacy.load("en_core_web_sm")
17
+
18
+ # 30 - french
19
+ # 31 - german
20
+ # 32 - spanish
21
+
22
+ def get_relevance(text, language, scaling=5):
23
+ link = f"https://books.google.com/ngrams/json?content={'+'.join(text.split(' '))}&year_start=2000&year_end=2019&corpus={languages[language]}&smoothing=0"
24
+ r = requests.get(link, headers=headers)
25
+ try:
26
+ ngrams = r.json()[0]['timeseries']
27
+ avg = sum(ngrams) / len(ngrams)
28
+ except:
29
+ avg = 0.0
30
+ return avg * pow(10, scaling)
31
+
32
+ def remove_punc(text):
33
+ result = ""
34
+ for c in text:
35
+ if c not in '''!()-[]{};:"\,<>./?@#$%^&*_~1234567890''':
36
+ result += c
37
+ return result
38
+
39
+ def tokenize_text_with_spacy(text):
40
+ doc = nlp(text)
41
+ tokens = [token.text for token in doc]
42
+ return tokens
43
+
44
+ def split_text(text):
45
+ text = remove_punc(text.lower())
46
+ tokens = tokenize_text_with_spacy(text)
47
+ return tokens
48
+
49
+ def process(text, excluded=[], lang="fr", scaling=5, upperbnd=float("inf"), lowerbnd=0):
50
+ tokens = set(split_text(text))
51
+ wordlist = []
52
+
53
+ for phrase in tokens:
54
+ if phrase not in excluded:
55
+ result = get_relevance(phrase, lang, scaling)
56
+ if 0 < result <= upperbnd and result >= lowerbnd:
57
+ wordlist.append([phrase, result])
58
+
59
+ wordlist = sorted(wordlist, key=lambda x: x[1], reverse=True)
60
+ return wordlist
61
+
62
+ def make_clickable(val):
63
+ return '<a href="{}">{}</a>'.format(val,val)
64
+
65
+ st.title("Vocabulary Learning")
66
+
67
+ text = st.text_area("Enter Text:", "Sample text to process")
68
+ lang = st.selectbox("Choose language", ["French", "German", "Spanish"])
69
+
70
+ common_words = {
71
+ "French": "Bonjour, Merci, S'il vous plaît, Excusez-moi, Oui, Non, Merci, Au revoir, Comment ça va ?, Bien, Mal, "
72
+ "Amour, Maison, Famille, Travail, École, Temps, Nourriture, Eau, Vin, Ville, Rue, Voiture, Train, "
73
+ "Avion, Livre, Musique, Art, Cinéma, Sport, Chat, Chien, Ami, Fête, Vacances, Bonheur, Tristesse, Jour, "
74
+ "Nuit, Semaine, Mois, Année, Nombre, Couleur, Joyeux, Triste, Grand, Petit, Beau, Laid, je, tu, il, de, "
75
+ "la, et, les, pour, avec, sa, fait, français, en, une, un, dans, qui, est, au, plus, a, le, un, du, "
76
+ "d'après, ne, pas, elle, trop, cas, jeune, était, devait, peux"
77
+ "des, mais, été, alors, assez, ce, cette, tout, toutes, depuis, sujet, presque, lequel, laquelle, n'y, "
78
+ "tant, \", que, n'en, peu, cour, eu, ses, pret, prets, sur, d'une, qu'elle, quelle"
79
+ "dans, se, plus, son, comme, y, aussi, à, au, aux, sont, aussi, ont, vie, alors, ou, où, faut"
80
+ "elle, on, nous, vous, ils, elles, faire, voir, avoir, être, que, qu'il, qu'elle, j'ai, j'avais, j'étais",
81
+ "German": "Hallo, Danke, Bitte, Entschuldigung, Ja, Nein, Auf Wiedersehen, Wie geht es dir?, Gut, Schlecht, "
82
+ "Liebe, Haus, Familie, Arbeit, Schule, Zeit, Essen, Wasser, Wein, Stadt, Straße, Auto, Zug, Flugzeug, "
83
+ "Buch, Musik, Kunst, Kino, Sport, Katze, Hund, Freund, Party, Urlaub, Glück, Traurigkeit, Tag, Nacht, "
84
+ "Woche, Monat, Jahr, Zahl, Farbe, Froh, Traurig, Groß, Klein, Schön, Hässlich",
85
+ "Spanish": "Hola, Gracias, Por favor, Disculpa, Sí, No, Adiós, ¿Cómo estás?, Bien, Mal, Amor, Casa, Familia, "
86
+ "Trabajo, Escuela, Tiempo, Comida, Agua, Vino, Ciudad, Calle, Coche, Tren, Avión, Libro, Música, Arte, "
87
+ "Cine, Deporte, Gato, Perro, Amigo, Fiesta, Vacaciones, Felicidad, Tristeza, Día, Noche, Semana, Mes, "
88
+ "Año, Número, Color, Feliz, Triste, Grande, Pequeño, Bonito, Feo"
89
+ }
90
+
91
+ excluded = st.text_input("Common words to exclude:", common_words[lang])
92
+ excluded = excluded.replace(" ", "").lower().split(",")
93
+
94
+ upper_bound = st.number_input('Upper bound N-gram score', 0.0, 1000.0, value=100.0)
95
+ lower_bound = st.number_input('Lower bound N-gram score', 0.0, 1000.0, value=0.01)
96
+
97
+ langMP = {"French": "fr", "German": "de", "Spanish": "es"}
98
+
99
+ output = process(text, excluded, lang, 5, upperbnd=upper_bound, lowerbnd=lower_bound)
100
+
101
+ df = pd.DataFrame(output, columns=["Word", "N-Gram"])
102
+
103
+ fig, ax = plt.subplots()
104
+ ax.spines['top'].set_visible(False)
105
+ ax.spines['right'].set_visible(False)
106
+ ax.spines['bottom'].set_visible(False)
107
+ ax.spines['left'].set_visible(False)
108
+ ax.barh(df["Word"], df["N-Gram"])
109
+ # ax.get_xaxis().set_ticks([])
110
+ ax.set_ylabel("Words")
111
+
112
+ st.subheader("Word Relevance")
113
+ st.pyplot(fig)
114
+
115
+ definitions = []
116
+ langcode = langMP[lang]
117
+ for word in df["Word"].tolist():
118
+ definitions.append(f'<a target="_blank" href="https://www.wordreference.com/{langcode}en/{word}">{word}</a>')
119
+
120
+ st.subheader("WordReference Links")
121
+ st.markdown("<br>".join(definitions), unsafe_allow_html=True)
main.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ # This is a sample Python script.
4
+
5
+ # Press ⌃R to execute it or replace it with your code.
6
+ # Press Double ⇧ to search everywhere for classes, files, tool windows, actions, and settings.
7
+
8
+
9
+ def print_hi(name):
10
+ # Use a breakpoint in the code line below to debug your script.
11
+ print(f'Hi, {name}') # Press ⌘F8 to toggle the breakpoint.
12
+
13
+
14
+ # Press the green button in the gutter to run the script.
15
+ if __name__ == '__main__':
16
+ print_hi('PyCharm')
17
+
18
+ # See PyCharm help at https://www.jetbrains.com/help/pycharm/
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ spacy
2
+ requests
3
+ streamlit
4
+ matplotlib
5
+ pandas