Spaces:

whuang06
/

wordrank

Sleeping

App Files Files Community

whuang06 commited on Jan 19, 2024

Commit

7bf520c

verified ·

1 Parent(s): 41c199a

Upload 3 files

Browse files

Files changed (3) hide show

app.py +121 -0
main.py +18 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import requests
+import spacy
+import json
+import time
+import streamlit as st
+import pandas as pd
+import matplotlib.pyplot as plt
+headers = {'Accept': 'application/json'}
+languages = {"French": "30", "German": "31", "Spanish": "32"}
+try:
+    nlp = spacy.load("en_core_web_sm")
+except:
+    spacy.cli.download('en_core_web_sm')
+    nlp = spacy.load("en_core_web_sm")
+# 30 - french
+# 31 - german
+# 32 - spanish
+def get_relevance(text, language, scaling=5):
+    link = f"https://books.google.com/ngrams/json?content={'+'.join(text.split(' '))}&year_start=2000&year_end=2019&corpus={languages[language]}&smoothing=0"
+    r = requests.get(link, headers=headers)
+    try:
+        ngrams = r.json()[0]['timeseries']
+        avg = sum(ngrams) / len(ngrams)
+    except:
+        avg = 0.0
+    return avg * pow(10, scaling)
+def remove_punc(text):
+    result = ""
+    for c in text:
+        if c not in '''!()-[]{};:"\,<>./?@#$%^&*_~1234567890''':
+            result += c
+    return result
+def tokenize_text_with_spacy(text):
+    doc = nlp(text)
+    tokens = [token.text for token in doc]
+    return tokens
+def split_text(text):
+    text = remove_punc(text.lower())
+    tokens = tokenize_text_with_spacy(text)
+    return tokens
+def process(text, excluded=[], lang="fr", scaling=5, upperbnd=float("inf"), lowerbnd=0):
+    tokens = set(split_text(text))
+    wordlist = []
+    for phrase in tokens:
+        if phrase not in excluded:
+            result = get_relevance(phrase, lang, scaling)
+            if 0 < result <= upperbnd and result >= lowerbnd:
+                wordlist.append([phrase, result])
+    wordlist = sorted(wordlist, key=lambda x: x[1], reverse=True)
+    return wordlist
+def make_clickable(val):
+    return '<a href="{}">{}</a>'.format(val,val)
+st.title("Vocabulary Learning")
+text = st.text_area("Enter Text:", "Sample text to process")
+lang = st.selectbox("Choose language", ["French", "German", "Spanish"])
+common_words = {
+    "French": "Bonjour, Merci, S'il vous plaît, Excusez-moi, Oui, Non, Merci, Au revoir, Comment ça va ?, Bien, Mal, "
+              "Amour, Maison, Famille, Travail, École, Temps, Nourriture, Eau, Vin, Ville, Rue, Voiture, Train, "
+              "Avion, Livre, Musique, Art, Cinéma, Sport, Chat, Chien, Ami, Fête, Vacances, Bonheur, Tristesse, Jour, "
+              "Nuit, Semaine, Mois, Année, Nombre, Couleur, Joyeux, Triste, Grand, Petit, Beau, Laid, je, tu, il, de, "
+              "la, et, les, pour, avec, sa, fait, français, en, une, un, dans, qui, est, au, plus, a, le, un, du, "
+              "d'après, ne, pas, elle, trop, cas, jeune, était, devait, peux"
+              "des, mais, été, alors, assez, ce, cette, tout, toutes, depuis, sujet, presque, lequel, laquelle, n'y, "
+              "tant, \", que, n'en, peu, cour, eu, ses, pret, prets, sur, d'une, qu'elle, quelle"
+              "dans, se, plus, son, comme, y, aussi, à, au, aux, sont, aussi, ont, vie, alors, ou, où, faut"
+              "elle, on, nous, vous, ils, elles, faire, voir, avoir, être, que, qu'il, qu'elle, j'ai, j'avais, j'étais",
+    "German": "Hallo, Danke, Bitte, Entschuldigung, Ja, Nein, Auf Wiedersehen, Wie geht es dir?, Gut, Schlecht, "
+              "Liebe, Haus, Familie, Arbeit, Schule, Zeit, Essen, Wasser, Wein, Stadt, Straße, Auto, Zug, Flugzeug, "
+              "Buch, Musik, Kunst, Kino, Sport, Katze, Hund, Freund, Party, Urlaub, Glück, Traurigkeit, Tag, Nacht, "
+              "Woche, Monat, Jahr, Zahl, Farbe, Froh, Traurig, Groß, Klein, Schön, Hässlich",
+    "Spanish": "Hola, Gracias, Por favor, Disculpa, Sí, No, Adiós, ¿Cómo estás?, Bien, Mal, Amor, Casa, Familia, "
+               "Trabajo, Escuela, Tiempo, Comida, Agua, Vino, Ciudad, Calle, Coche, Tren, Avión, Libro, Música, Arte, "
+               "Cine, Deporte, Gato, Perro, Amigo, Fiesta, Vacaciones, Felicidad, Tristeza, Día, Noche, Semana, Mes, "
+               "Año, Número, Color, Feliz, Triste, Grande, Pequeño, Bonito, Feo"
+}
+excluded = st.text_input("Common words to exclude:", common_words[lang])
+excluded = excluded.replace(" ", "").lower().split(",")
+upper_bound = st.number_input('Upper bound N-gram score', 0.0, 1000.0, value=100.0)
+lower_bound = st.number_input('Lower bound N-gram score', 0.0, 1000.0, value=0.01)
+langMP = {"French": "fr", "German": "de", "Spanish": "es"}
+output = process(text, excluded, lang, 5, upperbnd=upper_bound, lowerbnd=lower_bound)
+df = pd.DataFrame(output, columns=["Word", "N-Gram"])
+fig, ax = plt.subplots()
+ax.spines['top'].set_visible(False)
+ax.spines['right'].set_visible(False)
+ax.spines['bottom'].set_visible(False)
+ax.spines['left'].set_visible(False)
+ax.barh(df["Word"], df["N-Gram"])
+# ax.get_xaxis().set_ticks([])
+ax.set_ylabel("Words")
+st.subheader("Word Relevance")
+st.pyplot(fig)
+definitions = []
+langcode = langMP[lang]
+for word in df["Word"].tolist():
+    definitions.append(f'<a target="_blank" href="https://www.wordreference.com/{langcode}en/{word}">{word}</a>')
+st.subheader("WordReference Links")
+st.markdown("<br>".join(definitions), unsafe_allow_html=True)

main.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# This is a sample Python script.
+# Press ⌃R to execute it or replace it with your code.
+# Press Double ⇧ to search everywhere for classes, files, tool windows, actions, and settings.
+def print_hi(name):
+    # Use a breakpoint in the code line below to debug your script.
+    print(f'Hi, {name}')  # Press ⌘F8 to toggle the breakpoint.
+# Press the green button in the gutter to run the script.
+if __name__ == '__main__':
+    print_hi('PyCharm')
+# See PyCharm help at https://www.jetbrains.com/help/pycharm/

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+spacy
+requests
+streamlit
+matplotlib
+pandas