Spaces:
Running
Running
| from googletrans import Translator | |
| import spacy | |
| import gradio as gr | |
| import nltk | |
| from nltk.corpus import wordnet | |
| import wikipedia | |
| import re | |
| import time | |
| import random | |
| nltk.download('maxent_ne_chunker') #Chunker | |
| nltk.download('stopwords') #Stop Words List (Mainly Roman Languages) | |
| nltk.download('words') #200 000+ Alphabetical order list | |
| nltk.download('punkt') #Tokenizer | |
| nltk.download('verbnet') #For Description of Verbs | |
| nltk.download('omw') | |
| nltk.download('omw-1.4') #Multilingual Wordnet | |
| nltk.download('wordnet') #For Definitions, Antonyms and Synonyms | |
| nltk.download('shakespeare') | |
| nltk.download('dolch') #Sight words | |
| nltk.download('names') #People Names NER | |
| nltk.download('gazetteers') #Location NER | |
| nltk.download('opinion_lexicon') #Sentiment words | |
| nltk.download('averaged_perceptron_tagger') #Parts of Speech Tagging | |
| spacy.cli.download("en_core_web_sm") | |
| nlp = spacy.load('en_core_web_sm') | |
| translator = Translator() | |
| def Sentencechunker(sentence): | |
| Sentchunks = sentence.split(" ") | |
| chunks = [] | |
| for i in range(len(Sentchunks)): | |
| chunks.append(" ".join(Sentchunks[:i+1])) | |
| return " | ".join(chunks) | |
| def ReverseSentenceChunker(sentence): | |
| reversed_sentence = " ".join(reversed(sentence.split())) | |
| chunks = Sentencechunker(reversed_sentence) | |
| return chunks | |
| def three_words_chunk(sentence): | |
| words = sentence.split() | |
| chunks = [words[i:i+3] for i in range(len(words)-2)] | |
| chunks = [" ".join(chunk) for chunk in chunks] | |
| return " | ".join(chunks) | |
| def keep_nouns_verbs(sentence): | |
| doc = nlp(sentence) | |
| nouns_verbs = [] | |
| for token in doc: | |
| if token.pos_ in ['NOUN','VERB','PUNCT']: | |
| nouns_verbs.append(token.text) | |
| return " ".join(nouns_verbs) | |
| def unique_word_count(text="", state=None): | |
| if state is None: | |
| state = {} | |
| words = text.split() | |
| word_counts = state | |
| for word in words: | |
| if word in word_counts: | |
| word_counts[word] += 1 | |
| else: | |
| word_counts[word] = 1 | |
| sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True) | |
| return sorted_word_counts, | |
| def Wordchunker(word): | |
| chunks = [] | |
| for i in range(len(word)): | |
| chunks.append(word[:i+1]) | |
| return chunks | |
| def BatchWordChunk(sentence): | |
| words = sentence.split(" ") | |
| FinalOutput = "" | |
| Currentchunks = "" | |
| ChunksasString = "" | |
| for word in words: | |
| ChunksasString = "" | |
| Currentchunks = Wordchunker(word) | |
| for chunk in Currentchunks: | |
| ChunksasString += chunk + " " | |
| FinalOutput += "\n" + ChunksasString | |
| return FinalOutput | |
| # Translate from English to French | |
| langdest = gr.Dropdown(choices=["af", "de", "es", "ko", "ja", "zh-cn"], label="Choose Language", value="de") | |
| ChunkModeDrop = gr.Dropdown(choices=["Chunks", "Reverse", "Three Word Chunks", "Spelling Chunks"], label="Choose Chunk Type", value="Chunks") | |
| def FrontRevSentChunk (Chunkmode, Translate, Text, langdest): | |
| FinalOutput = "" | |
| TransFinalOutput = "" | |
| if Chunkmode=="Chunks": | |
| FinalOutput += Sentencechunker(Text) | |
| if Chunkmode=="Reverse": | |
| FinalOutput += ReverseSentenceChunker(Text) | |
| if Chunkmode=="Three Word Chunks": | |
| FinalOutput += three_words_chunk(Text) | |
| if Chunkmode=="Spelling Chunks": | |
| FinalOutput += BatchWordChunk(Text) | |
| if Translate: | |
| TransFinalOutput = FinalOutput | |
| translated = translator.translate(TransFinalOutput, dest=langdest) | |
| FinalOutput += "\n" + translated.text | |
| return FinalOutput | |
| # Define a function to filter out non-verb, noun, or adjective words | |
| def filter_words(words): | |
| # Use NLTK to tag each word with its part of speech | |
| tagged_words = nltk.pos_tag(words) | |
| # Define a set of parts of speech to keep (verbs, nouns, adjectives) | |
| keep_pos = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS'} | |
| # Filter the list to only include words with the desired parts of speech | |
| filtered_words = [word for word, pos in tagged_words if pos in keep_pos] | |
| return filtered_words | |
| def SepHypandSynExpansion(text): | |
| # Tokenize the text | |
| tokens = nltk.word_tokenize(text) | |
| NoHits = "" | |
| FinalOutput = "" | |
| # Find synonyms and hypernyms of each word in the text | |
| for token in tokens: | |
| synonyms = [] | |
| hypernyms = [] | |
| for synset in wordnet.synsets(token): | |
| synonyms += synset.lemma_names() | |
| hypernyms += [hypernym.name() for hypernym in synset.hypernyms()] | |
| if not synonyms and not hypernyms: | |
| NoHits += f"{token} | " | |
| else: | |
| FinalOutput += "\n" f"{token}: hypernyms={hypernyms}, synonyms={synonyms} \n" | |
| NoHits = set(NoHits.split(" | ")) | |
| NoHits = filter_words(NoHits) | |
| NoHits = "Words to pay special attention to: \n" + str(NoHits) | |
| return NoHits, FinalOutput | |
| def WikiSearch(term): | |
| termtoks = term.split(" ") | |
| for item in termtoks: | |
| # Search for the term on Wikipedia and get the first result | |
| result = wikipedia.search(item, results=20) | |
| return result | |
| def find_string_positions(s, string): | |
| positions = [] | |
| start = 0 | |
| while True: | |
| position = s.find(string, start) | |
| if position == -1: | |
| break | |
| positions.append(position) | |
| start = position + len(string) | |
| return positions | |
| def splittext(string, split_positions): | |
| split_strings = [] | |
| prepos = 0 | |
| for pos in split_positions: | |
| pos -= 12 | |
| split_strings.append((string[prepos:pos])) #, string[pos:])) | |
| prepos = pos | |
| FinalOutput = "" | |
| stoutput = "" | |
| linenumber = 1 | |
| print(linenumber) | |
| for item in split_strings[1:]: | |
| stoutput = item[0:29] + "\n" + item[30:] | |
| stspaces = find_string_positions(stoutput, " ") | |
| FinalOutput += str(linenumber) + "\n" + stoutput[:stspaces[-2]] + "\n" | |
| FinalOutput += "\n" | |
| linenumber += 1 | |
| return FinalOutput[2:] | |
| def create_dictionary(word_list, word_dict = {}): | |
| word_list = set(word_list.split(" ")) | |
| for word in word_list: | |
| key = word[:2] | |
| if key not in word_dict: | |
| word_dict[key] = [word] | |
| else: | |
| word_dict[key].append(word) | |
| return word_dict | |
| def merge_lines(roman_file, w4w_file, full_mean_file, macaronic_file): | |
| files = [roman_file, w4w_file, full_mean_file, macaronic_file] | |
| merged_lines = [] | |
| with open(roman_file.name, "r") as f1, open(w4w_file.name, "r") as f2, \ | |
| open(full_mean_file.name, "r") as f3, open(macaronic_file.name, "r") as f4: | |
| for lines in zip(f1, f2, f3, f4): | |
| merged_line = "\n".join(line.strip() for line in lines) | |
| merged_lines.append(merged_line) | |
| return "\n".join(merged_lines) | |
| def TTSforListeningPractice(text): | |
| return "not finished" | |
| def group_words(inlist): | |
| inlisttoks = inlist.split(" ") | |
| inlistset = set(inlisttoks) | |
| word_groups = [] | |
| current_group = [] | |
| for word in inlisttoks: | |
| current_group.append(word) | |
| if len(current_group) == 10: | |
| word_groups.append(current_group) | |
| current_group = [] | |
| if current_group: | |
| word_groups.append(current_group) | |
| current_group_index = 0 | |
| current_group_time = 0 | |
| while True: | |
| if current_group_time == 60: | |
| current_group_index = (current_group_index + 1) % len(word_groups) | |
| current_group_time = 0 | |
| else: | |
| if current_group_time % 10 == 0: | |
| random.shuffle(word_groups[current_group_index]) | |
| current_group_time += 10 | |
| yield " ".join(word_groups[current_group_index]) | |
| time.sleep(10) | |
| groupinput_text = gr.inputs.Textbox(lines=2, label="Enter a list of words") | |
| groupoutput_text = gr.outputs.Textbox(label="Grouped words") | |
| with gr.Blocks() as lliface: | |
| with gr.Tab("Welcome"): | |
| gr.HTML("""<h1> Spaces Test - Still Undercontruction </h1> <p> You only learn when you convert things you dont know to known --> Normally Repetition is the only reliable method for everybody </p> | |
| <p> Knowledge is a Language but productive knowledge is find replace as well </p> <p>LingQ is good option for per word state management</p> <p> Arrows app json creator for easy knowledge graphing and spacy POS graph? </p> | |
| <p> Vocab = Glossary + all non text wall(lists, diagrams, etc.)</p> | |
| <p> https://huggingface.co/spaces/vumichien/whisper-speaker-diarization<br></p> | |
| <p> In Language the goal is bigger vocab --> Knowledge equivalent = question answer pairs but to get to those you need related information pairs</p> | |
| <p> ChatGPT Turns Learning into a read only what you dont know ask only what you dont know feedback loop --> All you have to do is keep track of what prompts you have asked in the past</p>""") | |
| with gr.Tab("Unique word ID"): | |
| gr.Interface(fn=unique_word_count, inputs="text", outputs="text", title="Wordcounter") | |
| gr.Interface(fn=SepHypandSynExpansion, inputs="text", outputs=["text", "text"], title="Word suggestions") | |
| gr.Interface(fn=WikiSearch, inputs="text", outputs="text", title="Unique word suggestions(wiki articles)") | |
| with gr.Tab("Automating related information linking"): | |
| gr.HTML("Questions - Tacking and suggesting questions to ask = new education") | |
| with gr.Tab("Spelling and Chunks"): | |
| gr.HTML("<p> Spelling is the end goal, you already know many letter orders called words so you need leverage them to remember random sequences") | |
| with gr.Tab("Spelling Simplification - Use a dual language list"): | |
| gr.Interface(fn=create_dictionary, inputs="text", outputs="text", title="Sort Text by first two letters") | |
| with gr.Tab("Chunks"): | |
| gr.Interface(fn=FrontRevSentChunk, inputs=[ChunkModeDrop, "checkbox", "text", langdest], outputs="text") | |
| gr.Interface(fn=keep_nouns_verbs, inputs=["text"], outputs="text", title="Noun and Verbs only (Plus punctuation)") | |
| with gr.Tab("Timing Practice - Repitition"): | |
| gr.HTML("<p>Run from it, Dread it, Repitition is inevitable - Thanos</p> <p>Next Milestone is Turning this interface handsfree</p>") | |
| with gr.Tab("Gradio Version"): | |
| gr.Interface(fn=group_words, inputs=groupinput_text, outputs=groupoutput_text, title="Word Grouping and Rotation", description="Group a list of words into sets of 10 and rotate them every 60 seconds.").queue() | |
| with gr.Tab("HTML Version"): | |
| gr.HTML("""<iframe height="1200" style="width: 100%;" scrolling="no" title="Memorisation Aid" src="https://codepen.io/kwabs22/embed/preview/GRXKQgj?default-tab=result&editable=true" frameborder="no" loading="lazy" allowtransparency="true" allowfullscreen="true"> | |
| See the Pen <a href="https://codepen.io/kwabs22/pen/GRXKQgj"> | |
| Memorisation Aid</a> by kwabs22 (<a href="https://codepen.io/kwabs22">@kwabs22</a>) | |
| on <a href="https://codepen.io">CodePen</a>. | |
| </iframe>""") | |
| with gr.Tab("Knowledge Ideas"): | |
| gr.HTML("""<p>Good knowledge = ability to answer questions --> find Questions you cant answer and look for hidden answer within them </p> | |
| <p>My One Word Theory = We only use more words than needed when we have to or are bored --> Headings exist because title is not sufficient, subheadings exist because headings are not sufficient, Book Text exists because subheadings are not sufficient</p> | |
| <p>Big Picture = Expand the Heading and the subheadings and compare them to each other</p> | |
| <p>Application of Knowledge = App Version of the text (eg. Jupyter Notebooks) is what you create and learn first</p> | |
| """) | |
| with gr.Tab("Beginner - Songs - Chorus"): | |
| gr.HTML("Essentially if the sounds are repeated or long notes they are easy to remember") | |
| gr.Interface(fn=TTSforListeningPractice, inputs="text", outputs="text", title="Placeholder - paste chorus here and use TTS or make notes to save here") | |
| with gr.Tab("Transcribe - RASMUS Whisper"): | |
| gr.HTML("""<p>If this tab doesnt work use the link below ⬇️</p> <a href="https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles">https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles</a>""") | |
| gr.Interface.load("spaces/RASMUS/Whisper-youtube-crosslingual-subtitles", title="Subtitles") | |
| with gr.Tab("Advanced - LingQ Addons ideas"): | |
| gr.HTML("Extra functions needed - Persitent Sentence translation, UNWFWO, POS tagging and Word Count per user of words in their account. Macaronic Text is also another way to practice only the important information") | |
| with gr.Row(): | |
| RomanFile = gr.File(label="Paste Roman") | |
| W4WFile = gr.File(label="Paste Word 4 Word") | |
| FullMeanFile = gr.File(label="Paste Full Meaning") | |
| MacaronicFile = gr.File(label="Paste Macaronic Text") | |
| with gr.Row(): | |
| MergeButton = gr.Button() | |
| with gr.Row(): | |
| MergeOutput = gr.TextArea(label="Output") | |
| MergeButton.click(merge_lines, inputs=[RomanFile, W4WFile, FullMeanFile, MacaronicFile], outputs=[MergeOutput]) | |
| with gr.Tab("Dictionary from text"): | |
| gr.Interface(fn=create_dictionary, inputs="text", outputs="text", title="Two Letter Dictionary") | |
| lliface.launch() |