Spaces:
Runtime error
Runtime error
| # -*- coding: utf-8 -*- | |
| """app.ipynb | |
| Automatically generated by Colaboratory. | |
| Original file is located at | |
| https://colab.research.google.com/drive/10plMWPNgOBAggggGeW01XD195JH5cYlR | |
| """ | |
| import gradio as gr | |
| import csv | |
| import string | |
| import readability | |
| import pandas as pd | |
| import nltk | |
| from nltk.tokenize import word_tokenize | |
| import torch | |
| import gensim | |
| import gensim.downloader as api | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from nltk.corpus import wordnet as wn | |
| from transformers import DistilBertTokenizer | |
| from nltk.corpus import stopwords | |
| from fuzzywuzzy import fuzz | |
| from fuzzywuzzy import process | |
| from transformers import pipeline | |
| import statistics | |
| import seaborn as sns | |
| nltk.download('cmudict') | |
| nltk.download('stopwords') | |
| nltk.download('punkt') | |
| glove_vectors = api.load('glove-wiki-gigaword-100') | |
| tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') | |
| device = torch.device('cuda' if torch.cuda.is_available else 'cpu') | |
| #loading model | |
| PATH = '"C:\Users\Robby\Desktop\automaticlit\pytorchBERTmodel"' | |
| model = torch.load(PATH) | |
| model.eval() | |
| model.to(device) | |
| p = pipeline("automatic-speech-recognition") | |
| w2v = dict({}) | |
| for idx, key in enumerate(glove_vectors.wv.vocab): | |
| w2v[key] = glove_vectors.wv.get_vector(key) | |
| def calculate_diversity(text): | |
| stop_words = set(stopwords.words('english')) | |
| for i in string.punctuation: | |
| stop_words.add(i) | |
| tokenized_text = word_tokenize(text) | |
| tokenized_text = list(map(lambda word: word.lower(), tokenized_text)) | |
| sim_words = {} | |
| if len(tokenized_text) <= 1: | |
| return 1,"More Text Required" | |
| for idx, anc_word in enumerate(tokenized_text): | |
| if anc_word in stop_words: | |
| continue | |
| if idx in sim_words: | |
| sim_words[idx] = sim_words[idx] | |
| continue | |
| vocab = [anc_word] | |
| for pos, comp_word in enumerate(tokenized_text): | |
| try: | |
| if not comp_word in stop_words and cosine_similarity(w2v[anc_word].reshape(1, -1), w2v[comp_word].reshape(1, -1)) > .75: | |
| vocab.append(comp_word) | |
| sim_words[idx] = vocab | |
| except KeyError: | |
| continue | |
| scores = {} | |
| for key, value in sim_words.items(): | |
| if len(value) == 1: | |
| scores[key] = 1 | |
| continue | |
| t_sim = len(value) - 1 | |
| t_rep = (len(value) - 1) - (len(set(value)) ) | |
| score = ((t_sim - t_rep)/t_sim)**2 | |
| scores[key] = score | |
| mean_score = 0 | |
| total = 0 | |
| for value in scores.values(): | |
| mean_score += value | |
| total += 1 | |
| return scores, mean_score/total | |
| def dict_to_list(dictionary, max_size=10): | |
| outer_list = [] | |
| inner_list = [] | |
| for key, value in dictionary.items(): | |
| inner_list.append(value) | |
| if len(inner_list) == max_size: | |
| outer_list.append(inner_list) | |
| inner_list = [] | |
| if len(inner_list) > 0: | |
| outer_list.append(inner_list) | |
| return outer_list | |
| def heatmap(scores, df): | |
| total = 0 | |
| loops = 0 | |
| for ratio in scores.values(): | |
| #conditional to visualize the difference between no ratio and a 0 ratio score | |
| if ratio != -.3: | |
| total += ratio | |
| loops += 1 | |
| diversity_average = total/loops | |
| return sns.heatmap(df, cmap='gist_gray_r', vmin = -.3).set(title='Word Diversity Score Heatmap (Average Score: ' + str(diversity_average) + ')') | |
| def stats(text): | |
| results = readability.getmeasures(text, lang='en') | |
| return results | |
| def predict(text, tokenizer=tokenizer): | |
| model.eval() | |
| model.to(device) | |
| def prepare_data(text, tokenizer): | |
| input_ids = [] | |
| attention_masks = [] | |
| encoded_text = tokenizer.encode_plus( | |
| text, | |
| truncation=True, | |
| add_special_tokens = True, | |
| max_length = 315, | |
| pad_to_max_length=True, | |
| return_attention_mask = True, | |
| return_tensors = 'pt' | |
| ) | |
| input_ids.append(encoded_text['input_ids']) | |
| attention_masks.append(encoded_text['attention_mask']) | |
| input_ids = torch.cat(input_ids, dim=0) | |
| attention_masks = torch.cat(attention_masks, dim=0) | |
| return {'input_ids':input_ids, 'attention_masks':attention_masks} | |
| tokenized_example_text = prepare_data(text, tokenizer) | |
| with torch.no_grad(): | |
| result = model( | |
| tokenized_example_text['input_ids'].to(device), | |
| attention_mask = tokenized_example_text['attention_masks'].to(device), | |
| return_dict=True | |
| ).logits | |
| return result | |
| def reading_difficulty(excerpt): | |
| if len(excerpt) == 0: | |
| return "No Text Provided" | |
| windows = [] | |
| words = tokenizer.tokenize(excerpt) | |
| if len(words) > 301: | |
| for idx, text in enumerate(words): | |
| if idx % 300 == 0: | |
| if idx <= len(words) - 301: | |
| x = ' '.join(words[idx: idx+299]) | |
| windows.append(x) | |
| win_preds = [] | |
| for text in windows: | |
| win_preds.append(predict(text, tokenizer).item()) | |
| result = statistics.mean(win_preds) | |
| score = -(result * 1.786 + 6.4) + 10 | |
| return score | |
| else: | |
| result = predict(excerpt).item() | |
| score = -(result * 1.786 + 6.4) + 10 | |
| return score | |
| def calculate_stats(file_name, data_index): | |
| #unicode escape only for essays | |
| with open(file_name, encoding= 'unicode_escape') as f: | |
| information = {'lines':0, 'words_per_sentence':0, 'words':0, 'syll_per_word':0, 'characters_per_word':0, 'reading_difficulty':0 } | |
| reader = csv.reader(f) | |
| for line in reader: | |
| if len(line[data_index]) < 100: | |
| continue | |
| #if detect(line[data_index][len(line[data_index]) -400: len(line[data_index])-1]) == 'en': | |
| try: | |
| stat = stats(line[data_index]) | |
| except ValueError: | |
| continue | |
| information['lines'] += 1 | |
| print(information['lines']) | |
| information['words_per_sentence'] += stat['sentence info']['words_per_sentence'] | |
| information['words'] += stat['sentence info']['words'] | |
| information['syll_per_word'] += stat['sentence info']['syll_per_word'] | |
| information['characters_per_word'] += stat['sentence info']['characters_per_word'] | |
| information['reading_difficulty'] += reading_difficulty(line[data_index]) | |
| for i in information: | |
| if i != 'lines' and i != 'words': | |
| information[i] /= information['lines'] | |
| return information | |
| def transcribe(audio): | |
| #speech to text using pipeline | |
| text = p(audio)["text"] | |
| transcription.append(text) | |
| return text | |
| def compute_score(target, actual): | |
| target = target.lower() | |
| actual = actual.lower() | |
| return fuzz.ratio(target,actual) | |
| def phon(text): | |
| alph = nltk.corpus.cmudict.dict() | |
| text = word_tokenize(text) | |
| pronun = [] | |
| for word in text: | |
| try: | |
| pronun.append(alph[word][0]) | |
| except Exception as e: | |
| pronun.append(word) | |
| return pronun | |
| def gradio_fn(text, audio, target, actual_audio): | |
| if text == None and audio == None and target == None and actual_audio == None: | |
| return "No Inputs", "No Inputs", "No Inputs", "No Inputs" | |
| speech_score = 0 | |
| div = calculate_diversity(text) | |
| if actual_audio != None: | |
| actual = p(actual_audio)["text"] | |
| print('sdfgs') | |
| speech_score = compute_score(target, actual) | |
| return "Difficulty Score: " + str(reading_difficulty(actual)), "Transcript: " + str(actual.lower()), "Diversity Score: " + str(div[1]), "Speech Score: " + str(speech_score) | |
| transcription = [] | |
| if audio != None: | |
| text = p(audio)["text"] | |
| transcription.append(text) | |
| state = div[0] | |
| return "Difficulty Score: " + str(reading_difficulty(text)), "Transcript: " + str(transcription[-1].lower()), "Diversity Score: " + str(div[1]), "No Inputs" | |
| return "Difficulty Score: " + str(reading_difficulty(text)),"Diversity Score: " + str(div[1]), "No Audio Provided", "No Inputs" | |
| def plot(): | |
| text = state | |
| diversity = calculate_diversity(text)[0] | |
| print(diversity) | |
| df = pd.DataFrame(dict_to_list(diversity)) | |
| return heatmap(diversity, df) | |
| import csv | |
| example_data = [] | |
| x = 0 | |
| with open('C:\Users\Robby\Desktop\automaticlit\train.csv') as f: | |
| reader = csv.reader(f) | |
| for line in reader: | |
| example_data.append([line[3]]) | |
| x += 1 | |
| if x > 100: | |
| break | |
| state = {} | |
| interface = gr.Interface( | |
| fn=gradio_fn, | |
| inputs= [gr.components.Textbox( | |
| label="Text"), | |
| gr.components.Audio( | |
| label="Speech Translation", | |
| source="microphone", | |
| type="filepath"), | |
| gr.components.Textbox( | |
| label="Target Text to Recite" | |
| ), | |
| gr.components.Audio( | |
| label="Read Text Above for Score", | |
| source="microphone", | |
| type="filepath") | |
| ], | |
| outputs = ["text", "text", "text", "text"], | |
| theme="huggingface", | |
| description="Enter text or speak into your microphone to have your text analyzed!", | |
| rounded=True, | |
| container=True, | |
| examples=example_data, | |
| examples_per_page = 3 | |
| ).launch(debug=True) |