Spaces:

RobPruzan
/

automaticlitassesment

Runtime error

App Files Files Community

RobPruzan commited on Aug 8, 2022

Commit

84ed9cd

1 Parent(s): 6373e5b

Delete, wrong file name

Browse files

Files changed (1) hide show

app.py.py +0 -336

app.py.py DELETED Viewed

@@ -1,336 +0,0 @@
-# -*- coding: utf-8 -*-
-"""app.ipynb
-Automatically generated by Colaboratory.
-Original file is located at
-    https://colab.research.google.com/drive/10plMWPNgOBAggggGeW01XD195JH5cYlR
-"""
-import gradio as gr
-import csv
-import string
-import readability
-import pandas as pd
-import nltk
-from nltk.tokenize import word_tokenize
-import torch
-import gensim
-import gensim.downloader as api
-from sklearn.metrics.pairwise import cosine_similarity
-from nltk.corpus import wordnet as wn
-from transformers import DistilBertTokenizer
-from nltk.corpus import stopwords
-from fuzzywuzzy import fuzz
-from fuzzywuzzy import process
-from transformers import pipeline
-import statistics
-import seaborn as sns
-nltk.download('cmudict')
-nltk.download('stopwords')
-nltk.download('punkt')
-glove_vectors = api.load('glove-wiki-gigaword-100')
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
-#loading model
-PATH = '"C:\Users\Robby\Desktop\automaticlit\pytorchBERTmodel"'
-model = torch.load(PATH)
-model.eval()
-model.to(device)
-p = pipeline("automatic-speech-recognition")
-w2v = dict({})
-for idx, key in enumerate(glove_vectors.wv.vocab):
-  w2v[key] = glove_vectors.wv.get_vector(key)
-def calculate_diversity(text):
-  stop_words = set(stopwords.words('english'))
-  for i in string.punctuation:
-    stop_words.add(i)
-  tokenized_text = word_tokenize(text)
-  tokenized_text = list(map(lambda word: word.lower(), tokenized_text))
-  sim_words = {}
-  if len(tokenized_text) <= 1:
-    return 1,"More Text Required"
-  for idx, anc_word in enumerate(tokenized_text):
-    if anc_word in stop_words:
-      continue
-    if idx in sim_words:
-      sim_words[idx] = sim_words[idx]
-      continue
-    vocab = [anc_word]
-    for pos, comp_word in enumerate(tokenized_text):
-      try:
-        if not comp_word in stop_words and cosine_similarity(w2v[anc_word].reshape(1, -1), w2v[comp_word].reshape(1, -1)) > .75:
-          vocab.append(comp_word)
-        sim_words[idx] = vocab
-      except KeyError:
-        continue
-  scores = {}
-  for key, value in sim_words.items():
-    if len(value) == 1:
-      scores[key] = 1
-      continue
-    t_sim = len(value) - 1
-    t_rep = (len(value) - 1) - (len(set(value)) )
-    score = ((t_sim - t_rep)/t_sim)**2
-    scores[key] = score
-  mean_score = 0
-  total = 0
-  for value in scores.values():
-    mean_score += value
-    total += 1
-  return scores, mean_score/total
-def dict_to_list(dictionary, max_size=10):
-    outer_list = []
-    inner_list = []
-    for key, value in dictionary.items():
-        inner_list.append(value)
-        if len(inner_list) == max_size:
-            outer_list.append(inner_list)
-            inner_list = []
-    if len(inner_list) > 0:
-        outer_list.append(inner_list)
-    return outer_list
-def heatmap(scores, df):
-  total = 0
-  loops = 0
-  for ratio in scores.values():
-    #conditional to visualize the difference between no ratio and a 0 ratio score
-    if ratio != -.3:
-      total += ratio
-      loops += 1
-  diversity_average = total/loops
-  return sns.heatmap(df, cmap='gist_gray_r', vmin = -.3).set(title='Word Diversity Score Heatmap (Average Score: ' + str(diversity_average) + ')')
-def stats(text):
-  results = readability.getmeasures(text, lang='en')
-  return results
-def predict(text, tokenizer=tokenizer):
-  model.eval()
-  model.to(device)
-  def prepare_data(text, tokenizer):
-    input_ids = []
-    attention_masks = []
-    encoded_text = tokenizer.encode_plus(
-        text,
-        truncation=True,
-        add_special_tokens = True,
-        max_length = 315,
-        pad_to_max_length=True,
-        return_attention_mask = True,
-        return_tensors = 'pt'
-    )
-    input_ids.append(encoded_text['input_ids'])
-    attention_masks.append(encoded_text['attention_mask'])
-    input_ids = torch.cat(input_ids, dim=0)
-    attention_masks = torch.cat(attention_masks, dim=0)
-    return {'input_ids':input_ids, 'attention_masks':attention_masks}
-  tokenized_example_text = prepare_data(text, tokenizer)
-  with torch.no_grad():
-    result = model(
-      tokenized_example_text['input_ids'].to(device),
-      attention_mask = tokenized_example_text['attention_masks'].to(device),
-      return_dict=True
-  ).logits
-  return result
-def reading_difficulty(excerpt):
-  if len(excerpt) == 0:
-    return "No Text Provided"
-  windows = []
-  words = tokenizer.tokenize(excerpt)
-  if len(words) > 301:
-    for idx, text in enumerate(words):
-      if idx % 300 == 0:
-        if idx <= len(words) - 301:
-          x = ' '.join(words[idx: idx+299])
-          windows.append(x)
-    win_preds = []
-    for text in windows:
-      win_preds.append(predict(text, tokenizer).item())
-    result = statistics.mean(win_preds)
-    score = -(result * 1.786 + 6.4) + 10
-    return score
-  else:
-    result = predict(excerpt).item()
-    score = -(result * 1.786 + 6.4) + 10
-    return score
-def calculate_stats(file_name, data_index):
-  #unicode escape only for essays
-  with open(file_name, encoding= 'unicode_escape') as f:
-    information = {'lines':0, 'words_per_sentence':0, 'words':0, 'syll_per_word':0, 'characters_per_word':0, 'reading_difficulty':0 }
-    reader = csv.reader(f)
-    for line in reader:
-      if len(line[data_index]) < 100:
-        continue
-      #if detect(line[data_index][len(line[data_index]) -400: len(line[data_index])-1]) == 'en':
-      try:
-        stat = stats(line[data_index])
-      except ValueError:
-        continue
-      information['lines'] += 1
-      print(information['lines'])
-      information['words_per_sentence'] += stat['sentence info']['words_per_sentence']
-      information['words'] += stat['sentence info']['words']
-      information['syll_per_word'] += stat['sentence info']['syll_per_word']
-      information['characters_per_word'] += stat['sentence info']['characters_per_word']
-      information['reading_difficulty'] += reading_difficulty(line[data_index])
-  for i in information:
-    if i != 'lines' and i != 'words':
-      information[i] /= information['lines']
-  return information
-def transcribe(audio):
-  #speech to text using pipeline
-  text = p(audio)["text"]
-  transcription.append(text)
-  return text
-def compute_score(target, actual):
-  target = target.lower()
-  actual = actual.lower()
-  return fuzz.ratio(target,actual)
-def phon(text):
-  alph = nltk.corpus.cmudict.dict()
-  text = word_tokenize(text)
-  pronun = []
-  for word in text:
-      try:
-        pronun.append(alph[word][0])
-      except Exception as e:
-        pronun.append(word)
-  return pronun
-def gradio_fn(text, audio, target, actual_audio):
-  if text == None and audio == None and target == None and actual_audio == None:
-    return "No Inputs", "No Inputs", "No Inputs", "No Inputs"
-  speech_score = 0
-  div = calculate_diversity(text)
-  if actual_audio != None:
-    actual = p(actual_audio)["text"]
-    print('sdfgs')
-    speech_score = compute_score(target, actual)
-    return "Difficulty Score: " + str(reading_difficulty(actual)),  "Transcript: " + str(actual.lower()), "Diversity Score: " + str(div[1]), "Speech Score: " + str(speech_score)
-  transcription = []
-  if audio != None:
-    text = p(audio)["text"]
-    transcription.append(text)
-    state = div[0]
-    return "Difficulty Score: " + str(reading_difficulty(text)),  "Transcript: " + str(transcription[-1].lower()), "Diversity Score: " + str(div[1]), "No Inputs"
-  return "Difficulty Score: " + str(reading_difficulty(text)),"Diversity Score: " + str(div[1]), "No Audio Provided", "No Inputs"
-def plot():
-  text = state
-  diversity = calculate_diversity(text)[0]
-  print(diversity)
-  df = pd.DataFrame(dict_to_list(diversity))
-  return heatmap(diversity, df)
-import csv
-example_data = []
-x = 0
-with open('C:\Users\Robby\Desktop\automaticlit\train.csv') as f:
-  reader = csv.reader(f)
-  for line in reader:
-    example_data.append([line[3]])
-    x += 1
-    if x > 100:
-      break
-state = {}
-interface = gr.Interface(
-    fn=gradio_fn,
-    inputs= [gr.components.Textbox(
-                 label="Text"),
-             gr.components.Audio(
-                 label="Speech Translation",
-                 source="microphone",
-                 type="filepath"),
-             gr.components.Textbox(
-                 label="Target Text to Recite"
-             ),
-             gr.components.Audio(
-                 label="Read Text Above for Score",
-                 source="microphone",
-                 type="filepath")
-             ],
-    outputs = ["text", "text", "text", "text"],
-    theme="huggingface",
-    description="Enter text or speak into your microphone to have your text analyzed!",
-    rounded=True,
-    container=True,
-    examples=example_data,
-    examples_per_page = 3
-    ).launch(debug=True)