Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import pandas as pd | |
| from pytube import extract | |
| import re | |
| import string | |
| import pickle | |
| import nltk | |
| import nltk.sentiment.util | |
| from nltk.corpus import stopwords | |
| from nltk.stem import WordNetLemmatizer | |
| from keras.preprocessing.text import Tokenizer | |
| from keras.preprocessing.sequence import pad_sequences | |
| from tensorflow import keras | |
| from youtube_comment_downloader import * | |
| nltk.download('punkt') | |
| nltk.download('wordnet') | |
| # get YouTube ID | |
| def getID(url): | |
| print("Getting YouTube ID...") | |
| return extract.video_id(url) | |
| # function to clean comments | |
| def clean_text(text): | |
| lemmatizer = WordNetLemmatizer() | |
| # stopwords | |
| sw = ["i","me","my","myself","we","our","ours","ourselves","you","you're","you've","you'll","you'd","your","yours","yourself","yourselves","he","him","his","himself","she","she's","her","hers","herself","it","it's","its","itself","they","them","their","theirs","themselves","what","which","who","whom","this","that","that'll","these","those","am","is","are","was","were","be","been","being","have","has","had","having","do","does","did","doing","a","an","the","and","but","if","or","because","as","until","while","of","at","by","for","with","about","against","between","into","through","during","before","after","above","below","to","from","up","down","in","out","on","off","over","under","again","further","then","once","here","there","when","where","why","how","all","any","both","each","few","more","most","other","some","such","no","nor","not","only","own","same","so","than","too","very","s","t","can","will","just","don","don't","should","should've","now","d","ll","m","o","re","ve","y","ain","aren","aren't","couldn","couldn't","didn","didn't","doesn","doesn't","hadn","hadn't","hasn","hasn't","haven","haven't","isn","isn't","ma","mightn","mightn't","mustn","mustn't","needn","needn't","shan","shan't","shouldn","shouldn't","wasn","wasn't","weren","weren't","won","won't","wouldn","wouldn't"] | |
| # remove symbols and Emojis | |
| text = text.lower() | |
| text = re.sub('@', '', text) | |
| text = re.sub('\[.*?\]', '', text) | |
| text = re.sub('https?://\S+|www\.\S+', '', text) | |
| text = re.sub('<.*?>+', '', text) | |
| text = re.sub('[%s]' % re.escape(string.punctuation), '', text) | |
| text = re.sub('\n', '', text) | |
| text = re.sub('\w*\d\w*', '', text) | |
| text = re.sub(r"[^a-zA-Z ]+", "", text) | |
| # tokenize the data | |
| text = nltk.word_tokenize(text) | |
| # lemmatize | |
| text = [lemmatizer.lemmatize(t) for t in text] | |
| text = [lemmatizer.lemmatize(t, 'v') for t in text] | |
| # mark Negation | |
| tokens_neg_marked = nltk.sentiment.util.mark_negation(text) | |
| # remove stopwords | |
| text = [t for t in tokens_neg_marked | |
| if t.replace("_NEG", "").isalnum() and | |
| t.replace("_NEG", "") not in sw] | |
| return text | |
| def getSentenceTrain(): | |
| # open sentences_train file | |
| sentences_train_f = open('Deep learning/pickles/sentences_train.pickle', "rb") | |
| sentences_train = pickle.load(sentences_train_f) | |
| sentences_train_f.close() | |
| return sentences_train | |
| SGD_74_f = open('Shallow machine learning/pickles/SGD_74.pickle', "rb") | |
| SGD_train = pickle.load(SGD_74_f) | |
| SGD_74_f.close() | |
| logreg_79_f = open('Shallow machine learning/pickles/logreg_79.pickle', "rb") | |
| logreg_train = pickle.load(logreg_79_f) | |
| logreg_79_f.close() | |
| # get saved CNN model | |
| model = keras.models.load_model("Deep learning/CNN_82") | |
| def vote(test_point, _test): | |
| print("Voting on video effectivess...\n") | |
| pos_weighting = [] | |
| result = '' | |
| confidence = 0 | |
| algos_score = 0 | |
| algorithms = [ | |
| {'name': 'SGD', 'accuracy': 0.74*100, 'trained': SGD_train}, | |
| {'name': 'Logistic Regression', 'accuracy': 0.79*100, 'trained': logreg_train}, | |
| {'name': 'CNN', 'accuracy': 0.82*100, 'trained': model} | |
| ] | |
| for algo in algorithms: | |
| weight = algo['accuracy'] | |
| algos_score += weight | |
| if algo['name'] == "CNN": | |
| pred = algo['trained'].predict(_test) | |
| if pred[0][0] > 0.5: | |
| pos_weighting.append(weight) | |
| print("CNN voted for: effective" if pred[0][0]>0.5 else "CNN voted for: ineffective") | |
| else: | |
| pred = algo['trained'].predict(test_point) | |
| if pred == 'pos': | |
| pos_weighting.append(weight) | |
| print(algo['name'] + " voted for: effective" if pred=='pos' else algo['name'] + " voted for: ineffective") | |
| pos_result = sum(pos_weighting)/algos_score | |
| if pos_result < 0.5: | |
| result = 'ineffective' | |
| confidence = 1 - pos_result | |
| else: | |
| result = 'effective' | |
| confidence = pos_result | |
| return result, confidence | |
| def quantizeEffectiveness(url): | |
| # 1. Get YouTube ID | |
| print("Getting YouTube ID...") | |
| videoId = getID(url) | |
| # 2. Download comments | |
| print("Downloading comments...") | |
| downloader = YoutubeCommentDownloader() | |
| comments_downloaded = downloader.get_comments_from_url(f'https://www.youtube.com/watch?v={videoId}') | |
| comments = [comment for comment in comments_downloaded] | |
| comments_df = pd.DataFrame(comments) | |
| # 3. Clean comments | |
| print("Cleaning Comments...") | |
| comments_df['text'] = comments_df['text'].apply(lambda x: clean_text(x)) | |
| # get all words of video into one list | |
| all_words = [item for sublist in comments_df['text'].tolist() for item in sublist] | |
| # 4. Create test dataframe | |
| test = pd.DataFrame([[videoId]], columns=['VideoId']) | |
| # 5. Get documents (pre-processd comments) | |
| test_documents = [] | |
| test_documents.append(all_words) | |
| test['cleaned'] = test_documents | |
| test['cleaned_string'] = [' '.join(map(str, l)) for l in test['cleaned']] | |
| # 6. Get ML test point | |
| test_point = test['cleaned_string'] | |
| test_sentence = test['cleaned_string'].values | |
| # 7. Get trained sentences | |
| sentences_train = getSentenceTrain() | |
| # 8. Tokenize the data | |
| print("Tokenizing the data...") | |
| tokenizer = Tokenizer(num_words=5000) | |
| tokenizer.fit_on_texts(sentences_train) | |
| # 9. Get DL test point | |
| _test = pad_sequences(tokenizer.texts_to_sequences(test_sentence), padding='post', maxlen=100) | |
| # 10. Vote on video effectiveness | |
| result, confidence = vote(test_point, _test) | |
| return result, confidence | |
| def is_valid_youtube_url(text): | |
| youtube_regex = re.compile(r"^(https?://)?(www\.)?(youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/|youtube\.com/v/|youtube\.com/user/[^/]+/u/)?([^/&?=\s]{11})$") | |
| return bool(youtube_regex.match(text)) | |
| def greet(url): | |
| if not is_valid_youtube_url(url): | |
| return "Please input a valid YouTube URL" | |
| result, confidence = quantizeEffectiveness(url) | |
| return f"The video (ID: {getID(url)}) is {result} with a confidence of {round(confidence*100,2)}%" | |
| iface = gr.Interface(fn=greet, inputs="text", outputs="text") | |
| iface.launch() |