| import tensorflow as tf | |
| import numpy as np | |
| from tensorflow.keras.preprocessing.text import Tokenizer | |
| from tensorflow.keras.preprocessing.sequence import pad_sequences | |
| from tensorflow.keras.models import Sequential | |
| import json | |
| test_div = 0.75 | |
| vocab_size = 10000 | |
| embedding_dim = 16 | |
| max_length = 100 | |
| trunc_type = 'post' | |
| padding_type = 'post' | |
| oov_tok = "<OOV>" | |
| sentences = [ | |
| 'Wow this AI is astonishing', | |
| 'This is the worst AI', | |
| 'This is the best AI', | |
| 'I am the best AI', | |
| 'It is very astonishing that we can train a model on any data we have', | |
| ] | |
| headlines = [] | |
| is_sarcastic = [] | |
| article_link = [] | |
| with open('Sarcasm_Headlines_Dataset.json', 'r') as f: | |
| data = json.load(f) | |
| for i in data: | |
| headlines.append(i['headline']) | |
| is_sarcastic.append(i['is_sarcastic']) | |
| article_link.append(i['article_link']) | |
| train_data = headlines[:int(len(headlines) * test_div)] | |
| train_result = is_sarcastic[:int(len(is_sarcastic) * test_div)] | |
| test_data = headlines[int(len(headlines) * test_div):] | |
| test_result = is_sarcastic[int(len(is_sarcastic) * test_div):] | |
| tokenizer = Tokenizer(num_words=10000, oov_token=oov_tok) | |
| tokenizer.fit_on_texts(train_data) | |
| word_index = tokenizer.word_index | |
| train_sequences = tokenizer.texts_to_sequences(train_data) | |
| test_sequences = tokenizer.texts_to_sequences(test_data) | |
| train_padded = pad_sequences( | |
| train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) | |
| test_padded = pad_sequences( | |
| test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) | |
| training_padded = np.array(train_padded) | |
| training_labels = np.array(train_result) | |
| testing_padded = np.array(test_padded) | |
| testing_labels = np.array(test_result) | |
| model = Sequential([ | |
| tf.keras.layers.Embedding( | |
| vocab_size, embedding_dim, input_length=max_length), | |
| tf.keras.layers.GlobalAveragePooling1D(), | |
| tf.keras.layers.Dense(24, activation='relu'), | |
| tf.keras.layers.Dense(1, activation='sigmoid') | |
| ]) | |
| model.compile(loss='binary_crossentropy', | |
| optimizer='adam', metrics=['accuracy']) | |
| model.summary() | |
| num_epochs = 30 | |
| history = model.fit(training_padded, training_labels, epochs=num_epochs, | |
| validation_data=(testing_padded, testing_labels), verbose=2) | |
| sentence = ["granny starting to fear spiders in the garden might be real", | |
| "game of thrones season finale showing this sunday night", | |
| "Central Valley Coalition Suing the EPA Over Clean Air Failures"] | |
| sequences = tokenizer.texts_to_sequences(sentence) | |
| padded = pad_sequences(sequences, maxlen=max_length, | |
| padding=padding_type, truncating=trunc_type) | |
| print(model.predict(padded)) | |