| from typing import List, Tuple |
| import nltk |
| import sklearn |
| import question_categorizer as qc |
| from question_categorizer import TextClassificationModel |
| from tfidf_model import NLPModel |
| import tfidf_model |
| import transformers |
| import numpy as np |
| import pandas as pd |
| import json |
| from tqdm import tqdm |
| from collections import defaultdict |
|
|
|
|
|
|
|
|
|
|
| class QuizBowlModel: |
|
|
| def __init__(self, clear = False): |
| """ |
| Load your model(s) and whatever else you need in this function. |
| |
| Do NOT load your model or resources in the guess_and_buzz() function, |
| as it will increase latency severely. |
| """ |
| |
| self.categories = ['Geography', 'Religion', 'Philosophy', 'Trash','Mythology', 'Literature','Science', 'Social Science', 'History', 'Current Events', 'Fine Arts', 'ALL'] |
| self.tfidf_models = [None for _ in range(len(self.categories))] |
| self.qc_model = qc.TextClassificationModel.load_model("models/categorizer") |
| |
| self.load_tfidf_models(clear=clear) |
|
|
|
|
|
|
| |
|
|
| def guess_and_buzz(self, question_text: List[str]) -> List[Tuple[str, bool]]: |
| """ |
| This function accepts a list of question strings, and returns a list of tuples containing |
| strings representing the guess and corresponding booleans representing |
| whether or not to buzz. |
| |
| So, guess_and_buzz(["This is a question"]) should return [("answer", False)] |
| |
| If you are using a deep learning model, try to use batched prediction instead of |
| iterating using a for loop. |
| """ |
| |
| guesses = [] |
| curr_question = "" |
| |
| for question in question_text: |
| curr_question += question + "." |
| |
| confidence,answer = self.predict(curr_question) |
| |
| confidence = True if confidence > 0.5 else False |
|
|
| guesses.append((confidence,answer)) |
| |
| return guesses |
| |
| def load_tfidf_models(self, clear=False): |
| |
| print("loading tfidf models") |
| |
| |
| if not clear: |
| for category in range(len(self.categories)): |
| if self.tfidf_models[category] is None: |
| self.tfidf_models[category] = NLPModel().load(f"models/{self.categories[category]}_tfidf.pkl") |
| |
| self.tfidf_models[-1] = NLPModel().load(f"models/{'ALL'}_tfidf.pkl") |
| else: |
| for category in range(len(self.categories)): |
| if self.tfidf_models[category] is None: |
| self.tfidf_models[category] = NLPModel() |
| |
| print(self.tfidf_models) |
| |
| |
| |
| def train(self, data): |
| |
| |
| training_data = [[] for _ in range(len(self.categories))] |
| |
| with tqdm(total=len(data)) as pbar: |
| for data_point in data: |
| text = data_point["text"] |
| answer = data_point["answer"] |
| categories = data_point["category"] |
| |
| for category in categories: |
| |
| category_ind = self.categories.index(category) |
| |
| training_data[category_ind].append({"text": text, "answer": answer}) |
| |
| pbar.update(1) |
| |
|
|
| for ind,data in enumerate(training_data): |
| |
| self.tfidf_models[ind].process_data(data) |
| |
| |
| self.tfidf_models[ind].train_model() |
| |
| |
| self.tfidf_models[ind].save(f"models/{self.categories[ind]}_tfidf.pkl") |
| self.tfidf_models[ind] = None |
| |
| |
| training_data[ind] = [] |
| |
| |
| |
| |
| print("TRAINING DATA") |
| '''with tqdm(total=len(self.categories)) as pbar: |
| for category in range(len(self.categories)): |
| |
| # Train model |
| self.tfidf_models[category].train_model() |
| |
| # Save model |
| self.tfidf_models[category].save(f"models/{self.categories[category]}_tfidf.pkl") |
| |
| # Unload model |
| #print(f'category {self.categories[category]} gets unloaded') |
| self.tfidf_models[category] = None |
| training_data[category] = None |
| |
| pbar.update(1)''' |
| |
| print("Training complete.") |
|
|
| |
| |
| def predict(self, input_data, confidence_threshold=1.5): |
| |
| category_confidences = self.qc_model.predict(input_data) |
| |
| |
| |
| confident_indices = (category_confidences > confidence_threshold).nonzero()[:,1] |
| |
| |
| |
| max_confidence = 0 |
| max_answer = None |
| max_category = 0 |
| for category in confident_indices: |
| |
| confidence,answer = self.tfidf_models[category].predict(input_data) |
| |
| if(confidence > max_confidence): |
| max_confidence = confidence |
| max_answer = answer |
| max_category = category |
| |
| |
| |
| |
| return (np.tanh(max_confidence), max_answer) |
| |
| def evaluate(self, input_data): |
| correct = 0 |
| count = 0 |
| |
| with tqdm(total=len(input_data)) as pbar: |
| for data_point in input_data: |
| print(count % 10) |
| count += 1 |
| text = data_point["text"] |
| answer = data_point["answer"] |
| |
| answer_predict = self.predict(text)[1] |
| |
| if(answer == answer_predict): |
| correct += 1 |
| print(correct) |
| |
| if(count % 10 == 0): |
| average = float(correct)/count |
| print(f'rolling average: {average}') |
| |
| pbar.update(1) |
| |
| |
| accuracy = correct/len(input_data) |
| |
| return accuracy |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| if __name__ == "__main__": |
| |
| import argparse |
| parser = argparse.ArgumentParser() |
|
|
| parser.add_argument('--data', type=str) |
| parser.add_argument('--model', type=str) |
| parser.add_argument('--predict', type=str) |
| parser.add_argument('--clear', action='store_const', const=True, default=False) |
| parser.add_argument('--evaluate', type=str) |
|
|
| flags = parser.parse_args() |
| model = None |
| |
| print(flags.clear) |
|
|
| if flags.clear: |
|
|
| model = QuizBowlModel(clear=True) |
| |
| else: |
|
|
| model = QuizBowlModel() |
|
|
| |
|
|
| if flags.data: |
| |
| data_json = [] |
| |
| for data in flags.data: |
| with open(flags.data, 'r') as data_file: |
| data_json.extend(json.load(data_file)) |
|
|
| model.train(data_json) |
| |
| |
|
|
| if flags.model: |
| model.load(flags.model) |
| |
| if flags.predict: |
| print(model.predict(flags.predict)) |
| |
| if flags.evaluate: |
| with open(flags.evaluate, 'r') as data_file: |
| data_json = json.load(data_file) |
| print(f'accuracy: {model.evaluate(data_json)}') |
| |