| | from typing import List, Tuple |
| | import nltk |
| | import sklearn |
| | from .question_categorizer import TextClassificationModel |
| | from .tfidf_model import NLPModel |
| | import transformers |
| | from huggingface_hub import hf_hub_download |
| | import numpy as np |
| | import pandas as pd |
| | import json |
| | from tqdm import tqdm |
| | from collections import defaultdict |
| |
|
| |
|
| |
|
| |
|
| |
|
| | class QuizBowlModel: |
| |
|
| | def __init__(self, clear = False): |
| | """ |
| | Load your model(s) and whatever else you need in this function. |
| | |
| | Do NOT load your model or resources in the guess_and_buzz() function, |
| | as it will increase latency severely. |
| | """ |
| | |
| | self.categories = ['Geography', 'Religion', 'Philosophy', 'Trash','Mythology', 'Literature','Science', 'Social Science', 'History', 'Current Events', 'Fine Arts'] |
| | self.tfidf_models = [None for _ in range(len(self.categories))] |
| | self.qc_model = TextClassificationModel.load_model("models/categorizer") |
| | |
| | self.load_tfidf_models(clear=clear) |
| |
|
| |
|
| |
|
| | |
| |
|
| | def guess_and_buzz(self, question_text: List[str]) -> List[Tuple[str, bool]]: |
| | """ |
| | This function accepts a list of question strings, and returns a list of tuples containing |
| | strings representing the guess and corresponding booleans representing |
| | whether or not to buzz. |
| | |
| | So, guess_and_buzz(["This is a question"]) should return [("answer", False)] |
| | |
| | If you are using a deep learning model, try to use batched prediction instead of |
| | iterating using a for loop. |
| | """ |
| | |
| | guesses = [] |
| | curr_question = "" |
| | |
| | for question in question_text: |
| | curr_question += question + "." |
| | |
| | confidence,answer = self.predict(curr_question) |
| | |
| | confidence = True if confidence > 0.5 else False |
| |
|
| | guesses.append((confidence,answer)) |
| | |
| | return guesses |
| | |
| | def load_tfidf_models(self, clear=False): |
| | |
| | print("loading tfidf models") |
| | |
| | |
| | if not clear: |
| | for category in range(len(self.categories)): |
| |
|
| | REPO_ID = 'Backedman/TriviaAnsweringMachineREAL' |
| | FILENAME = f"models/{self.categories[category]}_tfidf.pkl" |
| |
|
| | hf_hub_download(repo_id=REPO_ID, filename=FILENAME, local_dir='.') |
| | |
| | if self.tfidf_models[category] is None: |
| | self.tfidf_models[category] = NLPModel().load(f"models/{self.categories[category]}_tfidf.pkl") |
| | |
| | |
| | else: |
| | for category in range(len(self.categories)): |
| | if self.tfidf_models[category] is None: |
| | self.tfidf_models[category] = NLPModel() |
| | |
| | print(self.tfidf_models) |
| | |
| | |
| | |
| | def train(self, data): |
| | |
| | |
| | training_data = [[] for _ in range(len(self.categories))] |
| | |
| | with tqdm(total=len(data)) as pbar: |
| | for data_point in data: |
| | text = data_point["text"] |
| | answer = data_point["answer"] |
| | categories = data_point["category"] |
| | |
| | for category in categories: |
| | |
| | category_ind = self.categories.index(category) |
| | |
| | training_data[category_ind].append({"text": text, "answer": answer}) |
| | |
| | pbar.update(1) |
| | |
| |
|
| | for ind,data in enumerate(training_data): |
| | |
| | self.tfidf_models[ind].process_data(data) |
| | |
| | |
| | self.tfidf_models[ind].train_model() |
| | |
| | |
| | self.tfidf_models[ind].save(f"models/{self.categories[ind]}_tfidf.pkl") |
| | self.tfidf_models[ind] = None |
| | |
| | |
| | training_data[ind] = [] |
| | |
| | |
| | |
| | |
| | print("TRAINING DATA") |
| | '''with tqdm(total=len(self.categories)) as pbar: |
| | for category in range(len(self.categories)): |
| | |
| | # Train model |
| | self.tfidf_models[category].train_model() |
| | |
| | # Save model |
| | self.tfidf_models[category].save(f"models/{self.categories[category]}_tfidf.pkl") |
| | |
| | # Unload model |
| | #print(f'category {self.categories[category]} gets unloaded') |
| | self.tfidf_models[category] = None |
| | training_data[category] = None |
| | |
| | pbar.update(1)''' |
| | |
| | print("Training complete.") |
| |
|
| | |
| | |
| | def predict(self, input_data, confidence_threshold=1.5): |
| | |
| | category_confidences = self.qc_model.predict(input_data) |
| | |
| | |
| | |
| | confident_indices = (category_confidences > confidence_threshold).nonzero()[:,1] |
| | |
| | |
| | |
| | max_confidence = 0 |
| | max_answer = None |
| | max_category = 0 |
| | for category in confident_indices: |
| | |
| | confidence,answer = self.tfidf_models[category].predict(input_data) |
| | |
| | if(confidence > max_confidence): |
| | max_confidence = confidence |
| | max_answer = answer |
| | max_category = category |
| | |
| | |
| | |
| | |
| | return (self.confidence_eq(np.tanh(max_confidence)), max_answer) |
| | |
| | def evaluate(self, input_data): |
| | correct = 0 |
| | count = 0 |
| | |
| | with tqdm(total=len(input_data)) as pbar: |
| | for data_point in input_data: |
| | print(count % 10) |
| | count += 1 |
| | text = data_point["text"] |
| | answer = data_point["answer"] |
| | |
| | answer_predict = self.predict(text)[1] |
| | |
| | if(answer == answer_predict): |
| | correct += 1 |
| | print(correct) |
| | |
| | if(count % 10 == 0): |
| | average = float(correct)/count |
| | print(f'rolling average: {average}') |
| | |
| | pbar.update(1) |
| | |
| | |
| | accuracy = correct/len(input_data) |
| | |
| | return accuracy |
| |
|
| | def confidence_eq(self,x): |
| | |
| | if(x < 0.5): |
| | return 0 |
| | elif(x < 0.6): |
| | return 2*x - 1 |
| | elif(x < 0.7): |
| | return 4*x - 2.2 |
| | else: |
| | return min(1, 1.5*(x + 0.1)**2 - 0.36) |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | if __name__ == "__main__": |
| | |
| | import argparse |
| | parser = argparse.ArgumentParser() |
| |
|
| | parser.add_argument('--data', type=str) |
| | parser.add_argument('--model', type=str) |
| | parser.add_argument('--predict', type=str) |
| | parser.add_argument('--clear', action='store_const', const=True, default=False) |
| | parser.add_argument('--evaluate', type=str) |
| |
|
| | flags = parser.parse_args() |
| | model = None |
| | |
| | print(flags.clear) |
| |
|
| | if flags.clear: |
| |
|
| | model = QuizBowlModel(clear=True) |
| | |
| | else: |
| |
|
| | model = QuizBowlModel() |
| |
|
| | |
| |
|
| | if flags.data: |
| | |
| | data_json = [] |
| | |
| | for data in flags.data: |
| | with open(flags.data, 'r') as data_file: |
| | data_json.extend(json.load(data_file)) |
| |
|
| | model.train(data_json) |
| | |
| | |
| |
|
| | if flags.model: |
| | model.load(flags.model) |
| | |
| | if flags.predict: |
| | print(model.predict(flags.predict)) |
| | |
| | if flags.evaluate: |
| | with open(flags.evaluate, 'r') as data_file: |
| | data_json = json.load(data_file) |
| | print(f'accuracy: {model.evaluate(data_json)}') |
| | |