Spaces:
Runtime error
Runtime error
Nick Canu
commited on
Commit
·
b0829c1
1
Parent(s):
394d881
add app
Browse files- .gitattributes +2 -33
- .gitignore +1 -0
- .streamlit/config.toml +6 -0
- .vscode/launch.json +16 -0
- Home.py +348 -0
- Model_Constants_Template.py +7 -0
- Model_Step_Data/slim_df.parquet.gzip +3 -0
- Model_Step_Data/vector_df.parquet.gzip +3 -0
- Persistent Objects/current_keys.gz +0 -0
- Persistent Objects/token_search.gz +0 -0
- README.md +48 -13
- Stream_to_Output/GameCleaner.py +144 -0
- Stream_to_Output/requirements.txt +6 -0
- __pycache__/Model_Constants.cpython-39.pyc +0 -0
- __pycache__/description_generator.cpython-39.pyc +0 -0
- __pycache__/title_generator.cpython-39.pyc +0 -0
- description_generator.py +120 -0
- requirements.txt +11 -0
- t5_model/config.json +60 -0
- t5_model/generation_config.json +7 -0
- t5_model/pytorch_model.bin +3 -0
- t5_model/special_tokens_map.json +107 -0
- t5_model/spiece.model +0 -0
- t5_model/tokenizer_config.json +114 -0
- title_generator.py +148 -0
.gitattributes
CHANGED
|
@@ -1,34 +1,3 @@
|
|
| 1 |
-
*.
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
*.gzip filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Model_Constants.py
|
.streamlit/config.toml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[theme]
|
| 2 |
+
primaryColor="#e76020"
|
| 3 |
+
backgroundColor="#FDFFFC"
|
| 4 |
+
secondaryBackgroundColor="#6E896A"
|
| 5 |
+
textColor="#0f0f0d"
|
| 6 |
+
font="monospace"
|
.vscode/launch.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
// Use IntelliSense to learn about possible attributes.
|
| 3 |
+
// Hover to view descriptions of existing attributes.
|
| 4 |
+
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
| 5 |
+
"version": "0.2.0",
|
| 6 |
+
"configurations": [
|
| 7 |
+
{
|
| 8 |
+
"name": "Python: Module",
|
| 9 |
+
"type": "python",
|
| 10 |
+
"request": "launch",
|
| 11 |
+
"module": "streamlit",
|
| 12 |
+
"args": ["run", "Home.py"],
|
| 13 |
+
"justMyCode": true
|
| 14 |
+
}
|
| 15 |
+
]
|
| 16 |
+
}
|
Home.py
ADDED
|
@@ -0,0 +1,348 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
|
| 3 |
+
st.set_page_config(page_title='Auto-BG: The Game Concept Generator', layout='wide')
|
| 4 |
+
|
| 5 |
+
def application():
|
| 6 |
+
###Imports
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import numpy as np
|
| 9 |
+
import re
|
| 10 |
+
import urllib
|
| 11 |
+
import pickle
|
| 12 |
+
import spacy
|
| 13 |
+
from spacy.tokens import DocBin
|
| 14 |
+
from title_generator import Title_Generator
|
| 15 |
+
import gzip
|
| 16 |
+
import io
|
| 17 |
+
from description_generator import input_manager, model_control
|
| 18 |
+
|
| 19 |
+
#UI Session Variables
|
| 20 |
+
if 'desc_iter' not in st.session_state:
|
| 21 |
+
st.session_state.desc_iter = 0
|
| 22 |
+
if 'title_iter' not in st.session_state:
|
| 23 |
+
st.session_state.title_iter = 0
|
| 24 |
+
if 'output_dict' not in st.session_state:
|
| 25 |
+
st.session_state.output_dict = {}
|
| 26 |
+
if 'inputs' not in st.session_state:
|
| 27 |
+
st.session_state.inputs = []
|
| 28 |
+
if 'cur_pair' not in st.session_state:
|
| 29 |
+
st.session_state.cur_pair = ("","Run me!")
|
| 30 |
+
if 'f_d' not in st.session_state:
|
| 31 |
+
st.session_state.f_d = None
|
| 32 |
+
if 'g_d' not in st.session_state:
|
| 33 |
+
st.session_state.g_d = None
|
| 34 |
+
if 'm_d' not in st.session_state:
|
| 35 |
+
st.session_state.m_d = None
|
| 36 |
+
if 'c_d' not in st.session_state:
|
| 37 |
+
st.session_state.c_d = None
|
| 38 |
+
if 'coop_d' not in st.session_state:
|
| 39 |
+
st.session_state.coop_d = 0
|
| 40 |
+
|
| 41 |
+
#non-ui helper functions
|
| 42 |
+
#reader code extended from https://gist.github.com/thearn/5424244 for alternate load format
|
| 43 |
+
def reader(url):
|
| 44 |
+
url_file = io.BytesIO(urllib.request.urlopen(url).read())
|
| 45 |
+
f = gzip.GzipFile(fileobj=url_file)
|
| 46 |
+
data = f.read()
|
| 47 |
+
obj = pickle.loads(data)
|
| 48 |
+
f.close()
|
| 49 |
+
return obj
|
| 50 |
+
|
| 51 |
+
def token_expand(url):
|
| 52 |
+
nlp = spacy.blank("en")
|
| 53 |
+
url_file = urllib.request.urlopen(url)
|
| 54 |
+
f = gzip.GzipFile(fileobj=url_file)
|
| 55 |
+
data = f.read()
|
| 56 |
+
obj = pickle.loads(data)
|
| 57 |
+
f.close()
|
| 58 |
+
doc_bin = DocBin().from_bytes(obj)
|
| 59 |
+
docs = list(doc_bin.get_docs(nlp.vocab))
|
| 60 |
+
return (docs[1:9],docs[9:192],docs[192:276],docs[276:3901])
|
| 61 |
+
|
| 62 |
+
def revert_cats(gt, mec, cat, fam, coop):
|
| 63 |
+
gt = ["game_type_" + x for x in gt]
|
| 64 |
+
mec = ["mechanic_" + x for x in mec]
|
| 65 |
+
cat = ["category_" + x for x in cat]
|
| 66 |
+
fam = ["family_" + x for x in fam if x != "Game: [redacted]"]
|
| 67 |
+
if coop == 1:
|
| 68 |
+
co = ["cooperative", "mechanic_Cooperative Game"]
|
| 69 |
+
else:
|
| 70 |
+
co = []
|
| 71 |
+
|
| 72 |
+
final_list = [gt,mec,cat,fam, co]
|
| 73 |
+
return [item for sublist in final_list for item in sublist]
|
| 74 |
+
|
| 75 |
+
def builder(ip):
|
| 76 |
+
ks = iman.input_parser(iman.set_input(ip))
|
| 77 |
+
mctrl.prompt_formatter(ks)
|
| 78 |
+
descs = []
|
| 79 |
+
for status in np.arange(0,3):
|
| 80 |
+
desc = mctrl.call_api(status=status)
|
| 81 |
+
clean_desc = mctrl.resp_cleanup(desc)
|
| 82 |
+
inter_pair = Tgen.candidate_generator(clean_desc)
|
| 83 |
+
out = Tgen.candidate_score(inter_pair,ex_check)
|
| 84 |
+
descs.append(out)
|
| 85 |
+
st.sidebar.success("Prompt " +str(status+1)+ " generated!")
|
| 86 |
+
st.session_state.output_dict = {0:descs[0],1:descs[1],2:descs[2]}
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def title_check(next=0):
|
| 91 |
+
if next==1:
|
| 92 |
+
if st.session_state.title_iter == (len(st.session_state.output_dict[st.session_state.desc_iter]['titles'])-1):
|
| 93 |
+
st.session_state.title_iter = 0
|
| 94 |
+
else:
|
| 95 |
+
st.session_state.title_iter +=1
|
| 96 |
+
elif next==-1:
|
| 97 |
+
if st.session_state.title_iter == 0:
|
| 98 |
+
st.session_state.title_iter = (len(st.session_state.output_dict[st.session_state.desc_iter]['titles'])-1)
|
| 99 |
+
else:
|
| 100 |
+
st.session_state.title_iter -=1
|
| 101 |
+
else:
|
| 102 |
+
st.session_state.title_iter = 0
|
| 103 |
+
|
| 104 |
+
cur_title = st.session_state.output_dict[st.session_state.desc_iter]['titles'][st.session_state.title_iter][0]
|
| 105 |
+
desc = re.sub(re.compile("__"),cur_title,st.session_state.output_dict[st.session_state.desc_iter]['text'])
|
| 106 |
+
|
| 107 |
+
return (cur_title, desc.lstrip())
|
| 108 |
+
|
| 109 |
+
def show_title(val):
|
| 110 |
+
out = title_check(next=val)
|
| 111 |
+
st.session_state.cur_pair = out
|
| 112 |
+
|
| 113 |
+
def PT_button_clicked():
|
| 114 |
+
show_title(-1)
|
| 115 |
+
|
| 116 |
+
def NT_button_clicked():
|
| 117 |
+
show_title(1)
|
| 118 |
+
|
| 119 |
+
def PD_button_clicked():
|
| 120 |
+
if st.session_state.desc_iter == 0:
|
| 121 |
+
st.session_state.desc_iter = 2
|
| 122 |
+
st.session_state.title_iter = 0
|
| 123 |
+
else:
|
| 124 |
+
st.session_state.desc_iter -= 1
|
| 125 |
+
st.session_state.title_iter = 0
|
| 126 |
+
show_title(0)
|
| 127 |
+
|
| 128 |
+
def ND_button_clicked():
|
| 129 |
+
if st.session_state.desc_iter == 2:
|
| 130 |
+
st.session_state.desc_iter = 0
|
| 131 |
+
st.session_state.title_iter = 0
|
| 132 |
+
else:
|
| 133 |
+
st.session_state.desc_iter += 1
|
| 134 |
+
st.session_state.title_iter = 0
|
| 135 |
+
show_title(0)
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
###Variables
|
| 140 |
+
|
| 141 |
+
###Data
|
| 142 |
+
@st.cache_resource
|
| 143 |
+
def fetch_data():
|
| 144 |
+
slim_df = pd.read_parquet('https://github.com/canunj/Auto-BoardGame/blob/main/Model_Step_Data/slim_df.parquet.gzip?raw=true')
|
| 145 |
+
search_tokens = token_expand("https://github.com/canunj/Auto-BoardGame/blob/main/Persistent%20Objects/token_search.gz?raw=true")
|
| 146 |
+
vector_df = pd.read_parquet('https://github.com/canunj/Auto-BoardGame/blob/main/Model_Step_Data/vector_df.parquet.gzip?raw=true')
|
| 147 |
+
category_keys = reader("https://github.com/canunj/Auto-BoardGame/blob/main/Persistent%20Objects/current_keys.gz?raw=true")
|
| 148 |
+
coop = [1,0]
|
| 149 |
+
st.sidebar.success("Fetched Data!")
|
| 150 |
+
return slim_df, search_tokens, vector_df, category_keys, coop
|
| 151 |
+
|
| 152 |
+
slim_df, search_tokens, vector_df, category_keys, coop = fetch_data()
|
| 153 |
+
|
| 154 |
+
ex_check = ["[Ee]verquest","[Cc]ivilization [Ii][IiVv]","[Cc]ivilization(?=:)","[Cc]ivilization [Ii][Ii]",
|
| 155 |
+
"[Cc]ivilization [Ii][Ii][Ii]","[Cc]ivilization V","[Aa]ge [Oo]f [Ee]mpires [Ii][Ii2]([Ii]|\b)", "[Rr]avenloft|[Cc]astle [Rr]avenloft",
|
| 156 |
+
"[Ss]cythe(?=:|\b)","[Dd]ungeons [&Aa][ n][Dd ][ Ddr][Ddra][rg][oa][gn][os](ns|\b)",
|
| 157 |
+
"[Aa]ge [Oo]f [Ee]mpires [Ii][Ii]: [Tt]he [Aa]ge [Oo]f [Kk]ings","[Aa]ge [Oo]f [Ee]mpires 2: [Tt]he [Aa]ge [Oo]f [Kk]ings",
|
| 158 |
+
"[Aa]ge [Oo]f [Ee]mpires","Doctor Who"]
|
| 159 |
+
|
| 160 |
+
###Models
|
| 161 |
+
@st.cache_resource
|
| 162 |
+
def setup_models():
|
| 163 |
+
return Title_Generator('./t5_model', slim_df), input_manager(vector_df, slim_df, search_tokens), model_control(apikey=st.secrets.key,model_id=st.secrets.model)
|
| 164 |
+
|
| 165 |
+
Tgen, iman, mctrl = setup_models()
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
#UI
|
| 170 |
+
|
| 171 |
+
#Intro
|
| 172 |
+
st.title("""Auto-BG: The Game Concept Generator""")
|
| 173 |
+
|
| 174 |
+
with st.expander("How to use", expanded=True):
|
| 175 |
+
st.write(
|
| 176 |
+
"""
|
| 177 |
+
Discover the concept for your next favorite game!
|
| 178 |
+
|
| 179 |
+
How do you use Auto-BG?
|
| 180 |
+
|
| 181 |
+
Pick any set of tags from four selectors below: Family, Game, Mechanic, and Category.
|
| 182 |
+
If you are looking to lose together - activate the cooperative toggle.
|
| 183 |
+
|
| 184 |
+
See ? icons for detailed information on each type of tag.
|
| 185 |
+
|
| 186 |
+
Select any pre-configured demo below to see how Auto-BG works on the tag set for a popular board game.
|
| 187 |
+
"""
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
results = st.empty()
|
| 191 |
+
|
| 192 |
+
with st.expander('Demos'):
|
| 193 |
+
|
| 194 |
+
st.write("""These buttons run Auto-BG on the tag set for real games you might be familiar with,
|
| 195 |
+
choose a button and the corresponding tags automatically fill the selectors below.
|
| 196 |
+
Press run and see how Auto-BG creates an alternate concept for these hit titles!
|
| 197 |
+
""")
|
| 198 |
+
|
| 199 |
+
b1, b2, b3 = st.columns(3)
|
| 200 |
+
|
| 201 |
+
with b1:
|
| 202 |
+
SoC = st.button('Catan', use_container_width=True)
|
| 203 |
+
if SoC:
|
| 204 |
+
st.session_state.f_d = [
|
| 205 |
+
'Animals: Sheep',
|
| 206 |
+
'Components: Hexagonal Tiles',
|
| 207 |
+
'Components: Wooden pieces & boards'
|
| 208 |
+
]
|
| 209 |
+
st.session_state.g_d = ['Family Game', 'Strategy Game']
|
| 210 |
+
st.session_state.m_d = [
|
| 211 |
+
'Hexagon Grid',
|
| 212 |
+
'Network and Route Building',
|
| 213 |
+
'Random Production',
|
| 214 |
+
'Trading',
|
| 215 |
+
'Variable Set-up'
|
| 216 |
+
]
|
| 217 |
+
st.session_state.c_d = [
|
| 218 |
+
'Economic',
|
| 219 |
+
'Negotiation'
|
| 220 |
+
]
|
| 221 |
+
st.session_state.coop_d = 0
|
| 222 |
+
|
| 223 |
+
with b2:
|
| 224 |
+
TtR = st.button('Ticket to Ride', use_container_width=True)
|
| 225 |
+
if TtR:
|
| 226 |
+
st.session_state.f_d = [
|
| 227 |
+
'Components: Map (Continental / National scale)',
|
| 228 |
+
'Continents: North America',
|
| 229 |
+
'Country: USA'
|
| 230 |
+
]
|
| 231 |
+
st.session_state.g_d = ['Family Game']
|
| 232 |
+
st.session_state.m_d = [
|
| 233 |
+
'Contracts',
|
| 234 |
+
'End Game Bonuses',
|
| 235 |
+
'Network and Route Building',
|
| 236 |
+
'Push Your Luck',
|
| 237 |
+
'Set Collection'
|
| 238 |
+
]
|
| 239 |
+
st.session_state.c_d = [
|
| 240 |
+
'Trains'
|
| 241 |
+
]
|
| 242 |
+
st.session_state.coop_d = 0
|
| 243 |
+
|
| 244 |
+
with b3:
|
| 245 |
+
P = st.button('Pandemic', use_container_width=True)
|
| 246 |
+
if P:
|
| 247 |
+
st.session_state.f_d = [
|
| 248 |
+
'Components: Map (Global Scale)',
|
| 249 |
+
'Components: Multi-Use Cards',
|
| 250 |
+
'Medical: Diseases',
|
| 251 |
+
'Region: The World',
|
| 252 |
+
'Theme: Science'
|
| 253 |
+
]
|
| 254 |
+
st.session_state.g_d = ['Family Game', 'Strategy Game']
|
| 255 |
+
st.session_state.m_d = [
|
| 256 |
+
'Action Points',
|
| 257 |
+
'Point to Point Movement',
|
| 258 |
+
'Trading',
|
| 259 |
+
'Variable Player Powers'
|
| 260 |
+
]
|
| 261 |
+
st.session_state.c_d = [
|
| 262 |
+
'Medical'
|
| 263 |
+
]
|
| 264 |
+
st.session_state.coop_d = 1
|
| 265 |
+
|
| 266 |
+
#Form
|
| 267 |
+
with st.expander("Auto-BG", expanded=True):
|
| 268 |
+
|
| 269 |
+
col1, col2 = st.columns(2)
|
| 270 |
+
|
| 271 |
+
with col1:
|
| 272 |
+
Family_v = st.multiselect("Family", options=pd.Series(category_keys[4][8:]), key='Family', default=st.session_state.f_d, max_selections=6, help='Descriptive niches for groupings of games.\n Maximum of six choices.')
|
| 273 |
+
|
| 274 |
+
with col2:
|
| 275 |
+
Game_v = st.multiselect("Game", options=pd.Series(category_keys[1]), key='Game', default=st.session_state.g_d, max_selections=2, help='Top level genres - Family, Strategy, etc.\n Maximum of two choices.')
|
| 276 |
+
|
| 277 |
+
col3, col4 = st.columns(2)
|
| 278 |
+
|
| 279 |
+
with col3:
|
| 280 |
+
Category_v = st.multiselect("Category", options=pd.Series(category_keys[3]), key='Category', default=st.session_state.c_d, max_selections=3, help='Expanded genre tags.\n Maximum of three choices.')
|
| 281 |
+
|
| 282 |
+
with col4:
|
| 283 |
+
Mechanics_v = st.multiselect("Mechanics", options=pd.Series([x for x in category_keys[2] if x != "Cooperative Game"]), key='Mechanic', default=st.session_state.m_d, max_selections=5, help='Game rules!\n Maximum of five choices.')
|
| 284 |
+
|
| 285 |
+
Cooperative_v = st.checkbox('Cooperative?', value=st.session_state.coop_d, key='CoopCheck')
|
| 286 |
+
|
| 287 |
+
run = st.button("Run Model", use_container_width=True)
|
| 288 |
+
|
| 289 |
+
if run:
|
| 290 |
+
if st.session_state.inputs == revert_cats(Game_v, Mechanics_v, Category_v, Family_v, Cooperative_v):
|
| 291 |
+
st.write('Inputs did not change, results currently loaded.')
|
| 292 |
+
else:
|
| 293 |
+
|
| 294 |
+
st.session_state.desc_iter = 0
|
| 295 |
+
st.session_state.title_iter = 0
|
| 296 |
+
st.session_state.output_dict = {}
|
| 297 |
+
|
| 298 |
+
if Cooperative_v == True:
|
| 299 |
+
Mechanics_v.append('Cooperative Game')
|
| 300 |
+
|
| 301 |
+
st.session_state.inputs = revert_cats(Game_v, Mechanics_v, Category_v, Family_v, Cooperative_v)
|
| 302 |
+
builder(st.session_state.inputs)
|
| 303 |
+
st.session_state.cur_pair = title_check()
|
| 304 |
+
|
| 305 |
+
if st.session_state.output_dict == {}:
|
| 306 |
+
results.empty()
|
| 307 |
+
else:
|
| 308 |
+
with results.expander('Results', expanded=True):
|
| 309 |
+
|
| 310 |
+
st.write(
|
| 311 |
+
"""
|
| 312 |
+
#### Title:
|
| 313 |
+
""")
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
st.write(st.session_state.cur_pair[0])
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
t_col1, t_col2 = st.columns(2)
|
| 321 |
+
with t_col1:
|
| 322 |
+
st.button("See Previous Title", on_click=PT_button_clicked, use_container_width=True)
|
| 323 |
+
|
| 324 |
+
with t_col2:
|
| 325 |
+
st.button("See Next Title", on_click=NT_button_clicked, use_container_width=True)
|
| 326 |
+
|
| 327 |
+
st.write(
|
| 328 |
+
"""
|
| 329 |
+
#### Description:
|
| 330 |
+
""")
|
| 331 |
+
st.write(st.session_state.cur_pair[1].replace('$','\$'))
|
| 332 |
+
|
| 333 |
+
d_col1, d_col2 = st.columns(2)
|
| 334 |
+
with d_col1:
|
| 335 |
+
st.button("See Previous Description", on_click=PD_button_clicked, use_container_width=True)
|
| 336 |
+
|
| 337 |
+
with d_col2:
|
| 338 |
+
st.button("See Next Description", on_click=ND_button_clicked, use_container_width=True)
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
page_names_to_funcs = {
|
| 343 |
+
"Application": application
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
demo_name = st.sidebar.selectbox("Choose a page:", page_names_to_funcs.keys())
|
| 347 |
+
page_names_to_funcs[demo_name]()
|
| 348 |
+
|
Model_Constants_Template.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def SEND_KEY():
|
| 2 |
+
KEY = ""
|
| 3 |
+
return KEY
|
| 4 |
+
|
| 5 |
+
def SEND_MODEL():
|
| 6 |
+
OAI_MODEL = ""
|
| 7 |
+
return OAI_MODEL
|
Model_Step_Data/slim_df.parquet.gzip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8eb032341c8bacc24ffee96e2a1b3201a0ab6c2837567956ba1ddb9492e056dc
|
| 3 |
+
size 16243764
|
Model_Step_Data/vector_df.parquet.gzip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eaf463f341982a460862da6ee77bbed38ad92ad36c4aef10bc031828681ef83f
|
| 3 |
+
size 3803902
|
Persistent Objects/current_keys.gz
ADDED
|
Binary file (39.7 kB). View file
|
|
|
Persistent Objects/token_search.gz
ADDED
|
Binary file (144 kB). View file
|
|
|
README.md
CHANGED
|
@@ -1,13 +1,48 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[icon banner image placeholder]
|
| 2 |
+
|
| 3 |
+
# Auto-BG
|
| 4 |
+
LLM-based text generation tool for creating board game concepts (description & title)
|
| 5 |
+
|
| 6 |
+
The Auto-BG (Board Game) tool is a text generation tool for creating board game concepts. It utilizes multiple large-language models to generate board game titles and descriptions tailored from user-input tags based on BoardGameGeek.com. The models used in this project include a trained T5 sequence-to-sequence model, primarily for title generation, and a robust GPT3 model for board game description generation. The T5 model was initially presented by Raffel et al. in ["Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer"](https://arxiv.org/pdf/1910.10683.pdf). The GPT3 model builds from Brown et al.'s work in ["Language Models are Few-Shot Learners"](https://arxiv.org/pdf/1910.10683.pdf).
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
## Table of Contents
|
| 10 |
+
- Features and Demo
|
| 11 |
+
- Examples
|
| 12 |
+
- Project Structure
|
| 13 |
+
- Customizing Auto-BG
|
| 14 |
+
- Citations and Licensing
|
| 15 |
+
|
| 16 |
+
## Features and Demo
|
| 17 |
+
The main features of this application include:
|
| 18 |
+
|
| 19 |
+
A user-friendly interface for Auto-BG can be found at (homepage).
|
| 20 |
+
|
| 21 |
+
## Examples
|
| 22 |
+
|
| 23 |
+
## Project Structure
|
| 24 |
+
|
| 25 |
+
## Customizing Auto-BG
|
| 26 |
+
NOTE: Auto-BG uses a fine-tuned GPT-3 Curie model that will be inaccessible without an organizational API key,
|
| 27 |
+
the below instructions are for advanced users interested in remixing Auto-BG with a new generator model.
|
| 28 |
+
|
| 29 |
+
In order to run this application, you will need the following:
|
| 30 |
+
1. An OpenAI account and API key
|
| 31 |
+
2. All libraries specified in both the primary and data processing requirements.txt files
|
| 32 |
+
3. A raw stream JSON file of BoardGameGeek data, formatted to match output from the Recommend.Games scraper
|
| 33 |
+
|
| 34 |
+
To implement a new instance of Auto-BG, follow these steps:
|
| 35 |
+
1. Clone the repository onto your local machine
|
| 36 |
+
2. Install the required packages listed in both 'requirements.txt' files using pip
|
| 37 |
+
3. Download the trained T5 model or provide a path to an alternate T5 model.
|
| 38 |
+
4. Placing the JSON data file in Stream_to_Output, run GameCleaner.py - this provides all required data files.
|
| 39 |
+
|
| 40 |
+
5. Prepare training prompts - convert all active keys to period stopped tokens in a string for each game.
|
| 41 |
+
6. Fine-tune a selected model following the instructions at: https://platform.openai.com/docs/guides/fine-tuning
|
| 42 |
+
NOTE: Auto-BG uses a Curie model with a lowered learning rate running for fewer epochs.
|
| 43 |
+
|
| 44 |
+
8. Create a Model_Constants.py file with your personal API key and model instance based on the template above.
|
| 45 |
+
9. You now have a customized instance of Auto-BG!
|
| 46 |
+
|
| 47 |
+
## Citations and Licensing
|
| 48 |
+
Auto-BG is licensed under CC BY-NC-SA 2.0, original data sourced from Recommend.Games @GitLab
|
Stream_to_Output/GameCleaner.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import re
|
| 4 |
+
import nltk
|
| 5 |
+
from nltk.corpus import stopwords
|
| 6 |
+
from gensim.parsing import preprocess_string, strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
|
| 7 |
+
import spacy
|
| 8 |
+
from langdetect import detect
|
| 9 |
+
import pickle
|
| 10 |
+
import gzip
|
| 11 |
+
nltk.download('stopwords')
|
| 12 |
+
|
| 13 |
+
#function definitions
|
| 14 |
+
|
| 15 |
+
#strips values out of encoded stream lists
|
| 16 |
+
def text_col_cleaner(frame, cols, pattern):
|
| 17 |
+
|
| 18 |
+
pattern = re.compile(pattern)
|
| 19 |
+
|
| 20 |
+
for col in cols:
|
| 21 |
+
frame[col] = frame[col].map(lambda x: [re.findall(pattern,val)[0].strip() for val in x], na_action='ignore')
|
| 22 |
+
return frame
|
| 23 |
+
|
| 24 |
+
#converts specified columns to one-hot
|
| 25 |
+
def encode_columns(frame):
|
| 26 |
+
targets = list(frame.columns)
|
| 27 |
+
for t in targets:
|
| 28 |
+
one_hot = pd.get_dummies(frame[t].apply(pd.Series).stack(),prefix=t).groupby(level=0).sum()
|
| 29 |
+
frame = pd.concat([frame,one_hot],axis=1)
|
| 30 |
+
return frame
|
| 31 |
+
|
| 32 |
+
#custom text processor for tokenizing descriptions by Kuan Chen & Nick Canu
|
| 33 |
+
def doc_text_preprocessing(ser):
|
| 34 |
+
nlp=spacy.load("en_core_web_sm", exclude=['parser','ner','textcat'])
|
| 35 |
+
|
| 36 |
+
"""text processing steps"""
|
| 37 |
+
stop_words=set(stopwords.words('english'))
|
| 38 |
+
stop_words.update(['game','player','players','games', 'also',
|
| 39 |
+
'description','publisher'])
|
| 40 |
+
|
| 41 |
+
single_letter_replace=lambda c: re.sub("\s+\w{1}\s+|\n|-|—",'',c)
|
| 42 |
+
to_lower_func=lambda c: c.lower()
|
| 43 |
+
|
| 44 |
+
lemma_text=[preprocess_string(
|
| 45 |
+
' '.join([token.lemma_ for token in desc]
|
| 46 |
+
),[remove_stopwords,strip_numeric,strip_punctuation,strip_tags,
|
| 47 |
+
strip_multiple_whitespaces,single_letter_replace,to_lower_func]
|
| 48 |
+
) for desc in ser.apply(lambda x: nlp(x))]
|
| 49 |
+
|
| 50 |
+
tokenize_text=[[word for word in string if word not in stop_words] for string in lemma_text]
|
| 51 |
+
|
| 52 |
+
return tokenize_text
|
| 53 |
+
|
| 54 |
+
#performs english language detection on the descriptions w/langdetect then additionally drops games using non-english characters in the name
|
| 55 |
+
def lang_cleanup(frame):
|
| 56 |
+
nlp=spacy.load("en_core_web_sm")
|
| 57 |
+
frame['description']=frame['description'].fillna('no words')
|
| 58 |
+
frame = frame[frame['description']!='no words']
|
| 59 |
+
frame['cleaned_descriptions']=doc_text_preprocessing(frame['description'])
|
| 60 |
+
|
| 61 |
+
detected_lang = []
|
| 62 |
+
for word in frame.cleaned_descriptions:
|
| 63 |
+
word=', '.join(word)
|
| 64 |
+
detected_lang.append(detect(word))
|
| 65 |
+
frame['lang'] = detected_lang
|
| 66 |
+
frame = frame[frame['lang']=='en']
|
| 67 |
+
|
| 68 |
+
non_eng_title_filter = frame['name'].str.contains('[^\x00-\x7f]', flags=re.IGNORECASE)
|
| 69 |
+
return frame[~non_eng_title_filter]
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
#column name stripper for creating key values
|
| 73 |
+
def column_fixer(frame,targ):
|
| 74 |
+
return [col.replace(targ, "").strip('"') for col in frame.columns if col.startswith(targ)]
|
| 75 |
+
|
| 76 |
+
#creates key list for defining web app lists & nlp tokens of the same unknown input search
|
| 77 |
+
def key_collator(frame):
|
| 78 |
+
nlp=spacy.load("en_core_web_sm")
|
| 79 |
+
fam = column_fixer(frame,'family_')
|
| 80 |
+
gt = column_fixer(frame,'game_type_')
|
| 81 |
+
mec = column_fixer(frame,'mechanic_')
|
| 82 |
+
cat = column_fixer(frame,'category_')
|
| 83 |
+
|
| 84 |
+
current_keys = (['cooperative'],gt,mec,cat,fam)
|
| 85 |
+
|
| 86 |
+
fam_keys = [nlp(w) for w in fam]
|
| 87 |
+
gt_keys = [nlp(w) for w in gt]
|
| 88 |
+
mec_keys = [nlp(w) for w in mec]
|
| 89 |
+
cat_keys = [nlp(w) for w in cat]
|
| 90 |
+
|
| 91 |
+
search_tokens = (gt_keys,mec_keys,cat_keys,fam_keys)
|
| 92 |
+
|
| 93 |
+
return current_keys, search_tokens
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
#-----------
|
| 97 |
+
|
| 98 |
+
#reading in raw file & removing unranked and compilation game items
|
| 99 |
+
df = pd.read_json(r'./bgg_GameItem.jl', lines=True)
|
| 100 |
+
df['rank'] = df['rank'].fillna(0).astype(int)
|
| 101 |
+
df = df[(df['rank']>0) & (df['compilation']!=1)]
|
| 102 |
+
|
| 103 |
+
#separating and cleaning the one-hot target columns
|
| 104 |
+
in_df = text_col_cleaner(frame = df[['game_type','mechanic','category','family']],
|
| 105 |
+
cols = ['game_type','mechanic','category','family'],
|
| 106 |
+
pattern = re.compile("([\S ]+)(?=:)"))
|
| 107 |
+
|
| 108 |
+
print('Text has been cleaned, now encoding one-hot columns')
|
| 109 |
+
|
| 110 |
+
#encoding one-hot columns and rejoining to features for output
|
| 111 |
+
proc_df = encode_columns(in_df)
|
| 112 |
+
step = df[['name','description','cooperative']]
|
| 113 |
+
join_df = pd.concat([step,proc_df.drop(['game_type','mechanic','category','family',
|
| 114 |
+
'game_type_Amiga','game_type_Arcade','game_type_Atari ST',
|
| 115 |
+
'game_type_Commodore 64'],axis=1)],axis=1)
|
| 116 |
+
|
| 117 |
+
print('Columns encoded, now performing english language detection and cleanup')
|
| 118 |
+
|
| 119 |
+
#english language detection steps & first data save
|
| 120 |
+
eng_df = lang_cleanup(join_df)
|
| 121 |
+
eng_df = eng_df.loc[:,~eng_df.columns.duplicated()].copy().reset_index(drop=True).fillna(0)
|
| 122 |
+
|
| 123 |
+
print('Creating vector-only dataframe & saving output')
|
| 124 |
+
|
| 125 |
+
#vector only data for operations
|
| 126 |
+
vector_df = eng_df.copy().drop(['name','description','cleaned_descriptions','lang'],axis=1)
|
| 127 |
+
|
| 128 |
+
eng_df.to_parquet('game_data.parquet.gzip',compression='gzip')
|
| 129 |
+
vector_df.to_parquet('game_vectors.parquet.gzip',compression='gzip')
|
| 130 |
+
|
| 131 |
+
print('Creating key lists')
|
| 132 |
+
|
| 133 |
+
#creating key lists - 1. string list of values by feature class for defining input selections & 2. nlp processed list for unknown input search
|
| 134 |
+
keys, search_toks = key_collator(vector_df)
|
| 135 |
+
|
| 136 |
+
with gzip.open("current_keys.gz", "wb") as f:
|
| 137 |
+
pickle.dump(keys, f)
|
| 138 |
+
f.close()
|
| 139 |
+
|
| 140 |
+
with gzip.open("key_search_tokens.gz", "wb") as f:
|
| 141 |
+
pickle.dump(search_toks, f)
|
| 142 |
+
f.close()
|
| 143 |
+
|
| 144 |
+
print('File creation is complete')
|
Stream_to_Output/requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gensim==4.3.1
|
| 2 |
+
langdetect==1.0.9
|
| 3 |
+
nltk==3.8.1
|
| 4 |
+
numpy==1.24.2
|
| 5 |
+
pandas==1.3.2
|
| 6 |
+
spacy==3.5.1
|
__pycache__/Model_Constants.cpython-39.pyc
ADDED
|
Binary file (457 Bytes). View file
|
|
|
__pycache__/description_generator.cpython-39.pyc
ADDED
|
Binary file (4.62 kB). View file
|
|
|
__pycache__/title_generator.cpython-39.pyc
ADDED
|
Binary file (6.8 kB). View file
|
|
|
description_generator.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import numpy as np
|
| 3 |
+
import re
|
| 4 |
+
import spacy
|
| 5 |
+
import openai
|
| 6 |
+
from operator import itemgetter
|
| 7 |
+
#user input manager class
|
| 8 |
+
class input_manager:
|
| 9 |
+
|
| 10 |
+
#initialize key dictionary from vector data frame and set community top N
|
| 11 |
+
def __init__(self,key_df, slim_df, search_tokens, top_n=10):
|
| 12 |
+
self.key_df = key_df
|
| 13 |
+
self.slim_df = slim_df
|
| 14 |
+
self.search_tokens = search_tokens
|
| 15 |
+
self.key = dict(zip(list(key_df.columns),np.zeros(len(key_df.columns))))
|
| 16 |
+
self.top_n = top_n
|
| 17 |
+
|
| 18 |
+
#translate input text to vector
|
| 19 |
+
def set_input(self,input_cats):
|
| 20 |
+
#need setup to apply correct group tag to values
|
| 21 |
+
nlp=spacy.load("en_core_web_md")
|
| 22 |
+
#separate known/unknown features
|
| 23 |
+
k_flags = [cat for cat in input_cats if cat in list(self.key.keys())]
|
| 24 |
+
unk_flags = [cat for cat in input_cats if cat not in list(self.key.keys())]
|
| 25 |
+
|
| 26 |
+
#process within feature class similarity for each unknown input
|
| 27 |
+
if len(unk_flags)>0:
|
| 28 |
+
|
| 29 |
+
outs = []
|
| 30 |
+
for word in unk_flags:
|
| 31 |
+
if re.match(r"game_type_",word):
|
| 32 |
+
tok = nlp(word.split("_")[-1])
|
| 33 |
+
mtch = max([(key,key.similarity(tok)) for key in self.search_tokens[0]],key=itemgetter(1))
|
| 34 |
+
#if no known match is found (model doesn't recognize input word), we're going to discard - other solutions performance prohibitive
|
| 35 |
+
if mtch[1]>0:
|
| 36 |
+
outs.append("game_type_"+mtch[0])
|
| 37 |
+
elif re.match(r"mechanic_",word):
|
| 38 |
+
tok = nlp(word.split("_")[-1])
|
| 39 |
+
mtch = max([(key,key.similarity(tok)) for key in self.search_tokens[1]],key=itemgetter(1))
|
| 40 |
+
if mtch[1]>0:
|
| 41 |
+
outs.append("mechanic_"+mtch[0])
|
| 42 |
+
elif re.match(r"category_",word):
|
| 43 |
+
tok = nlp(word.split("_")[-1])
|
| 44 |
+
mtch=max([(key,key.similarity(tok)) for key in self.search_tokens[2]],key=itemgetter(1))
|
| 45 |
+
if mtch[1]>0:
|
| 46 |
+
outs.append("category_"+mtch[0])
|
| 47 |
+
elif re.match(r"family_",word):
|
| 48 |
+
tok = nlp(word.split("_")[-1])
|
| 49 |
+
mtch=max([(key,key.similarity(tok)) for key in self.search_tokens[3]],key=itemgetter(1))
|
| 50 |
+
if mtch[1]>0:
|
| 51 |
+
outs.append("family_"+str(mtch[0]))
|
| 52 |
+
|
| 53 |
+
#if unks are processed, rejoin nearest match to known.
|
| 54 |
+
k_flags = list(set(k_flags+outs))
|
| 55 |
+
|
| 56 |
+
#preserve global key and ouput copy w/input keys activated to 1
|
| 57 |
+
d = self.key.copy()
|
| 58 |
+
for cat in k_flags:
|
| 59 |
+
d[cat] = 1.0
|
| 60 |
+
|
| 61 |
+
# DELETE ME
|
| 62 |
+
return d
|
| 63 |
+
|
| 64 |
+
def input_parser(self,in_vec):
|
| 65 |
+
#extracting keys from processed vector
|
| 66 |
+
ks = [k for k,v in in_vec.items() if v == 1]
|
| 67 |
+
|
| 68 |
+
return ks
|
| 69 |
+
|
| 70 |
+
class model_control:
|
| 71 |
+
def __init__(self, apikey, model_id):
|
| 72 |
+
self.api_key = apikey
|
| 73 |
+
openai.api_key = self.api_key
|
| 74 |
+
|
| 75 |
+
self.prompt = None
|
| 76 |
+
|
| 77 |
+
self.model = openai.FineTune.retrieve(id=model_id).fine_tuned_model
|
| 78 |
+
|
| 79 |
+
def prompt_formatter(self,ks):
|
| 80 |
+
self.prompt = ". ".join(ks) + "\n\n###\n\n"
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def call_api(self,status=0):
|
| 85 |
+
if status == 0:
|
| 86 |
+
temp=0.5
|
| 87 |
+
pres=0.7
|
| 88 |
+
elif status == 1:
|
| 89 |
+
temp=0.4
|
| 90 |
+
pres=0.6
|
| 91 |
+
elif status == 2:
|
| 92 |
+
temp=0.5
|
| 93 |
+
pres=0.8
|
| 94 |
+
|
| 95 |
+
answer = openai.Completion.create(
|
| 96 |
+
model=self.model,
|
| 97 |
+
prompt=self.prompt,
|
| 98 |
+
max_tokens=512,
|
| 99 |
+
temperature=temp,
|
| 100 |
+
stop=["END"],
|
| 101 |
+
presence_penalty=pres,
|
| 102 |
+
frequency_penalty=0.5
|
| 103 |
+
)
|
| 104 |
+
return answer['choices'][0]['text']
|
| 105 |
+
|
| 106 |
+
def resp_cleanup(self,text):
|
| 107 |
+
|
| 108 |
+
if ((text[-1] != "!") & (text[-1] != ".") & (text[-1] != "?")):
|
| 109 |
+
text = " ".join([e+'.' for e in text.split('.')[0:-1] if e])
|
| 110 |
+
|
| 111 |
+
sent = re.split(r'([.?!:])', text)
|
| 112 |
+
phrases = ["[Dd]esigned by","[Dd]esigner of","[Aa]rt by","[Aa]rtist of","[Pp]ublished","[Pp]ublisher of"]
|
| 113 |
+
|
| 114 |
+
pat = re.compile("(?:" + "|".join(phrases) + ")")
|
| 115 |
+
fix = re.compile("(?<=[.!?])[.!?]")
|
| 116 |
+
|
| 117 |
+
text = re.sub(fix,'',''.join([s for s in sent if pat.search(s) == None]))
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
return text
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gensim==4.3.1
|
| 2 |
+
langdetect==1.0.9
|
| 3 |
+
nltk==3.8.1
|
| 4 |
+
numpy==1.24.2
|
| 5 |
+
openai==0.27.2
|
| 6 |
+
pandas==1.3.2
|
| 7 |
+
scikit_learn==1.2.2
|
| 8 |
+
spacy==3.5.1
|
| 9 |
+
streamlit==1.20.0
|
| 10 |
+
torch==2.0.0
|
| 11 |
+
transformers==4.27.3
|
t5_model/config.json
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "Michau/t5-base-en-generate-headline",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"T5ForConditionalGeneration"
|
| 5 |
+
],
|
| 6 |
+
"d_ff": 3072,
|
| 7 |
+
"d_kv": 64,
|
| 8 |
+
"d_model": 768,
|
| 9 |
+
"decoder_start_token_id": 0,
|
| 10 |
+
"dense_act_fn": "relu",
|
| 11 |
+
"dropout_rate": 0.1,
|
| 12 |
+
"eos_token_id": 1,
|
| 13 |
+
"feed_forward_proj": "relu",
|
| 14 |
+
"initializer_factor": 1.0,
|
| 15 |
+
"is_encoder_decoder": true,
|
| 16 |
+
"is_gated_act": false,
|
| 17 |
+
"layer_norm_epsilon": 1e-06,
|
| 18 |
+
"model_type": "t5",
|
| 19 |
+
"n_positions": 512,
|
| 20 |
+
"num_decoder_layers": 12,
|
| 21 |
+
"num_heads": 12,
|
| 22 |
+
"num_layers": 12,
|
| 23 |
+
"output_past": true,
|
| 24 |
+
"pad_token_id": 0,
|
| 25 |
+
"relative_attention_max_distance": 128,
|
| 26 |
+
"relative_attention_num_buckets": 32,
|
| 27 |
+
"task_specific_params": {
|
| 28 |
+
"summarization": {
|
| 29 |
+
"early_stopping": true,
|
| 30 |
+
"length_penalty": 2.0,
|
| 31 |
+
"max_length": 200,
|
| 32 |
+
"min_length": 30,
|
| 33 |
+
"no_repeat_ngram_size": 3,
|
| 34 |
+
"num_beams": 4,
|
| 35 |
+
"prefix": "summarize: "
|
| 36 |
+
},
|
| 37 |
+
"translation_en_to_de": {
|
| 38 |
+
"early_stopping": true,
|
| 39 |
+
"max_length": 300,
|
| 40 |
+
"num_beams": 4,
|
| 41 |
+
"prefix": "translate English to German: "
|
| 42 |
+
},
|
| 43 |
+
"translation_en_to_fr": {
|
| 44 |
+
"early_stopping": true,
|
| 45 |
+
"max_length": 300,
|
| 46 |
+
"num_beams": 4,
|
| 47 |
+
"prefix": "translate English to French: "
|
| 48 |
+
},
|
| 49 |
+
"translation_en_to_ro": {
|
| 50 |
+
"early_stopping": true,
|
| 51 |
+
"max_length": 300,
|
| 52 |
+
"num_beams": 4,
|
| 53 |
+
"prefix": "translate English to Romanian: "
|
| 54 |
+
}
|
| 55 |
+
},
|
| 56 |
+
"torch_dtype": "float32",
|
| 57 |
+
"transformers_version": "4.26.1",
|
| 58 |
+
"use_cache": true,
|
| 59 |
+
"vocab_size": 32128
|
| 60 |
+
}
|
t5_model/generation_config.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"decoder_start_token_id": 0,
|
| 4 |
+
"eos_token_id": 1,
|
| 5 |
+
"pad_token_id": 0,
|
| 6 |
+
"transformers_version": "4.26.1"
|
| 7 |
+
}
|
t5_model/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e3f73b04bb3e12b9bd1f02b88f98648da9c317f734a61e9805ae385c1c57671d
|
| 3 |
+
size 891702929
|
t5_model/special_tokens_map.json
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"<extra_id_0>",
|
| 4 |
+
"<extra_id_1>",
|
| 5 |
+
"<extra_id_2>",
|
| 6 |
+
"<extra_id_3>",
|
| 7 |
+
"<extra_id_4>",
|
| 8 |
+
"<extra_id_5>",
|
| 9 |
+
"<extra_id_6>",
|
| 10 |
+
"<extra_id_7>",
|
| 11 |
+
"<extra_id_8>",
|
| 12 |
+
"<extra_id_9>",
|
| 13 |
+
"<extra_id_10>",
|
| 14 |
+
"<extra_id_11>",
|
| 15 |
+
"<extra_id_12>",
|
| 16 |
+
"<extra_id_13>",
|
| 17 |
+
"<extra_id_14>",
|
| 18 |
+
"<extra_id_15>",
|
| 19 |
+
"<extra_id_16>",
|
| 20 |
+
"<extra_id_17>",
|
| 21 |
+
"<extra_id_18>",
|
| 22 |
+
"<extra_id_19>",
|
| 23 |
+
"<extra_id_20>",
|
| 24 |
+
"<extra_id_21>",
|
| 25 |
+
"<extra_id_22>",
|
| 26 |
+
"<extra_id_23>",
|
| 27 |
+
"<extra_id_24>",
|
| 28 |
+
"<extra_id_25>",
|
| 29 |
+
"<extra_id_26>",
|
| 30 |
+
"<extra_id_27>",
|
| 31 |
+
"<extra_id_28>",
|
| 32 |
+
"<extra_id_29>",
|
| 33 |
+
"<extra_id_30>",
|
| 34 |
+
"<extra_id_31>",
|
| 35 |
+
"<extra_id_32>",
|
| 36 |
+
"<extra_id_33>",
|
| 37 |
+
"<extra_id_34>",
|
| 38 |
+
"<extra_id_35>",
|
| 39 |
+
"<extra_id_36>",
|
| 40 |
+
"<extra_id_37>",
|
| 41 |
+
"<extra_id_38>",
|
| 42 |
+
"<extra_id_39>",
|
| 43 |
+
"<extra_id_40>",
|
| 44 |
+
"<extra_id_41>",
|
| 45 |
+
"<extra_id_42>",
|
| 46 |
+
"<extra_id_43>",
|
| 47 |
+
"<extra_id_44>",
|
| 48 |
+
"<extra_id_45>",
|
| 49 |
+
"<extra_id_46>",
|
| 50 |
+
"<extra_id_47>",
|
| 51 |
+
"<extra_id_48>",
|
| 52 |
+
"<extra_id_49>",
|
| 53 |
+
"<extra_id_50>",
|
| 54 |
+
"<extra_id_51>",
|
| 55 |
+
"<extra_id_52>",
|
| 56 |
+
"<extra_id_53>",
|
| 57 |
+
"<extra_id_54>",
|
| 58 |
+
"<extra_id_55>",
|
| 59 |
+
"<extra_id_56>",
|
| 60 |
+
"<extra_id_57>",
|
| 61 |
+
"<extra_id_58>",
|
| 62 |
+
"<extra_id_59>",
|
| 63 |
+
"<extra_id_60>",
|
| 64 |
+
"<extra_id_61>",
|
| 65 |
+
"<extra_id_62>",
|
| 66 |
+
"<extra_id_63>",
|
| 67 |
+
"<extra_id_64>",
|
| 68 |
+
"<extra_id_65>",
|
| 69 |
+
"<extra_id_66>",
|
| 70 |
+
"<extra_id_67>",
|
| 71 |
+
"<extra_id_68>",
|
| 72 |
+
"<extra_id_69>",
|
| 73 |
+
"<extra_id_70>",
|
| 74 |
+
"<extra_id_71>",
|
| 75 |
+
"<extra_id_72>",
|
| 76 |
+
"<extra_id_73>",
|
| 77 |
+
"<extra_id_74>",
|
| 78 |
+
"<extra_id_75>",
|
| 79 |
+
"<extra_id_76>",
|
| 80 |
+
"<extra_id_77>",
|
| 81 |
+
"<extra_id_78>",
|
| 82 |
+
"<extra_id_79>",
|
| 83 |
+
"<extra_id_80>",
|
| 84 |
+
"<extra_id_81>",
|
| 85 |
+
"<extra_id_82>",
|
| 86 |
+
"<extra_id_83>",
|
| 87 |
+
"<extra_id_84>",
|
| 88 |
+
"<extra_id_85>",
|
| 89 |
+
"<extra_id_86>",
|
| 90 |
+
"<extra_id_87>",
|
| 91 |
+
"<extra_id_88>",
|
| 92 |
+
"<extra_id_89>",
|
| 93 |
+
"<extra_id_90>",
|
| 94 |
+
"<extra_id_91>",
|
| 95 |
+
"<extra_id_92>",
|
| 96 |
+
"<extra_id_93>",
|
| 97 |
+
"<extra_id_94>",
|
| 98 |
+
"<extra_id_95>",
|
| 99 |
+
"<extra_id_96>",
|
| 100 |
+
"<extra_id_97>",
|
| 101 |
+
"<extra_id_98>",
|
| 102 |
+
"<extra_id_99>"
|
| 103 |
+
],
|
| 104 |
+
"eos_token": "</s>",
|
| 105 |
+
"pad_token": "<pad>",
|
| 106 |
+
"unk_token": "<unk>"
|
| 107 |
+
}
|
t5_model/spiece.model
ADDED
|
Binary file (792 kB). View file
|
|
|
t5_model/tokenizer_config.json
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"<extra_id_0>",
|
| 4 |
+
"<extra_id_1>",
|
| 5 |
+
"<extra_id_2>",
|
| 6 |
+
"<extra_id_3>",
|
| 7 |
+
"<extra_id_4>",
|
| 8 |
+
"<extra_id_5>",
|
| 9 |
+
"<extra_id_6>",
|
| 10 |
+
"<extra_id_7>",
|
| 11 |
+
"<extra_id_8>",
|
| 12 |
+
"<extra_id_9>",
|
| 13 |
+
"<extra_id_10>",
|
| 14 |
+
"<extra_id_11>",
|
| 15 |
+
"<extra_id_12>",
|
| 16 |
+
"<extra_id_13>",
|
| 17 |
+
"<extra_id_14>",
|
| 18 |
+
"<extra_id_15>",
|
| 19 |
+
"<extra_id_16>",
|
| 20 |
+
"<extra_id_17>",
|
| 21 |
+
"<extra_id_18>",
|
| 22 |
+
"<extra_id_19>",
|
| 23 |
+
"<extra_id_20>",
|
| 24 |
+
"<extra_id_21>",
|
| 25 |
+
"<extra_id_22>",
|
| 26 |
+
"<extra_id_23>",
|
| 27 |
+
"<extra_id_24>",
|
| 28 |
+
"<extra_id_25>",
|
| 29 |
+
"<extra_id_26>",
|
| 30 |
+
"<extra_id_27>",
|
| 31 |
+
"<extra_id_28>",
|
| 32 |
+
"<extra_id_29>",
|
| 33 |
+
"<extra_id_30>",
|
| 34 |
+
"<extra_id_31>",
|
| 35 |
+
"<extra_id_32>",
|
| 36 |
+
"<extra_id_33>",
|
| 37 |
+
"<extra_id_34>",
|
| 38 |
+
"<extra_id_35>",
|
| 39 |
+
"<extra_id_36>",
|
| 40 |
+
"<extra_id_37>",
|
| 41 |
+
"<extra_id_38>",
|
| 42 |
+
"<extra_id_39>",
|
| 43 |
+
"<extra_id_40>",
|
| 44 |
+
"<extra_id_41>",
|
| 45 |
+
"<extra_id_42>",
|
| 46 |
+
"<extra_id_43>",
|
| 47 |
+
"<extra_id_44>",
|
| 48 |
+
"<extra_id_45>",
|
| 49 |
+
"<extra_id_46>",
|
| 50 |
+
"<extra_id_47>",
|
| 51 |
+
"<extra_id_48>",
|
| 52 |
+
"<extra_id_49>",
|
| 53 |
+
"<extra_id_50>",
|
| 54 |
+
"<extra_id_51>",
|
| 55 |
+
"<extra_id_52>",
|
| 56 |
+
"<extra_id_53>",
|
| 57 |
+
"<extra_id_54>",
|
| 58 |
+
"<extra_id_55>",
|
| 59 |
+
"<extra_id_56>",
|
| 60 |
+
"<extra_id_57>",
|
| 61 |
+
"<extra_id_58>",
|
| 62 |
+
"<extra_id_59>",
|
| 63 |
+
"<extra_id_60>",
|
| 64 |
+
"<extra_id_61>",
|
| 65 |
+
"<extra_id_62>",
|
| 66 |
+
"<extra_id_63>",
|
| 67 |
+
"<extra_id_64>",
|
| 68 |
+
"<extra_id_65>",
|
| 69 |
+
"<extra_id_66>",
|
| 70 |
+
"<extra_id_67>",
|
| 71 |
+
"<extra_id_68>",
|
| 72 |
+
"<extra_id_69>",
|
| 73 |
+
"<extra_id_70>",
|
| 74 |
+
"<extra_id_71>",
|
| 75 |
+
"<extra_id_72>",
|
| 76 |
+
"<extra_id_73>",
|
| 77 |
+
"<extra_id_74>",
|
| 78 |
+
"<extra_id_75>",
|
| 79 |
+
"<extra_id_76>",
|
| 80 |
+
"<extra_id_77>",
|
| 81 |
+
"<extra_id_78>",
|
| 82 |
+
"<extra_id_79>",
|
| 83 |
+
"<extra_id_80>",
|
| 84 |
+
"<extra_id_81>",
|
| 85 |
+
"<extra_id_82>",
|
| 86 |
+
"<extra_id_83>",
|
| 87 |
+
"<extra_id_84>",
|
| 88 |
+
"<extra_id_85>",
|
| 89 |
+
"<extra_id_86>",
|
| 90 |
+
"<extra_id_87>",
|
| 91 |
+
"<extra_id_88>",
|
| 92 |
+
"<extra_id_89>",
|
| 93 |
+
"<extra_id_90>",
|
| 94 |
+
"<extra_id_91>",
|
| 95 |
+
"<extra_id_92>",
|
| 96 |
+
"<extra_id_93>",
|
| 97 |
+
"<extra_id_94>",
|
| 98 |
+
"<extra_id_95>",
|
| 99 |
+
"<extra_id_96>",
|
| 100 |
+
"<extra_id_97>",
|
| 101 |
+
"<extra_id_98>",
|
| 102 |
+
"<extra_id_99>"
|
| 103 |
+
],
|
| 104 |
+
"eos_token": "</s>",
|
| 105 |
+
"extra_ids": 100,
|
| 106 |
+
"model_max_length": 512,
|
| 107 |
+
"name_or_path": "Michau/t5-base-en-generate-headline",
|
| 108 |
+
"pad_token": "<pad>",
|
| 109 |
+
"sp_model_kwargs": {},
|
| 110 |
+
"special_tokens_map_file": "/root/.cache/huggingface/hub/models--Michau--t5-base-en-generate-headline/snapshots/f526532f788c45b6b6288286e5ef929fa768ef6a/special_tokens_map.json",
|
| 111 |
+
"tokenizer_class": "T5Tokenizer",
|
| 112 |
+
"truncate": true,
|
| 113 |
+
"unk_token": "<unk>"
|
| 114 |
+
}
|
title_generator.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import re
|
| 3 |
+
from nltk.corpus import stopwords
|
| 4 |
+
from gensim.parsing import preprocess_string, strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
|
| 5 |
+
import spacy
|
| 6 |
+
import torch
|
| 7 |
+
from transformers import T5ForConditionalGeneration,T5Tokenizer
|
| 8 |
+
|
| 9 |
+
#Custom text tokenizer from https://github.com/canunj/deconstructing_games by N Canu & K Chen
|
| 10 |
+
def doc_text_preprocessing(ser):
|
| 11 |
+
nlp=spacy.load("en_core_web_sm", exclude=['parser','ner','textcat'])
|
| 12 |
+
|
| 13 |
+
"""text processing steps"""
|
| 14 |
+
import re
|
| 15 |
+
stop_words=set(stopwords.words('english'))
|
| 16 |
+
|
| 17 |
+
single_letter_replace=lambda c: re.sub("\s+\w{1}\s+|\n|-|—",'',c)
|
| 18 |
+
to_lower_func=lambda c: c.lower()
|
| 19 |
+
lemma_text=[preprocess_string(
|
| 20 |
+
' '.join([token.lemma_ for token in desc]
|
| 21 |
+
),[remove_stopwords,strip_numeric,strip_punctuation,strip_tags,
|
| 22 |
+
strip_multiple_whitespaces,single_letter_replace,to_lower_func]
|
| 23 |
+
) for desc in ser.apply(lambda x: nlp(x))]
|
| 24 |
+
|
| 25 |
+
tokenize_text=[[word for word in string if word not in stop_words] for string in lemma_text]
|
| 26 |
+
|
| 27 |
+
return tokenize_text
|
| 28 |
+
|
| 29 |
+
class Title_Generator:
|
| 30 |
+
|
| 31 |
+
def __init__(self, path, df):
|
| 32 |
+
self.model = T5ForConditionalGeneration.from_pretrained(path)
|
| 33 |
+
self.tokenizer = T5Tokenizer.from_pretrained(path)
|
| 34 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 35 |
+
self.model.to(self.device)
|
| 36 |
+
self.game_df = df
|
| 37 |
+
|
| 38 |
+
self.title_iter = -1
|
| 39 |
+
self.out_titles = None
|
| 40 |
+
self.best_title = None
|
| 41 |
+
self.description = None
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def candidate_generator(self, description):
|
| 45 |
+
text = "headline: " + description
|
| 46 |
+
|
| 47 |
+
encoding = self.tokenizer.encode_plus(text, return_tensors = "pt")
|
| 48 |
+
input_ids = encoding["input_ids"].to(self.device)
|
| 49 |
+
attention_masks = encoding["attention_mask"].to(self.device)
|
| 50 |
+
|
| 51 |
+
candidates = []
|
| 52 |
+
|
| 53 |
+
beam_outputs = self.model.generate(
|
| 54 |
+
input_ids = input_ids,
|
| 55 |
+
attention_mask = attention_masks,
|
| 56 |
+
max_length = 64,
|
| 57 |
+
num_beams = 16,
|
| 58 |
+
num_beam_groups=4,
|
| 59 |
+
num_return_sequences=8,
|
| 60 |
+
diversity_penalty=.1,
|
| 61 |
+
repetition_penalty=.9,
|
| 62 |
+
early_stopping = True)
|
| 63 |
+
|
| 64 |
+
for result in beam_outputs:
|
| 65 |
+
res = self.tokenizer.decode(result).replace('<pad> ','').replace('</s>','').replace('<pad>','')
|
| 66 |
+
candidates.append(res)
|
| 67 |
+
|
| 68 |
+
return candidates, description
|
| 69 |
+
|
| 70 |
+
def candidate_score(self,candidates,ex_check=None):
|
| 71 |
+
import random
|
| 72 |
+
from operator import itemgetter
|
| 73 |
+
|
| 74 |
+
if ex_check != None:
|
| 75 |
+
pat = re.compile("((?:" + "|".join(map(re.escape, candidates[0]+[cand.upper() for cand in candidates[0]])) + "|" + "|".join(ex_check) +"))")
|
| 76 |
+
desc = re.sub(pat, "__", candidates[1])
|
| 77 |
+
else:
|
| 78 |
+
pat = re.compile("((?:" + "|".join(map(re.escape, candidates[0]+[cand.upper() for cand in candidates[0]])) + "))")
|
| 79 |
+
desc = re.sub(pat, "__", candidates[1])
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
if re.search(re.compile(re.escape("__")), desc):
|
| 83 |
+
reg = re.compile("("+"|".join(ex_check) + ")")
|
| 84 |
+
hold = candidates[0]
|
| 85 |
+
gen_desc = re.sub(re.compile(re.escape("__")),"",desc)
|
| 86 |
+
candidates = self.candidate_generator(gen_desc)
|
| 87 |
+
next = [cand for cand in candidates[0]+hold if not reg.search(cand)]
|
| 88 |
+
candidates = (next, desc)
|
| 89 |
+
|
| 90 |
+
#backup load function, will refactor
|
| 91 |
+
nlp=spacy.load("en_core_web_md")
|
| 92 |
+
|
| 93 |
+
#check for existing games and duplicates
|
| 94 |
+
#transform function from https://stackoverflow.com/questions/42165779/python-how-to-remove-duplicate-valuescase-insensitive-from-a-list-with-same-o
|
| 95 |
+
def transform(L):
|
| 96 |
+
S = set(L)
|
| 97 |
+
return [item.title() for item in L if item.lower() not in S and not S.add(item.lower())]
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
clean_cand_step = list(set([game[0] for game in list(zip(candidates[0],[len(self.game_df[self.game_df.name.isin([x])]) for x in candidates[0]])) if game[1]==0]))
|
| 101 |
+
clean_cand_step = transform(clean_cand_step)
|
| 102 |
+
|
| 103 |
+
clean_cand_step = [re.sub(re.compile("(?<=[ ])And(?=[ ])"),'and',
|
| 104 |
+
re.sub(re.compile('(?<=\S) (([(]|\b)[Ss]econd [Ee]dition([)]|\b)|[Ss]econd [Ee]dition|2[Nn][Dd] [Ee]dition|([(]|\b)[Tt]hird [Ee]dition([)]|\b)|3[Rr][Dd] [Ee]dition)|["]Second Edition["]'),"",
|
| 105 |
+
re.sub(re.compile("(?<=[a-z])'S"),"'s",
|
| 106 |
+
re.sub(re.compile("(?<=[ ])Of(?=[ ])"),"of",x))))
|
| 107 |
+
for x in clean_cand_step]
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
clean_cand = []
|
| 111 |
+
for cand in clean_cand_step:
|
| 112 |
+
try:
|
| 113 |
+
inter = cand.split(":")
|
| 114 |
+
if inter[0].lower()==inter[1].lower():
|
| 115 |
+
clean_cand.append(inter[0])
|
| 116 |
+
else:
|
| 117 |
+
clean_cand.append(cand)
|
| 118 |
+
except:
|
| 119 |
+
clean_cand.append(cand)
|
| 120 |
+
|
| 121 |
+
#text processing
|
| 122 |
+
token_cand = doc_text_preprocessing(pd.Series(clean_cand))
|
| 123 |
+
token_art = doc_text_preprocessing(pd.Series([candidates[1]]))
|
| 124 |
+
sim = [nlp(title) for title in [" ".join(title) for title in token_cand]]
|
| 125 |
+
doc = nlp(" ".join(token_art[0]))
|
| 126 |
+
|
| 127 |
+
#scores cosine similarity between generated titles and body text, if the word is unknown (i.e. generator knows it but spacy doesn't)
|
| 128 |
+
#it assigns a random probability to populate
|
| 129 |
+
|
| 130 |
+
scores = [x if x !=0 else random.uniform(.3, .7) for x in [tok.similarity(doc) for tok in sim]]
|
| 131 |
+
|
| 132 |
+
out_titles = sorted(list(zip(clean_cand,scores)),key=itemgetter(1),reverse=True)
|
| 133 |
+
|
| 134 |
+
pat = re.compile("(?<=[!.?])(?=[^\s])")
|
| 135 |
+
pat2 = re.compile("([Ff]rom the [Pp]ublisher[: ]|[Ff]rom the [Dd]esigner[: ]|[Gg]ame [Dd]escription)")
|
| 136 |
+
pat3 = re.compile(": [Tt]he [Gg]ame: [Tt]he [Gg]ame|: [Tt]he [Gg]ame")
|
| 137 |
+
pat4 = re.compile("[Tt]he __")
|
| 138 |
+
pat5 = re.compile("__ [Gg]ame")
|
| 139 |
+
pat6 = re.compile("[Tt]he [Gg]ame [Oo]f __")
|
| 140 |
+
|
| 141 |
+
desc = re.sub(pat," ",candidates[1])
|
| 142 |
+
desc = re.sub(pat2,"",desc)
|
| 143 |
+
desc = re.sub(pat3,"",desc)
|
| 144 |
+
desc = re.sub(pat4,"__",desc)
|
| 145 |
+
desc = re.sub(pat5,"__",desc)
|
| 146 |
+
desc = re.sub(pat6,"__",desc)
|
| 147 |
+
|
| 148 |
+
return {'text':desc,'titles':out_titles}
|