Yajii2 commited on
Commit
1ffcc33
·
unverified ·
1 Parent(s): 8f9c2f7

Add application file

Browse files
Files changed (6) hide show
  1. .gitignore +179 -0
  2. app.py +70 -0
  3. model/gpt_char_model.py +27 -0
  4. model/gpt_char_model_v3.pth +3 -0
  5. requirements.txt +1 -0
  6. tokenizer.py +18 -0
.gitignore ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Created by https://www.toptal.com/developers/gitignore/api/python
2
+ # Edit at https://www.toptal.com/developers/gitignore?templates=python
3
+
4
+ ### Python ###
5
+ # Byte-compiled / optimized / DLL files
6
+ __pycache__/
7
+ *.py[cod]
8
+ *$py.class
9
+
10
+ # C extensions
11
+ *.so
12
+
13
+ # Distribution / packaging
14
+ .Python
15
+ build/
16
+ develop-eggs/
17
+ dist/
18
+ downloads/
19
+ eggs/
20
+ .eggs/
21
+ lib/
22
+ lib64/
23
+ parts/
24
+ sdist/
25
+ var/
26
+ wheels/
27
+ share/python-wheels/
28
+ *.egg-info/
29
+ .installed.cfg
30
+ *.egg
31
+ MANIFEST
32
+
33
+ # PyInstaller
34
+ # Usually these files are written by a python script from a template
35
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
36
+ *.manifest
37
+ *.spec
38
+
39
+ # Installer logs
40
+ pip-log.txt
41
+ pip-delete-this-directory.txt
42
+
43
+ # Unit test / coverage reports
44
+ htmlcov/
45
+ .tox/
46
+ .nox/
47
+ .coverage
48
+ .coverage.*
49
+ .cache
50
+ nosetests.xml
51
+ coverage.xml
52
+ *.cover
53
+ *.py,cover
54
+ .hypothesis/
55
+ .pytest_cache/
56
+ cover/
57
+
58
+ # Translations
59
+ *.mo
60
+ *.pot
61
+
62
+ # Django stuff:
63
+ *.log
64
+ local_settings.py
65
+ db.sqlite3
66
+ db.sqlite3-journal
67
+
68
+ # Flask stuff:
69
+ instance/
70
+ .webassets-cache
71
+
72
+ # Scrapy stuff:
73
+ .scrapy
74
+
75
+ # Sphinx documentation
76
+ docs/_build/
77
+
78
+ # PyBuilder
79
+ .pybuilder/
80
+ target/
81
+
82
+ # Jupyter Notebook
83
+ .ipynb_checkpoints
84
+
85
+ # IPython
86
+ profile_default/
87
+ ipython_config.py
88
+
89
+ # pyenv
90
+ # For a library or package, you might want to ignore these files since the code is
91
+ # intended to run in multiple environments; otherwise, check them in:
92
+ # .python-version
93
+
94
+ # pipenv
95
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
97
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
98
+ # install all needed dependencies.
99
+ #Pipfile.lock
100
+
101
+ # poetry
102
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
104
+ # commonly ignored for libraries.
105
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106
+ #poetry.lock
107
+
108
+ # pdm
109
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110
+ #pdm.lock
111
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112
+ # in version control.
113
+ # https://pdm.fming.dev/#use-with-ide
114
+ .pdm.toml
115
+
116
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117
+ __pypackages__/
118
+
119
+ # Celery stuff
120
+ celerybeat-schedule
121
+ celerybeat.pid
122
+
123
+ # SageMath parsed files
124
+ *.sage.py
125
+
126
+ # Environments
127
+ .env
128
+ .venv
129
+ env/
130
+ venv/
131
+ ENV/
132
+ env.bak/
133
+ venv.bak/
134
+
135
+ # Spyder project settings
136
+ .spyderproject
137
+ .spyproject
138
+
139
+ # Rope project settings
140
+ .ropeproject
141
+
142
+ # mkdocs documentation
143
+ /site
144
+
145
+ # mypy
146
+ .mypy_cache/
147
+ .dmypy.json
148
+ dmypy.json
149
+
150
+ # Pyre type checker
151
+ .pyre/
152
+
153
+ # pytype static type analyzer
154
+ .pytype/
155
+
156
+ # Cython debug symbols
157
+ cython_debug/
158
+
159
+ # PyCharm
160
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
163
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
164
+ #.idea/
165
+
166
+ ### Python Patch ###
167
+ # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
168
+ poetry.toml
169
+
170
+ # ruff
171
+ .ruff_cache/
172
+
173
+ # LSP config files
174
+ pyrightconfig.json
175
+
176
+ # End of https://www.toptal.com/developers/gitignore/api/python
177
+
178
+ data/*
179
+ !data/.keep
app.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+
4
+ from model.gpt_char_model import CharGPT
5
+ from tokenizer import CharTokenizer
6
+
7
+
8
+ def load_model(model_path="model/gpt_char_model.pth", block_size=32):
9
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
+ device = torch.device("cpu")
11
+ tokenizer = CharTokenizer()
12
+ vocab_size = len(tokenizer.chars)
13
+ model = CharGPT(
14
+ vocab_size=vocab_size,
15
+ block_size=block_size,
16
+ n_layer=6,
17
+ n_head=4,
18
+ n_embd=256,
19
+ ).to(device)
20
+ model.load_state_dict(torch.load(model_path, map_location=device))
21
+ model.eval()
22
+ return model, tokenizer, device
23
+
24
+
25
+ @torch.no_grad()
26
+ def generate_username(seed_text="", min_length=1, max_length=16, temperature=1.0):
27
+ model, tokenizer, device = load_model(model_path="model/gpt_char_model_v3.pth")
28
+ input_ids = tokenizer.encode(seed_text)
29
+ input_ids.insert(0, 0)
30
+ input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)
31
+ for _ in range(max_length):
32
+ input_crop = input_ids[:, -model.block_size :]
33
+ logits = model(input_crop)
34
+ logits = logits[:, -1, :] / temperature
35
+ probs = torch.softmax(logits, dim=-1)
36
+ next_id = torch.multinomial(probs, num_samples=1)
37
+ next_char = tokenizer.decode(next_id[0].tolist())
38
+ if next_char == "\n":
39
+ if input_ids.shape[1] < min_length:
40
+ continue
41
+ break
42
+ input_ids = torch.cat((input_ids, next_id), dim=1)
43
+ return tokenizer.decode(input_ids[0].tolist()).strip()
44
+
45
+
46
+ def gradio_interface(seed_text, min_length, max_length, temperature):
47
+ return generate_username(
48
+ seed_text, int(min_length), int(max_length), float(temperature)
49
+ )
50
+
51
+
52
+ with gr.Blocks(theme=gr.themes.Ocean()) as demo:
53
+ gr.Markdown("# MCID Generator")
54
+ with gr.Row():
55
+ seed = gr.Textbox(label="Start token", value="")
56
+ with gr.Row():
57
+ with gr.Column():
58
+ min_length = gr.Slider(1, 32, value=1, step=1, label="Minimum length")
59
+ max_length = gr.Slider(1, 32, value=16, step=1, label="Maximum length")
60
+ temperature = gr.Slider(0.5, 2.0, value=1.0, step=0.05, label="Temperature")
61
+ with gr.Row():
62
+ output = gr.Textbox(label="Generated username")
63
+ generate_btn = gr.Button("Generate")
64
+ generate_btn.click(
65
+ gradio_interface,
66
+ inputs=[seed, min_length, max_length, temperature],
67
+ outputs=output,
68
+ )
69
+
70
+ demo.launch(share=True)
model/gpt_char_model.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ class CharGPT(nn.Module):
5
+ def __init__(self, vocab_size, n_embd=128, n_head=4, n_layer=4, block_size=32):
6
+ super().__init__()
7
+ self.token_embedding = nn.Embedding(vocab_size, n_embd)
8
+ self.pos_embedding = nn.Embedding(block_size, n_embd)
9
+ self.transformer = nn.TransformerEncoder(
10
+ nn.TransformerEncoderLayer(d_model=n_embd, nhead=n_head),
11
+ num_layers=n_layer
12
+ )
13
+ self.ln = nn.LayerNorm(n_embd)
14
+ self.fc = nn.Linear(n_embd, vocab_size)
15
+
16
+ self.block_size = block_size
17
+
18
+ def forward(self, idx):
19
+ B, T = idx.shape
20
+ tok_emb = self.token_embedding(idx)
21
+ pos = torch.arange(T, device=idx.device)
22
+ pos_emb = self.pos_embedding(pos)
23
+ x = tok_emb + pos_emb
24
+ x = self.transformer(x)
25
+ x = self.ln(x)
26
+ logits = self.fc(x)
27
+ return logits
model/gpt_char_model_v3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28877d8db87eabd9a8e1a674b9b5c4cb600d0a163acdac00886b351bc96c1232
3
+ size 31757701
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ torch
tokenizer.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class CharTokenizer:
2
+ def __init__(self, chars=None):
3
+ if chars is None:
4
+ # Minecraftユーザー名で使われやすい文字のセット例
5
+ chars = list(
6
+ "\nabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"
7
+ )
8
+ self.chars = chars
9
+ self.char2id = {ch: i for i, ch in enumerate(chars)}
10
+ self.id2char = {i: ch for i, ch in enumerate(chars)}
11
+
12
+ def encode(self, text):
13
+ # 文字列をIDリストに変換。未知文字は無視(または特別扱いも可能)
14
+ return [self.char2id[ch] for ch in text if ch in self.char2id]
15
+
16
+ def decode(self, ids):
17
+ # IDリストを文字列に変換
18
+ return "".join(self.id2char[i] for i in ids if i in self.id2char)