Rafael Camargo commited on
Commit
698cdee
·
0 Parent(s):

initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ src/model/*.pt filter=lfs diff=lfs merge=lfs -text
2
+ src/data/dictionary_compact.json filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Created by https://www.toptal.com/developers/gitignore/api/windows,node,macos,linux,sublimetext
3
+ # Edit at https://www.toptal.com/developers/gitignore?templates=windows,node,macos,linux,sublimetext
4
+
5
+ ### Linux ###
6
+ *~
7
+
8
+ # temporary files which can be created if a process still has a handle open of a deleted file
9
+ .fuse_hidden*
10
+
11
+ # KDE directory preferences
12
+ .directory
13
+
14
+ # Linux trash folder which might appear on any partition or disk
15
+ .Trash-*
16
+
17
+ # .nfs files are created when an open file is removed but is still being accessed
18
+ .nfs*
19
+
20
+ ### macOS ###
21
+ # General
22
+ .DS_Store
23
+ .AppleDouble
24
+ .LSOverride
25
+
26
+ # Icon must end with two \r
27
+ Icon
28
+
29
+
30
+ # Thumbnails
31
+ ._*
32
+
33
+ # Files that might appear in the root of a volume
34
+ .DocumentRevisions-V100
35
+ .fseventsd
36
+ .Spotlight-V100
37
+ .TemporaryItems
38
+ .Trashes
39
+ .VolumeIcon.icns
40
+ .com.apple.timemachine.donotpresent
41
+
42
+ # Directories potentially created on remote AFP share
43
+ .AppleDB
44
+ .AppleDesktop
45
+ Network Trash Folder
46
+ Temporary Items
47
+ .apdisk
48
+
49
+ ### Node ###
50
+ # Logs
51
+ logs
52
+ *.log
53
+ npm-debug.log*
54
+ yarn-debug.log*
55
+ yarn-error.log*
56
+ lerna-debug.log*
57
+
58
+ # Diagnostic reports (https://nodejs.org/api/report.html)
59
+ report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
60
+
61
+ # Runtime data
62
+ pids
63
+ *.pid
64
+ *.seed
65
+ *.pid.lock
66
+
67
+ # Directory for instrumented libs generated by jscoverage/JSCover
68
+ lib-cov
69
+
70
+ # Coverage directory used by tools like istanbul
71
+ coverage
72
+ *.lcov
73
+
74
+ # nyc test coverage
75
+ .nyc_output
76
+
77
+ # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
78
+ .grunt
79
+
80
+ # Bower dependency directory (https://bower.io/)
81
+ bower_components
82
+
83
+ # node-waf configuration
84
+ .lock-wscript
85
+
86
+ # Compiled binary addons (https://nodejs.org/api/addons.html)
87
+ build/Release
88
+
89
+ # Dependency directories
90
+ node_modules/
91
+ jspm_packages/
92
+
93
+ # TypeScript v1 declaration files
94
+ typings/
95
+
96
+ # TypeScript cache
97
+ *.tsbuildinfo
98
+
99
+ # Optional npm cache directory
100
+ .npm
101
+
102
+ # Optional eslint cache
103
+ .eslintcache
104
+
105
+ # Optional stylelint cache
106
+ .stylelintcache
107
+
108
+ # Microbundle cache
109
+ .rpt2_cache/
110
+ .rts2_cache_cjs/
111
+ .rts2_cache_es/
112
+ .rts2_cache_umd/
113
+
114
+ # Optional REPL history
115
+ .node_repl_history
116
+
117
+ # Output of 'npm pack'
118
+ *.tgz
119
+
120
+ # Yarn Integrity file
121
+ .yarn-integrity
122
+
123
+ # dotenv environment variables file
124
+ .env
125
+ .env.test
126
+ .env*.local
127
+
128
+ # parcel-bundler cache (https://parceljs.org/)
129
+ .cache
130
+ .parcel-cache
131
+
132
+ # Next.js build output
133
+ .next
134
+
135
+ # Nuxt.js build / generate output
136
+ .nuxt
137
+ dist
138
+
139
+ # Storybook build outputs
140
+ .out
141
+ .storybook-out
142
+ storybook-static
143
+
144
+ # rollup.js default build output
145
+ dist/
146
+
147
+ # Gatsby files
148
+ .cache/
149
+ # Comment in the public line in if your project uses Gatsby and not Next.js
150
+ # https://nextjs.org/blog/next-9-1#public-directory-support
151
+ # public
152
+
153
+ # vuepress build output
154
+ .vuepress/dist
155
+
156
+ # Serverless directories
157
+ .serverless/
158
+
159
+ # FuseBox cache
160
+ .fusebox/
161
+
162
+ # DynamoDB Local files
163
+ .dynamodb/
164
+
165
+ # TernJS port file
166
+ .tern-port
167
+
168
+ # Stores VSCode versions used for testing VSCode extensions
169
+ .vscode-test
170
+
171
+ # Temporary folders
172
+ tmp/
173
+ temp/
174
+
175
+ ### SublimeText ###
176
+ # Cache files for Sublime Text
177
+ *.tmlanguage.cache
178
+ *.tmPreferences.cache
179
+ *.stTheme.cache
180
+
181
+ # Workspace files are user-specific
182
+ *.sublime-workspace
183
+
184
+ # Project files should be checked into the repository, unless a significant
185
+ # proportion of contributors will probably not be using Sublime Text
186
+ # *.sublime-project
187
+
188
+ # SFTP configuration file
189
+ sftp-config.json
190
+
191
+ # Package control specific files
192
+ Package Control.last-run
193
+ Package Control.ca-list
194
+ Package Control.ca-bundle
195
+ Package Control.system-ca-bundle
196
+ Package Control.cache/
197
+ Package Control.ca-certs/
198
+ Package Control.merged-ca-bundle
199
+ Package Control.user-ca-bundle
200
+ oscrypto-ca-bundle.crt
201
+ bh_unicode_properties.cache
202
+
203
+ # Sublime-github package stores a github token in this file
204
+ # https://packagecontrol.io/packages/sublime-github
205
+ GitHub.sublime-settings
206
+
207
+ ### Windows ###
208
+ # Windows thumbnail cache files
209
+ Thumbs.db
210
+ Thumbs.db:encryptable
211
+ ehthumbs.db
212
+ ehthumbs_vista.db
213
+
214
+ # Dump file
215
+ *.stackdump
216
+
217
+ # Folder config file
218
+ [Dd]esktop.ini
219
+
220
+ # Recycle Bin used on file shares
221
+ $RECYCLE.BIN/
222
+
223
+ # Windows Installer files
224
+ *.cab
225
+ *.msi
226
+ *.msix
227
+ *.msm
228
+ *.msp
229
+
230
+ # Windows shortcuts
231
+ *.lnk
232
+
233
+ # End of https://www.toptal.com/developers/gitignore/api/windows,node,macos,linux,sublimetext
README.md ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Rellow
2
+
3
+ ## Contributing
4
+
5
+ 1. Install [Node](https://nodejs.org/en/). Download the "Recommend for Most Users" version.
6
+
7
+ 2. Clone the repo:
8
+ ``` bash
9
+ git clone git@github.com:rafaelcamargo/rellow.git
10
+ ```
11
+
12
+ 3. Go to the project directory
13
+ ``` bash
14
+ cd rellow
15
+ ```
16
+
17
+ 4. Install the project dependencies
18
+ ``` bash
19
+ npm install
20
+ ```
21
+
22
+ 5. Run the experiment
23
+ ``` bash
24
+ npm run start
25
+ ```
26
+
27
+ ## Tests
28
+
29
+ In case you have changed any website behavior, ensure that all changes are covered with automated tests:
30
+ ``` bash
31
+ npm run test
32
+ ```
package-lock.json ADDED
The diff for this file is too large to render. See raw diff
 
package.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "rellow",
3
+ "version": "0.1.0",
4
+ "description": "Imaginary word generator",
5
+ "main": "src/index.js",
6
+ "scripts": {
7
+ "test": "jest",
8
+ "start": "node ./src/index"
9
+ },
10
+ "keywords": [
11
+ "ai",
12
+ "generative",
13
+ "supervised",
14
+ "learning",
15
+ "word"
16
+ ],
17
+ "author": "Rafael Camargo <hello@rafelcamargo.com>",
18
+ "license": "UNLICENSED",
19
+ "devDependencies": {
20
+ "jest": "^29.7.0"
21
+ },
22
+ "dependencies": {
23
+ "@tensorflow/tfjs-node": "^4.22.0"
24
+ }
25
+ }
src/data/dictionary_compact.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4636de7bb1026b772680a075efc093871c0362fcdbbf588fa7d2f060492b44a4
3
+ size 22458934
src/data/words.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "word": "Eloquent",
4
+ "definition": "Fluent or persuasive in speaking or writing"
5
+ },
6
+ {
7
+ "word": "Serene",
8
+ "definition": "Peaceful and untroubled"
9
+ },
10
+ {
11
+ "word": "Vivid",
12
+ "definition": "Producing powerful feelings or strong"
13
+ },
14
+ {
15
+ "word": "Ardent",
16
+ "definition": "Enthusiastic or passionate"
17
+ },
18
+ {
19
+ "word": "Fragrant",
20
+ "definition": "Having a pleasant or sweet smell"
21
+ },
22
+ {
23
+ "word": "Diligent",
24
+ "definition": "Showing care and conscientiousness in one's work or duties"
25
+ },
26
+ {
27
+ "word": "Imminent",
28
+ "definition": "About to happen"
29
+ },
30
+ {
31
+ "word": "Opaque",
32
+ "definition": "Not able to be seen through"
33
+ },
34
+ {
35
+ "word": "Subtle",
36
+ "definition": "Delicate or precise as to be difficult to perceive or analyze"
37
+ },
38
+ {
39
+ "word": "Ethereal",
40
+ "definition": "Extremely delicate and light in a way that seems not of this world"
41
+ }
42
+ ]
src/index.js ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const tf = require('@tensorflow/tfjs-node');
2
+ const tokenService = require('./services/token');
3
+ const dictionary = require('./data/words');
4
+
5
+ const dataset = dictionary.map(({ word, definition }) => {
6
+ const encodedDefinition = tokenService.encode(definition.replace(/,;\./g, '').split(' '));
7
+ const encodedWord = tokenService.encode([word]);
8
+ const necessaryPad = encodedDefinition.length - encodedWord.length;
9
+ const padding = new Array(necessaryPad).fill(0);
10
+ const finalWord = encodedWord.concat(padding)
11
+ return {
12
+ word: finalWord,
13
+ definition: encodedDefinition,
14
+ }
15
+ })
16
+ const dictionatySize = 200 // random big number. understand how to better set it
17
+ const maxInputSentenceSize = dataset.reduce((maxSize, { word, definition }) => {
18
+ return definition.length > maxSize ? definition.length : maxSize;
19
+ }, 0);
20
+
21
+
22
+ async function run() {
23
+ const model = tf.sequential();
24
+ // Embedding layer for word representations
25
+ model.add(tf.layers.embedding({inputDim: dictionatySize, outputDim: 64, inputLength: maxInputSentenceSize}));
26
+ // LSTM layer for capturing sequence information
27
+ model.add(tf.layers.lstm({units: 128, returnSequences: true}));
28
+ // Dense layer to output a word for each position in the sentence
29
+ // model.add(tf.layers.dense({units: maxInputSentenceSize, activation: 'softmax'}));
30
+ model.add(tf.layers.dense({units: dictionatySize, activation: 'softmax'}));
31
+ model.compile({loss: 'categoricalCrossentropy', optimizer: 'adam'});
32
+
33
+ // Prepare input and output sequences as tensors
34
+ const trainingWords = dataset.map(({ word }) => tokenService.padEncoding(word, maxInputSentenceSize))
35
+ const trainingDefinitions = dataset.map(({ definition }) => tokenService.padEncoding(definition, maxInputSentenceSize))
36
+
37
+ const tensorWords = tf.tensor2d(trainingWords); // shape: [numSamples, maxInputLength]
38
+ // const tensorDefinitions = tf.tensor2d(trainingDefinitions); // shape: [numSamples, maxOutputLength, vocabSize]
39
+
40
+ const tensorDefinitions = tf.tensor3d(
41
+ trainingDefinitions.map(def => tokenService.oneHotEncode(def, dictionatySize)),
42
+ [trainingDefinitions.length, maxInputSentenceSize, dictionatySize]
43
+ );
44
+
45
+ // Train the model on text sequences
46
+ await model.fit(tensorWords, tensorDefinitions, {epochs: 100});
47
+
48
+ // predict(model, 'Serene') // Understand why definition is not right even for a word already defined in the training dataset
49
+ predict(model, 'Smoker');
50
+ }
51
+
52
+ function predict(model, newWord){
53
+ let encodedWord = tokenService.encode([newWord]);
54
+
55
+ // Ensure padding
56
+ encodedWord = tokenService.padEncoding(encodedWord, maxInputSentenceSize);
57
+
58
+ // Convert to tensor
59
+ const wordTensor = tf.tensor2d([encodedWord]);
60
+
61
+ // Generate prediction
62
+ const prediction = model.predict(wordTensor);
63
+
64
+ // Decode the predicted tokens
65
+ const predictedTokens = prediction.argMax(2).arraySync()[0]; // Get token IDs
66
+ console.log({ predictedTokens })
67
+ const predictedDefinition = predictedTokens.map(tokenId => tokenService.decodeToken(tokenId)).join(' ');
68
+
69
+ console.log(`Generated Definition for '${newWord}': ${predictedDefinition.replace(/0/g, '').trim()}`);
70
+ }
71
+
72
+ run()
src/services/token.js ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const _public = {};
2
+
3
+ const dictionary = [];
4
+
5
+ _public.encode = words => {
6
+ return words.map(word => {
7
+ const code = findWordCode(word);
8
+ if(code !== -1) return code + 1;
9
+ dictionary.push(word);
10
+ return dictionary.length;
11
+ });
12
+ };
13
+
14
+ _public.decode = codes => {
15
+ return codes.map(code => {
16
+ return dictionary[code - 1];
17
+ });
18
+ };
19
+
20
+ _public.decodeToken = code => {
21
+ return dictionary[code - 1] || 0;
22
+ }
23
+
24
+ _public.oneHotEncode = (sequence, vocabSize) => {
25
+ return sequence.map(tokenId => {
26
+ const oneHot = new Array(vocabSize).fill(0);
27
+ if (tokenId >= 0 && tokenId < vocabSize) {
28
+ oneHot[tokenId] = 1;
29
+ }
30
+ return oneHot;
31
+ });
32
+ };
33
+
34
+ _public.padEncoding = (encoding, minLength) => {
35
+ const necessaryPadding = minLength - encoding.length;
36
+ if(necessaryPadding > 0) {
37
+ const padding = new Array(necessaryPadding).fill(0);
38
+ return encoding.concat(padding);
39
+ }
40
+ return encoding;
41
+ };
42
+
43
+ _public.resetDictionary = () => {
44
+ dictionary.length = 0;
45
+ };
46
+
47
+ function findWordCode(word){
48
+ return dictionary.indexOf(word);
49
+ }
50
+
51
+ module.exports = _public;
src/services/token.test.js ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const tokenService = require('./token')
2
+
3
+ describe('Token Service', () => {
4
+ afterEach(() => {
5
+ tokenService.resetDictionary()
6
+ });
7
+
8
+ it('should encode/decode words', () => {
9
+ expect(tokenService.encode(['glorious', 'times'])).toEqual([1, 2]);
10
+ expect(tokenService.decode([2, 1])).toEqual(['times', 'glorious']);
11
+ });
12
+
13
+ it('should pad encoding result when lower than minimum length', () => {
14
+ expect(tokenService.padEncoding([1,2], 5)).toEqual([1,2,0,0,0]);
15
+ expect(tokenService.padEncoding([1,2,3,4,5,6], 6)).toEqual([1,2,3,4,5,6]);
16
+ })
17
+ });