Rafael Camargo commited on
Commit
d62ddc8
·
1 Parent(s): 698cdee

remove javascript code and start a python project

Browse files
.gitignore CHANGED
@@ -1,6 +1,5 @@
1
-
2
- # Created by https://www.toptal.com/developers/gitignore/api/windows,node,macos,linux,sublimetext
3
- # Edit at https://www.toptal.com/developers/gitignore?templates=windows,node,macos,linux,sublimetext
4
 
5
  ### Linux ###
6
  *~
@@ -46,188 +45,180 @@ Network Trash Folder
46
  Temporary Items
47
  .apdisk
48
 
49
- ### Node ###
50
- # Logs
51
- logs
52
- *.log
53
- npm-debug.log*
54
- yarn-debug.log*
55
- yarn-error.log*
56
- lerna-debug.log*
57
-
58
- # Diagnostic reports (https://nodejs.org/api/report.html)
59
- report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
60
-
61
- # Runtime data
62
- pids
63
- *.pid
64
- *.seed
65
- *.pid.lock
66
-
67
- # Directory for instrumented libs generated by jscoverage/JSCover
68
- lib-cov
69
-
70
- # Coverage directory used by tools like istanbul
71
- coverage
72
- *.lcov
73
-
74
- # nyc test coverage
75
- .nyc_output
76
-
77
- # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
78
- .grunt
79
-
80
- # Bower dependency directory (https://bower.io/)
81
- bower_components
82
-
83
- # node-waf configuration
84
- .lock-wscript
85
 
86
- # Compiled binary addons (https://nodejs.org/api/addons.html)
87
- build/Release
 
 
 
88
 
89
- # Dependency directories
90
- node_modules/
91
- jspm_packages/
92
 
93
- # TypeScript v1 declaration files
94
- typings/
95
-
96
- # TypeScript cache
97
- *.tsbuildinfo
98
-
99
- # Optional npm cache directory
100
- .npm
101
-
102
- # Optional eslint cache
103
- .eslintcache
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- # Optional stylelint cache
106
- .stylelintcache
 
107
 
108
- # Microbundle cache
109
- .rpt2_cache/
110
- .rts2_cache_cjs/
111
- .rts2_cache_es/
112
- .rts2_cache_umd/
113
 
114
- # Optional REPL history
115
- .node_repl_history
116
 
117
- # Output of 'npm pack'
118
- *.tgz
 
 
119
 
120
- # Yarn Integrity file
121
- .yarn-integrity
122
 
123
- # dotenv environment variables file
124
- .env
125
- .env.test
126
- .env*.local
127
 
128
- # parcel-bundler cache (https://parceljs.org/)
129
- .cache
130
- .parcel-cache
131
 
132
- # Next.js build output
133
- .next
 
 
 
 
134
 
135
- # Nuxt.js build / generate output
136
- .nuxt
137
- dist
138
 
139
- # Storybook build outputs
140
- .out
141
- .storybook-out
142
- storybook-static
143
 
144
- # rollup.js default build output
145
- dist/
146
 
147
- # Gatsby files
148
- .cache/
149
- # Comment in the public line in if your project uses Gatsby and not Next.js
150
- # https://nextjs.org/blog/next-9-1#public-directory-support
151
- # public
152
-
153
- # vuepress build output
154
- .vuepress/dist
155
-
156
- # Serverless directories
157
- .serverless/
158
-
159
- # FuseBox cache
160
- .fusebox/
161
-
162
- # DynamoDB Local files
163
- .dynamodb/
164
-
165
- # TernJS port file
166
- .tern-port
167
-
168
- # Stores VSCode versions used for testing VSCode extensions
169
- .vscode-test
170
-
171
- # Temporary folders
172
- tmp/
173
- temp/
174
-
175
- ### SublimeText ###
176
- # Cache files for Sublime Text
177
- *.tmlanguage.cache
178
- *.tmPreferences.cache
179
- *.stTheme.cache
180
-
181
- # Workspace files are user-specific
182
- *.sublime-workspace
183
-
184
- # Project files should be checked into the repository, unless a significant
185
- # proportion of contributors will probably not be using Sublime Text
186
- # *.sublime-project
187
-
188
- # SFTP configuration file
189
- sftp-config.json
190
-
191
- # Package control specific files
192
- Package Control.last-run
193
- Package Control.ca-list
194
- Package Control.ca-bundle
195
- Package Control.system-ca-bundle
196
- Package Control.cache/
197
- Package Control.ca-certs/
198
- Package Control.merged-ca-bundle
199
- Package Control.user-ca-bundle
200
- oscrypto-ca-bundle.crt
201
- bh_unicode_properties.cache
202
-
203
- # Sublime-github package stores a github token in this file
204
- # https://packagecontrol.io/packages/sublime-github
205
- GitHub.sublime-settings
206
-
207
- ### Windows ###
208
- # Windows thumbnail cache files
209
- Thumbs.db
210
- Thumbs.db:encryptable
211
- ehthumbs.db
212
- ehthumbs_vista.db
213
-
214
- # Dump file
215
- *.stackdump
216
-
217
- # Folder config file
218
- [Dd]esktop.ini
219
-
220
- # Recycle Bin used on file shares
221
- $RECYCLE.BIN/
222
-
223
- # Windows Installer files
224
- *.cab
225
- *.msi
226
- *.msix
227
- *.msm
228
- *.msp
229
-
230
- # Windows shortcuts
231
- *.lnk
232
-
233
- # End of https://www.toptal.com/developers/gitignore/api/windows,node,macos,linux,sublimetext
 
1
+ # Created by https://www.toptal.com/developers/gitignore/api/macos,python,linux
2
+ # Edit at https://www.toptal.com/developers/gitignore?templates=macos,python,linux
 
3
 
4
  ### Linux ###
5
  *~
 
45
  Temporary Items
46
  .apdisk
47
 
48
+ ### macOS Patch ###
49
+ # iCloud generated files
50
+ *.icloud
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ ### Python ###
53
+ # Byte-compiled / optimized / DLL files
54
+ __pycache__/
55
+ *.py[cod]
56
+ *$py.class
57
 
58
+ # C extensions
59
+ *.so
 
60
 
61
+ # Distribution / packaging
62
+ .Python
63
+ build/
64
+ develop-eggs/
65
+ dist/
66
+ downloads/
67
+ eggs/
68
+ .eggs/
69
+ lib/
70
+ lib64/
71
+ parts/
72
+ sdist/
73
+ var/
74
+ wheels/
75
+ share/python-wheels/
76
+ *.egg-info/
77
+ .installed.cfg
78
+ *.egg
79
+ MANIFEST
80
+
81
+ # PyInstaller
82
+ # Usually these files are written by a python script from a template
83
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
84
+ *.manifest
85
+ *.spec
86
+
87
+ # Installer logs
88
+ pip-log.txt
89
+ pip-delete-this-directory.txt
90
+
91
+ # Unit test / coverage reports
92
+ htmlcov/
93
+ .tox/
94
+ .nox/
95
+ .coverage
96
+ .coverage.*
97
+ .cache
98
+ nosetests.xml
99
+ coverage.xml
100
+ *.cover
101
+ *.py,cover
102
+ .hypothesis/
103
+ .pytest_cache/
104
+ cover/
105
+
106
+ # Translations
107
+ *.mo
108
+ *.pot
109
+
110
+ # Django stuff:
111
+ *.log
112
+ local_settings.py
113
+ db.sqlite3
114
+ db.sqlite3-journal
115
+
116
+ # Flask stuff:
117
+ instance/
118
+ .webassets-cache
119
+
120
+ # Scrapy stuff:
121
+ .scrapy
122
+
123
+ # Sphinx documentation
124
+ docs/_build/
125
+
126
+ # PyBuilder
127
+ .pybuilder/
128
+ target/
129
+
130
+ # Jupyter Notebook
131
+ .ipynb_checkpoints
132
+
133
+ # IPython
134
+ profile_default/
135
+ ipython_config.py
136
+
137
+ # pyenv
138
+ # For a library or package, you might want to ignore these files since the code is
139
+ # intended to run in multiple environments; otherwise, check them in:
140
+ # .python-version
141
+
142
+ # pipenv
143
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
144
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
145
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
146
+ # install all needed dependencies.
147
+ #Pipfile.lock
148
+
149
+ # poetry
150
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
151
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
152
+ # commonly ignored for libraries.
153
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
154
+ #poetry.lock
155
+
156
+ # pdm
157
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
158
+ #pdm.lock
159
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
160
+ # in version control.
161
+ # https://pdm.fming.dev/#use-with-ide
162
+ .pdm.toml
163
+
164
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
165
+ __pypackages__/
166
+
167
+ # Celery stuff
168
+ celerybeat-schedule
169
+ celerybeat.pid
170
+
171
+ # SageMath parsed files
172
+ *.sage.py
173
+
174
+ # Environments
175
+ .env
176
+ .venv
177
+ env/
178
+ venv/
179
+ ENV/
180
+ env.bak/
181
+ venv.bak/
182
 
183
+ # Spyder project settings
184
+ .spyderproject
185
+ .spyproject
186
 
187
+ # Rope project settings
188
+ .ropeproject
 
 
 
189
 
190
+ # mkdocs documentation
191
+ /site
192
 
193
+ # mypy
194
+ .mypy_cache/
195
+ .dmypy.json
196
+ dmypy.json
197
 
198
+ # Pyre type checker
199
+ .pyre/
200
 
201
+ # pytype static type analyzer
202
+ .pytype/
 
 
203
 
204
+ # Cython debug symbols
205
+ cython_debug/
 
206
 
207
+ # PyCharm
208
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
209
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
210
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
211
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
212
+ #.idea/
213
 
214
+ ### Python Patch ###
215
+ # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
216
+ poetry.toml
217
 
218
+ # ruff
219
+ .ruff_cache/
 
 
220
 
221
+ # LSP config files
222
+ pyrightconfig.json
223
 
224
+ # End of https://www.toptal.com/developers/gitignore/api/macos,python,linux
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Pipfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [[source]]
2
+ url = "https://pypi.org/simple"
3
+ verify_ssl = true
4
+ name = "pypi"
5
+
6
+ [packages]
7
+
8
+ [dev-packages]
9
+
10
+ [requires]
11
+ python_version = "3.12"
README.md DELETED
@@ -1,32 +0,0 @@
1
- ## Rellow
2
-
3
- ## Contributing
4
-
5
- 1. Install [Node](https://nodejs.org/en/). Download the "Recommend for Most Users" version.
6
-
7
- 2. Clone the repo:
8
- ``` bash
9
- git clone git@github.com:rafaelcamargo/rellow.git
10
- ```
11
-
12
- 3. Go to the project directory
13
- ``` bash
14
- cd rellow
15
- ```
16
-
17
- 4. Install the project dependencies
18
- ``` bash
19
- npm install
20
- ```
21
-
22
- 5. Run the experiment
23
- ``` bash
24
- npm run start
25
- ```
26
-
27
- ## Tests
28
-
29
- In case you have changed any website behavior, ensure that all changes are covered with automated tests:
30
- ``` bash
31
- npm run test
32
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
package-lock.json DELETED
The diff for this file is too large to render. See raw diff
 
package.json DELETED
@@ -1,25 +0,0 @@
1
- {
2
- "name": "rellow",
3
- "version": "0.1.0",
4
- "description": "Imaginary word generator",
5
- "main": "src/index.js",
6
- "scripts": {
7
- "test": "jest",
8
- "start": "node ./src/index"
9
- },
10
- "keywords": [
11
- "ai",
12
- "generative",
13
- "supervised",
14
- "learning",
15
- "word"
16
- ],
17
- "author": "Rafael Camargo <hello@rafelcamargo.com>",
18
- "license": "UNLICENSED",
19
- "devDependencies": {
20
- "jest": "^29.7.0"
21
- },
22
- "dependencies": {
23
- "@tensorflow/tfjs-node": "^4.22.0"
24
- }
25
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/data/dictionary_compact.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4636de7bb1026b772680a075efc093871c0362fcdbbf588fa7d2f060492b44a4
3
- size 22458934
 
 
 
 
src/data/words.json DELETED
@@ -1,42 +0,0 @@
1
- [
2
- {
3
- "word": "Eloquent",
4
- "definition": "Fluent or persuasive in speaking or writing"
5
- },
6
- {
7
- "word": "Serene",
8
- "definition": "Peaceful and untroubled"
9
- },
10
- {
11
- "word": "Vivid",
12
- "definition": "Producing powerful feelings or strong"
13
- },
14
- {
15
- "word": "Ardent",
16
- "definition": "Enthusiastic or passionate"
17
- },
18
- {
19
- "word": "Fragrant",
20
- "definition": "Having a pleasant or sweet smell"
21
- },
22
- {
23
- "word": "Diligent",
24
- "definition": "Showing care and conscientiousness in one's work or duties"
25
- },
26
- {
27
- "word": "Imminent",
28
- "definition": "About to happen"
29
- },
30
- {
31
- "word": "Opaque",
32
- "definition": "Not able to be seen through"
33
- },
34
- {
35
- "word": "Subtle",
36
- "definition": "Delicate or precise as to be difficult to perceive or analyze"
37
- },
38
- {
39
- "word": "Ethereal",
40
- "definition": "Extremely delicate and light in a way that seems not of this world"
41
- }
42
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/index.js DELETED
@@ -1,72 +0,0 @@
1
- const tf = require('@tensorflow/tfjs-node');
2
- const tokenService = require('./services/token');
3
- const dictionary = require('./data/words');
4
-
5
- const dataset = dictionary.map(({ word, definition }) => {
6
- const encodedDefinition = tokenService.encode(definition.replace(/,;\./g, '').split(' '));
7
- const encodedWord = tokenService.encode([word]);
8
- const necessaryPad = encodedDefinition.length - encodedWord.length;
9
- const padding = new Array(necessaryPad).fill(0);
10
- const finalWord = encodedWord.concat(padding)
11
- return {
12
- word: finalWord,
13
- definition: encodedDefinition,
14
- }
15
- })
16
- const dictionatySize = 200 // random big number. understand how to better set it
17
- const maxInputSentenceSize = dataset.reduce((maxSize, { word, definition }) => {
18
- return definition.length > maxSize ? definition.length : maxSize;
19
- }, 0);
20
-
21
-
22
- async function run() {
23
- const model = tf.sequential();
24
- // Embedding layer for word representations
25
- model.add(tf.layers.embedding({inputDim: dictionatySize, outputDim: 64, inputLength: maxInputSentenceSize}));
26
- // LSTM layer for capturing sequence information
27
- model.add(tf.layers.lstm({units: 128, returnSequences: true}));
28
- // Dense layer to output a word for each position in the sentence
29
- // model.add(tf.layers.dense({units: maxInputSentenceSize, activation: 'softmax'}));
30
- model.add(tf.layers.dense({units: dictionatySize, activation: 'softmax'}));
31
- model.compile({loss: 'categoricalCrossentropy', optimizer: 'adam'});
32
-
33
- // Prepare input and output sequences as tensors
34
- const trainingWords = dataset.map(({ word }) => tokenService.padEncoding(word, maxInputSentenceSize))
35
- const trainingDefinitions = dataset.map(({ definition }) => tokenService.padEncoding(definition, maxInputSentenceSize))
36
-
37
- const tensorWords = tf.tensor2d(trainingWords); // shape: [numSamples, maxInputLength]
38
- // const tensorDefinitions = tf.tensor2d(trainingDefinitions); // shape: [numSamples, maxOutputLength, vocabSize]
39
-
40
- const tensorDefinitions = tf.tensor3d(
41
- trainingDefinitions.map(def => tokenService.oneHotEncode(def, dictionatySize)),
42
- [trainingDefinitions.length, maxInputSentenceSize, dictionatySize]
43
- );
44
-
45
- // Train the model on text sequences
46
- await model.fit(tensorWords, tensorDefinitions, {epochs: 100});
47
-
48
- // predict(model, 'Serene') // Understand why definition is not right even for a word already defined in the training dataset
49
- predict(model, 'Smoker');
50
- }
51
-
52
- function predict(model, newWord){
53
- let encodedWord = tokenService.encode([newWord]);
54
-
55
- // Ensure padding
56
- encodedWord = tokenService.padEncoding(encodedWord, maxInputSentenceSize);
57
-
58
- // Convert to tensor
59
- const wordTensor = tf.tensor2d([encodedWord]);
60
-
61
- // Generate prediction
62
- const prediction = model.predict(wordTensor);
63
-
64
- // Decode the predicted tokens
65
- const predictedTokens = prediction.argMax(2).arraySync()[0]; // Get token IDs
66
- console.log({ predictedTokens })
67
- const predictedDefinition = predictedTokens.map(tokenId => tokenService.decodeToken(tokenId)).join(' ');
68
-
69
- console.log(`Generated Definition for '${newWord}': ${predictedDefinition.replace(/0/g, '').trim()}`);
70
- }
71
-
72
- run()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/services/token.js DELETED
@@ -1,51 +0,0 @@
1
- const _public = {};
2
-
3
- const dictionary = [];
4
-
5
- _public.encode = words => {
6
- return words.map(word => {
7
- const code = findWordCode(word);
8
- if(code !== -1) return code + 1;
9
- dictionary.push(word);
10
- return dictionary.length;
11
- });
12
- };
13
-
14
- _public.decode = codes => {
15
- return codes.map(code => {
16
- return dictionary[code - 1];
17
- });
18
- };
19
-
20
- _public.decodeToken = code => {
21
- return dictionary[code - 1] || 0;
22
- }
23
-
24
- _public.oneHotEncode = (sequence, vocabSize) => {
25
- return sequence.map(tokenId => {
26
- const oneHot = new Array(vocabSize).fill(0);
27
- if (tokenId >= 0 && tokenId < vocabSize) {
28
- oneHot[tokenId] = 1;
29
- }
30
- return oneHot;
31
- });
32
- };
33
-
34
- _public.padEncoding = (encoding, minLength) => {
35
- const necessaryPadding = minLength - encoding.length;
36
- if(necessaryPadding > 0) {
37
- const padding = new Array(necessaryPadding).fill(0);
38
- return encoding.concat(padding);
39
- }
40
- return encoding;
41
- };
42
-
43
- _public.resetDictionary = () => {
44
- dictionary.length = 0;
45
- };
46
-
47
- function findWordCode(word){
48
- return dictionary.indexOf(word);
49
- }
50
-
51
- module.exports = _public;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/services/token.test.js DELETED
@@ -1,17 +0,0 @@
1
- const tokenService = require('./token')
2
-
3
- describe('Token Service', () => {
4
- afterEach(() => {
5
- tokenService.resetDictionary()
6
- });
7
-
8
- it('should encode/decode words', () => {
9
- expect(tokenService.encode(['glorious', 'times'])).toEqual([1, 2]);
10
- expect(tokenService.decode([2, 1])).toEqual(['times', 'glorious']);
11
- });
12
-
13
- it('should pad encoding result when lower than minimum length', () => {
14
- expect(tokenService.padEncoding([1,2], 5)).toEqual([1,2,0,0,0]);
15
- expect(tokenService.padEncoding([1,2,3,4,5,6], 6)).toEqual([1,2,3,4,5,6]);
16
- })
17
- });