initial commit

Files changed (10) hide show

.gitattributes +2 -0
.gitignore +233 -0
README.md +32 -0
package-lock.json +0 -0
package.json +25 -0
src/data/dictionary_compact.json +3 -0
src/data/words.json +42 -0
src/index.js +72 -0
src/services/token.js +51 -0
src/services/token.test.js +17 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ src/model/*.pt filter=lfs diff=lfs merge=lfs -text
2	+ src/data/dictionary_compact.json filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,233 @@

+# Created by https://www.toptal.com/developers/gitignore/api/windows,node,macos,linux,sublimetext
+# Edit at https://www.toptal.com/developers/gitignore?templates=windows,node,macos,linux,sublimetext
+### Linux ###
+*~
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+# KDE directory preferences
+.directory
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+### macOS ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+# Icon must end with two \r
+Icon
+# Thumbnails
+._*
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+### Node ###
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+lerna-debug.log*
+# Diagnostic reports (https://nodejs.org/api/report.html)
+report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
+# Runtime data
+pids
+*.pid
+*.seed
+*.pid.lock
+# Directory for instrumented libs generated by jscoverage/JSCover
+lib-cov
+# Coverage directory used by tools like istanbul
+coverage
+*.lcov
+# nyc test coverage
+.nyc_output
+# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
+.grunt
+# Bower dependency directory (https://bower.io/)
+bower_components
+# node-waf configuration
+.lock-wscript
+# Compiled binary addons (https://nodejs.org/api/addons.html)
+build/Release
+# Dependency directories
+node_modules/
+jspm_packages/
+# TypeScript v1 declaration files
+typings/
+# TypeScript cache
+*.tsbuildinfo
+# Optional npm cache directory
+.npm
+# Optional eslint cache
+.eslintcache
+# Optional stylelint cache
+.stylelintcache
+# Microbundle cache
+.rpt2_cache/
+.rts2_cache_cjs/
+.rts2_cache_es/
+.rts2_cache_umd/
+# Optional REPL history
+.node_repl_history
+# Output of 'npm pack'
+*.tgz
+# Yarn Integrity file
+.yarn-integrity
+# dotenv environment variables file
+.env
+.env.test
+.env*.local
+# parcel-bundler cache (https://parceljs.org/)
+.cache
+.parcel-cache
+# Next.js build output
+.next
+# Nuxt.js build / generate output
+.nuxt
+dist
+# Storybook build outputs
+.out
+.storybook-out
+storybook-static
+# rollup.js default build output
+dist/
+# Gatsby files
+.cache/
+# Comment in the public line in if your project uses Gatsby and not Next.js
+# https://nextjs.org/blog/next-9-1#public-directory-support
+# public
+# vuepress build output
+.vuepress/dist
+# Serverless directories
+.serverless/
+# FuseBox cache
+.fusebox/
+# DynamoDB Local files
+.dynamodb/
+# TernJS port file
+.tern-port
+# Stores VSCode versions used for testing VSCode extensions
+.vscode-test
+# Temporary folders
+tmp/
+temp/
+### SublimeText ###
+# Cache files for Sublime Text
+*.tmlanguage.cache
+*.tmPreferences.cache
+*.stTheme.cache
+# Workspace files are user-specific
+*.sublime-workspace
+# Project files should be checked into the repository, unless a significant
+# proportion of contributors will probably not be using Sublime Text
+# *.sublime-project
+# SFTP configuration file
+sftp-config.json
+# Package control specific files
+Package Control.last-run
+Package Control.ca-list
+Package Control.ca-bundle
+Package Control.system-ca-bundle
+Package Control.cache/
+Package Control.ca-certs/
+Package Control.merged-ca-bundle
+Package Control.user-ca-bundle
+oscrypto-ca-bundle.crt
+bh_unicode_properties.cache
+# Sublime-github package stores a github token in this file
+# https://packagecontrol.io/packages/sublime-github
+GitHub.sublime-settings
+### Windows ###
+# Windows thumbnail cache files
+Thumbs.db
+Thumbs.db:encryptable
+ehthumbs.db
+ehthumbs_vista.db
+# Dump file
+*.stackdump
+# Folder config file
+[Dd]esktop.ini
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+# Windows Installer files
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+# Windows shortcuts
+*.lnk
+# End of https://www.toptal.com/developers/gitignore/api/windows,node,macos,linux,sublimetext

README.md ADDED Viewed

	@@ -0,0 +1,32 @@

+## Rellow
+## Contributing
+1. Install [Node](https://nodejs.org/en/). Download the "Recommend for Most Users" version.
+2. Clone the repo:
+``` bash
+git clone git@github.com:rafaelcamargo/rellow.git
+```
+3. Go to the project directory
+``` bash
+cd rellow
+```
+4. Install the project dependencies
+``` bash
+npm install
+```
+5. Run the experiment
+``` bash
+npm run start
+```
+## Tests
+In case you have changed any website behavior, ensure that all changes are covered with automated tests:
+``` bash
+npm run test
+```

package-lock.json ADDED Viewed

The diff for this file is too large to render. See raw diff

package.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "name": "rellow",
+  "version": "0.1.0",
+  "description": "Imaginary word generator",
+  "main": "src/index.js",
+  "scripts": {
+    "test": "jest",
+    "start": "node ./src/index"
+  },
+  "keywords": [
+    "ai",
+    "generative",
+    "supervised",
+    "learning",
+    "word"
+  ],
+  "author": "Rafael Camargo <hello@rafelcamargo.com>",
+  "license": "UNLICENSED",
+  "devDependencies": {
+    "jest": "^29.7.0"
+  },
+  "dependencies": {
+    "@tensorflow/tfjs-node": "^4.22.0"
+  }
+}

src/data/dictionary_compact.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4636de7bb1026b772680a075efc093871c0362fcdbbf588fa7d2f060492b44a4
+size 22458934

src/data/words.json ADDED Viewed

	@@ -0,0 +1,42 @@

+[
+  {
+    "word": "Eloquent",
+    "definition": "Fluent or persuasive in speaking or writing"
+  },
+  {
+    "word": "Serene",
+    "definition": "Peaceful and untroubled"
+  },
+  {
+    "word": "Vivid",
+    "definition": "Producing powerful feelings or strong"
+  },
+  {
+    "word": "Ardent",
+    "definition": "Enthusiastic or passionate"
+  },
+  {
+    "word": "Fragrant",
+    "definition": "Having a pleasant or sweet smell"
+  },
+  {
+    "word": "Diligent",
+    "definition": "Showing care and conscientiousness in one's work or duties"
+  },
+  {
+    "word": "Imminent",
+    "definition": "About to happen"
+  },
+  {
+    "word": "Opaque",
+    "definition": "Not able to be seen through"
+  },
+  {
+    "word": "Subtle",
+    "definition": "Delicate or precise as to be difficult to perceive or analyze"
+  },
+  {
+    "word": "Ethereal",
+    "definition": "Extremely delicate and light in a way that seems not of this world"
+  }
+]

src/index.js ADDED Viewed

	@@ -0,0 +1,72 @@

+const tf = require('@tensorflow/tfjs-node');
+const tokenService = require('./services/token');
+const dictionary = require('./data/words');
+const dataset = dictionary.map(({ word, definition }) => {
+  const encodedDefinition = tokenService.encode(definition.replace(/,;\./g, '').split(' '));
+  const encodedWord = tokenService.encode([word]);
+  const necessaryPad = encodedDefinition.length - encodedWord.length;
+  const padding = new Array(necessaryPad).fill(0);
+  const finalWord = encodedWord.concat(padding)
+  return {
+    word: finalWord,
+    definition: encodedDefinition,
+  }
+})
+const dictionatySize = 200 // random big number. understand how to better set it
+const maxInputSentenceSize = dataset.reduce((maxSize, { word, definition }) => {
+  return definition.length > maxSize ? definition.length : maxSize;
+}, 0);
+async function run() {
+  const model = tf.sequential();
+  // Embedding layer for word representations
+  model.add(tf.layers.embedding({inputDim: dictionatySize, outputDim: 64, inputLength: maxInputSentenceSize}));
+  // LSTM layer for capturing sequence information
+  model.add(tf.layers.lstm({units: 128, returnSequences: true}));
+  // Dense layer to output a word for each position in the sentence
+  // model.add(tf.layers.dense({units: maxInputSentenceSize, activation: 'softmax'}));
+  model.add(tf.layers.dense({units: dictionatySize, activation: 'softmax'}));
+  model.compile({loss: 'categoricalCrossentropy', optimizer: 'adam'});
+  // Prepare input and output sequences as tensors
+  const trainingWords = dataset.map(({ word }) => tokenService.padEncoding(word, maxInputSentenceSize))
+  const trainingDefinitions = dataset.map(({ definition }) => tokenService.padEncoding(definition, maxInputSentenceSize))
+  const tensorWords = tf.tensor2d(trainingWords); // shape: [numSamples, maxInputLength]
+  // const tensorDefinitions = tf.tensor2d(trainingDefinitions); // shape: [numSamples, maxOutputLength, vocabSize]
+  const tensorDefinitions = tf.tensor3d(
+    trainingDefinitions.map(def => tokenService.oneHotEncode(def, dictionatySize)),
+    [trainingDefinitions.length, maxInputSentenceSize, dictionatySize]
+  );
+  // Train the model on text sequences
+  await model.fit(tensorWords, tensorDefinitions, {epochs: 100});
+  // predict(model, 'Serene') // Understand why definition is not right even for a word already defined in the training dataset
+  predict(model, 'Smoker');
+}
+function predict(model, newWord){
+  let encodedWord = tokenService.encode([newWord]);
+  // Ensure padding
+  encodedWord = tokenService.padEncoding(encodedWord, maxInputSentenceSize);
+  // Convert to tensor
+  const wordTensor = tf.tensor2d([encodedWord]);
+  // Generate prediction
+  const prediction = model.predict(wordTensor);
+  // Decode the predicted tokens
+  const predictedTokens = prediction.argMax(2).arraySync()[0]; // Get token IDs
+  console.log({ predictedTokens })
+  const predictedDefinition = predictedTokens.map(tokenId => tokenService.decodeToken(tokenId)).join(' ');
+  console.log(`Generated Definition for '${newWord}': ${predictedDefinition.replace(/0/g, '').trim()}`);
+}
+run()

src/services/token.js ADDED Viewed

	@@ -0,0 +1,51 @@

+const _public = {};
+const dictionary = [];
+_public.encode = words => {
+  return words.map(word => {
+    const code = findWordCode(word);
+    if(code !== -1) return code + 1;
+    dictionary.push(word);
+    return dictionary.length;
+  });
+};
+_public.decode = codes => {
+  return codes.map(code => {
+    return dictionary[code - 1];
+  });
+};
+_public.decodeToken = code => {
+  return dictionary[code - 1] || 0;
+}
+_public.oneHotEncode = (sequence, vocabSize) => {
+  return sequence.map(tokenId => {
+    const oneHot = new Array(vocabSize).fill(0);
+    if (tokenId >= 0 && tokenId < vocabSize) {
+      oneHot[tokenId] = 1;
+    }
+    return oneHot;
+  });
+};
+_public.padEncoding = (encoding, minLength) => {
+  const necessaryPadding = minLength - encoding.length;
+  if(necessaryPadding > 0) {
+    const padding = new Array(necessaryPadding).fill(0);
+    return encoding.concat(padding);
+  }
+  return encoding;
+};
+_public.resetDictionary = () => {
+  dictionary.length = 0;
+};
+function findWordCode(word){
+  return dictionary.indexOf(word);
+}
+module.exports = _public;

src/services/token.test.js ADDED Viewed

	@@ -0,0 +1,17 @@

+const tokenService = require('./token')
+describe('Token Service', () => {
+  afterEach(() => {
+    tokenService.resetDictionary()
+  });
+  it('should encode/decode words', () => {
+    expect(tokenService.encode(['glorious', 'times'])).toEqual([1, 2]);
+    expect(tokenService.decode([2, 1])).toEqual(['times', 'glorious']);
+  });
+  it('should pad encoding result when lower than minimum length', () => {
+    expect(tokenService.padEncoding([1,2], 5)).toEqual([1,2,0,0,0]);
+    expect(tokenService.padEncoding([1,2,3,4,5,6], 6)).toEqual([1,2,3,4,5,6]);
+  })
+});