// See how a piece of text gets turned into tokens by the different analyzers. // Requires that the index exists in Elasticsearch. // // Example: // // npm run analyze-text -- -V dotcom -l en "The name of the wind" import { Client } from '@elastic/elasticsearch' import { Command, Option } from 'commander' import chalk from 'chalk' import dotenv from 'dotenv' import { languageKeys } from '@/languages/lib/languages-server' import { allVersions } from '@/versions/lib/all-versions' import type { IndicesAnalyzeAnalyzeToken } from '@elastic/elasticsearch/lib/api/types' // Now you can optionally have set the ELASTICSEARCH_URL in your .env file. dotenv.config() // Create an object that maps the "short name" of a version to // all information about it. E.g. // // { // 'ghes-3.5': { // hasNumberedReleases: true, // currentRelease: '3.5', // version: 'enterprise-server@3.5', // miscBaseName: 'ghes-' // ... // }, // ... // // We need this later to be able to map CLI arguments to what the // records are called when found on disk. const shortNames: Record = Object.fromEntries( Object.values(allVersions).map((info) => { const shortName = info.hasNumberedReleases ? `${info.miscBaseName}${info.currentRelease}` : info.miscBaseName return [shortName, info] }), ) const allVersionKeys: string[] = Object.keys(shortNames) interface Options { verbose?: boolean version?: string language?: string notLanguage?: string elasticsearchUrl?: string indexPrefix?: string } const program = new Command() program .description('Analyze text into tokens') .option('-v, --verbose', 'Verbose outputs') .addOption(new Option('-V, --version ', 'Specific version').choices(allVersionKeys)) .addOption( new Option('-l, --language ', 'Which language to focus on').choices(languageKeys), ) .option('--not-language ', 'Exclude a specific language') .option('-u, --elasticsearch-url ', 'If different from $ELASTICSEARCH_URL') .option('--index-prefix ', 'Prefix for the index name') .argument('', 'text to tokenize') .parse(process.argv) const options = program.opts() const args: string[] = program.args try { await main(options, args) } catch (err) { console.error(chalk.red('Error:'), err) process.exit(1) } async function main(opts: Options, textArgs: string[]): Promise { const texts = [textArgs.join(' ')] if (!opts.elasticsearchUrl && !process.env.ELASTICSEARCH_URL) { throw new Error( 'Must pass the elasticsearch URL option or ' + 'set the environment variable ELASTICSEARCH_URL', ) } let node = opts.elasticsearchUrl || process.env.ELASTICSEARCH_URL! // Allow the user to lazily set it to `localhost:9200` for example. if (!node.startsWith('http') && !node.startsWith('://') && node.split(':').length === 2) { node = `http://${node}` } try { const parsed = new URL(node) if (!parsed.hostname) throw new Error('No valid hostname') } catch (err) { console.error(chalk.bold('URL for Elasticsearch not a valid URL'), err) return } const { verbose, language, notLanguage } = opts // The notLanguage is useful if you want to, for example, index all languages // *except* English. if (language && notLanguage) { throw new Error("Can't combine --language and --not-language") } if (verbose) { console.log(`Connecting to ${chalk.bold(safeUrlDisplay(node))}`) } const client = new Client({ node }) // This will throw if it can't ping await client.ping() const versionKey = opts.version || 'dotcom' if (verbose) { console.log(`Analyzing on version ${chalk.bold(versionKey)}`) } const languageKey = opts.language || 'en' if (verbose) { console.log(`Analyzing on language ${chalk.bold(languageKey)}`) } const { indexPrefix } = opts const prefix = indexPrefix ? `${indexPrefix}_` : '' const indexName = `${prefix}github-docs-${versionKey}-${languageKey}` console.log(chalk.yellow(`Analyzing in ${chalk.bold(indexName)}`)) await analyzeVersion(client, texts, indexName) } function safeUrlDisplay(url: string): string { const parsed = new URL(url) if (parsed.password) { parsed.password = '***' } if (parsed.username) { parsed.username = `${parsed.username.slice(0, 4)}***` } return parsed.toString() } async function analyzeVersion(client: Client, texts: string[], indexName: string): Promise { for (const text of texts) { console.log(`RAW TEXT: 〝${chalk.italic(text)}〞`) for (const analyzer of ['text_analyzer_explicit', 'text_analyzer', 'standard']) { console.log('ANALYZER:', chalk.bold(analyzer)) const response = await client.indices.analyze({ index: indexName, body: { analyzer, text }, }) const tokens: IndicesAnalyzeAnalyzeToken[] | undefined = response.tokens const tokenWords: string[] = tokens?.map((token) => token.token) || [] console.log(tokenWords) } } }