Upload folder using huggingface_hub

88df9e4 verified about 1 month ago

6.49 kB

	import walk from 'walk-sync'
	import path from 'path'
	import { TokenizationError, TokenKind } from 'liquidjs'
	import type { TagToken } from 'liquidjs'
	import { getLiquidTokens } from '@/content-linter/lib/helpers/liquid-utils'

	const __dirname = path.dirname(new URL(import.meta.url).pathname)

	const repoRoot = path.resolve(__dirname, '../../../../')
	const contentDirectory = path.resolve(__dirname, repoRoot, 'content/')
	const dataDirectory = path.resolve(__dirname, repoRoot, 'data/')

	const reusablesDirectory = path.resolve(dataDirectory, 'reusables/')

	export type FilesWithLineNumbers = {
	filePath: string
	lineNumbers: number[]
	reusableFile?: string
	}[]
	export type FilesWithSimilarity = {
	filePath: string
	similarityScore: number
	reusableFile?: string
	}[]

	export function filterFiles(files: string[]) {
	return files.filter(
	(filePath) =>
	filePath.endsWith('.md') \|\| (filePath.endsWith('.yml') && !filePath.endsWith('README.md')),
	)
	}

	export function getAllContentFilePaths() {
	const allContentFiles = filterFiles(
	walk(contentDirectory, {
	includeBasePath: true,
	directories: false,
	}),
	)

	const allDataFiles = filterFiles(
	walk(dataDirectory, {
	includeBasePath: true,
	directories: false,
	}),
	)

	return [...allContentFiles, ...allDataFiles]
	}

	// Get the string that represents the reusable in the content files
	export function getReusableLiquidString(reusablePath: string): string {
	const relativePath = path.relative(reusablesDirectory, reusablePath)
	return `reusables.${relativePath.slice(0, -3).split('/').join('.')}`
	}

	export function getIndicesOfLiquidVariable(liquidVariable: string, fileContents: string): number[] {
	const indices: number[] = []
	try {
	const tokens = getLiquidTokens(fileContents).filter(
	(token): token is TagToken => token.kind === TokenKind.Tag,
	)
	for (const token of tokens) {
	if (token.name === 'data' && token.args.trim() === liquidVariable) {
	indices.push(token.begin)
	}
	}
	} catch (err) {
	if (err instanceof TokenizationError) return []
	throw err
	}

	return indices
	}

	// Find the path to a reusable file.
	export function resolveReusablePath(reusablePath: string): string {
	// Try .md if extension is not provided
	if (!reusablePath.endsWith('.md') && !reusablePath.endsWith('.yml')) {
	reusablePath += '.md'
	}

	// Allow user to just pass the name of the file. If it's not ambiguous, we'll find it.
	const allReusableFiles = getAllReusablesFilePaths()
	const foundPaths = []
	for (const possiblePath of allReusableFiles) {
	if (possiblePath.includes(reusablePath)) {
	foundPaths.push(possiblePath)
	}
	}

	if (foundPaths.length === 0) {
	console.error(`Reusables file not found: ${reusablePath}`)
	process.exit(1)
	} else if (foundPaths.length === 1) {
	return foundPaths[0]
	} else {
	console.error(`Multiple reusables found by name: ${reusablePath}`)
	for (let i = 0; i < foundPaths.length; i++) {
	console.error(` ${i + 1}: ${getRelativeReusablesPath(foundPaths[i])}`)
	}
	console.error('Please specify which reusable by passing the full path')
	process.exit(1)
	}
	}

	let paths: string[]
	export function getAllReusablesFilePaths(): string[] {
	if (paths) return paths!
	paths = filterFiles(
	walk(reusablesDirectory, {
	includeBasePath: true,
	directories: false,
	ignore: ['/README.md', 'enterprise_deprecation/'],
	}),
	)
	return paths
	}

	export function findIndicesOfSubstringInString(substr: string, str: string): number[] {
	str = str.toLowerCase()

	const result: number[] = []

	let idx = str.indexOf(substr)

	while (idx !== -1) {
	result.push(idx)
	idx = str.indexOf(substr, idx + 1)
	}
	return result
	}

	export function findSimilarSubStringInString(substr: string, str: string) {
	// Take every sentence in the substr, lower case it, and compare it to every sentence in the str to get a similarity score
	const substrSentences = substr.split('.').map((sentence) => sentence.toLowerCase())
	const corpus = str.split('.').map((sentence) => sentence.toLowerCase())

	let similarityScore = 0

	// Find how similar every two strings are based on the words they share
	for (const substrSentence of substrSentences) {
	for (const sentence of corpus) {
	const substrTokens = substrSentence.split(' ')
	const tokens = sentence.split(' ')

	const sharedWords = substrTokens.filter((token) => tokens.includes(token))

	similarityScore += sharedWords.length / (substrTokens.length + tokens.length)
	}
	}

	// Normalize the similarity score
	return Math.round((similarityScore / substrSentences.length) * corpus.length)
	}

	export function printFindsWithLineNumbers(
	absolute: boolean,
	reusableFindings: { filePath: string; lineNumbers: number[]; reusableFile?: string }[],
	similarityFindings?: { filePath: string; similarityScore: number; reusableFile?: string }[],
	) {
	for (const { filePath, lineNumbers, reusableFile } of reusableFindings) {
	let printReusablePath = reusableFile
	let printFilePath = filePath
	if (!absolute) {
	printReusablePath = getRelativeReusablesPath(printReusablePath as string)
	printFilePath = path.relative(repoRoot, printFilePath)
	}
	if (reusableFile) {
	console.log(`\nReusable ${printReusablePath} can be used`)
	console.log(`In ${printFilePath} on:`)
	} else {
	console.log(`\nIn ${printFilePath} on:`)
	}
	for (const lineNumber of lineNumbers) {
	console.log(` Line ${lineNumber}`)
	}
	}

	if (similarityFindings?.length) {
	console.log('\nFindings using "similar" algorithm:')
	for (const { filePath, similarityScore, reusableFile } of similarityFindings) {
	let printReusablePath = reusableFile
	let printFilePath = filePath
	if (!absolute) {
	printReusablePath = getRelativeReusablesPath(printReusablePath as string)
	printFilePath = path.relative(repoRoot, printFilePath)
	}
	if (reusableFile) {
	console.log(`\nReusables ${printReusablePath} can be used`)
	console.log(`In ${printFilePath} with similarity score: ${similarityScore}`)
	} else {
	console.log(`\nIn ${printFilePath} with similarity score: ${similarityScore}`)
	}
	}
	}
	}

	export function getRelativeReusablesPath(reusablePath: string) {
	if (!reusablePath) {
	return ''
	}
	return path.relative(repoRoot, reusablePath)
	}