AbdulElahGwaith's picture
Upload folder using huggingface_hub
88df9e4 verified
import walk from 'walk-sync'
import path from 'path'
import { TokenizationError, TokenKind } from 'liquidjs'
import type { TagToken } from 'liquidjs'
import { getLiquidTokens } from '@/content-linter/lib/helpers/liquid-utils'
const __dirname = path.dirname(new URL(import.meta.url).pathname)
const repoRoot = path.resolve(__dirname, '../../../../')
const contentDirectory = path.resolve(__dirname, repoRoot, 'content/')
const dataDirectory = path.resolve(__dirname, repoRoot, 'data/')
const reusablesDirectory = path.resolve(dataDirectory, 'reusables/')
export type FilesWithLineNumbers = {
filePath: string
lineNumbers: number[]
reusableFile?: string
}[]
export type FilesWithSimilarity = {
filePath: string
similarityScore: number
reusableFile?: string
}[]
export function filterFiles(files: string[]) {
return files.filter(
(filePath) =>
filePath.endsWith('.md') || (filePath.endsWith('.yml') && !filePath.endsWith('README.md')),
)
}
export function getAllContentFilePaths() {
const allContentFiles = filterFiles(
walk(contentDirectory, {
includeBasePath: true,
directories: false,
}),
)
const allDataFiles = filterFiles(
walk(dataDirectory, {
includeBasePath: true,
directories: false,
}),
)
return [...allContentFiles, ...allDataFiles]
}
// Get the string that represents the reusable in the content files
export function getReusableLiquidString(reusablePath: string): string {
const relativePath = path.relative(reusablesDirectory, reusablePath)
return `reusables.${relativePath.slice(0, -3).split('/').join('.')}`
}
export function getIndicesOfLiquidVariable(liquidVariable: string, fileContents: string): number[] {
const indices: number[] = []
try {
const tokens = getLiquidTokens(fileContents).filter(
(token): token is TagToken => token.kind === TokenKind.Tag,
)
for (const token of tokens) {
if (token.name === 'data' && token.args.trim() === liquidVariable) {
indices.push(token.begin)
}
}
} catch (err) {
if (err instanceof TokenizationError) return []
throw err
}
return indices
}
// Find the path to a reusable file.
export function resolveReusablePath(reusablePath: string): string {
// Try .md if extension is not provided
if (!reusablePath.endsWith('.md') && !reusablePath.endsWith('.yml')) {
reusablePath += '.md'
}
// Allow user to just pass the name of the file. If it's not ambiguous, we'll find it.
const allReusableFiles = getAllReusablesFilePaths()
const foundPaths = []
for (const possiblePath of allReusableFiles) {
if (possiblePath.includes(reusablePath)) {
foundPaths.push(possiblePath)
}
}
if (foundPaths.length === 0) {
console.error(`Reusables file not found: ${reusablePath}`)
process.exit(1)
} else if (foundPaths.length === 1) {
return foundPaths[0]
} else {
console.error(`Multiple reusables found by name: ${reusablePath}`)
for (let i = 0; i < foundPaths.length; i++) {
console.error(` ${i + 1}: ${getRelativeReusablesPath(foundPaths[i])}`)
}
console.error('Please specify which reusable by passing the full path')
process.exit(1)
}
}
let paths: string[]
export function getAllReusablesFilePaths(): string[] {
if (paths) return paths!
paths = filterFiles(
walk(reusablesDirectory, {
includeBasePath: true,
directories: false,
ignore: ['**/README.md', 'enterprise_deprecation/**'],
}),
)
return paths
}
export function findIndicesOfSubstringInString(substr: string, str: string): number[] {
str = str.toLowerCase()
const result: number[] = []
let idx = str.indexOf(substr)
while (idx !== -1) {
result.push(idx)
idx = str.indexOf(substr, idx + 1)
}
return result
}
export function findSimilarSubStringInString(substr: string, str: string) {
// Take every sentence in the substr, lower case it, and compare it to every sentence in the str to get a similarity score
const substrSentences = substr.split('.').map((sentence) => sentence.toLowerCase())
const corpus = str.split('.').map((sentence) => sentence.toLowerCase())
let similarityScore = 0
// Find how similar every two strings are based on the words they share
for (const substrSentence of substrSentences) {
for (const sentence of corpus) {
const substrTokens = substrSentence.split(' ')
const tokens = sentence.split(' ')
const sharedWords = substrTokens.filter((token) => tokens.includes(token))
similarityScore += sharedWords.length / (substrTokens.length + tokens.length)
}
}
// Normalize the similarity score
return Math.round((similarityScore / substrSentences.length) * corpus.length)
}
export function printFindsWithLineNumbers(
absolute: boolean,
reusableFindings: { filePath: string; lineNumbers: number[]; reusableFile?: string }[],
similarityFindings?: { filePath: string; similarityScore: number; reusableFile?: string }[],
) {
for (const { filePath, lineNumbers, reusableFile } of reusableFindings) {
let printReusablePath = reusableFile
let printFilePath = filePath
if (!absolute) {
printReusablePath = getRelativeReusablesPath(printReusablePath as string)
printFilePath = path.relative(repoRoot, printFilePath)
}
if (reusableFile) {
console.log(`\nReusable ${printReusablePath} can be used`)
console.log(`In ${printFilePath} on:`)
} else {
console.log(`\nIn ${printFilePath} on:`)
}
for (const lineNumber of lineNumbers) {
console.log(` Line ${lineNumber}`)
}
}
if (similarityFindings?.length) {
console.log('\nFindings using "similar" algorithm:')
for (const { filePath, similarityScore, reusableFile } of similarityFindings) {
let printReusablePath = reusableFile
let printFilePath = filePath
if (!absolute) {
printReusablePath = getRelativeReusablesPath(printReusablePath as string)
printFilePath = path.relative(repoRoot, printFilePath)
}
if (reusableFile) {
console.log(`\nReusables ${printReusablePath} can be used`)
console.log(`In ${printFilePath} with similarity score: ${similarityScore}`)
} else {
console.log(`\nIn ${printFilePath} with similarity score: ${similarityScore}`)
}
}
}
}
export function getRelativeReusablesPath(reusablePath: string) {
if (!reusablePath) {
return ''
}
return path.relative(repoRoot, reusablePath)
}