Upload folder using huggingface_hub

88df9e4 verified about 1 month ago

12.8 kB

	import { fileURLToPath } from 'url'
	import { Command } from 'commander'
	import fs from 'fs'
	import yaml from 'js-yaml'
	import path from 'path'
	import ora from 'ora'
	import { execSync } from 'child_process'
	import { callModelsApi } from '@/ai-tools/lib/call-models-api'
	import dotenv from 'dotenv'
	import readFrontmatter from '@/frame/lib/read-frontmatter'
	import { schema } from '@/frame/lib/frontmatter'
	dotenv.config({ quiet: true })

	const __dirname = path.dirname(fileURLToPath(import.meta.url))
	const promptDir = path.join(__dirname, '../prompts')
	const promptTemplatePath = path.join(promptDir, 'prompt-template.yml')

	if (!process.env.GITHUB_TOKEN) {
	// Try to find a token via the CLI before throwing an error
	const token = execSync('gh auth token').toString()
	if (token.startsWith('gh')) {
	process.env.GITHUB_TOKEN = token
	} else {
	console.warn(`🔑 A token is needed to run this script. Please do one of the following and try again:

	1. Add a GITHUB_TOKEN to a local .env file.
	2. Install https://cli.github.com and authenticate via 'gh auth login'.
	`)
	process.exit(1)
	}
	}

	// Dynamically discover available editor types from prompt files
	const getAvailableEditorTypes = (): string[] => {
	const editorTypes: string[] = []

	try {
	const promptFiles = fs.readdirSync(promptDir)
	for (const file of promptFiles) {
	if (file.endsWith('.md')) {
	const editorName = path.basename(file, '.md')
	editorTypes.push(editorName)
	}
	}
	} catch {
	console.warn('Could not read prompts directory, using empty editor types')
	}

	return editorTypes
	}

	const editorTypes = getAvailableEditorTypes()

	// Enhanced recursive markdown file finder with symlink, depth, and root path checks
	const findMarkdownFiles = (
	dir: string,
	rootDir: string,
	depth: number = 0,
	maxDepth: number = 20,
	visited: Set<string> = new Set(),
	): string[] => {
	const markdownFiles: string[] = []
	let realDir: string
	try {
	realDir = fs.realpathSync(dir)
	} catch {
	// If we can't resolve real path, skip this directory
	return []
	}
	// Prevent escaping root directory
	if (!realDir.startsWith(rootDir)) {
	return []
	}
	// Prevent symlink loops
	if (visited.has(realDir)) {
	return []
	}
	visited.add(realDir)
	// Prevent excessive depth
	if (depth > maxDepth) {
	return []
	}
	let entries: fs.Dirent[]
	try {
	entries = fs.readdirSync(realDir, { withFileTypes: true })
	} catch {
	// If we can't read directory, skip
	return []
	}
	for (const entry of entries) {
	const fullPath = path.join(realDir, entry.name)
	let realFullPath: string
	try {
	realFullPath = fs.realpathSync(fullPath)
	} catch {
	continue
	}
	// Prevent escaping root directory for files
	if (!realFullPath.startsWith(rootDir)) {
	continue
	}
	if (entry.isDirectory()) {
	markdownFiles.push(...findMarkdownFiles(realFullPath, rootDir, depth + 1, maxDepth, visited))
	} else if (entry.isFile() && entry.name.endsWith('.md')) {
	markdownFiles.push(realFullPath)
	}
	}
	return markdownFiles
	}

	const refinementDescriptions = (): string => {
	return editorTypes.join(', ')
	}

	interface CliOptions {
	verbose?: boolean
	prompt?: string[]
	refine?: string[]
	files: string[]
	write?: boolean
	}

	const program = new Command()

	program
	.name('ai-tools')
	.description('AI-powered content tools for editing and analysis')
	.option('-v, --verbose', 'Enable verbose output')
	.option(
	'-w, --write',
	'Write changes back to the original files (default: output to console only)',
	)
	.option('-p, --prompt <type...>', `Specify one or more prompt type: ${refinementDescriptions()}`)
	.option(
	'-r, --refine <type...>',
	`(Deprecated: use --prompt) Specify one or more prompt type: ${refinementDescriptions()}`,
	)
	.requiredOption(
	'-f, --files <files...>',
	'One or more content file paths in the content directory',
	)
	.action((options: CliOptions) => {
	;(async () => {
	const spinner = ora('Starting AI review...').start()

	const files = options.files
	// Handle both --prompt and --refine options for backwards compatibility
	const prompts = options.prompt \|\| options.refine

	if (!prompts \|\| prompts.length === 0) {
	spinner.fail('No prompt type specified. Use --prompt or --refine with one or more types.')
	process.exitCode = 1
	return
	}

	// Validate that all requested editor types exist
	const availableEditors = editorTypes
	for (const editor of prompts) {
	if (!availableEditors.includes(editor)) {
	spinner.fail(
	`Unknown prompt type: ${editor}. Available types: ${availableEditors.join(', ')}`,
	)
	process.exitCode = 1
	return
	}
	}

	if (options.verbose) {
	console.log(`Processing ${files.length} files with prompts: ${prompts.join(', ')}`)
	}

	for (const file of files) {
	const filePath = path.resolve(process.cwd(), file)
	spinner.text = `Checking file: ${file}`

	if (!fs.existsSync(filePath)) {
	spinner.fail(`File not found: ${filePath}`)
	process.exitCode = 1
	continue
	}

	// Check if it's a directory
	const isDirectory = fs.statSync(filePath).isDirectory()

	for (const editorType of prompts) {
	try {
	// For other editor types, process individual files
	const filesToProcess: string[] = []

	if (isDirectory) {
	// Find all markdown files in the directory recursively
	// Use process.cwd() as the root directory for safety
	const rootDir = fs.realpathSync(process.cwd())
	filesToProcess.push(...findMarkdownFiles(filePath, rootDir))

	if (filesToProcess.length === 0) {
	spinner.warn(`No markdown files found in directory: ${file}`)
	continue
	}

	spinner.text = `Found ${filesToProcess.length} markdown files in ${file}`
	} else {
	filesToProcess.push(filePath)
	}

	spinner.start()
	for (const fileToProcess of filesToProcess) {
	const relativePath = path.relative(process.cwd(), fileToProcess)
	spinner.text = `Processing: ${relativePath}`
	try {
	const content = fs.readFileSync(fileToProcess, 'utf8')
	const answer = await callEditor(
	editorType,
	content,
	options.write \|\| false,
	options.verbose \|\| false,
	)
	spinner.stop()

	if (options.write) {
	if (editorType === 'intro') {
	// For frontmatter addition/modification, merge properties instead of overwriting entire file
	const updatedContent = mergeFrontmatterProperties(fileToProcess, answer)
	fs.writeFileSync(fileToProcess, updatedContent, 'utf8')
	console.log(`✅ Added frontmatter properties to: ${relativePath}`)
	} else {
	// For other editor types, write the full result back to the original file
	fs.writeFileSync(fileToProcess, answer, 'utf8')
	console.log(`✅ Updated: ${relativePath}`)
	}
	} else {
	// Just output to console (current behavior)
	if (filesToProcess.length > 1) {
	console.log(`\n=== ${relativePath} ===`)
	}
	console.log(answer)
	}
	} catch (err) {
	const error = err as Error
	spinner.fail(`Error processing ${relativePath}: ${error.message}`)
	process.exitCode = 1
	} finally {
	spinner.stop()
	}
	}
	} catch (err) {
	const error = err as Error
	const targetName = path.relative(process.cwd(), filePath)
	spinner.fail(`Error processing ${targetName}: ${error.message}`)
	process.exitCode = 1
	}
	}
	}

	spinner.stop()

	// Exit with appropriate code based on whether any errors occurred
	if (process.exitCode) {
	process.exit(process.exitCode)
	}
	})()
	})

	program.parse(process.argv)

	// Handle graceful shutdown
	process.on('SIGINT', () => {
	console.log('\n\n🛑 Process interrupted by user')
	process.exit(0)
	})

	process.on('SIGTERM', () => {
	console.log('\n\n🛑 Process terminated')
	process.exit(0)
	})

	interface PromptMessage {
	content: string
	role: string
	}

	interface PromptData {
	messages: PromptMessage[]
	model?: string
	temperature?: number
	max_tokens?: number
	}

	// Function to merge new frontmatter properties into existing file while preserving formatting
	function mergeFrontmatterProperties(filePath: string, newPropertiesYaml: string): string {
	const content = fs.readFileSync(filePath, 'utf8')
	const parsed = readFrontmatter(content)

	if (parsed.errors && parsed.errors.length > 0) {
	throw new Error(
	`Failed to parse frontmatter: ${parsed.errors.map((e) => e.message).join(', ')}`,
	)
	}

	if (!parsed.content) {
	throw new Error('Failed to parse content from file')
	}

	try {
	// Clean up the AI response - remove markdown code blocks if present
	let cleanedYaml = newPropertiesYaml.trim()
	cleanedYaml = cleanedYaml.replace(/^```ya?ml\s*\n/i, '')
	cleanedYaml = cleanedYaml.replace(/\n```\s*$/i, '')
	cleanedYaml = cleanedYaml.trim()

	interface FrontmatterProperties {
	intro?: string
	[key: string]: unknown
	}
	const newProperties = yaml.load(cleanedYaml) as FrontmatterProperties

	// Security: Validate against prototype pollution using the official frontmatter schema
	const allowedKeys = Object.keys(schema.properties)

	const sanitizedProperties = Object.fromEntries(
	Object.entries(newProperties).filter(([key]) => {
	if (allowedKeys.includes(key)) {
	return true
	}
	console.warn(`Filtered out potentially unsafe frontmatter key: ${key}`)
	return false
	}),
	)

	// Merge new properties with existing frontmatter
	const mergedData: FrontmatterProperties = { ...parsed.data, ...sanitizedProperties }

	// Manually ensure intro is wrapped in single quotes in the final output
	let result = readFrontmatter.stringify(parsed.content, mergedData)

	// Post-process to ensure intro field has single quotes
	if (newProperties.intro) {
	const introValue = newProperties.intro.toString()
	// Replace any quote style on intro with single quotes
	result = result.replace(
	/^intro:\s(['"`]?)([^'"`\n\r]+)\1?\s$/m,
	`intro: '${introValue.replace(/'/g, "''")}'`, // Escape single quotes by doubling them
	)
	}
	return result
	} catch (error) {
	console.error('Failed to parse AI response as YAML:')
	console.error('Raw AI response:', JSON.stringify(newPropertiesYaml))
	throw new Error(`Failed to parse new frontmatter properties: ${error}`)
	}
	}

	async function callEditor(
	editorType: string,
	content: string,
	writeMode: boolean,
	verbose = false,
	): Promise<string> {
	const markdownPromptPath = path.join(promptDir, `${String(editorType)}.md`)

	if (!fs.existsSync(markdownPromptPath)) {
	throw new Error(`Prompt file not found: ${markdownPromptPath}`)
	}

	const markdownPrompt = fs.readFileSync(markdownPromptPath, 'utf8')

	const prompt = yaml.load(fs.readFileSync(promptTemplatePath, 'utf8')) as PromptData

	// Validate the prompt template has required properties
	if (!prompt.messages \|\| !Array.isArray(prompt.messages)) {
	throw new Error('Invalid prompt template: missing or invalid messages array')
	}

	for (const msg of prompt.messages) {
	msg.content = msg.content.replace('{{markdownPrompt}}', markdownPrompt)
	msg.content = msg.content.replace('{{input}}', content)
	// Replace writeMode template variable with simple string replacement
	msg.content = msg.content.replace(
	/<!-- IF_WRITE_MODE -->/g,
	writeMode ? '' : '<!-- REMOVE_START -->',
	)
	msg.content = msg.content.replace(
	/<!-- ELSE_WRITE_MODE -->/g,
	writeMode ? '<!-- REMOVE_START -->' : '',
	)
	msg.content = msg.content.replace(
	/<!-- END_WRITE_MODE -->/g,
	writeMode ? '' : '<!-- REMOVE_END -->',
	)

	// Remove sections marked for removal
	msg.content = msg.content.replace(/<!-- REMOVE_START -->[\s\S]*?<!-- REMOVE_END -->/g, '')
	}

	return callModelsApi(prompt, verbose)
	}