Upload folder using huggingface_hub

88df9e4 verified about 1 month ago

6.95 kB

	import fs from 'fs'
	import yaml from 'js-yaml'
	import { cuss } from 'cuss'
	import { cuss as cussPt } from 'cuss/pt'
	import { cuss as cussFr } from 'cuss/fr'
	import { cuss as cussEs } from 'cuss/es'
	let language: any = null

	async function getLanguageInstance() {
	if (!language) {
	const { Language } = await import('@horizon-rs/language-guesser')
	language = new Language()
	}
	return language
	}

	// Exported for the debugging CLI script
	export const SIGNAL_RATINGS = [
	{
	reduction: 1.0,
	name: 'email-only',
	validator: (comment: string) => isEmailOnly(comment),
	},
	{
	reduction: 0.2,
	name: 'contains-email',
	validator: (comment: string) => isContainingEmail(comment),
	},
	{
	reduction: 1.0,
	name: 'url-only',
	validator: (comment: string) => isURL(comment),
	},
	{
	reduction: 1.0,
	name: 'numbers-only',
	validator: (comment: string) => isNumbersOnly(comment),
	},
	{
	reduction: 0.1,
	name: 'all-uppercase',
	validator: (comment: string) => isAllUppercase(comment),
	},
	{
	reduction: 0.5,
	name: 'single-word',
	validator: (comment: string) => isSingleWord(comment),
	},
	{
	reduction: 0.2,
	name: 'too-short',
	validator: (comment: string) => isTooShort(comment),
	},
	{
	reduction: 0.2,
	name: 'not-language',
	validator: async (comment: string, commentLanguage: string) =>
	await isNotLanguage(comment, commentLanguage),
	},
	{
	reduction: 0.3,
	name: 'cuss-words-likely',
	validator: (comment: string, commentLanguage: string) =>
	isLikelyCussWords(comment, commentLanguage),
	},
	{
	reduction: 0.1,
	name: 'cuss-words-maybe',
	validator: (comment: string, commentLanguage: string) =>
	isMaybeCussWords(comment, commentLanguage),
	},
	{
	reduction: 0.2,
	name: 'mostly-emoji',
	validator: (comment: string) => isMostlyEmoji(comment),
	},
	{
	reduction: 1.0,
	name: 'spammy-words',
	validator: (comment: string) => isSpammyWordList(comment),
	},
	]

	export async function getGuessedLanguage(comment: string) {
	if (!comment \|\| !comment.trim()) {
	return
	}

	const lang = await getLanguageInstance()
	const bestGuess = lang.guessBest(comment.trim(), [])
	if (!bestGuess) return // Can happen if the text is just whitespace
	// // @horizon-rs/language-guesser is based on tri-grams and can lead
	// // to false positives. For example, it thinks that 'Thamk you ❤️🙏' is
	// // Haitian! And that 'I wanne robux 1000' is Polish!
	// // But that's because they are short and there's not enough clues to
	// // guess what language it is. You and I might know those are actually
	// // attempts to be English, despite the spelling.
	// // But are they useful comments? Given that this is just a signal,
	// // and not a hard blocker, it's more of a clue than a fact.

	return bestGuess.alpha2 \|\| undefined
	}

	export async function analyzeComment(text: string, commentLanguage = 'en') {
	const signals = []
	let rating = 1.0
	for (const { reduction, name, validator } of SIGNAL_RATINGS) {
	if (await validator(text, commentLanguage)) {
	signals.push(name)
	rating -= reduction
	}
	if (rating <= 0) break
	}

	return { signals, rating }
	}

	function isEmailOnly(text: string) {
	if (text.includes('@') && !/\s/.test(text.trim()) && !text.includes('://')) {
	const atSigns = text.split('@').length
	if (atSigns === 2) {
	return true
	}
	}
	}

	function isContainingEmail(text: string) {
	if (text.includes('@') && !isEmailOnly(text)) {
	// Don't use splitWords() here because `foo@example.com` will be
	// split up into ['foo', 'example.com'].
	return text.split(/\s+/g).some((word) => isEmailOnly(word))
	}
	return false
	}

	function isURL(text: string) {
	if (!text.trim().includes(' ')) {
	if (URL.canParse(text.trim())) return true
	}
	}

	function isNumbersOnly(text: string) {
	return /^\d+$/.test(text.replace(/\s/g, ''))
	}

	function isAllUppercase(text: string) {
	return /[A-Z]/.test(text) && text === text.toUpperCase()
	}

	function isTooShort(text: string) {
	const split = text.trim().split(/\s+/)
	if (split.length <= 3) {
	return true
	}
	}

	function isSingleWord(text: string) {
	const whitespaceSplit = text.trim().split(/\s+/)
	// E.g. `this-has-no-whitespace` or `snap/hooks/install`
	return whitespaceSplit.length === 1
	}

	async function isNotLanguage(text: string, language_: string) {
	const lang = await getLanguageInstance()
	const bestGuess = lang.guessBest(text.trim(), [])
	if (!bestGuess) return true // Can happen if the text is just whitespace
	// @horizon-rs/language-guesser is based on tri-grams and can lead
	// to false positives. For example, it thinks that 'Thamk you ❤️🙏' is
	// Haitian! And that 'I wanne robux 1000' is Polish!
	// But that's because they are short and there's not enough clues to
	// guess what language it is. You and I might know those are actually
	// attempts to be English, despite the spelling.
	// But are they useful comments? Given that this is just a signal,
	// and not a hard blocker, it's more of a clue than a fact.

	// We don't want to reduce the score for English comments. English
	// comments, when evaluated by language, are always valid.
	return bestGuess.alpha2 !== language_ && bestGuess.alpha2 !== 'en'
	}

	function isMostlyEmoji(text: string) {
	text = text.replace(/\s/g, '')
	const emojiRegex = /\p{Emoji}/gu
	const emojiMatches = text.match(emojiRegex)
	if (!emojiMatches) return false
	const emojiRatio = emojiMatches.length / text.length
	return emojiRatio > 0.25
	}

	function getCussWords(lang: string) {
	switch (lang) {
	case 'pt':
	return cussPt
	case 'fr':
	return cussFr
	case 'es':
	return cussEs
	default:
	return cuss
	}
	}

	function isLikelyCussWords(text: string, language_: string, rating = 2) {
	const cussWords = getCussWords(language_)
	const words = splitWords(text).map((word) => word.toLowerCase())
	for (const word of words) {
	if (cussWords[word] && cussWords[word] === rating) {
	return true
	}
	}
	return false
	}

	function isMaybeCussWords(text: string, language_: string) {
	return isLikelyCussWords(text, language_, 1)
	}

	const segmenter = new Intl.Segmenter([], { granularity: 'word' })

	function splitWords(text: string) {
	const segmentedText = segmenter.segment(text)
	return [...segmentedText].filter((s) => s.isWordLike).map((s) => s.segment)
	}

	const surveyYaml = yaml.load(fs.readFileSync('data/survey-words.yml', 'utf8')) as {
	words: string[]
	}
	const surveyWords = surveyYaml.words.map((word: string) => word.toLowerCase())

	function isSpammyWordList(text: string) {
	const words = text.toLowerCase().split(/(\s+\|\\n+)/g)
	// Currently, we're intentionally not checking for
	// survey words that are substrings of a comment word.
	return Boolean(words.some((word) => surveyWords.includes(word)))
	}