AbdulElahGwaith's picture
Upload folder using huggingface_hub
88df9e4 verified
import fs from 'fs'
import yaml from 'js-yaml'
import { cuss } from 'cuss'
import { cuss as cussPt } from 'cuss/pt'
import { cuss as cussFr } from 'cuss/fr'
import { cuss as cussEs } from 'cuss/es'
let language: any = null
async function getLanguageInstance() {
if (!language) {
const { Language } = await import('@horizon-rs/language-guesser')
language = new Language()
}
return language
}
// Exported for the debugging CLI script
export const SIGNAL_RATINGS = [
{
reduction: 1.0,
name: 'email-only',
validator: (comment: string) => isEmailOnly(comment),
},
{
reduction: 0.2,
name: 'contains-email',
validator: (comment: string) => isContainingEmail(comment),
},
{
reduction: 1.0,
name: 'url-only',
validator: (comment: string) => isURL(comment),
},
{
reduction: 1.0,
name: 'numbers-only',
validator: (comment: string) => isNumbersOnly(comment),
},
{
reduction: 0.1,
name: 'all-uppercase',
validator: (comment: string) => isAllUppercase(comment),
},
{
reduction: 0.5,
name: 'single-word',
validator: (comment: string) => isSingleWord(comment),
},
{
reduction: 0.2,
name: 'too-short',
validator: (comment: string) => isTooShort(comment),
},
{
reduction: 0.2,
name: 'not-language',
validator: async (comment: string, commentLanguage: string) =>
await isNotLanguage(comment, commentLanguage),
},
{
reduction: 0.3,
name: 'cuss-words-likely',
validator: (comment: string, commentLanguage: string) =>
isLikelyCussWords(comment, commentLanguage),
},
{
reduction: 0.1,
name: 'cuss-words-maybe',
validator: (comment: string, commentLanguage: string) =>
isMaybeCussWords(comment, commentLanguage),
},
{
reduction: 0.2,
name: 'mostly-emoji',
validator: (comment: string) => isMostlyEmoji(comment),
},
{
reduction: 1.0,
name: 'spammy-words',
validator: (comment: string) => isSpammyWordList(comment),
},
]
export async function getGuessedLanguage(comment: string) {
if (!comment || !comment.trim()) {
return
}
const lang = await getLanguageInstance()
const bestGuess = lang.guessBest(comment.trim(), [])
if (!bestGuess) return // Can happen if the text is just whitespace
// // @horizon-rs/language-guesser is based on tri-grams and can lead
// // to false positives. For example, it thinks that 'Thamk you ❤️🙏' is
// // Haitian! And that 'I wanne robux 1000' is Polish!
// // But that's because they are short and there's not enough clues to
// // guess what language it is. You and I might know those are actually
// // attempts to be English, despite the spelling.
// // But are they useful comments? Given that this is just a signal,
// // and not a hard blocker, it's more of a clue than a fact.
return bestGuess.alpha2 || undefined
}
export async function analyzeComment(text: string, commentLanguage = 'en') {
const signals = []
let rating = 1.0
for (const { reduction, name, validator } of SIGNAL_RATINGS) {
if (await validator(text, commentLanguage)) {
signals.push(name)
rating -= reduction
}
if (rating <= 0) break
}
return { signals, rating }
}
function isEmailOnly(text: string) {
if (text.includes('@') && !/\s/.test(text.trim()) && !text.includes('://')) {
const atSigns = text.split('@').length
if (atSigns === 2) {
return true
}
}
}
function isContainingEmail(text: string) {
if (text.includes('@') && !isEmailOnly(text)) {
// Don't use splitWords() here because `foo@example.com` will be
// split up into ['foo', 'example.com'].
return text.split(/\s+/g).some((word) => isEmailOnly(word))
}
return false
}
function isURL(text: string) {
if (!text.trim().includes(' ')) {
if (URL.canParse(text.trim())) return true
}
}
function isNumbersOnly(text: string) {
return /^\d+$/.test(text.replace(/\s/g, ''))
}
function isAllUppercase(text: string) {
return /[A-Z]/.test(text) && text === text.toUpperCase()
}
function isTooShort(text: string) {
const split = text.trim().split(/\s+/)
if (split.length <= 3) {
return true
}
}
function isSingleWord(text: string) {
const whitespaceSplit = text.trim().split(/\s+/)
// E.g. `this-has-no-whitespace` or `snap/hooks/install`
return whitespaceSplit.length === 1
}
async function isNotLanguage(text: string, language_: string) {
const lang = await getLanguageInstance()
const bestGuess = lang.guessBest(text.trim(), [])
if (!bestGuess) return true // Can happen if the text is just whitespace
// @horizon-rs/language-guesser is based on tri-grams and can lead
// to false positives. For example, it thinks that 'Thamk you ❤️🙏' is
// Haitian! And that 'I wanne robux 1000' is Polish!
// But that's because they are short and there's not enough clues to
// guess what language it is. You and I might know those are actually
// attempts to be English, despite the spelling.
// But are they useful comments? Given that this is just a signal,
// and not a hard blocker, it's more of a clue than a fact.
// We don't want to reduce the score for English comments. English
// comments, when evaluated by language, are always valid.
return bestGuess.alpha2 !== language_ && bestGuess.alpha2 !== 'en'
}
function isMostlyEmoji(text: string) {
text = text.replace(/\s/g, '')
const emojiRegex = /\p{Emoji}/gu
const emojiMatches = text.match(emojiRegex)
if (!emojiMatches) return false
const emojiRatio = emojiMatches.length / text.length
return emojiRatio > 0.25
}
function getCussWords(lang: string) {
switch (lang) {
case 'pt':
return cussPt
case 'fr':
return cussFr
case 'es':
return cussEs
default:
return cuss
}
}
function isLikelyCussWords(text: string, language_: string, rating = 2) {
const cussWords = getCussWords(language_)
const words = splitWords(text).map((word) => word.toLowerCase())
for (const word of words) {
if (cussWords[word] && cussWords[word] === rating) {
return true
}
}
return false
}
function isMaybeCussWords(text: string, language_: string) {
return isLikelyCussWords(text, language_, 1)
}
const segmenter = new Intl.Segmenter([], { granularity: 'word' })
function splitWords(text: string) {
const segmentedText = segmenter.segment(text)
return [...segmentedText].filter((s) => s.isWordLike).map((s) => s.segment)
}
const surveyYaml = yaml.load(fs.readFileSync('data/survey-words.yml', 'utf8')) as {
words: string[]
}
const surveyWords = surveyYaml.words.map((word: string) => word.toLowerCase())
function isSpammyWordList(text: string) {
const words = text.toLowerCase().split(/(\s+|\\n+)/g)
// Currently, we're intentionally not checking for
// survey words that are substrings of a comment word.
return Boolean(words.some((word) => surveyWords.includes(word)))
}