| | import fs from 'fs' |
| | import yaml from 'js-yaml' |
| | import { cuss } from 'cuss' |
| | import { cuss as cussPt } from 'cuss/pt' |
| | import { cuss as cussFr } from 'cuss/fr' |
| | import { cuss as cussEs } from 'cuss/es' |
| | let language: any = null |
| |
|
| | async function getLanguageInstance() { |
| | if (!language) { |
| | const { Language } = await import('@horizon-rs/language-guesser') |
| | language = new Language() |
| | } |
| | return language |
| | } |
| |
|
| | |
| | export const SIGNAL_RATINGS = [ |
| | { |
| | reduction: 1.0, |
| | name: 'email-only', |
| | validator: (comment: string) => isEmailOnly(comment), |
| | }, |
| | { |
| | reduction: 0.2, |
| | name: 'contains-email', |
| | validator: (comment: string) => isContainingEmail(comment), |
| | }, |
| | { |
| | reduction: 1.0, |
| | name: 'url-only', |
| | validator: (comment: string) => isURL(comment), |
| | }, |
| | { |
| | reduction: 1.0, |
| | name: 'numbers-only', |
| | validator: (comment: string) => isNumbersOnly(comment), |
| | }, |
| | { |
| | reduction: 0.1, |
| | name: 'all-uppercase', |
| | validator: (comment: string) => isAllUppercase(comment), |
| | }, |
| | { |
| | reduction: 0.5, |
| | name: 'single-word', |
| | validator: (comment: string) => isSingleWord(comment), |
| | }, |
| | { |
| | reduction: 0.2, |
| | name: 'too-short', |
| | validator: (comment: string) => isTooShort(comment), |
| | }, |
| | { |
| | reduction: 0.2, |
| | name: 'not-language', |
| | validator: async (comment: string, commentLanguage: string) => |
| | await isNotLanguage(comment, commentLanguage), |
| | }, |
| | { |
| | reduction: 0.3, |
| | name: 'cuss-words-likely', |
| | validator: (comment: string, commentLanguage: string) => |
| | isLikelyCussWords(comment, commentLanguage), |
| | }, |
| | { |
| | reduction: 0.1, |
| | name: 'cuss-words-maybe', |
| | validator: (comment: string, commentLanguage: string) => |
| | isMaybeCussWords(comment, commentLanguage), |
| | }, |
| | { |
| | reduction: 0.2, |
| | name: 'mostly-emoji', |
| | validator: (comment: string) => isMostlyEmoji(comment), |
| | }, |
| | { |
| | reduction: 1.0, |
| | name: 'spammy-words', |
| | validator: (comment: string) => isSpammyWordList(comment), |
| | }, |
| | ] |
| |
|
| | export async function getGuessedLanguage(comment: string) { |
| | if (!comment || !comment.trim()) { |
| | return |
| | } |
| |
|
| | const lang = await getLanguageInstance() |
| | const bestGuess = lang.guessBest(comment.trim(), []) |
| | if (!bestGuess) return |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | return bestGuess.alpha2 || undefined |
| | } |
| |
|
| | export async function analyzeComment(text: string, commentLanguage = 'en') { |
| | const signals = [] |
| | let rating = 1.0 |
| | for (const { reduction, name, validator } of SIGNAL_RATINGS) { |
| | if (await validator(text, commentLanguage)) { |
| | signals.push(name) |
| | rating -= reduction |
| | } |
| | if (rating <= 0) break |
| | } |
| |
|
| | return { signals, rating } |
| | } |
| |
|
| | function isEmailOnly(text: string) { |
| | if (text.includes('@') && !/\s/.test(text.trim()) && !text.includes('://')) { |
| | const atSigns = text.split('@').length |
| | if (atSigns === 2) { |
| | return true |
| | } |
| | } |
| | } |
| |
|
| | function isContainingEmail(text: string) { |
| | if (text.includes('@') && !isEmailOnly(text)) { |
| | |
| | |
| | return text.split(/\s+/g).some((word) => isEmailOnly(word)) |
| | } |
| | return false |
| | } |
| |
|
| | function isURL(text: string) { |
| | if (!text.trim().includes(' ')) { |
| | if (URL.canParse(text.trim())) return true |
| | } |
| | } |
| |
|
| | function isNumbersOnly(text: string) { |
| | return /^\d+$/.test(text.replace(/\s/g, '')) |
| | } |
| |
|
| | function isAllUppercase(text: string) { |
| | return /[A-Z]/.test(text) && text === text.toUpperCase() |
| | } |
| |
|
| | function isTooShort(text: string) { |
| | const split = text.trim().split(/\s+/) |
| | if (split.length <= 3) { |
| | return true |
| | } |
| | } |
| |
|
| | function isSingleWord(text: string) { |
| | const whitespaceSplit = text.trim().split(/\s+/) |
| | |
| | return whitespaceSplit.length === 1 |
| | } |
| |
|
| | async function isNotLanguage(text: string, language_: string) { |
| | const lang = await getLanguageInstance() |
| | const bestGuess = lang.guessBest(text.trim(), []) |
| | if (!bestGuess) return true |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | return bestGuess.alpha2 !== language_ && bestGuess.alpha2 !== 'en' |
| | } |
| |
|
| | function isMostlyEmoji(text: string) { |
| | text = text.replace(/\s/g, '') |
| | const emojiRegex = /\p{Emoji}/gu |
| | const emojiMatches = text.match(emojiRegex) |
| | if (!emojiMatches) return false |
| | const emojiRatio = emojiMatches.length / text.length |
| | return emojiRatio > 0.25 |
| | } |
| |
|
| | function getCussWords(lang: string) { |
| | switch (lang) { |
| | case 'pt': |
| | return cussPt |
| | case 'fr': |
| | return cussFr |
| | case 'es': |
| | return cussEs |
| | default: |
| | return cuss |
| | } |
| | } |
| |
|
| | function isLikelyCussWords(text: string, language_: string, rating = 2) { |
| | const cussWords = getCussWords(language_) |
| | const words = splitWords(text).map((word) => word.toLowerCase()) |
| | for (const word of words) { |
| | if (cussWords[word] && cussWords[word] === rating) { |
| | return true |
| | } |
| | } |
| | return false |
| | } |
| |
|
| | function isMaybeCussWords(text: string, language_: string) { |
| | return isLikelyCussWords(text, language_, 1) |
| | } |
| |
|
| | const segmenter = new Intl.Segmenter([], { granularity: 'word' }) |
| |
|
| | function splitWords(text: string) { |
| | const segmentedText = segmenter.segment(text) |
| | return [...segmentedText].filter((s) => s.isWordLike).map((s) => s.segment) |
| | } |
| |
|
| | const surveyYaml = yaml.load(fs.readFileSync('data/survey-words.yml', 'utf8')) as { |
| | words: string[] |
| | } |
| | const surveyWords = surveyYaml.words.map((word: string) => word.toLowerCase()) |
| |
|
| | function isSpammyWordList(text: string) { |
| | const words = text.toLowerCase().split(/(\s+|\\n+)/g) |
| | |
| | |
| | return Boolean(words.some((word) => surveyWords.includes(word))) |
| | } |
| |
|