import fs from 'fs'
import yaml from 'js-yaml'
import { cuss } from 'cuss'
import { cuss as cussPt } from 'cuss/pt'
import { cuss as cussFr } from 'cuss/fr'
import { cuss as cussEs } from 'cuss/es'
let language: any = null

async function getLanguageInstance() {
  if (!language) {
    const { Language } = await import('@horizon-rs/language-guesser')
    language = new Language()
  }
  return language
}

// Exported for the debugging CLI script
export const SIGNAL_RATINGS = [
  {
    reduction: 1.0,
    name: 'email-only',
    validator: (comment: string) => isEmailOnly(comment),
  },
  {
    reduction: 0.2,
    name: 'contains-email',
    validator: (comment: string) => isContainingEmail(comment),
  },
  {
    reduction: 1.0,
    name: 'url-only',
    validator: (comment: string) => isURL(comment),
  },
  {
    reduction: 1.0,
    name: 'numbers-only',
    validator: (comment: string) => isNumbersOnly(comment),
  },
  {
    reduction: 0.1,
    name: 'all-uppercase',
    validator: (comment: string) => isAllUppercase(comment),
  },
  {
    reduction: 0.5,
    name: 'single-word',
    validator: (comment: string) => isSingleWord(comment),
  },
  {
    reduction: 0.2,
    name: 'too-short',
    validator: (comment: string) => isTooShort(comment),
  },
  {
    reduction: 0.2,
    name: 'not-language',
    validator: async (comment: string, commentLanguage: string) =>
      await isNotLanguage(comment, commentLanguage),
  },
  {
    reduction: 0.3,
    name: 'cuss-words-likely',
    validator: (comment: string, commentLanguage: string) =>
      isLikelyCussWords(comment, commentLanguage),
  },
  {
    reduction: 0.1,
    name: 'cuss-words-maybe',
    validator: (comment: string, commentLanguage: string) =>
      isMaybeCussWords(comment, commentLanguage),
  },
  {
    reduction: 0.2,
    name: 'mostly-emoji',
    validator: (comment: string) => isMostlyEmoji(comment),
  },
  {
    reduction: 1.0,
    name: 'spammy-words',
    validator: (comment: string) => isSpammyWordList(comment),
  },
]

export async function getGuessedLanguage(comment: string) {
  if (!comment || !comment.trim()) {
    return
  }

  const lang = await getLanguageInstance()
  const bestGuess = lang.guessBest(comment.trim(), [])
  if (!bestGuess) return // Can happen if the text is just whitespace
  // // @horizon-rs/language-guesser is based on tri-grams and can lead
  // // to false positives. For example, it thinks that 'Thamk you ❤️🙏' is
  // // Haitian! And that 'I wanne robux 1000' is Polish!
  // // But that's because they are short and there's not enough clues to
  // // guess what language it is. You and I might know those are actually
  // // attempts to be English, despite the spelling.
  // // But are they useful comments? Given that this is just a signal,
  // // and not a hard blocker, it's more of a clue than a fact.

  return bestGuess.alpha2 || undefined
}

export async function analyzeComment(text: string, commentLanguage = 'en') {
  const signals = []
  let rating = 1.0
  for (const { reduction, name, validator } of SIGNAL_RATINGS) {
    if (await validator(text, commentLanguage)) {
      signals.push(name)
      rating -= reduction
    }
    if (rating <= 0) break
  }

  return { signals, rating }
}

function isEmailOnly(text: string) {
  if (text.includes('@') && !/\s/.test(text.trim()) && !text.includes('://')) {
    const atSigns = text.split('@').length
    if (atSigns === 2) {
      return true
    }
  }
}

function isContainingEmail(text: string) {
  if (text.includes('@') && !isEmailOnly(text)) {
    // Don't use splitWords() here because `foo@example.com` will be
    // split up into ['foo', 'example.com'].
    return text.split(/\s+/g).some((word) => isEmailOnly(word))
  }
  return false
}

function isURL(text: string) {
  if (!text.trim().includes(' ')) {
    if (URL.canParse(text.trim())) return true
  }
}

function isNumbersOnly(text: string) {
  return /^\d+$/.test(text.replace(/\s/g, ''))
}

function isAllUppercase(text: string) {
  return /[A-Z]/.test(text) && text === text.toUpperCase()
}

function isTooShort(text: string) {
  const split = text.trim().split(/\s+/)
  if (split.length <= 3) {
    return true
  }
}

function isSingleWord(text: string) {
  const whitespaceSplit = text.trim().split(/\s+/)
  // E.g. `this-has-no-whitespace` or `snap/hooks/install`
  return whitespaceSplit.length === 1
}

async function isNotLanguage(text: string, language_: string) {
  const lang = await getLanguageInstance()
  const bestGuess = lang.guessBest(text.trim(), [])
  if (!bestGuess) return true // Can happen if the text is just whitespace
  // @horizon-rs/language-guesser is based on tri-grams and can lead
  // to false positives. For example, it thinks that 'Thamk you ❤️🙏' is
  // Haitian! And that 'I wanne robux 1000' is Polish!
  // But that's because they are short and there's not enough clues to
  // guess what language it is. You and I might know those are actually
  // attempts to be English, despite the spelling.
  // But are they useful comments? Given that this is just a signal,
  // and not a hard blocker, it's more of a clue than a fact.

  // We don't want to reduce the score for English comments. English
  // comments, when evaluated by language, are always valid.
  return bestGuess.alpha2 !== language_ && bestGuess.alpha2 !== 'en'
}

function isMostlyEmoji(text: string) {
  text = text.replace(/\s/g, '')
  const emojiRegex = /\p{Emoji}/gu
  const emojiMatches = text.match(emojiRegex)
  if (!emojiMatches) return false
  const emojiRatio = emojiMatches.length / text.length
  return emojiRatio > 0.25
}

function getCussWords(lang: string) {
  switch (lang) {
    case 'pt':
      return cussPt
    case 'fr':
      return cussFr
    case 'es':
      return cussEs
    default:
      return cuss
  }
}

function isLikelyCussWords(text: string, language_: string, rating = 2) {
  const cussWords = getCussWords(language_)
  const words = splitWords(text).map((word) => word.toLowerCase())
  for (const word of words) {
    if (cussWords[word] && cussWords[word] === rating) {
      return true
    }
  }
  return false
}

function isMaybeCussWords(text: string, language_: string) {
  return isLikelyCussWords(text, language_, 1)
}

const segmenter = new Intl.Segmenter([], { granularity: 'word' })

function splitWords(text: string) {
  const segmentedText = segmenter.segment(text)
  return [...segmentedText].filter((s) => s.isWordLike).map((s) => s.segment)
}

const surveyYaml = yaml.load(fs.readFileSync('data/survey-words.yml', 'utf8')) as {
  words: string[]
}
const surveyWords = surveyYaml.words.map((word: string) => word.toLowerCase())

function isSpammyWordList(text: string) {
  const words = text.toLowerCase().split(/(\s+|\\n+)/g)
  // Currently, we're intentionally not checking for
  // survey words that are substrings of a comment word.
  return Boolean(words.some((word) => surveyWords.includes(word)))
}