package semantic

// stopwords is a set of common English words that carry little semantic
// meaning and should be excluded from lexical matching to improve
// signal-to-noise ratio.
var stopwords = map[string]bool{
	"the": true, "a": true, "an": true, "is": true, "are": true,
	"was": true, "were": true, "be": true, "been": true, "being": true,
	"have": true, "has": true, "had": true, "do": true, "does": true,
	"did": true, "will": true, "would": true, "could": true, "should": true,
	"may": true, "might": true, "shall": true, "can": true,
	"to": true, "of": true, "for": true,
	"with": true, "at": true, "by": true, "from": true, "as": true,
	"into": true, "through": true, "about": true, "above": true,
	"after": true, "before": true, "between": true, "under": true,
	"and": true, "but": true, "nor": true,
	"so": true, "yet": true, "both": true, "either": true, "neither": true,
	"this": true, "that": true, "these": true, "those": true,
	"it": true, "its": true, "i": true, "me": true, "my": true,
	"we": true, "our": true, "you": true, "your": true, "he": true,
	"she": true, "his": true, "her": true, "they": true, "their": true,
}

// semanticStopwords are words that are normally stopwords but carry
// meaningful signal in certain UI contexts (e.g. "in" in "sign in",
// "not" in "do not"). They are removed ONLY if they don't appear in
// the other side's token set (context-aware removal).
var semanticStopwords = map[string]bool{
	"in":  true, // "sign in", "log in"
	"up":  true, // "sign up", "look up"
	"out": true, // "log out", "sign out"
	"on":  true, // "log on"
	"off": true, // "log off"
	"not": true, // "do not", "not now"
	"no":  true, // negation carries meaning
	"or":  true, // disjunction in UI labels
	"ok":  true, // acceptance button
}

// isStopword returns true if the token is a common English stopword.
func isStopword(token string) bool {
	return stopwords[token]
}

// isSemanticStopword returns true for words that are semi-stopwords:
// normally low-value but can carry meaning in UI context.
func isSemanticStopword(token string) bool {
	return semanticStopwords[token]
}

// removeStopwords filters out stopwords from a token list.
// If removal would empty the list, the original tokens are returned
// to avoid zero-signal matching.
func removeStopwords(tokens []string) []string {
	filtered := make([]string, 0, len(tokens))
	for _, t := range tokens {
		if !isStopword(t) {
			filtered = append(filtered, t)
		}
	}
	if len(filtered) == 0 {
		return tokens
	}
	return filtered
}

// removeStopwordsContextAware performs context-aware stopword removal.
// A word is preserved if:
//  1. It is not a stopword, OR
//  2. It IS a semantic stopword AND it appears in the other set of
//     tokens (meaning it carries matching signal), OR
//  3. It forms part of a known synonym phrase with adjacent tokens
//     (e.g. "sign" + "in" → "sign in" is a synonym entry).
//
// Falls back to returning original tokens if everything would be removed.
func removeStopwordsContextAware(tokens []string, otherTokens []string) []string {
	otherSet := make(map[string]bool, len(otherTokens))
	for _, t := range otherTokens {
		otherSet[t] = true
	}

	phraseTokens := make(map[int]bool)
	for n := 2; n <= 3 && n <= len(tokens); n++ {
		for i := 0; i <= len(tokens)-n; i++ {
			joined := ""
			for j := i; j < i+n; j++ {
				if j > i {
					joined += " "
				}
				joined += tokens[j]
			}
			if _, ok := synonymIndex[joined]; ok {
				for j := i; j < i+n; j++ {
					phraseTokens[j] = true
				}
			}
		}
	}

	filtered := make([]string, 0, len(tokens))
	for i, t := range tokens {
		switch {
		case !isStopword(t) && !isSemanticStopword(t):
			// Not a stopword at all — always keep.
			filtered = append(filtered, t)
		case phraseTokens[i]:
			// Part of a known synonym phrase — keep it.
			filtered = append(filtered, t)
		case isSemanticStopword(t) && otherSet[t]:
			// Semantic stopword that appears in the other side — keep.
			filtered = append(filtered, t)
		case isSemanticStopword(t) && !isStopword(t):
			// Semantic-only word not in the hard stopword list — keep.
			filtered = append(filtered, t)
			// Pure stopwords are dropped (default case).
		}
	}

	if len(filtered) == 0 {
		return tokens
	}
	return filtered
}