Spaces:

AUXteam
/

WitNote

Paused

App Files Files Community

WitNote / internal /semantic /stopwords.go

AUXteam

Upload folder using huggingface_hub

6a7089a verified 18 days ago

raw

history blame contribute delete

4.27 kB

	package semantic

	// stopwords is a set of common English words that carry little semantic
	// meaning and should be excluded from lexical matching to improve
	// signal-to-noise ratio.
	var stopwords = map[string]bool{
	"the": true, "a": true, "an": true, "is": true, "are": true,
	"was": true, "were": true, "be": true, "been": true, "being": true,
	"have": true, "has": true, "had": true, "do": true, "does": true,
	"did": true, "will": true, "would": true, "could": true, "should": true,
	"may": true, "might": true, "shall": true, "can": true,
	"to": true, "of": true, "for": true,
	"with": true, "at": true, "by": true, "from": true, "as": true,
	"into": true, "through": true, "about": true, "above": true,
	"after": true, "before": true, "between": true, "under": true,
	"and": true, "but": true, "nor": true,
	"so": true, "yet": true, "both": true, "either": true, "neither": true,
	"this": true, "that": true, "these": true, "those": true,
	"it": true, "its": true, "i": true, "me": true, "my": true,
	"we": true, "our": true, "you": true, "your": true, "he": true,
	"she": true, "his": true, "her": true, "they": true, "their": true,
	}

	// semanticStopwords are words that are normally stopwords but carry
	// meaningful signal in certain UI contexts (e.g. "in" in "sign in",
	// "not" in "do not"). They are removed ONLY if they don't appear in
	// the other side's token set (context-aware removal).
	var semanticStopwords = map[string]bool{
	"in": true, // "sign in", "log in"
	"up": true, // "sign up", "look up"
	"out": true, // "log out", "sign out"
	"on": true, // "log on"
	"off": true, // "log off"
	"not": true, // "do not", "not now"
	"no": true, // negation carries meaning
	"or": true, // disjunction in UI labels
	"ok": true, // acceptance button
	}

	// isStopword returns true if the token is a common English stopword.
	func isStopword(token string) bool {
	return stopwords[token]
	}

	// isSemanticStopword returns true for words that are semi-stopwords:
	// normally low-value but can carry meaning in UI context.
	func isSemanticStopword(token string) bool {
	return semanticStopwords[token]
	}

	// removeStopwords filters out stopwords from a token list.
	// If removal would empty the list, the original tokens are returned
	// to avoid zero-signal matching.
	func removeStopwords(tokens []string) []string {
	filtered := make([]string, 0, len(tokens))
	for _, t := range tokens {
	if !isStopword(t) {
	filtered = append(filtered, t)
	}
	}
	if len(filtered) == 0 {
	return tokens
	}
	return filtered
	}

	// removeStopwordsContextAware performs context-aware stopword removal.
	// A word is preserved if:
	// 1. It is not a stopword, OR
	// 2. It IS a semantic stopword AND it appears in the other set of
	// tokens (meaning it carries matching signal), OR
	// 3. It forms part of a known synonym phrase with adjacent tokens
	// (e.g. "sign" + "in" → "sign in" is a synonym entry).
	//
	// Falls back to returning original tokens if everything would be removed.
	func removeStopwordsContextAware(tokens []string, otherTokens []string) []string {
	otherSet := make(map[string]bool, len(otherTokens))
	for _, t := range otherTokens {
	otherSet[t] = true
	}

	phraseTokens := make(map[int]bool)
	for n := 2; n <= 3 && n <= len(tokens); n++ {
	for i := 0; i <= len(tokens)-n; i++ {
	joined := ""
	for j := i; j < i+n; j++ {
	if j > i {
	joined += " "
	}
	joined += tokens[j]
	}
	if _, ok := synonymIndex[joined]; ok {
	for j := i; j < i+n; j++ {
	phraseTokens[j] = true
	}
	}
	}
	}

	filtered := make([]string, 0, len(tokens))
	for i, t := range tokens {
	switch {
	case !isStopword(t) && !isSemanticStopword(t):
	// Not a stopword at all — always keep.
	filtered = append(filtered, t)
	case phraseTokens[i]:
	// Part of a known synonym phrase — keep it.
	filtered = append(filtered, t)
	case isSemanticStopword(t) && otherSet[t]:
	// Semantic stopword that appears in the other side — keep.
	filtered = append(filtered, t)
	case isSemanticStopword(t) && !isStopword(t):
	// Semantic-only word not in the hard stopword list — keep.
	filtered = append(filtered, t)
	// Pure stopwords are dropped (default case).
	}
	}

	if len(filtered) == 0 {
	return tokens
	}
	return filtered
	}