ataeff commited on
Commit
889bc84
·
verified ·
1 Parent(s): 9268534

Update haze/subjectivity.py

Browse files
Files changed (1) hide show
  1. haze/subjectivity.py +23 -1
haze/subjectivity.py CHANGED
@@ -52,6 +52,23 @@ STOP_WORDS = frozenset({
52
  'just', 'only', 'also', 'very', 'too', 'any', 'some', 'all', 'no',
53
  })
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  def tokenize_words(text: str) -> List[str]:
57
  """
@@ -215,8 +232,13 @@ class Subjectivity:
215
  self.corpus_trigrams.append((words[i], words[i+1], words[i+2]))
216
 
217
  # Find most common trigrams as "gravity centers"
 
218
  trigram_counts = Counter(self.corpus_trigrams)
219
- self.identity.gravity_centers = [t for t, _ in trigram_counts.most_common(50)]
 
 
 
 
220
 
221
  def _build_identity_patterns(self) -> None:
222
  """Build identity patterns from bootstrap text."""
 
52
  'just', 'only', 'also', 'very', 'too', 'any', 'some', 'all', 'no',
53
  })
54
 
55
+ # Blacklist of mundane/generic phrases that shouldn't dominate gravity centers
56
+ # These are phrases that appear frequently in corpus but don't contribute to identity
57
+ # Format: set of 3-tuples of lowercased words
58
+ MUNDANE_TRIGRAMS = frozenset({
59
+ # Location phrases - too generic, not identity-defining
60
+ ('the', 'living', 'room'),
61
+ ('in', 'the', 'living'),
62
+ ('to', 'the', 'living'),
63
+ ('the', 'storage', 'room'),
64
+ ('in', 'the', 'storage'),
65
+ ('to', 'the', 'storage'),
66
+ })
67
+
68
+ # Configuration for gravity center selection
69
+ GRAVITY_CENTER_POOL_SIZE = 100 # Initial pool of top trigrams to filter from
70
+ GRAVITY_CENTER_FINAL_SIZE = 50 # Final number of gravity centers to keep
71
+
72
 
73
  def tokenize_words(text: str) -> List[str]:
74
  """
 
232
  self.corpus_trigrams.append((words[i], words[i+1], words[i+2]))
233
 
234
  # Find most common trigrams as "gravity centers"
235
+ # Filter out mundane/generic phrases that don't contribute to identity
236
  trigram_counts = Counter(self.corpus_trigrams)
237
+ filtered_trigrams = [
238
+ t for t, _ in trigram_counts.most_common(GRAVITY_CENTER_POOL_SIZE)
239
+ if t not in MUNDANE_TRIGRAMS
240
+ ]
241
+ self.identity.gravity_centers = filtered_trigrams[:GRAVITY_CENTER_FINAL_SIZE]
242
 
243
  def _build_identity_patterns(self) -> None:
244
  """Build identity patterns from bootstrap text."""