Update haze/subjectivity.py
Browse files- haze/subjectivity.py +23 -1
haze/subjectivity.py
CHANGED
|
@@ -52,6 +52,23 @@ STOP_WORDS = frozenset({
|
|
| 52 |
'just', 'only', 'also', 'very', 'too', 'any', 'some', 'all', 'no',
|
| 53 |
})
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
def tokenize_words(text: str) -> List[str]:
|
| 57 |
"""
|
|
@@ -215,8 +232,13 @@ class Subjectivity:
|
|
| 215 |
self.corpus_trigrams.append((words[i], words[i+1], words[i+2]))
|
| 216 |
|
| 217 |
# Find most common trigrams as "gravity centers"
|
|
|
|
| 218 |
trigram_counts = Counter(self.corpus_trigrams)
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
def _build_identity_patterns(self) -> None:
|
| 222 |
"""Build identity patterns from bootstrap text."""
|
|
|
|
| 52 |
'just', 'only', 'also', 'very', 'too', 'any', 'some', 'all', 'no',
|
| 53 |
})
|
| 54 |
|
| 55 |
+
# Blacklist of mundane/generic phrases that shouldn't dominate gravity centers
|
| 56 |
+
# These are phrases that appear frequently in corpus but don't contribute to identity
|
| 57 |
+
# Format: set of 3-tuples of lowercased words
|
| 58 |
+
MUNDANE_TRIGRAMS = frozenset({
|
| 59 |
+
# Location phrases - too generic, not identity-defining
|
| 60 |
+
('the', 'living', 'room'),
|
| 61 |
+
('in', 'the', 'living'),
|
| 62 |
+
('to', 'the', 'living'),
|
| 63 |
+
('the', 'storage', 'room'),
|
| 64 |
+
('in', 'the', 'storage'),
|
| 65 |
+
('to', 'the', 'storage'),
|
| 66 |
+
})
|
| 67 |
+
|
| 68 |
+
# Configuration for gravity center selection
|
| 69 |
+
GRAVITY_CENTER_POOL_SIZE = 100 # Initial pool of top trigrams to filter from
|
| 70 |
+
GRAVITY_CENTER_FINAL_SIZE = 50 # Final number of gravity centers to keep
|
| 71 |
+
|
| 72 |
|
| 73 |
def tokenize_words(text: str) -> List[str]:
|
| 74 |
"""
|
|
|
|
| 232 |
self.corpus_trigrams.append((words[i], words[i+1], words[i+2]))
|
| 233 |
|
| 234 |
# Find most common trigrams as "gravity centers"
|
| 235 |
+
# Filter out mundane/generic phrases that don't contribute to identity
|
| 236 |
trigram_counts = Counter(self.corpus_trigrams)
|
| 237 |
+
filtered_trigrams = [
|
| 238 |
+
t for t, _ in trigram_counts.most_common(GRAVITY_CENTER_POOL_SIZE)
|
| 239 |
+
if t not in MUNDANE_TRIGRAMS
|
| 240 |
+
]
|
| 241 |
+
self.identity.gravity_centers = filtered_trigrams[:GRAVITY_CENTER_FINAL_SIZE]
|
| 242 |
|
| 243 |
def _build_identity_patterns(self) -> None:
|
| 244 |
"""Build identity patterns from bootstrap text."""
|