Upload folder using huggingface_hub
Browse files- README.md +93 -0
- config.json +12 -0
- umap_model.pkl +3 -0
README.md
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Vibes Chat Clustering Model
|
| 2 |
+
|
| 3 |
+
This model is trained on WhatsApp chat data from "The vibez" group.
|
| 4 |
+
|
| 5 |
+
## Model Architecture
|
| 6 |
+
|
| 7 |
+
- **Embeddings**: sentence-transformers/all-MiniLM-L6-v2 (384 dimensions)
|
| 8 |
+
- **UMAP**: Reduces to 15 dimensions with cosine metric
|
| 9 |
+
- **Clustering**: Fresh HDBSCAN on each inference (min_cluster_size=2 recommended)
|
| 10 |
+
- **Topics**: Fresh c-TF-IDF vocabulary extraction on each inference
|
| 11 |
+
|
| 12 |
+
## Usage
|
| 13 |
+
|
| 14 |
+
```python
|
| 15 |
+
import pickle
|
| 16 |
+
import json
|
| 17 |
+
from sentence_transformers import SentenceTransformer
|
| 18 |
+
import hdbscan
|
| 19 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 20 |
+
import numpy as np
|
| 21 |
+
|
| 22 |
+
# Load model and config
|
| 23 |
+
with open('umap_model.pkl', 'rb') as f:
|
| 24 |
+
umap_model = pickle.load(f)
|
| 25 |
+
|
| 26 |
+
with open('config.json') as f:
|
| 27 |
+
config = json.load(f)
|
| 28 |
+
|
| 29 |
+
# Load BERT model
|
| 30 |
+
bert_model = SentenceTransformer(config['bert_model'])
|
| 31 |
+
|
| 32 |
+
# Your texts to cluster
|
| 33 |
+
texts = ["your", "messages", "here"]
|
| 34 |
+
|
| 35 |
+
# 1. Embed with BERT
|
| 36 |
+
embeddings = bert_model.encode(texts)
|
| 37 |
+
|
| 38 |
+
# 2. Transform with UMAP
|
| 39 |
+
reduced = umap_model.transform(embeddings)
|
| 40 |
+
|
| 41 |
+
# 3. Cluster with HDBSCAN (fresh clustering)
|
| 42 |
+
clusterer = hdbscan.HDBSCAN(
|
| 43 |
+
min_cluster_size=config['recommended_min_cluster_size'],
|
| 44 |
+
metric='euclidean',
|
| 45 |
+
cluster_selection_method='eom'
|
| 46 |
+
)
|
| 47 |
+
labels = clusterer.fit_predict(reduced)
|
| 48 |
+
|
| 49 |
+
# 4. Extract topics with c-TF-IDF (fresh vocabulary)
|
| 50 |
+
# Group texts by cluster
|
| 51 |
+
cluster_docs = {}
|
| 52 |
+
for text, label in zip(texts, labels):
|
| 53 |
+
if label != -1:
|
| 54 |
+
cluster_docs.setdefault(label, []).append(text)
|
| 55 |
+
|
| 56 |
+
# Concatenate per cluster
|
| 57 |
+
cluster_texts = [" ".join(cluster_docs[cid]) for cid in sorted(cluster_docs.keys())]
|
| 58 |
+
|
| 59 |
+
# Vectorize
|
| 60 |
+
vectorizer = CountVectorizer(
|
| 61 |
+
stop_words="english",
|
| 62 |
+
min_df=max(1, int(len(cluster_texts) * 0.05)),
|
| 63 |
+
max_df=config['max_df'],
|
| 64 |
+
ngram_range=tuple(config['ngram_range'])
|
| 65 |
+
)
|
| 66 |
+
tf = vectorizer.fit_transform(cluster_texts)
|
| 67 |
+
|
| 68 |
+
# c-TF-IDF
|
| 69 |
+
n_clusters = len(cluster_texts)
|
| 70 |
+
df = np.array((tf > 0).sum(axis=0)).flatten()
|
| 71 |
+
idf = np.log(n_clusters / (1 + df))
|
| 72 |
+
ctfidf = tf.multiply(idf).toarray()
|
| 73 |
+
|
| 74 |
+
# Get top words
|
| 75 |
+
words = vectorizer.get_feature_names_out()
|
| 76 |
+
for i, cid in enumerate(sorted(cluster_docs.keys())):
|
| 77 |
+
top_indices = ctfidf[i].argsort()[-10:][::-1]
|
| 78 |
+
top_words = [words[j] for j in top_indices]
|
| 79 |
+
print(f"Topic {cid}: {', '.join(top_words)}")
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
## Training Data
|
| 83 |
+
|
| 84 |
+
Trained on historical WhatsApp chat bursts (excluding last 2 months).
|
| 85 |
+
|
| 86 |
+
## Key Insight
|
| 87 |
+
|
| 88 |
+
This model uses **UMAP-only training**:
|
| 89 |
+
- UMAP projection is frozen (trained once)
|
| 90 |
+
- HDBSCAN clustering is fresh each inference
|
| 91 |
+
- c-TF-IDF vocabulary is fresh each inference
|
| 92 |
+
|
| 93 |
+
This allows the model to adapt to new vocabulary and topics while maintaining a consistent embedding space.
|
config.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bert_model": "all-MiniLM-L6-v2",
|
| 3 |
+
"umap_n_components": 15,
|
| 4 |
+
"umap_metric": "cosine",
|
| 5 |
+
"recommended_min_cluster_size": 2,
|
| 6 |
+
"min_df_percent": 0.05,
|
| 7 |
+
"max_df": 0.85,
|
| 8 |
+
"ngram_range": [
|
| 9 |
+
1,
|
| 10 |
+
2
|
| 11 |
+
]
|
| 12 |
+
}
|
umap_model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:67713963c33d3b1e1c3a8887c5d9e1e8e4c5b8cdc93dbe430147021965cc1672
|
| 3 |
+
size 727443
|