romybeaute commited on
Commit
f67c8fe
·
1 Parent(s): abad87b

Add test suite for JOSS submission

Browse files
Files changed (4) hide show
  1. tests/conftest.py +86 -0
  2. tests/test_imports.py +74 -0
  3. tests/test_utils.py +356 -0
  4. tests/tests.md +14 -0
tests/conftest.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Pytest configuration and shared fixtures for MOSAIC-app tests.
3
+ """
4
+
5
+ import pytest
6
+ import pandas as pd
7
+ import numpy as np
8
+ import tempfile
9
+ import os
10
+
11
+
12
+ @pytest.fixture
13
+ def sample_dataframe():
14
+ """Create a sample DataFrame with phenomenological reports."""
15
+ return pd.DataFrame({
16
+ "id": [1, 2, 3, 4, 5],
17
+ "text": [
18
+ "I saw vivid geometric patterns and colors.",
19
+ "There was a feeling of floating outside my body.",
20
+ "Time seemed to slow down completely.",
21
+ "I experienced a deep sense of peace and calm.",
22
+ "The music created visual patterns in my mind."
23
+ ],
24
+ "condition": ["HS", "HS", "DL", "DL", "HS"]
25
+ })
26
+
27
+
28
+ @pytest.fixture
29
+ def sample_csv_file(sample_dataframe):
30
+ """Create a temporary CSV file with sample data."""
31
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
32
+ sample_dataframe.to_csv(f, index=False)
33
+ temp_path = f.name
34
+
35
+ yield temp_path
36
+
37
+ # Cleanup
38
+ if os.path.exists(temp_path):
39
+ os.unlink(temp_path)
40
+
41
+
42
+ @pytest.fixture
43
+ def sample_embeddings():
44
+ """Create sample embeddings (5 documents, 384 dimensions)."""
45
+ np.random.seed(42)
46
+ return np.random.randn(5, 384).astype(np.float32)
47
+
48
+
49
+ @pytest.fixture
50
+ def sample_documents():
51
+ """Sample list of documents for testing."""
52
+ return [
53
+ "I saw vivid geometric patterns and colors.",
54
+ "There was a feeling of floating outside my body.",
55
+ "Time seemed to slow down completely.",
56
+ "I experienced a deep sense of peace and calm.",
57
+ "The music created visual patterns in my mind."
58
+ ]
59
+
60
+
61
+ @pytest.fixture
62
+ def sample_config():
63
+ """Sample BERTopic configuration dictionary."""
64
+ return {
65
+ "embedding_model": "all-MiniLM-L6-v2",
66
+ "umap_params": {
67
+ "n_neighbors": 15,
68
+ "n_components": 5,
69
+ "min_dist": 0.0,
70
+ },
71
+ "hdbscan_params": {
72
+ "min_cluster_size": 5,
73
+ "min_samples": 3,
74
+ },
75
+ "vectorizer_params": {
76
+ "ngram_range": [1, 2],
77
+ "stop_words": "english",
78
+ },
79
+ "use_vectorizer": True,
80
+ "bt_params": {
81
+ "top_n_words": 10,
82
+ "nr_topics": "auto",
83
+ },
84
+ "granularity": "sentences",
85
+ "min_words": 2,
86
+ }
tests/test_imports.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tests to verify that all required packages can be imported.
3
+ This catches missing dependencies early.
4
+ """
5
+
6
+ import pytest
7
+
8
+
9
+ class TestImports:
10
+ """Test that all required packages are importable."""
11
+
12
+ def test_import_pandas(self):
13
+ """Test pandas import."""
14
+ import pandas
15
+ assert pandas is not None
16
+
17
+ def test_import_numpy(self):
18
+ """Test numpy import."""
19
+ import numpy
20
+ assert numpy is not None
21
+
22
+ def test_import_streamlit(self):
23
+ """Test streamlit import."""
24
+ import streamlit
25
+ assert streamlit is not None
26
+
27
+ def test_import_bertopic(self):
28
+ """Test BERTopic import."""
29
+ import bertopic
30
+ assert bertopic is not None
31
+
32
+ def test_import_sentence_transformers(self):
33
+ """Test sentence-transformers import."""
34
+ import sentence_transformers
35
+ assert sentence_transformers is not None
36
+
37
+ def test_import_umap(self):
38
+ """Test UMAP import."""
39
+ import umap
40
+ assert umap is not None
41
+
42
+ def test_import_hdbscan(self):
43
+ """Test HDBSCAN import."""
44
+ import hdbscan
45
+ assert hdbscan is not None
46
+
47
+ def test_import_sklearn(self):
48
+ """Test scikit-learn import."""
49
+ import sklearn
50
+ assert sklearn is not None
51
+
52
+ def test_import_nltk(self):
53
+ """Test NLTK import."""
54
+ import nltk
55
+ assert nltk is not None
56
+
57
+ def test_import_datamapplot(self):
58
+ """Test datamapplot import."""
59
+ import datamapplot
60
+ assert datamapplot is not None
61
+
62
+ def test_import_matplotlib(self):
63
+ """Test matplotlib import."""
64
+ import matplotlib
65
+ assert matplotlib is not None
66
+
67
+ def test_import_huggingface_hub(self):
68
+ """Test huggingface_hub import."""
69
+ import huggingface_hub
70
+ assert huggingface_hub is not None
71
+
72
+
73
+ if __name__ == "__main__":
74
+ pytest.main([__file__, "-v"])
tests/test_utils.py ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tests for utility functions in MOSAIC-app.
3
+ These tests verify core functionality without requiring Streamlit or heavy ML models.
4
+ """
5
+
6
+ import pytest
7
+ import pandas as pd
8
+ import numpy as np
9
+ import tempfile
10
+ import os
11
+
12
+
13
+ # =====================================================================
14
+ # Test helper functions
15
+ # =====================================================================
16
+
17
+ def _slugify(s: str) -> str:
18
+ """Convert string to safe folder name (copied from app.py for testing)."""
19
+ import re
20
+ s = s.strip()
21
+ s = re.sub(r"[^A-Za-z0-9._-]+", "_", s)
22
+ return s or "DATASET"
23
+
24
+
25
+ def _pick_text_column(df: pd.DataFrame) -> str | None:
26
+ """Return the first matching preferred text column name if present."""
27
+ ACCEPTABLE_TEXT_COLUMNS = [
28
+ "reflection_answer_english",
29
+ "reflection_answer",
30
+ "text",
31
+ "report",
32
+ ]
33
+ for col in ACCEPTABLE_TEXT_COLUMNS:
34
+ if col in df.columns:
35
+ return col
36
+ return None
37
+
38
+
39
+ def _list_text_columns(df: pd.DataFrame) -> list[str]:
40
+ """Return all columns."""
41
+ return list(df.columns)
42
+
43
+
44
+ def _clean_label(x: str) -> str:
45
+ """Clean LLM-generated label (copied from app.py for testing)."""
46
+ import re
47
+ x = (x or "").strip()
48
+ lines = x.splitlines()
49
+ x = lines[0].strip() if lines else ""
50
+ x = x.strip(' "\'`')
51
+ x = re.sub(r"[.:\-–—]+$", "", x).strip()
52
+ x = re.sub(r"[^\w\s]", "", x).strip()
53
+ # Remove "Experience of" but keep "Experience" alone for separate handling
54
+ x = re.sub(
55
+ r"^(Experiential(?:\s+Phenomenon)?|Experience of|Subjective Experience of|Phenomenon of)\s+",
56
+ "",
57
+ x,
58
+ flags=re.IGNORECASE,
59
+ )
60
+ x = re.sub(
61
+ r"\s+(experience|experiences|phenomenon|state|states)$",
62
+ "",
63
+ x,
64
+ flags=re.IGNORECASE,
65
+ )
66
+ x = x.strip()
67
+ return x or "Unlabelled"
68
+
69
+
70
+ # =====================================================================
71
+ # Tests for _slugify
72
+ # =====================================================================
73
+
74
+ class TestSlugify:
75
+ """Tests for the _slugify function."""
76
+
77
+ def test_basic_string(self):
78
+ """Test that basic strings are preserved."""
79
+ assert _slugify("MOSAIC") == "MOSAIC"
80
+ assert _slugify("dataset") == "dataset"
81
+
82
+ def test_spaces_replaced(self):
83
+ """Test that spaces are replaced with underscores."""
84
+ assert _slugify("my dataset") == "my_dataset"
85
+ assert _slugify("my dataset") == "my_dataset"
86
+
87
+ def test_special_characters_replaced(self):
88
+ """Test that special characters are replaced."""
89
+ assert _slugify("dataset@2024!") == "dataset_2024_"
90
+ assert _slugify("data/set") == "data_set"
91
+
92
+ def test_empty_string(self):
93
+ """Test that empty strings return 'DATASET'."""
94
+ assert _slugify("") == "DATASET"
95
+ assert _slugify(" ") == "DATASET"
96
+
97
+ def test_whitespace_stripped(self):
98
+ """Test that leading/trailing whitespace is stripped."""
99
+ assert _slugify(" dataset ") == "dataset"
100
+
101
+ def test_allowed_characters_preserved(self):
102
+ """Test that dots, hyphens, underscores are preserved."""
103
+ assert _slugify("data-set_v1.0") == "data-set_v1.0"
104
+
105
+
106
+ # =====================================================================
107
+ # Tests for _pick_text_column
108
+ # =====================================================================
109
+
110
+ class TestPickTextColumn:
111
+ """Tests for the _pick_text_column function."""
112
+
113
+ def test_reflection_answer_english(self):
114
+ """Test that 'reflection_answer_english' is picked first."""
115
+ df = pd.DataFrame({
116
+ "id": [1, 2],
117
+ "reflection_answer_english": ["text1", "text2"],
118
+ "text": ["other1", "other2"]
119
+ })
120
+ assert _pick_text_column(df) == "reflection_answer_english"
121
+
122
+ def test_reflection_answer(self):
123
+ """Test that 'reflection_answer' is picked if no 'reflection_answer_english'."""
124
+ df = pd.DataFrame({
125
+ "id": [1, 2],
126
+ "reflection_answer": ["text1", "text2"],
127
+ })
128
+ assert _pick_text_column(df) == "reflection_answer"
129
+
130
+ def test_text_column(self):
131
+ """Test that 'text' column is recognized."""
132
+ df = pd.DataFrame({
133
+ "id": [1, 2],
134
+ "text": ["text1", "text2"],
135
+ })
136
+ assert _pick_text_column(df) == "text"
137
+
138
+ def test_report_column(self):
139
+ """Test that 'report' column is recognized."""
140
+ df = pd.DataFrame({
141
+ "id": [1, 2],
142
+ "report": ["text1", "text2"],
143
+ })
144
+ assert _pick_text_column(df) == "report"
145
+
146
+ def test_no_matching_column(self):
147
+ """Test that None is returned if no matching column exists."""
148
+ df = pd.DataFrame({
149
+ "id": [1, 2],
150
+ "description": ["text1", "text2"],
151
+ })
152
+ assert _pick_text_column(df) is None
153
+
154
+ def test_empty_dataframe(self):
155
+ """Test with empty DataFrame."""
156
+ df = pd.DataFrame()
157
+ assert _pick_text_column(df) is None
158
+
159
+
160
+ # =====================================================================
161
+ # Tests for _list_text_columns
162
+ # =====================================================================
163
+
164
+ class TestListTextColumns:
165
+ """Tests for the _list_text_columns function."""
166
+
167
+ def test_returns_all_columns(self):
168
+ """Test that all columns are returned."""
169
+ df = pd.DataFrame({
170
+ "id": [1, 2],
171
+ "text": ["a", "b"],
172
+ "category": ["x", "y"]
173
+ })
174
+ cols = _list_text_columns(df)
175
+ assert cols == ["id", "text", "category"]
176
+
177
+ def test_empty_dataframe(self):
178
+ """Test with empty DataFrame."""
179
+ df = pd.DataFrame()
180
+ assert _list_text_columns(df) == []
181
+
182
+
183
+ # =====================================================================
184
+ # Tests for _clean_label
185
+ # =====================================================================
186
+
187
+ class TestCleanLabel:
188
+ """Tests for the _clean_label function."""
189
+
190
+ def test_basic_label(self):
191
+ """Test that basic labels are preserved."""
192
+ assert _clean_label("Visual Patterns") == "Visual Patterns"
193
+
194
+ def test_strips_whitespace(self):
195
+ """Test that whitespace is stripped."""
196
+ assert _clean_label(" Visual Patterns ") == "Visual Patterns"
197
+
198
+ def test_removes_quotes(self):
199
+ """Test that quotes are removed."""
200
+ assert _clean_label('"Visual Patterns"') == "Visual Patterns"
201
+ assert _clean_label("'Visual Patterns'") == "Visual Patterns"
202
+
203
+ def test_removes_trailing_punctuation(self):
204
+ """Test that trailing punctuation is removed."""
205
+ assert _clean_label("Visual Patterns.") == "Visual Patterns"
206
+ assert _clean_label("Visual Patterns:") == "Visual Patterns"
207
+ assert _clean_label("Visual Patterns—") == "Visual Patterns"
208
+
209
+ def test_removes_experience_prefix(self):
210
+ """Test that 'Experience of' prefix is removed."""
211
+ assert _clean_label("Experience of Visual Patterns") == "Visual Patterns"
212
+ assert _clean_label("Subjective Experience of Colors") == "Colors"
213
+ assert _clean_label("Phenomenon of Light") == "Light"
214
+
215
+ def test_removes_experience_suffix(self):
216
+ """Test that 'experience' suffix is removed."""
217
+ assert _clean_label("Visual Pattern experience") == "Visual Pattern"
218
+ assert _clean_label("Color phenomenon") == "Color"
219
+
220
+ def test_takes_first_line_only(self):
221
+ """Test that only first line is used."""
222
+ assert _clean_label("Visual Patterns\nSome explanation") == "Visual Patterns"
223
+
224
+ def test_empty_returns_unlabelled(self):
225
+ """Test that empty string returns 'Unlabelled'."""
226
+ assert _clean_label("") == "Unlabelled"
227
+ assert _clean_label(" ") == "Unlabelled"
228
+
229
+ def test_none_returns_unlabelled(self):
230
+ """Test that None returns 'Unlabelled'."""
231
+ assert _clean_label(None) == "Unlabelled"
232
+
233
+
234
+ # =====================================================================
235
+ # Tests for CSV handling
236
+ # =====================================================================
237
+
238
+ class TestCSVHandling:
239
+ """Tests for CSV loading and processing."""
240
+
241
+ def test_load_csv_with_text_column(self):
242
+ """Test loading a CSV with a recognized text column."""
243
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
244
+ f.write("id,text\n")
245
+ f.write("1,This is a test report.\n")
246
+ f.write("2,Another report here.\n")
247
+ temp_path = f.name
248
+
249
+ try:
250
+ df = pd.read_csv(temp_path)
251
+ assert len(df) == 2
252
+ assert _pick_text_column(df) == "text"
253
+ finally:
254
+ os.unlink(temp_path)
255
+
256
+ def test_load_csv_filters_empty_rows(self):
257
+ """Test that empty text rows are handled."""
258
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
259
+ f.write("id,text\n")
260
+ f.write("1,Valid text\n")
261
+ f.write("2,\n")
262
+ f.write("3,Another valid text\n")
263
+ temp_path = f.name
264
+
265
+ try:
266
+ df = pd.read_csv(temp_path)
267
+ df = df[df["text"].notna() & (df["text"].str.strip() != "")]
268
+ assert len(df) == 2
269
+ finally:
270
+ os.unlink(temp_path)
271
+
272
+
273
+ # =====================================================================
274
+ # Tests for sentence tokenization
275
+ # =====================================================================
276
+
277
+ class TestSentenceTokenization:
278
+ """Tests for sentence splitting functionality."""
279
+
280
+ def test_sentence_splitting(self):
281
+ """Test basic sentence splitting."""
282
+ import nltk
283
+ try:
284
+ nltk.data.find('tokenizers/punkt')
285
+ except LookupError:
286
+ nltk.download('punkt', quiet=True)
287
+
288
+ text = "This is sentence one. This is sentence two. And a third."
289
+ sentences = nltk.sent_tokenize(text)
290
+ assert len(sentences) == 3
291
+
292
+ def test_single_sentence(self):
293
+ """Test with single sentence."""
294
+ import nltk
295
+ try:
296
+ nltk.data.find('tokenizers/punkt')
297
+ except LookupError:
298
+ nltk.download('punkt', quiet=True)
299
+
300
+ text = "Just one sentence here."
301
+ sentences = nltk.sent_tokenize(text)
302
+ assert len(sentences) == 1
303
+
304
+ def test_min_words_filter(self):
305
+ """Test filtering sentences by minimum word count."""
306
+ sentences = [
307
+ "This is a long sentence with many words.",
308
+ "Short one.",
309
+ "Another longer sentence here.",
310
+ "Hi."
311
+ ]
312
+ min_words = 3
313
+ filtered = [s for s in sentences if len(s.split()) >= min_words]
314
+ assert len(filtered) == 2
315
+
316
+
317
+ # =====================================================================
318
+ # Tests for data validation
319
+ # =====================================================================
320
+
321
+ class TestDataValidation:
322
+ """Tests for data validation functions."""
323
+
324
+ def test_embedding_shape_validation(self):
325
+ """Test that embedding dimensions match document count."""
326
+ docs = ["doc1", "doc2", "doc3"]
327
+ embeddings = np.random.randn(3, 384) # 3 docs, 384-dim embeddings
328
+
329
+ assert embeddings.shape[0] == len(docs)
330
+
331
+ def test_embedding_dtype(self):
332
+ """Test embedding dtype conversion."""
333
+ embeddings = np.random.randn(5, 384)
334
+ embeddings = np.asarray(embeddings, dtype=np.float32)
335
+
336
+ assert embeddings.dtype == np.float32
337
+
338
+ def test_config_hash_consistency(self):
339
+ """Test that same config produces same hash."""
340
+ import json
341
+
342
+ config1 = {"param_a": 1, "param_b": "value"}
343
+ config2 = {"param_b": "value", "param_a": 1} # Same but different order
344
+
345
+ hash1 = json.dumps(config1, sort_keys=True)
346
+ hash2 = json.dumps(config2, sort_keys=True)
347
+
348
+ assert hash1 == hash2
349
+
350
+
351
+ # =====================================================================
352
+ # Run tests
353
+ # =====================================================================
354
+
355
+ if __name__ == "__main__":
356
+ pytest.main([__file__, "-v"])
tests/tests.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### How to run the tests
2
+
3
+ # Install pytest if not already installed
4
+ pip install pytest
5
+
6
+ # Run all tests
7
+ pytest tests/ -v
8
+
9
+ # Run specific test file
10
+ pytest tests/test_utils.py -v
11
+
12
+ # Run with coverage (optional)
13
+ pip install pytest-cov
14
+ pytest tests/ -v --cov=.