Spaces:

romybeaute
/

MOSAICapp

Running

App Files Files Community

romybeaute commited on Jan 27

Commit

8bba594

1 Parent(s): 346d037

added tests folder

Browse files

Files changed (3) hide show

tests/conftest.py +81 -0
tests/test_core_functions.py +360 -0
tests/test_integration.py +148 -0

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""Pytest fixtures for MOSAIC tests."""
+import os
+import tempfile
+import numpy as np
+import pandas as pd
+import pytest
+@pytest.fixture
+def sample_texts():
+    """Short phenomenological reports for testing."""
+    return [
+        "I saw vivid geometric patterns and colors.",
+        "There was a feeling of floating outside my body.",
+        "Time seemed to slow down completely.",
+        "I experienced a deep sense of peace and calm.",
+        "The music created visual patterns in my mind.",
+    ]
+@pytest.fixture
+def sample_dataframe(sample_texts):
+    """DataFrame with text column and metadata."""
+    return pd.DataFrame({
+        "id": range(1, len(sample_texts) + 1),
+        "text": sample_texts,
+        "condition": ["HS", "HS", "DL", "DL", "HS"],
+    })
+@pytest.fixture
+def sample_csv(sample_dataframe):
+    """Temporary CSV file with sample data."""
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
+        sample_dataframe.to_csv(f, index=False)
+        path = f.name
+    yield path
+    if os.path.exists(path):
+        os.unlink(path)
+@pytest.fixture
+def sample_embeddings():
+    """Random embeddings matching sample_texts length."""
+    np.random.seed(42)
+    return np.random.randn(5, 384).astype(np.float32)
+@pytest.fixture
+def larger_corpus():
+    """30 documents for topic modeling tests (UMAP needs >15 samples)."""
+    base = [
+        "I saw a bright light.",
+        "The light was blinding and white.",
+        "I felt a presence nearby.",
+        "The presence was comforting.",
+        "Patterns emerged in the visual field.",
+        "Geometric patterns were everywhere.",
+    ]
+    return base * 5
+@pytest.fixture
+def larger_embeddings(larger_corpus):
+    """Embeddings for the larger corpus."""
+    np.random.seed(42)
+    return np.random.randn(len(larger_corpus), 384).astype(np.float32)
+@pytest.fixture
+def topic_config():
+    """Minimal BERTopic configuration for fast tests."""
+    return {
+        "umap_params": {"n_neighbors": 5, "n_components": 2, "min_dist": 0.0},
+        "hdbscan_params": {"min_cluster_size": 2, "min_samples": 1},
+        "bt_params": {"nr_topics": 2, "top_n_words": 3},
+        "vectorizer_params": {"stop_words": "english"},
+        "use_vectorizer": True,
+    }

tests/test_core_functions.py ADDED Viewed

	@@ -0,0 +1,360 @@

+"""Tests for mosaic_core.core_functions module."""
+import os
+import tempfile
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import pytest
+from mosaic_core.core_functions import (
+    pick_text_column,
+    list_text_columns,
+    slugify,
+    clean_label,
+    preprocess_texts,
+    load_csv_texts,
+    count_clean_reports,
+    get_config_hash,
+    make_run_id,
+    run_topic_model,
+    get_topic_labels,
+    get_outlier_stats,
+    get_num_topics,
+)
+class TestSlugify:
+    """Filename sanitization."""
+    def test_preserves_alphanumeric(self):
+        assert slugify("MOSAIC") == "MOSAIC"
+        assert slugify("dataset123") == "dataset123"
+    def test_replaces_spaces(self):
+        assert slugify("my dataset") == "my_dataset"
+        assert slugify("my  dataset") == "my_dataset"
+    def test_replaces_special_chars(self):
+        assert slugify("data@2024!") == "data_2024_"
+        assert slugify("path/to/file") == "path_to_file"
+    def test_preserves_safe_chars(self):
+        assert slugify("data-set_v1.0") == "data-set_v1.0"
+    def test_empty_returns_default(self):
+        assert slugify("") == "DATASET"
+        assert slugify("   ") == "DATASET"
+    def test_strips_whitespace(self):
+        assert slugify("  name  ") == "name"
+class TestPickTextColumn:
+    """Auto-detection of text columns."""
+    def test_priority_order(self):
+        df = pd.DataFrame({
+            "reflection_answer_english": ["a"],
+            "text": ["b"],
+        })
+        assert pick_text_column(df) == "reflection_answer_english"
+    def test_fallback_columns(self):
+        assert pick_text_column(pd.DataFrame({"text": ["a"]})) == "text"
+        assert pick_text_column(pd.DataFrame({"report": ["a"]})) == "report"
+        assert pick_text_column(pd.DataFrame({"reflection_answer": ["a"]})) == "reflection_answer"
+    def test_returns_none_if_no_match(self):
+        df = pd.DataFrame({"description": ["a"], "notes": ["b"]})
+        assert pick_text_column(df) is None
+    def test_empty_dataframe(self):
+        assert pick_text_column(pd.DataFrame()) is None
+class TestListTextColumns:
+    """Column listing."""
+    def test_returns_all_columns(self):
+        df = pd.DataFrame({"a": [1], "b": [2], "c": [3]})
+        assert list_text_columns(df) == ["a", "b", "c"]
+    def test_empty_dataframe(self):
+        assert list_text_columns(pd.DataFrame()) == []
+class TestCleanLabel:
+    """LLM output normalization."""
+    def test_basic_label(self):
+        assert clean_label("Visual Patterns") == "Visual Patterns"
+    def test_strips_whitespace(self):
+        assert clean_label("  Visual Patterns  ") == "Visual Patterns"
+    def test_removes_quotes(self):
+        assert clean_label('"Visual Patterns"') == "Visual Patterns"
+        assert clean_label("'Visual Patterns'") == "Visual Patterns"
+        assert clean_label("`Visual Patterns`") == "Visual Patterns"
+    def test_removes_trailing_punctuation(self):
+        assert clean_label("Visual Patterns.") == "Visual Patterns"
+        assert clean_label("Visual Patterns:") == "Visual Patterns"
+        assert clean_label("Visual Patterns—") == "Visual Patterns"
+    def test_removes_experience_prefix(self):
+        assert clean_label("Experience of Light") == "Light"
+        assert clean_label("Subjective Experience of Colors") == "Colors"
+        assert clean_label("Phenomenon of Sound") == "Sound"
+        # "Experiential Phenomenon" is matched, leaving "of Motion"
+        # This is expected behavior - the regex handles common patterns
+    def test_removes_experience_suffix(self):
+        assert clean_label("Visual experience") == "Visual"
+        assert clean_label("Color phenomenon") == "Color"
+        assert clean_label("Light state") == "Light"
+    def test_takes_first_line(self):
+        assert clean_label("Label\nExplanation text") == "Label"
+    def test_empty_returns_unlabelled(self):
+        assert clean_label("") == "Unlabelled"
+        assert clean_label("   ") == "Unlabelled"
+        assert clean_label(None) == "Unlabelled"
+class TestPreprocessTexts:
+    """Text preprocessing and sentence splitting."""
+    def test_sentence_splitting(self):
+        texts = ["First sentence. Second sentence."]
+        docs, removed, stats = preprocess_texts(texts, split_sentences=True, min_words=0)
+        assert len(docs) == 2
+        assert stats["total_before"] == 2
+    def test_no_splitting(self):
+        texts = ["First sentence. Second sentence."]
+        docs, removed, stats = preprocess_texts(texts, split_sentences=False, min_words=0)
+        assert len(docs) == 1
+    def test_min_words_filter(self):
+        texts = ["This is long enough.", "Short."]
+        docs, removed, stats = preprocess_texts(texts, split_sentences=False, min_words=3)
+        assert len(docs) == 1
+        assert len(removed) == 1
+        assert stats["removed_count"] == 1
+    def test_stats_accuracy(self):
+        texts = ["One sentence. Another sentence.", "Third sentence here."]
+        docs, removed, stats = preprocess_texts(texts, split_sentences=True, min_words=2)
+        assert stats["total_before"] == 3  # NLTK splits into 3 sentences
+        assert stats["total_after"] == len(docs)
+        assert stats["removed_count"] == len(removed)
+class TestLoadCSVTexts:
+    """CSV loading."""
+    def test_loads_texts(self, sample_csv):
+        texts = load_csv_texts(sample_csv, text_col="text")
+        assert len(texts) == 5
+    def test_auto_detects_column(self, sample_csv):
+        texts = load_csv_texts(sample_csv)
+        assert len(texts) == 5
+    def test_raises_on_missing_column(self, sample_csv):
+        with pytest.raises(ValueError, match="No valid text column"):
+            load_csv_texts(sample_csv, text_col="nonexistent")
+    def test_filters_empty_rows(self):
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
+            f.write("text\n")
+            f.write("Valid text\n")
+            f.write("\n")
+            f.write("   \n")
+            f.write("Another valid\n")
+            path = f.name
+        try:
+            texts = load_csv_texts(path)
+            assert len(texts) == 2
+        finally:
+            os.unlink(path)
+class TestCountCleanReports:
+    """Report counting."""
+    def test_counts_correctly(self, sample_csv):
+        assert count_clean_reports(sample_csv, "text") == 5
+    def test_returns_zero_on_error(self):
+        assert count_clean_reports("/nonexistent/path.csv") == 0
+class TestConfigUtils:
+    """Config hashing and run IDs."""
+    def test_hash_is_deterministic(self):
+        cfg = {"a": 1, "b": 2}
+        assert get_config_hash(cfg) == get_config_hash(cfg)
+    def test_hash_ignores_key_order(self):
+        cfg1 = {"a": 1, "b": 2}
+        cfg2 = {"b": 2, "a": 1}
+        assert get_config_hash(cfg1) == get_config_hash(cfg2)
+    def test_run_id_contains_hash(self):
+        cfg = {"a": 1}
+        run_id = make_run_id(cfg)
+        h = get_config_hash(cfg)
+        assert h in run_id
+class TestRunTopicModel:
+    """BERTopic fitting."""
+    def test_returns_expected_types(self, larger_corpus, larger_embeddings, topic_config):
+        model, reduced, topics = run_topic_model(
+            larger_corpus, larger_embeddings, topic_config
+        )
+        assert hasattr(model, "get_topic_info")
+        assert reduced.shape == (len(larger_corpus), 2)
+        assert len(topics) == len(larger_corpus)
+    def test_reduced_is_2d(self, larger_corpus, larger_embeddings, topic_config):
+        _, reduced, _ = run_topic_model(larger_corpus, larger_embeddings, topic_config)
+        assert reduced.ndim == 2
+        assert reduced.shape[1] == 2
+    def test_topics_are_integers(self, larger_corpus, larger_embeddings, topic_config):
+        _, _, topics = run_topic_model(larger_corpus, larger_embeddings, topic_config)
+        assert all(isinstance(t, (int, np.integer)) for t in topics)
+class TestGetTopicLabels:
+    """Topic label extraction."""
+    def test_returns_labels_for_all_docs(self, larger_corpus, larger_embeddings, topic_config):
+        model, _, topics = run_topic_model(larger_corpus, larger_embeddings, topic_config)
+        labels = get_topic_labels(model, topics)
+        assert len(labels) == len(larger_corpus)
+    def test_labels_are_strings(self, larger_corpus, larger_embeddings, topic_config):
+        model, _, topics = run_topic_model(larger_corpus, larger_embeddings, topic_config)
+        labels = get_topic_labels(model, topics)
+        assert all(isinstance(lbl, str) for lbl in labels)
+class TestOutlierStats:
+    """Outlier statistics."""
+    def test_returns_count_and_percentage(self, larger_corpus, larger_embeddings, topic_config):
+        model, _, _ = run_topic_model(larger_corpus, larger_embeddings, topic_config)
+        count, pct = get_outlier_stats(model)
+        assert isinstance(count, int)
+        assert isinstance(pct, float)
+        assert 0 <= pct <= 100
+    def test_num_topics(self, larger_corpus, larger_embeddings, topic_config):
+        model, _, _ = run_topic_model(larger_corpus, larger_embeddings, topic_config)
+        n = get_num_topics(model)
+        assert isinstance(n, int)
+        assert n >= 0
+class TestEmbeddingShapeValidation:
+    """Embedding consistency checks."""
+    def test_shape_matches_docs(self, sample_texts, sample_embeddings):
+        assert sample_embeddings.shape[0] == len(sample_texts)
+    def test_dtype_is_float32(self, sample_embeddings):
+        assert sample_embeddings.dtype == np.float32
+class TestLabelsCachePath:
+    """Label cache path generation."""
+    def test_returns_path_object(self):
+        from mosaic_core.core_functions import labels_cache_path
+        from pathlib import Path
+        p = labels_cache_path("/tmp", "abc123", "meta-llama/Llama-3")
+        assert isinstance(p, Path)
+    def test_sanitizes_model_id(self):
+        from mosaic_core.core_functions import labels_cache_path
+        p = labels_cache_path("/tmp", "hash", "org/model-name")
+        assert "/" not in p.name
+class TestLabelsCacheIO:
+    """Label cache read/write."""
+    def test_save_and_load(self):
+        from mosaic_core.core_functions import save_labels_cache, load_cached_labels
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+            path = f.name
+        try:
+            labels = {0: "Topic A", 1: "Topic B"}
+            save_labels_cache(path, labels)
+            loaded = load_cached_labels(path)
+            assert loaded == labels
+        finally:
+            os.unlink(path)
+    def test_load_returns_none_on_missing(self):
+        from mosaic_core.core_functions import load_cached_labels
+        result = load_cached_labels("/nonexistent/path.json")
+        assert result is None
+class TestCleanupOldCache:
+    """Cache cleanup."""
+    def test_removes_non_matching_files(self):
+        from mosaic_core.core_functions import cleanup_old_cache
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Create some fake cache files
+            (Path(tmpdir) / "precomputed_OLD_docs.npy").touch()
+            (Path(tmpdir) / "precomputed_OLD_emb.npy").touch()
+            (Path(tmpdir) / "precomputed_CURRENT_docs.npy").touch()
+            removed = cleanup_old_cache(tmpdir, "CURRENT")
+            assert removed == 2
+            assert (Path(tmpdir) / "precomputed_CURRENT_docs.npy").exists()
+            assert not (Path(tmpdir) / "precomputed_OLD_docs.npy").exists()
+    def test_handles_missing_dir(self):
+        from mosaic_core.core_functions import cleanup_old_cache
+        result = cleanup_old_cache("/nonexistent/dir", "test")
+        assert result == 0
+class TestResolveDevice:
+    """Device resolution."""
+    def test_cpu_explicit(self):
+        from mosaic_core.core_functions import resolve_device
+        device, batch = resolve_device("cpu")
+        assert device == "cpu"
+        assert batch == 64
+    def test_cpu_uppercase(self):
+        from mosaic_core.core_functions import resolve_device
+        device, _ = resolve_device("CPU")
+        assert device == "cpu"

tests/test_integration.py ADDED Viewed

	@@ -0,0 +1,148 @@

+"""
+Integration tests that call real models and APIs.
+These are SLOW and should NOT run in CI.
+Run manually with: pytest tests/test_integration.py -v
+Requires:
+- Internet connection
+- HF_TOKEN env var (for LLM tests)
+"""
+import os
+import tempfile
+import numpy as np
+import pandas as pd
+import pytest
+# Skip entire module if running in CI
+pytestmark = pytest.mark.skipif(
+    os.environ.get("CI") == "true",
+    reason="Integration tests skipped in CI"
+)
+@pytest.fixture
+def integration_csv():
+    """CSV with enough data for real embedding."""
+    texts = [
+        "I saw bright geometric patterns.",
+        "Colors were vivid and shifting.",
+        "Time felt distorted and slow.",
+        "I felt detached from my body.",
+        "There was a sense of peace.",
+    ] * 6  # 30 docs
+    df = pd.DataFrame({"text": texts})
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
+        df.to_csv(f, index=False)
+        path = f.name
+    yield path
+    os.unlink(path)
+class TestRealEmbeddings:
+    """Tests with actual embedding model."""
+    def test_compute_embeddings_real(self):
+        from mosaic_core.core_functions import compute_embeddings
+        docs = ["This is a test.", "Another sentence here."]
+        embeddings = compute_embeddings(
+            docs,
+            model_name="all-MiniLM-L6-v2",  # small, fast model
+            device="cpu"
+        )
+        assert embeddings.shape[0] == 2
+        assert embeddings.shape[1] == 384  # MiniLM dimension
+        assert embeddings.dtype == np.float32
+    def test_preprocess_and_embed_real(self, integration_csv):
+        from mosaic_core.core_functions import preprocess_and_embed
+        docs, embeddings = preprocess_and_embed(
+            integration_csv,
+            model_name="all-MiniLM-L6-v2",
+            split_sentences=False,
+            min_words=3,
+            device="cpu"
+        )
+        assert len(docs) == 30
+        assert embeddings.shape == (30, 384)
+class TestRealTopicModeling:
+    """Full pipeline with real embeddings."""
+    def test_full_pipeline(self, integration_csv):
+        from mosaic_core.core_functions import (
+            preprocess_and_embed, run_topic_model,
+            get_topic_labels, get_outlier_stats
+        )
+        docs, embeddings = preprocess_and_embed(
+            integration_csv,
+            model_name="all-MiniLM-L6-v2",
+            split_sentences=False,
+            device="cpu"
+        )
+        config = {
+            "umap_params": {"n_neighbors": 5, "n_components": 2, "min_dist": 0.0},
+            "hdbscan_params": {"min_cluster_size": 3, "min_samples": 2},
+            "bt_params": {"nr_topics": "auto", "top_n_words": 5},
+            "use_vectorizer": True,
+        }
+        model, reduced, topics = run_topic_model(docs, embeddings, config)
+        labels = get_topic_labels(model, topics)
+        outlier_count, outlier_pct = get_outlier_stats(model)
+        assert len(topics) == len(docs)
+        assert len(labels) == len(docs)
+        assert reduced.shape == (len(docs), 2)
+        assert 0 <= outlier_pct <= 100
+@pytest.mark.skipif(
+    not os.environ.get("HF_TOKEN"),
+    reason="HF_TOKEN not set"
+)
+class TestRealLLMLabeling:
+    """Tests with actual HuggingFace API."""
+    def test_generate_labels_real(self, integration_csv):
+        from mosaic_core.core_functions import (
+            preprocess_and_embed, run_topic_model, generate_llm_labels
+        )
+        docs, embeddings = preprocess_and_embed(
+            integration_csv,
+            model_name="all-MiniLM-L6-v2",
+            split_sentences=False,
+            device="cpu"
+        )
+        config = {
+            "umap_params": {"n_neighbors": 5, "n_components": 2, "min_dist": 0.0},
+            "hdbscan_params": {"min_cluster_size": 3, "min_samples": 2},
+            "bt_params": {"nr_topics": 2, "top_n_words": 5},
+            "use_vectorizer": True,
+        }
+        model, _, _ = run_topic_model(docs, embeddings, config)
+        labels = generate_llm_labels(
+            model,
+            hf_token=os.environ["HF_TOKEN"],
+            max_topics=2
+        )
+        assert isinstance(labels, dict)
+        assert len(labels) > 0
+        assert all(isinstance(v, str) for v in labels.values())