""" Integration tests that call real models and APIs. These are SLOW and should NOT run in CI. Run manually with: pytest tests/test_integration.py -v Requires: - Internet connection - HF_TOKEN env var (for LLM tests) """ import os import tempfile import numpy as np import pandas as pd import pytest # Skip entire module if running in CI pytestmark = pytest.mark.skipif( os.environ.get("CI") == "true", reason="Integration tests skipped in CI" ) @pytest.fixture def integration_csv(): """CSV with enough data for real embedding.""" texts = [ "I saw bright geometric patterns.", "Colors were vivid and shifting.", "Time felt distorted and slow.", "I felt detached from my body.", "There was a sense of peace.", ] * 6 # 30 docs df = pd.DataFrame({"text": texts}) with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: df.to_csv(f, index=False) path = f.name yield path os.unlink(path) class TestRealEmbeddings: """Tests with actual embedding model.""" def test_compute_embeddings_real(self): from mosaic_core.core_functions import compute_embeddings docs = ["This is a test.", "Another sentence here."] embeddings = compute_embeddings( docs, model_name="all-MiniLM-L6-v2", # small, fast model device="cpu" ) assert embeddings.shape[0] == 2 assert embeddings.shape[1] == 384 # MiniLM dimension assert embeddings.dtype == np.float32 def test_preprocess_and_embed_real(self, integration_csv): from mosaic_core.core_functions import preprocess_and_embed docs, embeddings = preprocess_and_embed( integration_csv, model_name="all-MiniLM-L6-v2", split_sentences=False, min_words=3, device="cpu" ) assert len(docs) == 30 assert embeddings.shape == (30, 384) class TestRealTopicModeling: """Full pipeline with real embeddings.""" def test_full_pipeline(self, integration_csv): from mosaic_core.core_functions import ( preprocess_and_embed, run_topic_model, get_topic_labels, get_outlier_stats ) docs, embeddings = preprocess_and_embed( integration_csv, model_name="all-MiniLM-L6-v2", split_sentences=False, device="cpu" ) config = { "umap_params": {"n_neighbors": 5, "n_components": 2, "min_dist": 0.0}, "hdbscan_params": {"min_cluster_size": 3, "min_samples": 2}, "bt_params": {"nr_topics": "auto", "top_n_words": 5}, "use_vectorizer": True, } model, reduced, topics = run_topic_model(docs, embeddings, config) labels = get_topic_labels(model, topics) outlier_count, outlier_pct = get_outlier_stats(model) assert len(topics) == len(docs) assert len(labels) == len(docs) assert reduced.shape == (len(docs), 2) assert 0 <= outlier_pct <= 100 @pytest.mark.skipif( not os.environ.get("HF_TOKEN"), reason="HF_TOKEN not set" ) class TestRealLLMLabeling: """Tests with actual HuggingFace API.""" def test_generate_labels_real(self, integration_csv): from mosaic_core.core_functions import ( preprocess_and_embed, run_topic_model, generate_llm_labels ) docs, embeddings = preprocess_and_embed( integration_csv, model_name="all-MiniLM-L6-v2", split_sentences=False, device="cpu" ) config = { "umap_params": {"n_neighbors": 5, "n_components": 2, "min_dist": 0.0}, "hdbscan_params": {"min_cluster_size": 3, "min_samples": 2}, "bt_params": {"nr_topics": 2, "top_n_words": 5}, "use_vectorizer": True, } model, _, _ = run_topic_model(docs, embeddings, config) labels = generate_llm_labels( model, hf_token=os.environ["HF_TOKEN"], max_topics=2 ) assert isinstance(labels, dict) assert len(labels) > 0 assert all(isinstance(v, str) for v in labels.values())