Spaces:
Running
Running
File size: 4,406 Bytes
8bba594 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
"""
Integration tests that call real models and APIs.
These are SLOW and should NOT run in CI.
Run manually with: pytest tests/test_integration.py -v
Requires:
- Internet connection
- HF_TOKEN env var (for LLM tests)
"""
import os
import tempfile
import numpy as np
import pandas as pd
import pytest
# Skip entire module if running in CI
pytestmark = pytest.mark.skipif(
os.environ.get("CI") == "true",
reason="Integration tests skipped in CI"
)
@pytest.fixture
def integration_csv():
"""CSV with enough data for real embedding."""
texts = [
"I saw bright geometric patterns.",
"Colors were vivid and shifting.",
"Time felt distorted and slow.",
"I felt detached from my body.",
"There was a sense of peace.",
] * 6 # 30 docs
df = pd.DataFrame({"text": texts})
with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
df.to_csv(f, index=False)
path = f.name
yield path
os.unlink(path)
class TestRealEmbeddings:
"""Tests with actual embedding model."""
def test_compute_embeddings_real(self):
from mosaic_core.core_functions import compute_embeddings
docs = ["This is a test.", "Another sentence here."]
embeddings = compute_embeddings(
docs,
model_name="all-MiniLM-L6-v2", # small, fast model
device="cpu"
)
assert embeddings.shape[0] == 2
assert embeddings.shape[1] == 384 # MiniLM dimension
assert embeddings.dtype == np.float32
def test_preprocess_and_embed_real(self, integration_csv):
from mosaic_core.core_functions import preprocess_and_embed
docs, embeddings = preprocess_and_embed(
integration_csv,
model_name="all-MiniLM-L6-v2",
split_sentences=False,
min_words=3,
device="cpu"
)
assert len(docs) == 30
assert embeddings.shape == (30, 384)
class TestRealTopicModeling:
"""Full pipeline with real embeddings."""
def test_full_pipeline(self, integration_csv):
from mosaic_core.core_functions import (
preprocess_and_embed, run_topic_model,
get_topic_labels, get_outlier_stats
)
docs, embeddings = preprocess_and_embed(
integration_csv,
model_name="all-MiniLM-L6-v2",
split_sentences=False,
device="cpu"
)
config = {
"umap_params": {"n_neighbors": 5, "n_components": 2, "min_dist": 0.0},
"hdbscan_params": {"min_cluster_size": 3, "min_samples": 2},
"bt_params": {"nr_topics": "auto", "top_n_words": 5},
"use_vectorizer": True,
}
model, reduced, topics = run_topic_model(docs, embeddings, config)
labels = get_topic_labels(model, topics)
outlier_count, outlier_pct = get_outlier_stats(model)
assert len(topics) == len(docs)
assert len(labels) == len(docs)
assert reduced.shape == (len(docs), 2)
assert 0 <= outlier_pct <= 100
@pytest.mark.skipif(
not os.environ.get("HF_TOKEN"),
reason="HF_TOKEN not set"
)
class TestRealLLMLabeling:
"""Tests with actual HuggingFace API."""
def test_generate_labels_real(self, integration_csv):
from mosaic_core.core_functions import (
preprocess_and_embed, run_topic_model, generate_llm_labels
)
docs, embeddings = preprocess_and_embed(
integration_csv,
model_name="all-MiniLM-L6-v2",
split_sentences=False,
device="cpu"
)
config = {
"umap_params": {"n_neighbors": 5, "n_components": 2, "min_dist": 0.0},
"hdbscan_params": {"min_cluster_size": 3, "min_samples": 2},
"bt_params": {"nr_topics": 2, "top_n_words": 5},
"use_vectorizer": True,
}
model, _, _ = run_topic_model(docs, embeddings, config)
labels = generate_llm_labels(
model,
hf_token=os.environ["HF_TOKEN"],
max_topics=2
)
assert isinstance(labels, dict)
assert len(labels) > 0
assert all(isinstance(v, str) for v in labels.values()) |