Spaces:

romybeaute
/

MOSAICapp

Running

App Files Files Community

MOSAICapp / tests /test_integration.py

romybeaute

added tests folder

8bba594 9 days ago

raw

history blame contribute delete

4.41 kB

	"""
	Integration tests that call real models and APIs.

	These are SLOW and should NOT run in CI.
	Run manually with: pytest tests/test_integration.py -v

	Requires:
	- Internet connection
	- HF_TOKEN env var (for LLM tests)
	"""

	import os
	import tempfile

	import numpy as np
	import pandas as pd
	import pytest

	# Skip entire module if running in CI
	pytestmark = pytest.mark.skipif(
	os.environ.get("CI") == "true",
	reason="Integration tests skipped in CI"
	)


	@pytest.fixture
	def integration_csv():
	"""CSV with enough data for real embedding."""
	texts = [
	"I saw bright geometric patterns.",
	"Colors were vivid and shifting.",
	"Time felt distorted and slow.",
	"I felt detached from my body.",
	"There was a sense of peace.",
	] * 6 # 30 docs

	df = pd.DataFrame({"text": texts})

	with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
	df.to_csv(f, index=False)
	path = f.name

	yield path
	os.unlink(path)


	class TestRealEmbeddings:
	"""Tests with actual embedding model."""

	def test_compute_embeddings_real(self):
	from mosaic_core.core_functions import compute_embeddings

	docs = ["This is a test.", "Another sentence here."]
	embeddings = compute_embeddings(
	docs,
	model_name="all-MiniLM-L6-v2", # small, fast model
	device="cpu"
	)

	assert embeddings.shape[0] == 2
	assert embeddings.shape[1] == 384 # MiniLM dimension
	assert embeddings.dtype == np.float32

	def test_preprocess_and_embed_real(self, integration_csv):
	from mosaic_core.core_functions import preprocess_and_embed

	docs, embeddings = preprocess_and_embed(
	integration_csv,
	model_name="all-MiniLM-L6-v2",
	split_sentences=False,
	min_words=3,
	device="cpu"
	)

	assert len(docs) == 30
	assert embeddings.shape == (30, 384)


	class TestRealTopicModeling:
	"""Full pipeline with real embeddings."""

	def test_full_pipeline(self, integration_csv):
	from mosaic_core.core_functions import (
	preprocess_and_embed, run_topic_model,
	get_topic_labels, get_outlier_stats
	)

	docs, embeddings = preprocess_and_embed(
	integration_csv,
	model_name="all-MiniLM-L6-v2",
	split_sentences=False,
	device="cpu"
	)

	config = {
	"umap_params": {"n_neighbors": 5, "n_components": 2, "min_dist": 0.0},
	"hdbscan_params": {"min_cluster_size": 3, "min_samples": 2},
	"bt_params": {"nr_topics": "auto", "top_n_words": 5},
	"use_vectorizer": True,
	}

	model, reduced, topics = run_topic_model(docs, embeddings, config)
	labels = get_topic_labels(model, topics)
	outlier_count, outlier_pct = get_outlier_stats(model)

	assert len(topics) == len(docs)
	assert len(labels) == len(docs)
	assert reduced.shape == (len(docs), 2)
	assert 0 <= outlier_pct <= 100


	@pytest.mark.skipif(
	not os.environ.get("HF_TOKEN"),
	reason="HF_TOKEN not set"
	)
	class TestRealLLMLabeling:
	"""Tests with actual HuggingFace API."""

	def test_generate_labels_real(self, integration_csv):
	from mosaic_core.core_functions import (
	preprocess_and_embed, run_topic_model, generate_llm_labels
	)

	docs, embeddings = preprocess_and_embed(
	integration_csv,
	model_name="all-MiniLM-L6-v2",
	split_sentences=False,
	device="cpu"
	)

	config = {
	"umap_params": {"n_neighbors": 5, "n_components": 2, "min_dist": 0.0},
	"hdbscan_params": {"min_cluster_size": 3, "min_samples": 2},
	"bt_params": {"nr_topics": 2, "top_n_words": 5},
	"use_vectorizer": True,
	}

	model, _, _ = run_topic_model(docs, embeddings, config)

	labels = generate_llm_labels(
	model,
	hf_token=os.environ["HF_TOKEN"],
	max_topics=2
	)

	assert isinstance(labels, dict)
	assert len(labels) > 0
	assert all(isinstance(v, str) for v in labels.values())