MOSAICapp / tests /conftest.py
romybeaute's picture
modif tests with dummy csv
8c495c2
"""Pytest fixtures for MOSAIC tests using local dummy dataset."""
import os
import pytest
import pandas as pd
import numpy as np
from pathlib import Path
@pytest.fixture
def sample_csv():
"""Returns the path to the dummy_dataset.csv file located in the same directory."""
# Get the directory where this conftest.py file resides
current_dir = Path(__file__).parent
file_path = current_dir / "dummy_dataset.csv"
if not file_path.exists():
pytest.fail(f"Test data file not found at: {file_path}")
return str(file_path)
@pytest.fixture
def sample_dataframe(sample_csv):
"""Loads the CSV into a DataFrame and normalizes column names."""
df = pd.read_csv(sample_csv)
# Normalize text column name for tests (handle 'report' vs 'text')
if 'text' not in df.columns:
if 'report' in df.columns:
df = df.rename(columns={'report': 'text'})
else:
# Fallback: assume first column is text if neither exists
df = df.rename(columns={df.columns[0]: 'text'})
return df
@pytest.fixture
def sample_texts(sample_dataframe):
"""Returns the list of text reports from the dataframe."""
return sample_dataframe['text'].tolist()
@pytest.fixture
def sample_embeddings(sample_texts):
"""Generates random embeddings matching the exact length of the CSV data."""
np.random.seed(42)
# Generate (n_samples, 384) matrix
return np.random.randn(len(sample_texts), 384).astype(np.float32)
@pytest.fixture
def larger_corpus(sample_texts):
"""
Alias for sample_texts.
Since the dummy dataset is sufficiently large, we reuse it.
"""
return sample_texts
@pytest.fixture
def larger_embeddings(sample_embeddings):
"""Alias for sample_embeddings matching the larger corpus."""
return sample_embeddings
@pytest.fixture
def topic_config():
"""Minimal BERTopic configuration for fast tests."""
return {
"umap_params": {"n_neighbors": 5, "n_components": 2, "min_dist": 0.0},
"hdbscan_params": {"min_cluster_size": 2, "min_samples": 1},
"bt_params": {"nr_topics": 2, "top_n_words": 3},
"vectorizer_params": {"stop_words": "english"},
"use_vectorizer": True,
}