Spaces:

romybeaute
/

MOSAICapp

Running

File size: 2,258 Bytes

"""Pytest fixtures for MOSAIC tests using local dummy dataset."""

import os
import pytest
import pandas as pd
import numpy as np
from pathlib import Path

@pytest.fixture
def sample_csv():
    """Returns the path to the dummy_dataset.csv file located in the same directory."""
    # Get the directory where this conftest.py file resides
    current_dir = Path(__file__).parent
    file_path = current_dir / "dummy_dataset.csv"
    
    if not file_path.exists():
        pytest.fail(f"Test data file not found at: {file_path}")
        
    return str(file_path)

@pytest.fixture
def sample_dataframe(sample_csv):
    """Loads the CSV into a DataFrame and normalizes column names."""
    df = pd.read_csv(sample_csv)
    
    # Normalize text column name for tests (handle 'report' vs 'text')
    if 'text' not in df.columns:
        if 'report' in df.columns:
            df = df.rename(columns={'report': 'text'})
        else:
            # Fallback: assume first column is text if neither exists
            df = df.rename(columns={df.columns[0]: 'text'})
            
    return df

@pytest.fixture
def sample_texts(sample_dataframe):
    """Returns the list of text reports from the dataframe."""
    return sample_dataframe['text'].tolist()

@pytest.fixture
def sample_embeddings(sample_texts):
    """Generates random embeddings matching the exact length of the CSV data."""
    np.random.seed(42)
    # Generate (n_samples, 384) matrix
    return np.random.randn(len(sample_texts), 384).astype(np.float32)

@pytest.fixture
def larger_corpus(sample_texts):
    """
    Alias for sample_texts. 
    Since the dummy dataset is sufficiently large, we reuse it.
    """
    return sample_texts

@pytest.fixture
def larger_embeddings(sample_embeddings):
    """Alias for sample_embeddings matching the larger corpus."""
    return sample_embeddings

@pytest.fixture
def topic_config():
    """Minimal BERTopic configuration for fast tests."""
    return {
        "umap_params": {"n_neighbors": 5, "n_components": 2, "min_dist": 0.0},
        "hdbscan_params": {"min_cluster_size": 2, "min_samples": 1},
        "bt_params": {"nr_topics": 2, "top_n_words": 3},
        "vectorizer_params": {"stop_words": "english"},
        "use_vectorizer": True,
    }