Spaces:
Running
Running
File size: 2,258 Bytes
8c495c2 8bba594 8c495c2 8bba594 8c495c2 8bba594 8c495c2 8bba594 8c495c2 8bba594 8c495c2 8bba594 8c495c2 8bba594 8c495c2 8bba594 8c495c2 8bba594 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 | """Pytest fixtures for MOSAIC tests using local dummy dataset."""
import os
import pytest
import pandas as pd
import numpy as np
from pathlib import Path
@pytest.fixture
def sample_csv():
"""Returns the path to the dummy_dataset.csv file located in the same directory."""
# Get the directory where this conftest.py file resides
current_dir = Path(__file__).parent
file_path = current_dir / "dummy_dataset.csv"
if not file_path.exists():
pytest.fail(f"Test data file not found at: {file_path}")
return str(file_path)
@pytest.fixture
def sample_dataframe(sample_csv):
"""Loads the CSV into a DataFrame and normalizes column names."""
df = pd.read_csv(sample_csv)
# Normalize text column name for tests (handle 'report' vs 'text')
if 'text' not in df.columns:
if 'report' in df.columns:
df = df.rename(columns={'report': 'text'})
else:
# Fallback: assume first column is text if neither exists
df = df.rename(columns={df.columns[0]: 'text'})
return df
@pytest.fixture
def sample_texts(sample_dataframe):
"""Returns the list of text reports from the dataframe."""
return sample_dataframe['text'].tolist()
@pytest.fixture
def sample_embeddings(sample_texts):
"""Generates random embeddings matching the exact length of the CSV data."""
np.random.seed(42)
# Generate (n_samples, 384) matrix
return np.random.randn(len(sample_texts), 384).astype(np.float32)
@pytest.fixture
def larger_corpus(sample_texts):
"""
Alias for sample_texts.
Since the dummy dataset is sufficiently large, we reuse it.
"""
return sample_texts
@pytest.fixture
def larger_embeddings(sample_embeddings):
"""Alias for sample_embeddings matching the larger corpus."""
return sample_embeddings
@pytest.fixture
def topic_config():
"""Minimal BERTopic configuration for fast tests."""
return {
"umap_params": {"n_neighbors": 5, "n_components": 2, "min_dist": 0.0},
"hdbscan_params": {"min_cluster_size": 2, "min_samples": 1},
"bt_params": {"nr_topics": 2, "top_n_words": 3},
"vectorizer_params": {"stop_words": "english"},
"use_vectorizer": True,
} |