Spaces:
Running
Running
| """Pytest fixtures for MOSAIC tests using local dummy dataset.""" | |
| import os | |
| import pytest | |
| import pandas as pd | |
| import numpy as np | |
| from pathlib import Path | |
| def sample_csv(): | |
| """Returns the path to the dummy_dataset.csv file located in the same directory.""" | |
| # Get the directory where this conftest.py file resides | |
| current_dir = Path(__file__).parent | |
| file_path = current_dir / "dummy_dataset.csv" | |
| if not file_path.exists(): | |
| pytest.fail(f"Test data file not found at: {file_path}") | |
| return str(file_path) | |
| def sample_dataframe(sample_csv): | |
| """Loads the CSV into a DataFrame and normalizes column names.""" | |
| df = pd.read_csv(sample_csv) | |
| # Normalize text column name for tests (handle 'report' vs 'text') | |
| if 'text' not in df.columns: | |
| if 'report' in df.columns: | |
| df = df.rename(columns={'report': 'text'}) | |
| else: | |
| # Fallback: assume first column is text if neither exists | |
| df = df.rename(columns={df.columns[0]: 'text'}) | |
| return df | |
| def sample_texts(sample_dataframe): | |
| """Returns the list of text reports from the dataframe.""" | |
| return sample_dataframe['text'].tolist() | |
| def sample_embeddings(sample_texts): | |
| """Generates random embeddings matching the exact length of the CSV data.""" | |
| np.random.seed(42) | |
| # Generate (n_samples, 384) matrix | |
| return np.random.randn(len(sample_texts), 384).astype(np.float32) | |
| def larger_corpus(sample_texts): | |
| """ | |
| Alias for sample_texts. | |
| Since the dummy dataset is sufficiently large, we reuse it. | |
| """ | |
| return sample_texts | |
| def larger_embeddings(sample_embeddings): | |
| """Alias for sample_embeddings matching the larger corpus.""" | |
| return sample_embeddings | |
| def topic_config(): | |
| """Minimal BERTopic configuration for fast tests.""" | |
| return { | |
| "umap_params": {"n_neighbors": 5, "n_components": 2, "min_dist": 0.0}, | |
| "hdbscan_params": {"min_cluster_size": 2, "min_samples": 1}, | |
| "bt_params": {"nr_topics": 2, "top_n_words": 3}, | |
| "vectorizer_params": {"stop_words": "english"}, | |
| "use_vectorizer": True, | |
| } |