Spaces:

akhil-vaidya
/

quailvec

Runtime error

App Files Files Community

akhil-vaidya commited on Dec 25, 2025

Commit

59e348f

verified ·

1 Parent(s): 5487a6c

Upload 21 files

Browse files

Files changed (21) hide show

app/README.md +85 -0
app/app.py +916 -0
app/run_demo.bat +0 -0
app/run_demo.py +38 -0
src/qualivec/__init__.py +15 -0
src/qualivec/__pycache__/__init__.cpython-311.pyc +0 -0
src/qualivec/__pycache__/__init__.cpython-312.pyc +0 -0
src/qualivec/__pycache__/classification.cpython-312.pyc +0 -0
src/qualivec/__pycache__/data.cpython-312.pyc +0 -0
src/qualivec/__pycache__/embedding.cpython-312.pyc +0 -0
src/qualivec/__pycache__/evaluation.cpython-312.pyc +0 -0
src/qualivec/__pycache__/matching.cpython-312.pyc +0 -0
src/qualivec/__pycache__/optimization.cpython-312.pyc +0 -0
src/qualivec/__pycache__/sampling.cpython-312.pyc +0 -0
src/qualivec/classification.py +216 -0
src/qualivec/data.py +174 -0
src/qualivec/embedding.py +276 -0
src/qualivec/evaluation.py +254 -0
src/qualivec/matching.py +104 -0
src/qualivec/optimization.py +263 -0
src/qualivec/sampling.py +102 -0

app/README.md ADDED Viewed

	@@ -0,0 +1,85 @@

+# QualiVec Streamlit Demo
+This Streamlit application provides an interactive demonstration of the QualiVec library for qualitative content analysis using LLM embeddings.
+## Features
+- **Interactive Data Upload**: Upload your own CSV files for reference and labeled data
+- **Model Configuration**: Choose from different pre-trained embedding models
+- **Threshold Optimization**: Automatically find the optimal similarity threshold
+- **Real-time Classification**: See classification results as they happen
+- **Comprehensive Evaluation**: View detailed performance metrics and visualizations
+- **Bootstrap Analysis**: Get confidence intervals for robust evaluation
+## How to Run
+### Option 1: Local Installation
+1. **Install Dependencies**:
+   ```bash
+   pip install -e .
+   ```
+2. **Run the App**:
+   ```bash
+   cd app
+   uv run run_demo.py
+   ```
+3. **Access the App**:
+   Open your browser and navigate to `http://localhost:8501`
+### Option 2: Docker
+1. **Build the Docker Image**:
+   ```bash
+   docker build -t qualivec .
+   ```
+2. **Run the Docker Container**:
+   ```bash
+   docker run --rm -p 8501:8501 qualivec
+   ```
+3. **Access the App**:
+   Open your browser and navigate to `http://localhost:8501`
+> **Note**: The Docker option provides a containerized environment with all dependencies pre-installed, making it easier to run the application without setting up a local Python environment.
+## Data Format Requirements
+### Reference Data (CSV)
+Your reference data should contain:
+- `tag`: The class/category label
+- `sentence`: The example text for that category
+Example:
+```csv
+tag,sentence
+Positive,This is absolutely fantastic!
+Negative,This is terrible and disappointing
+Neutral,This is okay I guess
+```
+### Labeled Data (CSV)
+Your labeled data should contain:
+- `sentence`: The text to be classified
+- `Label`: The true class/category (for evaluation)
+Example:
+```csv
+sentence,Label
+I love this product so much!,Positive
+Not very good quality,Negative
+Average product nothing special,Neutral
+```
+## Navigation
+The app is organized into 5 main sections:
+1. **🏠 Home**: Overview and introduction to QualiVec
+2. **📊 Data Upload**: Upload your reference and labeled data files
+3. **🔧 Configuration**: Set up embedding models and parameters
+4. **🎯 Classification**: Run the classification and optimization process
+5. **📈 Results**: View detailed results and download outputs

app/app.py ADDED Viewed

	@@ -0,0 +1,916 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import tempfile
+import os
+import sys
+from io import StringIO
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+# Add the parent directory to sys.path to import the module
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from src.qualivec.data import DataLoader
+from src.qualivec.embedding import EmbeddingModel
+from src.qualivec.matching import SemanticMatcher
+from src.qualivec.classification import Classifier
+from src.qualivec.evaluation import Evaluator
+from src.qualivec.optimization import ThresholdOptimizer
+# Set page config
+st.set_page_config(
+    page_title="QualiVec Demo",
+    page_icon="🔍",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS for better styling
+st.markdown("""
+<style>
+    .main-header {
+        font-size: 2.5rem;
+        font-weight: bold;
+        color: #2E4057;
+        text-align: center;
+        margin-bottom: 2rem;
+    }
+    .section-header {
+        font-size: 1.5rem;
+        font-weight: bold;
+        color: #048A81;
+        margin-top: 2rem;
+        margin-bottom: 1rem;
+    }
+    .metric-card {
+        background-color: #f0f2f6;
+        padding: 1rem;
+        border-radius: 0.5rem;
+        margin: 0.5rem 0;
+    }
+    .success-message {
+        background-color: #d4edda;
+        color: #155724;
+        padding: 1rem;
+        border-radius: 0.5rem;
+        margin: 1rem 0;
+    }
+    .warning-message {
+        background-color: #fff3cd;
+        color: #856404;
+        padding: 1rem;
+        border-radius: 0.5rem;
+        margin: 1rem 0;
+    }
+</style>
+""", unsafe_allow_html=True)
+def main():
+    st.markdown('<div class="main-header">🔍 QualiVec Demo</div>', unsafe_allow_html=True)
+    st.markdown("""
+    <div style="text-align: center; margin-bottom: 2rem;">
+        <p style="font-size: 1.2rem; color: #666;">
+            Qualitative Content Analysis with LLM Embeddings
+        </p>
+    </div>
+    """, unsafe_allow_html=True)
+    # Sidebar for navigation
+    st.sidebar.title("Navigation")
+    page = st.sidebar.selectbox(
+        "Choose a page",
+        ["🏠 Home", "📊 Data Upload", "🔧 Configuration", "🎯 Classification", "📈 Results"]
+    )
+    # Initialize session state
+    if 'classifier' not in st.session_state:
+        st.session_state.classifier = None
+    if 'reference_data' not in st.session_state:
+        st.session_state.reference_data = None
+    if 'labeled_data' not in st.session_state:
+        st.session_state.labeled_data = None
+    if 'optimization_results' not in st.session_state:
+        st.session_state.optimization_results = None
+    if 'evaluation_results' not in st.session_state:
+        st.session_state.evaluation_results = None
+    # Route to different pages
+    if page == "🏠 Home":
+        show_home_page()
+    elif page == "📊 Data Upload":
+        show_data_upload_page()
+    elif page == "🔧 Configuration":
+        show_configuration_page()
+    elif page == "🎯 Classification":
+        show_classification_page()
+    elif page == "📈 Results":
+        show_results_page()
+def show_home_page():
+    st.markdown('<div class="section-header">Welcome to QualiVec</div>', unsafe_allow_html=True)
+    col1, col2, col3 = st.columns([1, 2, 1])
+    with col2:
+        st.markdown("""
+        ### What is QualiVec?
+        QualiVec is a Python library that uses Large Language Model (LLM) embeddings for qualitative content analysis. It helps researchers and analysts classify text data by comparing it against reference examples.
+        ### Key Features:
+        - **Semantic Matching**: Uses advanced embedding models to find semantic similarity
+        - **Threshold Optimization**: Automatically finds the best similarity threshold
+        - **Comprehensive Evaluation**: Provides detailed metrics and visualizations
+        - **Bootstrap Analysis**: Confidence intervals for robust evaluation
+        ### How It Works:
+        1. **Upload Data**: Provide reference examples and data to classify
+        2. **Configure**: Set up embedding models and parameters
+        3. **Optimize**: Find the best threshold for classification
+        4. **Classify**: Apply the model to your data
+        5. **Evaluate**: Get detailed performance metrics
+        ### Getting Started:
+        Use the sidebar to navigate through the demo. Start with **Data Upload** to begin your analysis.
+        """)
+    # Add sample data info
+    st.markdown('<div class="section-header">Sample Data Format</div>', unsafe_allow_html=True)
+    col1, col2 = st.columns(2)
+    with col1:
+        st.markdown("**Reference Data Format:**")
+        sample_ref = pd.DataFrame({
+            'tag': ['Positive', 'Negative', 'Neutral'],
+            'sentence': ['This is great!', 'This is terrible', 'This is okay']
+        })
+        st.dataframe(sample_ref, use_container_width=True)
+    with col2:
+        st.markdown("**Labeled Data Format:**")
+        sample_labeled = pd.DataFrame({
+            'sentence': ['I love this product', 'Not very good', 'Average quality'],
+            'Label': ['Positive', 'Negative', 'Neutral']
+        })
+        st.dataframe(sample_labeled, use_container_width=True)
+def show_data_upload_page():
+    st.markdown('<div class="section-header">Data Upload</div>', unsafe_allow_html=True)
+    col1, col2 = st.columns(2)
+    with col1:
+        st.markdown("### Reference Data")
+        st.markdown("Upload a CSV file containing reference examples with columns: `tag` (class) and `sentence` (example text)")
+        reference_file = st.file_uploader(
+            "Choose reference data file",
+            type=['csv'],
+            key='reference_file'
+        )
+        if reference_file is not None:
+            try:
+                reference_df = pd.read_csv(reference_file)
+                st.success("Reference data loaded successfully!")
+                st.dataframe(reference_df.head(), use_container_width=True)
+                # Validate columns
+                required_cols = ['tag', 'sentence']
+                missing_cols = [col for col in required_cols if col not in reference_df.columns]
+                if missing_cols:
+                    st.error(f"Missing required columns: {missing_cols}")
+                else:
+                    # Prepare reference data
+                    reference_df = reference_df.rename(columns={
+                        'tag': 'class',
+                        'sentence': 'matching_node'
+                    })
+                    st.session_state.reference_data = reference_df
+                    # Show statistics
+                    st.markdown("**Data Statistics:**")
+                    st.write(f"- Total examples: {len(reference_df)}")
+                    st.write(f"- Unique classes: {reference_df['class'].nunique()}")
+                    st.write(f"- Class distribution:")
+                    st.write(reference_df['class'].value_counts())
+            except Exception as e:
+                st.error(f"Error loading reference data: {str(e)}")
+    with col2:
+        st.markdown("### Labeled Data")
+        st.markdown("Upload a CSV file containing data to classify with columns: `sentence` (text) and `Label` (true class)")
+        labeled_file = st.file_uploader(
+            "Choose labeled data file",
+            type=['csv'],
+            key='labeled_file'
+        )
+        if labeled_file is not None:
+            try:
+                labeled_df = pd.read_csv(labeled_file)
+                st.success("Labeled data loaded successfully!")
+                st.dataframe(labeled_df.head(), use_container_width=True)
+                # Validate columns
+                required_cols = ['sentence', 'Label']
+                missing_cols = [col for col in required_cols if col not in labeled_df.columns]
+                if missing_cols:
+                    st.error(f"Missing required columns: {missing_cols}")
+                else:
+                    # Prepare labeled data
+                    labeled_df = labeled_df.rename(columns={'Label': 'label'})
+                    labeled_df['label'] = labeled_df['label'].replace('0', 'Other')
+                    st.session_state.labeled_data = labeled_df
+                    # Show statistics
+                    st.markdown("**Data Statistics:**")
+                    st.write(f"- Total samples: {len(labeled_df)}")
+                    st.write(f"- Unique labels: {labeled_df['label'].nunique()}")
+                    st.write(f"- Label distribution:")
+                    st.write(labeled_df['label'].value_counts())
+            except Exception as e:
+                st.error(f"Error loading labeled data: {str(e)}")
+    # Show data compatibility check
+    if st.session_state.reference_data is not None and st.session_state.labeled_data is not None:
+        st.markdown('<div class="section-header">Data Compatibility Check</div>', unsafe_allow_html=True)
+        ref_classes = set(st.session_state.reference_data['class'].unique())
+        labeled_classes = set(st.session_state.labeled_data['label'].unique())
+        # Check for unknown classes
+        unknown_classes = labeled_classes - ref_classes
+        if unknown_classes:
+            st.warning(f"Warning: Labels in labeled data not found in reference data: {unknown_classes}")
+        else:
+            st.success("✅ Data compatibility check passed!")
+        # Show class overlap
+        st.markdown("**Class Overlap Analysis:**")
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.metric("Reference Classes", len(ref_classes))
+        with col2:
+            st.metric("Labeled Classes", len(labeled_classes))
+        with col3:
+            st.metric("Common Classes", len(ref_classes.intersection(labeled_classes)))
+def show_configuration_page():
+    st.markdown('<div class="section-header">Model Configuration</div>', unsafe_allow_html=True)
+    # Check if data is loaded
+    if st.session_state.reference_data is None or st.session_state.labeled_data is None:
+        st.warning("Please upload both reference and labeled data first.")
+        return
+    col1, col2 = st.columns(2)
+    with col1:
+        st.markdown("### Embedding Model")
+        # Model type selection
+        model_type = st.selectbox(
+            "Choose model type",
+            ["HuggingFace", "Gemini"],
+            help="Select the type of embedding model to use"
+        )
+        # Model selection based on type
+        if model_type == "HuggingFace":
+            model_options = [
+                "sentence-transformers/all-MiniLM-L6-v2",
+                "sentence-transformers/all-mpnet-base-v2",
+                "sentence-transformers/distilbert-base-nli-mean-tokens"
+            ]
+            selected_model = st.selectbox(
+                "Choose HuggingFace model",
+                model_options,
+                help="Select the pre-trained HuggingFace model for generating embeddings"
+            )
+        else:  # Gemini
+            gemini_models = [
+                "gemini-embedding-001",
+                "text-embedding-004"
+            ]
+            selected_model = st.selectbox(
+                "Choose Gemini model",
+                gemini_models,
+                help="Select the Gemini embedding model for generating embeddings"
+            )
+            # Calculate total texts to process
+            total_texts = 0
+            if st.session_state.reference_data is not None:
+                total_texts += len(st.session_state.reference_data)
+            if st.session_state.labeled_data is not None:
+                total_texts += len(st.session_state.labeled_data)
+            st.warning(
+                f"⚠️ **Gemini API Rate Limits (Free Tier)**\\n\\n"
+                f"- 1,500 requests per day\\n"
+                f"- Each batch of 100 texts = 1 request\\n"
+                f"- Your current dataset: ~{total_texts} texts\\n"
+                f"- Estimated requests needed: ~{(total_texts // 100) + 1}\\n\\n"
+                f"If you exceed quota, consider:\\n"
+                f"1. Using a smaller dataset\\n"
+                f"2. Switching to HuggingFace models (no limits)\\n"
+                f"3. Upgrading to a paid API plan"
+            )
+            st.info("💡 Note: Using Gemini embeddings requires GOOGLE_API_KEY environment variable to be set.")
+        st.markdown("### Initial Threshold")
+        initial_threshold = st.slider(
+            "Initial similarity threshold",
+            min_value=0.0,
+            max_value=1.0,
+            value=0.7,
+            step=0.05,
+            help="Cosine similarity threshold for classification"
+        )
+    with col2:
+        st.markdown("### Optimization Parameters")
+        optimize_threshold = st.checkbox(
+            "Enable threshold optimization",
+            value=True,
+            help="Automatically find the best threshold"
+        )
+        if optimize_threshold:
+            col2_1, col2_2 = st.columns(2)
+            with col2_1:
+                start_threshold = st.slider(
+                    "Start threshold",
+                    min_value=0.0,
+                    max_value=1.0,
+                    value=0.5,
+                    step=0.05
+                )
+                end_threshold = st.slider(
+                    "End threshold",
+                    min_value=0.0,
+                    max_value=1.0,
+                    value=0.9,
+                    step=0.05
+                )
+            with col2_2:
+                step_size = st.slider(
+                    "Step size",
+                    min_value=0.005,
+                    max_value=0.05,
+                    value=0.01,
+                    step=0.005
+                )
+                optimization_metric = st.selectbox(
+                    "Optimization metric",
+                    ["f1_macro", "accuracy", "precision_macro", "recall_macro"]
+                )
+    # Load models button
+    if st.button("Initialize Models", type="primary"):
+        with st.spinner("Loading models... This may take a few minutes."):
+            try:
+                # Initialize classifier
+                classifier = Classifier(verbose=False)
+                # Determine model type parameter
+                model_type_param = "gemini" if model_type == "Gemini" else "huggingface"
+                classifier.load_models(
+                    model_name=selected_model,
+                    model_type=model_type_param,
+                    threshold=initial_threshold
+                )
+                # Prepare reference vectors
+                with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as tmp_ref:
+                    tmp_ref_path = tmp_ref.name
+                    st.session_state.reference_data.to_csv(tmp_ref_path, index=False)
+                try:
+                    reference_data = classifier.prepare_reference_vectors(
+                        reference_path=tmp_ref_path,
+                        class_column='class',
+                        node_column='matching_node'
+                    )
+                finally:
+                    # Ensure file is deleted even if an error occurs
+                    try:
+                        os.unlink(tmp_ref_path)
+                    except (OSError, PermissionError):
+                        pass  # File might already be deleted or locked
+                st.session_state.classifier = classifier
+                st.session_state.reference_vectors = reference_data
+                st.session_state.config = {
+                    'model_type': model_type,
+                    'model_name': selected_model,
+                    'initial_threshold': initial_threshold,
+                    'optimize_threshold': optimize_threshold,
+                    'start_threshold': start_threshold if optimize_threshold else None,
+                    'end_threshold': end_threshold if optimize_threshold else None,
+                    'step_size': step_size if optimize_threshold else None,
+                    'optimization_metric': optimization_metric if optimize_threshold else None
+                }
+                st.success("✅ Models initialized successfully!")
+            except Exception as e:
+                st.error(f"Error initializing models: {str(e)}")
+    # Show current configuration
+    if st.session_state.classifier is not None:
+        st.markdown('<div class="section-header">Current Configuration</div>', unsafe_allow_html=True)
+        config = st.session_state.config
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.markdown("**Model Settings:**")
+            st.write(f"- Model type: {config['model_type']}")
+            st.write(f"- Model: {config['model_name']}")
+            st.write(f"- Initial threshold: {config['initial_threshold']}")
+        with col2:
+            st.markdown("**Optimization:**")
+            st.write(f"- Enabled: {config['optimize_threshold']}")
+            if config['optimize_threshold']:
+                st.write(f"- Range: {config['start_threshold']:.2f} - {config['end_threshold']:.2f}")
+                st.write(f"- Step: {config['step_size']:.3f}")
+        with col3:
+            st.markdown("**Data:**")
+            st.write(f"- Reference examples: {len(st.session_state.reference_data)}")
+            st.write(f"- Labeled samples: {len(st.session_state.labeled_data)}")
+def show_classification_page():
+    st.markdown('<div class="section-header">Classification & Optimization</div>', unsafe_allow_html=True)
+    # Check if models are loaded
+    if st.session_state.classifier is None:
+        st.warning("Please configure and initialize models first.")
+        return
+    # Run classification
+    if st.button("Run Classification", type="primary"):
+        with st.spinner("Running classification and optimization..."):
+            try:
+                # Save labeled data to temporary file
+                with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as tmp_labeled:
+                    tmp_labeled_path = tmp_labeled.name
+                    st.session_state.labeled_data.to_csv(tmp_labeled_path, index=False)
+                try:
+                    # Run optimization if enabled
+                    if st.session_state.config['optimize_threshold']:
+                        optimization_results = st.session_state.classifier.evaluate_classification(
+                            labeled_path=tmp_labeled_path,
+                            reference_data=st.session_state.reference_vectors,
+                            sentence_column='sentence',
+                            label_column='label',
+                            optimize_threshold=True,
+                            start=st.session_state.config['start_threshold'],
+                            end=st.session_state.config['end_threshold'],
+                            step=st.session_state.config['step_size']
+                        )
+                        st.session_state.optimization_results = optimization_results
+                        optimal_threshold = optimization_results["optimal_threshold"]
+                        # Update classifier with optimal threshold
+                        st.session_state.classifier.matcher = SemanticMatcher(
+                            threshold=optimal_threshold,
+                            verbose=False
+                        )
+                        st.success(f"✅ Optimization completed! Optimal threshold: {optimal_threshold:.4f}")
+                    else:
+                        optimal_threshold = st.session_state.config['initial_threshold']
+                    # Run evaluation
+                    embedding_model = st.session_state.classifier.embedding_model
+                    data_loader = DataLoader(verbose=False)
+                    full_df = data_loader.load_labeled_data(tmp_labeled_path, label_column='label')
+                    # Generate embeddings
+                    full_embeddings = embedding_model.embed_dataframe(full_df, text_column='sentence')
+                    # Classify
+                    match_results = st.session_state.classifier.matcher.match(
+                        full_embeddings,
+                        st.session_state.reference_vectors
+                    )
+                    predicted_labels = match_results["predicted_class"].tolist()
+                    true_labels = full_df['label'].tolist()
+                    # Evaluate
+                    evaluator = Evaluator(verbose=False)
+                    eval_results = evaluator.evaluate(
+                        true_labels=true_labels,
+                        predicted_labels=predicted_labels,
+                        class_names=list(set(true_labels) | set(predicted_labels))
+                    )
+                    # Bootstrap evaluation
+                    bootstrap_results = evaluator.bootstrap_evaluate(
+                        true_labels=true_labels,
+                        predicted_labels=predicted_labels,
+                        n_iterations=100
+                    )
+                    st.session_state.evaluation_results = eval_results
+                    st.session_state.bootstrap_results = bootstrap_results
+                    st.session_state.predictions = {
+                        'true_labels': true_labels,
+                        'predicted_labels': predicted_labels,
+                        'match_results': match_results,
+                        'full_df': full_df
+                    }
+                finally:
+                    # Ensure temporary file is deleted
+                    try:
+                        os.unlink(tmp_labeled_path)
+                    except (OSError, PermissionError):
+                        pass  # File might already be deleted or locked
+                    st.success("✅ Classification completed successfully!")
+            except Exception as e:
+                st.error(f"Error during classification: {str(e)}")
+    # Show optimization results if available
+    if st.session_state.optimization_results is not None:
+        st.markdown('<div class="section-header">Optimization Results</div>', unsafe_allow_html=True)
+        results = st.session_state.optimization_results
+        col1, col2, col3, col4 = st.columns(4)
+        with col1:
+            st.metric(
+                "Optimal Threshold",
+                f"{results['optimal_threshold']:.4f}"
+            )
+        with col2:
+            st.metric(
+                "Accuracy",
+                f"{results['optimal_metrics']['accuracy']:.4f}"
+            )
+        with col3:
+            st.metric(
+                "F1 Score",
+                f"{results['optimal_metrics']['f1_macro']:.4f}"
+            )
+        with col4:
+            st.metric(
+                "Precision",
+                f"{results['optimal_metrics']['precision_macro']:.4f}"
+            )
+        # Plot optimization curve
+        st.markdown("### Optimization Curve")
+        opt_results = results["results_by_threshold"]
+        fig = make_subplots(
+            rows=2, cols=2,
+            subplot_titles=('Accuracy', 'F1 Score', 'Precision', 'Recall'),
+            vertical_spacing=0.1
+        )
+        thresholds = opt_results["thresholds"]
+        # Add traces
+        fig.add_trace(
+            go.Scatter(x=thresholds, y=opt_results["accuracy"], name="Accuracy"),
+            row=1, col=1
+        )
+        fig.add_trace(
+            go.Scatter(x=thresholds, y=opt_results["f1_macro"], name="F1 Score"),
+            row=1, col=2
+        )
+        fig.add_trace(
+            go.Scatter(x=thresholds, y=opt_results["precision_macro"], name="Precision"),
+            row=2, col=1
+        )
+        fig.add_trace(
+            go.Scatter(x=thresholds, y=opt_results["recall_macro"], name="Recall"),
+            row=2, col=2
+        )
+        # Add optimal threshold line to each subplot using shapes
+        optimal_thresh = results['optimal_threshold']
+        # Add vertical line as shapes to each subplot
+        shapes = []
+        for row in range(1, 3):
+            for col in range(1, 3):
+                # Calculate the subplot domain
+                xaxis = f'x{(row-1)*2 + col}' if (row-1)*2 + col > 1 else 'x'
+                shapes.append(
+                    dict(
+                        type="line",
+                        x0=optimal_thresh, x1=optimal_thresh,
+                        y0=0, y1=1,
+                        yref=f"y{(row-1)*2 + col} domain" if (row-1)*2 + col > 1 else "y domain",
+                        xref=xaxis,
+                        line=dict(color="red", width=2, dash="dash")
+                    )
+                )
+        fig.update_layout(shapes=shapes)
+        fig.update_layout(
+            title="Threshold Optimization Results",
+            showlegend=False,
+            height=600
+        )
+        st.plotly_chart(fig, use_container_width=True)
+def show_results_page():
+    st.markdown('<div class="section-header">Results & Evaluation</div>', unsafe_allow_html=True)
+    # Check if evaluation results are available
+    if st.session_state.evaluation_results is None:
+        st.warning("Please run classification first to see results.")
+        return
+    eval_results = st.session_state.evaluation_results
+    # Performance metrics
+    st.markdown("### Performance Metrics")
+    col1, col2, col3, col4 = st.columns(4)
+    with col1:
+        st.metric(
+            "Overall Accuracy",
+            f"{eval_results['accuracy']:.4f}"
+        )
+    with col2:
+        st.metric(
+            "Macro F1 Score",
+            f"{eval_results['f1_macro']:.4f}"
+        )
+    with col3:
+        st.metric(
+            "Macro Precision",
+            f"{eval_results['precision_macro']:.4f}"
+        )
+    with col4:
+        st.metric(
+            "Macro Recall",
+            f"{eval_results['recall_macro']:.4f}"
+        )
+    # Class-wise metrics
+    st.markdown("### Class-wise Performance")
+    class_metrics_df = pd.DataFrame({
+        'Class': list(eval_results['class_metrics']['precision'].keys()),
+        'Precision': list(eval_results['class_metrics']['precision'].values()),
+        'Recall': list(eval_results['class_metrics']['recall'].values()),
+        'F1-Score': list(eval_results['class_metrics']['f1'].values()),
+        'Support': list(eval_results['class_metrics']['support'].values())
+    })
+    st.dataframe(class_metrics_df, use_container_width=True)
+    # Confusion Matrix
+    st.markdown("### Confusion Matrix")
+    cm = eval_results['confusion_matrix']
+    class_names = eval_results['confusion_matrix_labels']
+    fig = px.imshow(
+        cm,
+        labels=dict(x="Predicted", y="True", color="Count"),
+        x=class_names,
+        y=class_names,
+        color_continuous_scale='Blues',
+        text_auto=True,
+        title="Confusion Matrix"
+    )
+    fig.update_layout(
+        width=600,
+        height=600
+    )
+    st.plotly_chart(fig, use_container_width=True)
+    # Bootstrap Results
+    if st.session_state.bootstrap_results is not None:
+        st.markdown("### Bootstrap Confidence Intervals")
+        bootstrap_results = st.session_state.bootstrap_results
+        # Debug: show available keys
+        if 'confidence_intervals' in bootstrap_results:
+            metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
+            for metric in metrics:
+                if metric in bootstrap_results['confidence_intervals']:
+                    ci_data = bootstrap_results['confidence_intervals'][metric]
+                    st.markdown(f"**{metric.replace('_', ' ').title()}:**")
+                    col1, col2, col3 = st.columns(3)
+                    # Check available confidence levels
+                    available_levels = list(ci_data.keys())
+                    with col1:
+                        if '0.95' in ci_data:
+                            ci_95 = ci_data['0.95']
+                            if isinstance(ci_95, dict):
+                                st.write(f"95% CI: [{ci_95['lower']:.4f}, {ci_95['upper']:.4f}]")
+                            elif isinstance(ci_95, (list, tuple)) and len(ci_95) >= 2:
+                                st.write(f"95% CI: [{ci_95[0]:.4f}, {ci_95[1]:.4f}]")
+                            else:
+                                st.write("95% CI: Format not recognized")
+                        elif 0.95 in ci_data:
+                            ci_95 = ci_data[0.95]
+                            if isinstance(ci_95, dict):
+                                st.write(f"95% CI: [{ci_95['lower']:.4f}, {ci_95['upper']:.4f}]")
+                            elif isinstance(ci_95, (list, tuple)) and len(ci_95) >= 2:
+                                st.write(f"95% CI: [{ci_95[0]:.4f}, {ci_95[1]:.4f}]")
+                            else:
+                                st.write("95% CI: Format not recognized")
+                        else:
+                            st.write("95% CI: Not available")
+                    with col2:
+                        if '0.99' in ci_data:
+                            ci_99 = ci_data['0.99']
+                            if isinstance(ci_99, dict):
+                                st.write(f"99% CI: [{ci_99['lower']:.4f}, {ci_99['upper']:.4f}]")
+                            elif isinstance(ci_99, (list, tuple)) and len(ci_99) >= 2:
+                                st.write(f"99% CI: [{ci_99[0]:.4f}, {ci_99[1]:.4f}]")
+                            else:
+                                st.write("99% CI: Format not recognized")
+                        elif 0.99 in ci_data:
+                            ci_99 = ci_data[0.99]
+                            if isinstance(ci_99, dict):
+                                st.write(f"99% CI: [{ci_99['lower']:.4f}, {ci_99['upper']:.4f}]")
+                            elif isinstance(ci_99, (list, tuple)) and len(ci_99) >= 2:
+                                st.write(f"99% CI: [{ci_99[0]:.4f}, {ci_99[1]:.4f}]")
+                            else:
+                                st.write("99% CI: Format not recognized")
+                        else:
+                            st.write("99% CI: Not available")
+                    with col3:
+                        if 'point_estimates' in bootstrap_results and metric in bootstrap_results['point_estimates']:
+                            st.write(f"Point Estimate: {bootstrap_results['point_estimates'][metric]:.4f}")
+                        else:
+                            st.write("Point Estimate: Not available")
+        else:
+            st.info("Bootstrap confidence intervals not available.")
+        # Bootstrap Distribution Plot
+        st.markdown("### Bootstrap Distributions")
+        if 'bootstrap_distribution' in bootstrap_results:
+            fig = make_subplots(
+                rows=2, cols=2,
+                subplot_titles=('Accuracy', 'F1 Score', 'Precision', 'Recall')
+            )
+            distributions = bootstrap_results['bootstrap_distribution']
+            if 'accuracy' in distributions:
+                fig.add_trace(
+                    go.Histogram(x=distributions['accuracy'], name="Accuracy", nbinsx=30),
+                    row=1, col=1
+                )
+            if 'f1_macro' in distributions:
+                fig.add_trace(
+                    go.Histogram(x=distributions['f1_macro'], name="F1 Score", nbinsx=30),
+                    row=1, col=2
+                )
+            if 'precision_macro' in distributions:
+                fig.add_trace(
+                    go.Histogram(x=distributions['precision_macro'], name="Precision", nbinsx=30),
+                    row=2, col=1
+                )
+            if 'recall_macro' in distributions:
+                fig.add_trace(
+                    go.Histogram(x=distributions['recall_macro'], name="Recall", nbinsx=30),
+                    row=2, col=2
+                )
+            fig.update_layout(
+                title="Bootstrap Distributions",
+                showlegend=False,
+                height=600
+            )
+            st.plotly_chart(fig, use_container_width=True)
+        else:
+            st.info("Bootstrap distributions not available.")
+    # Sample predictions
+    if 'predictions' in st.session_state:
+        st.markdown("### Sample Predictions")
+        predictions = st.session_state.predictions
+        sample_df = predictions['full_df'].copy()
+        sample_df['predicted_class'] = predictions['predicted_labels']
+        sample_df['true_class'] = predictions['true_labels']
+        sample_df['similarity_score'] = predictions['match_results']['similarity_score']
+        sample_df['correct'] = sample_df['predicted_class'] == sample_df['true_class']
+        # Filter options
+        col1, col2 = st.columns(2)
+        with col1:
+            show_correct = st.checkbox("Show correct predictions", value=True)
+        with col2:
+            show_incorrect = st.checkbox("Show incorrect predictions", value=True)
+        # Filter data
+        if show_correct and show_incorrect:
+            filtered_df = sample_df
+        elif show_correct:
+            filtered_df = sample_df[sample_df['correct'] == True]
+        elif show_incorrect:
+            filtered_df = sample_df[sample_df['correct'] == False]
+        else:
+            filtered_df = pd.DataFrame()
+        if not filtered_df.empty:
+            # Sample random rows
+            n_samples = min(20, len(filtered_df))
+            sample_rows = filtered_df.sample(n=n_samples) if len(filtered_df) > n_samples else filtered_df
+            display_df = sample_rows[['sentence', 'true_class', 'predicted_class', 'similarity_score', 'correct']].reset_index(drop=True)
+            st.dataframe(display_df, use_container_width=True)
+        else:
+            st.info("No predictions to show with current filters.")
+    # Download results
+    st.markdown("### Download Results")
+    col1, col2 = st.columns(2)
+    with col1:
+        # Download class-wise metrics
+        csv_metrics = class_metrics_df.to_csv(index=False)
+        st.download_button(
+            label="Download Class Metrics",
+            data=csv_metrics,
+            file_name="class_metrics.csv",
+            mime="text/csv"
+        )
+    with col2:
+        # Download predictions
+        if 'predictions' in st.session_state:
+            predictions = st.session_state.predictions
+            results_df = predictions['full_df'].copy()
+            results_df['predicted_class'] = predictions['predicted_labels']
+            results_df['similarity_score'] = predictions['match_results']['similarity_score']
+            csv_results = results_df.to_csv(index=False)
+            st.download_button(
+                label="Download Predictions",
+                data=csv_results,
+                file_name="predictions.csv",
+                mime="text/csv"
+            )
+if __name__ == "__main__":
+    main()

app/run_demo.bat ADDED Viewed

File without changes

app/run_demo.py ADDED Viewed

	@@ -0,0 +1,38 @@

+#!/usr/bin/env python3
+"""
+Quick launcher script for the QualiVec Streamlit demo.
+"""
+import subprocess
+import sys
+import os
+def main():
+    """Launch the Streamlit app."""
+    # Get the directory of this script
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    app_path = os.path.join(script_dir, "app.py")
+    print("🚀 Starting QualiVec Demo...")
+    print("📍 App will be available at: http://localhost:8501")
+    print("⏹️  Press Ctrl+C to stop the app")
+    print("-" * 50)
+    try:
+        # Run streamlit
+        subprocess.run([
+            sys.executable, "-m", "streamlit", "run", app_path,
+            "--server.headless", "true",
+            # "--server.address=0.0.0.0",
+            "--server.port=8501",
+            "--server.enableCORS", "false",
+            "--server.enableXsrfProtection", "false"
+        ])
+    except KeyboardInterrupt:
+        print("\n🛑 App stopped by user")
+    except Exception as e:
+        print(f"❌ Error starting app: {e}")
+if __name__ == "__main__":
+    main()

src/qualivec/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""QualiVec: Qualitative Content Analysis with LLM Embeddings."""
+from qualivec.data import DataLoader
+from qualivec.sampling import Sampler
+from qualivec.embedding import EmbeddingModel
+from qualivec.matching import SemanticMatcher
+from qualivec.evaluation import Evaluator
+from qualivec.optimization import ThresholdOptimizer
+from qualivec.classification import Classifier
+__version__ = "0.1.0"
+def main() -> None:
+    print("QualiVec: Qualitative Content Analysis with LLM Embeddings")
+    print(f"Version: {__version__}")

src/qualivec/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.15 kB). View file

src/qualivec/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (969 Bytes). View file

src/qualivec/__pycache__/classification.cpython-312.pyc ADDED Viewed

Binary file (8.62 kB). View file

src/qualivec/__pycache__/data.cpython-312.pyc ADDED Viewed

Binary file (8.37 kB). View file

src/qualivec/__pycache__/embedding.cpython-312.pyc ADDED Viewed

Binary file (11.2 kB). View file

src/qualivec/__pycache__/evaluation.cpython-312.pyc ADDED Viewed

Binary file (10.2 kB). View file

src/qualivec/__pycache__/matching.cpython-312.pyc ADDED Viewed

Binary file (5.07 kB). View file

src/qualivec/__pycache__/optimization.cpython-312.pyc ADDED Viewed

Binary file (10.7 kB). View file

src/qualivec/__pycache__/sampling.cpython-312.pyc ADDED Viewed

Binary file (4.78 kB). View file

src/qualivec/classification.py ADDED Viewed

	@@ -0,0 +1,216 @@

+"""Classification utilities for QualiVec."""
+import pandas as pd
+import numpy as np
+from typing import Dict, List, Optional, Any
+from qualivec.data import DataLoader
+from qualivec.embedding import EmbeddingModel
+from qualivec.matching import SemanticMatcher
+class Classifier:
+    """Handles classification for QualiVec."""
+    def __init__(self,
+                 embedding_model: Optional[EmbeddingModel] = None,
+                 matcher: Optional[SemanticMatcher] = None,
+                 verbose: bool = True):
+        """Initialize the classifier.
+        Args:
+            embedding_model: Model for generating embeddings.
+            matcher: Model for semantic matching.
+            verbose: Whether to print status messages.
+        """
+        self.embedding_model = embedding_model
+        self.matcher = matcher
+        self.verbose = verbose
+        self.data_loader = DataLoader(verbose=verbose)
+    def load_models(self,
+                   model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
+                   model_type: str = "huggingface",
+                   threshold: float = 0.7):
+        """Load embedding model and matcher.
+        Args:
+            model_name: Name of the model to use (HuggingFace or Gemini).
+            model_type: Type of model ('huggingface' or 'gemini').
+            threshold: Cosine similarity threshold for matching.
+        """
+        if self.verbose:
+            print(f"Loading {model_type} embedding model: {model_name}")
+        self.embedding_model = EmbeddingModel(
+            model_name=model_name,
+            model_type=model_type,
+            verbose=self.verbose
+        )
+        self.matcher = SemanticMatcher(threshold=threshold, verbose=self.verbose)
+        if self.verbose:
+            print("Models loaded successfully")
+    def prepare_reference_vectors(self,
+                                 reference_path: str,
+                                 class_column: str = "class",
+                                 node_column: str = "matching_node") -> Dict[str, Any]:
+        """Prepare reference vectors from a CSV file.
+        Args:
+            reference_path: Path to the CSV file with reference vectors.
+            class_column: Name of the column containing class labels.
+            node_column: Name of the column containing matching nodes.
+        Returns:
+            Dictionary with reference vector information.
+        """
+        if self.embedding_model is None:
+            raise ValueError("Embedding model not loaded. Call load_models first.")
+        # Load reference vectors
+        reference_df = self.data_loader.load_reference_vectors(
+            reference_path, class_column=class_column, node_column=node_column
+        )
+        # Generate embeddings
+        reference_data = self.embedding_model.embed_reference_vectors(
+            reference_df, class_column=class_column, node_column=node_column
+        )
+        if self.verbose:
+            print(f"Prepared {len(reference_data['embeddings'])} reference vectors")
+            print(f"Unique classes: {len(reference_data['class_to_idx'])}")
+        return reference_data
+    def classify(self,
+                corpus_path: str,
+                reference_data: Dict[str, Any],
+                sentence_column: str = "sentence",
+                output_path: Optional[str] = None) -> pd.DataFrame:
+        """Classify texts in a corpus using reference vectors.
+        Args:
+            corpus_path: Path to the CSV file with corpus.
+            reference_data: Dictionary with reference vector information.
+            sentence_column: Name of the column containing sentences.
+            output_path: Path to save the classification results.
+        Returns:
+            DataFrame with classification results.
+        """
+        if self.embedding_model is None or self.matcher is None:
+            raise ValueError("Models not loaded. Call load_models first.")
+        # Load corpus
+        corpus_df = self.data_loader.load_corpus(corpus_path, sentence_column=sentence_column)
+        # Generate embeddings
+        corpus_embeddings = self.embedding_model.embed_dataframe(
+            corpus_df, text_column=sentence_column
+        )
+        # Classify
+        results_df = self.matcher.classify_corpus(
+            corpus_embeddings, reference_data, corpus_df
+        )
+        # Save results if output path provided
+        if output_path is not None:
+            self.data_loader.save_dataframe(results_df, output_path)
+            if self.verbose:
+                print(f"Saved classification results to {output_path}")
+        return results_df
+    def evaluate_classification(self,
+                              labeled_path: str,
+                              reference_data: Dict[str, Any],
+                              sentence_column: str = "sentence",
+                              label_column: str = "label",
+                              optimize_threshold: bool = False,
+                              start: float = 0.5,
+                              end: float = 0.9,
+                              step: float = 0.01) -> Dict[str, Any]:
+        """Evaluate classification performance on labeled data.
+        Args:
+            labeled_path: Path to the CSV file with labeled data.
+            reference_data: Dictionary with reference vector information.
+            sentence_column: Name of the column containing sentences.
+            label_column: Name of the column containing true labels.
+            optimize_threshold: Whether to optimize the threshold.
+            start: Start threshold value for optimization.
+            end: End threshold value for optimization.
+            step: Threshold step size for optimization.
+        Returns:
+            Dictionary with evaluation results.
+        """
+        from qualivec.evaluation import Evaluator
+        from qualivec.optimization import ThresholdOptimizer
+        if self.embedding_model is None:
+            raise ValueError("Embedding model not loaded. Call load_models first.")
+        # Load labeled data
+        labeled_df = self.data_loader.load_labeled_data(labeled_path, label_column=label_column)
+        # Validate labels
+        valid = self.data_loader.validate_labels(
+            labeled_df,
+            pd.DataFrame({
+                "class": reference_data["classes"]
+            }).drop_duplicates(),
+            label_column=label_column,
+            class_column="class"
+        )
+        if not valid and self.verbose:
+            print("Warning: Some labels in the labeled data are not in reference vectors")
+        # Generate embeddings
+        labeled_embeddings = self.embedding_model.embed_dataframe(
+            labeled_df, text_column=sentence_column
+        )
+        # True labels
+        true_labels = labeled_df[label_column].tolist()
+        if optimize_threshold:
+            # Optimize threshold
+            if self.verbose:
+                print("Optimizing threshold...")
+            optimizer = ThresholdOptimizer(verbose=self.verbose)
+            optimization_results = optimizer.optimize(
+                labeled_embeddings,
+                reference_data,
+                true_labels,
+                start=start,
+                end=end,
+                step=step,
+                metric="f1_macro"
+            )
+            # Update matcher with optimal threshold
+            self.matcher = SemanticMatcher(threshold=optimization_results["optimal_threshold"],
+                                          verbose=self.verbose)
+            return optimization_results
+        else:
+            # Evaluate with current threshold
+            if self.matcher is None:
+                raise ValueError("Matcher not loaded. Call load_models first.")
+            # Get predictions
+            match_results = self.matcher.match(labeled_embeddings, reference_data)
+            predicted_labels = match_results["predicted_class"].tolist()
+            # Evaluate
+            evaluator = Evaluator(verbose=self.verbose)
+            eval_results = evaluator.bootstrap_evaluate(true_labels, predicted_labels)
+            return eval_results

src/qualivec/data.py ADDED Viewed

	@@ -0,0 +1,174 @@

+"""Data loading and validation utilities for QualiVec."""
+import os
+import pandas as pd
+from typing import List, Optional, Dict, Any, Union, Tuple
+class DataLoader:
+    """Handles data loading and validation for QualiVec."""
+    def __init__(self, verbose: bool = True):
+        """Initialize the DataLoader.
+        Args:
+            verbose: Whether to print status messages.
+        """
+        self.verbose = verbose
+    def load_corpus(self, filepath: str, sentence_column: str = "sentence") -> pd.DataFrame:
+        """Load a corpus from a CSV file.
+        Args:
+            filepath: Path to the CSV file.
+            sentence_column: Name of the column containing sentences.
+        Returns:
+            DataFrame containing the corpus.
+        Raises:
+            FileNotFoundError: If the file does not exist.
+            ValueError: If the sentence column is missing.
+        """
+        if not os.path.exists(filepath):
+            raise FileNotFoundError(f"File not found: {filepath}")
+        # Load the data
+        if self.verbose:
+            print(f"Loading corpus from {filepath}...")
+        df = pd.read_csv(filepath)
+        # Validate schema
+        if sentence_column not in df.columns:
+            raise ValueError(f"Required column '{sentence_column}' not found in the CSV file.")
+        # Basic validation
+        if df[sentence_column].isna().any():
+            if self.verbose:
+                print(f"Warning: {df[sentence_column].isna().sum()} null values found in '{sentence_column}' column.")
+        if self.verbose:
+            print(f"Loaded {len(df)} rows from {filepath}")
+        return df
+    def load_reference_vectors(self, filepath: str, class_column: str = "class",
+                               node_column: str = "matching_node") -> pd.DataFrame:
+        """Load reference vectors from a CSV file.
+        Args:
+            filepath: Path to the CSV file.
+            class_column: Name of the column containing class labels.
+            node_column: Name of the column containing matching nodes.
+        Returns:
+            DataFrame containing the reference vectors.
+        Raises:
+            FileNotFoundError: If the file does not exist.
+            ValueError: If required columns are missing.
+        """
+        if not os.path.exists(filepath):
+            raise FileNotFoundError(f"File not found: {filepath}")
+        if self.verbose:
+            print(f"Loading reference vectors from {filepath}...")
+        df = pd.read_csv(filepath)
+        # Validate schema
+        required_columns = [class_column, node_column]
+        missing_columns = [col for col in required_columns if col not in df.columns]
+        if missing_columns:
+            raise ValueError(f"Required columns {missing_columns} not found in the CSV file.")
+        # Basic validation
+        if df[class_column].isna().any() or df[node_column].isna().any():
+            if self.verbose:
+                print(f"Warning: Null values found in reference vectors.")
+        if self.verbose:
+            print(f"Loaded {len(df)} reference vectors from {filepath}")
+            print(f"Unique classes: {df[class_column].nunique()}")
+        return df
+    def load_labeled_data(self, filepath: str, label_column: str = "label") -> pd.DataFrame:
+        """Load manually labeled data from a CSV file.
+        Args:
+            filepath: Path to the CSV file.
+            label_column: Name of the column containing labels.
+        Returns:
+            DataFrame containing the labeled data.
+        Raises:
+            FileNotFoundError: If the file does not exist.
+            ValueError: If the label column is missing.
+        """
+        if not os.path.exists(filepath):
+            raise FileNotFoundError(f"File not found: {filepath}")
+        if self.verbose:
+            print(f"Loading labeled data from {filepath}...")
+        df = pd.read_csv(filepath)
+        # Validate schema
+        if label_column not in df.columns:
+            raise ValueError(f"Required column '{label_column}' not found in the CSV file.")
+        # Basic validation
+        if df[label_column].isna().any():
+            if self.verbose:
+                print(f"Warning: {df[label_column].isna().sum()} null values found in '{label_column}' column.")
+        if self.verbose:
+            print(f"Loaded {len(df)} labeled samples from {filepath}")
+            print(f"Label distribution:\n{df[label_column].value_counts()}")
+        return df
+    def save_dataframe(self, df: pd.DataFrame, filepath: str) -> None:
+        """Save a DataFrame to a CSV file.
+        Args:
+            df: DataFrame to save.
+            filepath: Path to save the CSV file.
+        """
+        df.to_csv(filepath, index=False)
+        if self.verbose:
+            print(f"Saved {len(df)} rows to {filepath}")
+    def validate_labels(self, labeled_df: pd.DataFrame, reference_df: pd.DataFrame,
+                        label_column: str = "label", class_column: str = "class") -> bool:
+        """Validate that labels in the labeled data are a subset of those in the reference data.
+        Args:
+            labeled_df: DataFrame containing labeled data.
+            reference_df: DataFrame containing reference vectors.
+            label_column: Name of the column containing labels in labeled_df.
+            class_column: Name of the column containing classes in reference_df.
+        Returns:
+            True if validation passes, False otherwise.
+        """
+        labeled_classes = set(labeled_df[label_column].unique())
+        reference_classes = set(reference_df[class_column].unique())
+        unknown_classes = labeled_classes - reference_classes
+        if unknown_classes:
+            if self.verbose:
+                print(f"Warning: Found {len(unknown_classes)} labels in labeled data that are not in reference vectors:")
+                print(unknown_classes)
+            return False
+        if self.verbose:
+            print("Label validation passed: All labels in labeled data are in reference vectors.")
+        return True

src/qualivec/embedding.py ADDED Viewed

	@@ -0,0 +1,276 @@

+"""Embedding utilities for QualiVec."""
+import numpy as np
+import pandas as pd
+from typing import List, Dict, Any, Optional, Union
+import torch
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModel
+import os
+import time
+class EmbeddingModel:
+    """Handles text embedding for QualiVec."""
+    def __init__(self,
+                 model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
+                 model_type: str = "huggingface",
+                 device: Optional[str] = None,
+                 cache_dir: Optional[str] = None,
+                 verbose: bool = True):
+        """Initialize the embedding model.
+        Args:
+            model_name: Name of the model to use (HuggingFace model or Gemini model).
+            model_type: Type of model ('huggingface' or 'gemini').
+            device: Device to use for computation ('cpu' or 'cuda'). Only for HuggingFace models.
+            cache_dir: Directory to cache models. Only for HuggingFace models.
+            verbose: Whether to print status messages.
+        """
+        self.model_name = model_name
+        self.model_type = model_type.lower()
+        self.verbose = verbose
+        self.cache_dir = cache_dir
+        if self.model_type not in ["huggingface", "gemini"]:
+            raise ValueError(f"model_type must be 'huggingface' or 'gemini', got '{model_type}'")
+        if self.model_type == "huggingface":
+            # Determine device
+            if device is None:
+                self.device = "cuda" if torch.cuda.is_available() else "cpu"
+            else:
+                self.device = device
+            if self.verbose:
+                print(f"Using device: {self.device}")
+                print(f"Loading HuggingFace model: {model_name}")
+            # Load model and tokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
+            self.model = AutoModel.from_pretrained(model_name, cache_dir=cache_dir).to(self.device)
+            if self.verbose:
+                print(f"HuggingFace model loaded successfully")
+        elif self.model_type == "gemini":
+            if self.verbose:
+                print(f"Initializing Gemini model: {model_name}")
+            # Import Gemini client
+            try:
+                from google import genai
+                # Get API key from environment variable
+                api_key = os.environ.get("GOOGLE_API_KEY")
+                if not api_key:
+                    raise ValueError(
+                        "GOOGLE_API_KEY environment variable not set. "
+                        "Please set it with your Gemini API key."
+                    )
+                self.genai_client = genai.Client(api_key="API_KEY")
+                if self.verbose:
+                    print(f"Gemini client initialized successfully")
+                    print(f"⚠️  Free tier limits: 1,500 requests/day, 100 texts per batch")
+            except ImportError:
+                raise ImportError("google-genai library is required for Gemini models. Install with: pip install google-genai")
+    def _mean_pooling(self, model_output, attention_mask):
+        """Mean pooling operation to get sentence embeddings."""
+        token_embeddings = model_output[0]
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    def embed_texts(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
+        """Generate embeddings for a list of texts.
+        Args:
+            texts: List of texts to embed.
+            batch_size: Batch size for processing.
+        Returns:
+            Numpy array of embeddings.
+        """
+        if self.verbose:
+            print(f"Generating embeddings for {len(texts)} texts")
+        if self.model_type == "huggingface":
+            return self._embed_texts_huggingface(texts, batch_size)
+        elif self.model_type == "gemini":
+            return self._embed_texts_gemini(texts, batch_size)
+        else:
+            raise ValueError(f"Unsupported model_type: {self.model_type}")
+    def _embed_texts_huggingface(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
+        """Generate embeddings using HuggingFace model.
+        Args:
+            texts: List of texts to embed.
+            batch_size: Batch size for processing.
+        Returns:
+            Numpy array of embeddings.
+        """
+        embeddings = []
+        # Process in batches
+        for i in tqdm(range(0, len(texts), batch_size), disable=not self.verbose):
+            batch_texts = texts[i:i + batch_size]
+            # Tokenize
+            encoded_input = self.tokenizer(batch_texts, padding=True, truncation=True,
+                                          max_length=512, return_tensors='pt').to(self.device)
+            # Get model output
+            with torch.no_grad():
+                model_output = self.model(**encoded_input)
+            # Mean pooling
+            batch_embeddings = self._mean_pooling(model_output, encoded_input['attention_mask'])
+            # Normalize embeddings
+            batch_embeddings = torch.nn.functional.normalize(batch_embeddings, p=2, dim=1)
+            # Add to list
+            embeddings.append(batch_embeddings.cpu().numpy())
+        # Concatenate all batches
+        all_embeddings = np.vstack(embeddings)
+        if self.verbose:
+            print(f"Generated embeddings with shape: {all_embeddings.shape}")
+        return all_embeddings
+    def _embed_texts_gemini(self, texts: List[str], batch_size: int = 100) -> np.ndarray:
+        """Generate embeddings using Gemini model with rate limiting.
+        Args:
+            texts: List of texts to embed.
+            batch_size: Batch size for processing (reduced to 100 to respect rate limits).
+        Returns:
+            Numpy array of embeddings.
+        """
+        embeddings = []
+        # Process in batches with rate limiting
+        for i in tqdm(range(0, len(texts), batch_size), disable=not self.verbose):
+            batch_texts = texts[i:i + batch_size]
+            # Retry logic with exponential backoff
+            max_retries = 3
+            retry_delay = 2  # seconds
+            for attempt in range(max_retries):
+                try:
+                    # Get embeddings from Gemini
+                    result = self.genai_client.models.embed_content(
+                        model=self.model_name,
+                        contents=batch_texts  # type: ignore
+                    )
+                    # Extract embeddings
+                    if result.embeddings:
+                        batch_embeddings = [emb.values for emb in result.embeddings]
+                        embeddings.extend(batch_embeddings)
+                    # Add delay between batches to respect rate limits (free tier: 1500 requests/day)
+                    # With 100 texts per batch and ~60 second delay, we can process ~1440 texts/day
+                    if i + batch_size < len(texts):
+                        time.sleep(1)  # 1 second delay between batches
+                    break  # Success, exit retry loop
+                except Exception as e:
+                    error_msg = str(e)
+                    if "429" in error_msg or "RESOURCE_EXHAUSTED" in error_msg:
+                        if attempt < max_retries - 1:
+                            if self.verbose:
+                                print(f"\nRate limit hit. Waiting {retry_delay} seconds before retry {attempt + 1}/{max_retries}...")
+                            time.sleep(retry_delay)
+                            retry_delay *= 2  # Exponential backoff
+                        else:
+                            raise Exception(
+                                f"Gemini API quota exceeded. Free tier limits: 1500 requests/day.\n"
+                                f"Error: {error_msg}\n\n"
+                                f"Solutions:\n"
+                                f"1. Wait and try again later (quota resets daily)\n"
+                                f"2. Reduce the amount of data being processed\n"
+                                f"3. Upgrade to a paid API plan\n"
+                                f"4. Use HuggingFace models instead (no API limits)"
+                            )
+                    else:
+                        raise  # Re-raise non-quota errors
+        # Convert to numpy array
+        all_embeddings = np.array(embeddings)
+        if self.verbose:
+            print(f"Generated embeddings with shape: {all_embeddings.shape}")
+        return all_embeddings
+    def embed_dataframe(self,
+                        df: pd.DataFrame,
+                        text_column: str,
+                        batch_size: int = 32) -> np.ndarray:
+        """Generate embeddings for texts in a DataFrame column.
+        Args:
+            df: DataFrame containing texts.
+            text_column: Name of the column containing texts.
+            batch_size: Batch size for processing.
+        Returns:
+            Numpy array of embeddings.
+        """
+        if text_column not in df.columns:
+            raise ValueError(f"Column '{text_column}' not found in DataFrame.")
+        texts = df[text_column].fillna("").tolist()
+        return self.embed_texts(texts, batch_size)
+    def embed_reference_vectors(self,
+                               df: pd.DataFrame,
+                               class_column: str = "class",
+                               node_column: str = "matching_node",
+                               batch_size: int = 32) -> Dict[str, Any]:
+        """Generate embeddings for reference vectors.
+        Args:
+            df: DataFrame containing reference vectors.
+            class_column: Name of the column containing class labels.
+            node_column: Name of the column containing matching nodes.
+            batch_size: Batch size for processing.
+        Returns:
+            Dictionary with class info and embeddings.
+        """
+        required_columns = [class_column, node_column]
+        missing_columns = [col for col in required_columns if col not in df.columns]
+        if missing_columns:
+            raise ValueError(f"Required columns {missing_columns} not found in DataFrame.")
+        # Get texts and generate embeddings
+        texts = df[node_column].fillna("").tolist()
+        embeddings = self.embed_texts(texts, batch_size)
+        # Create result dictionary
+        result = {
+            "classes": df[class_column].tolist(),
+            "nodes": df[node_column].tolist(),
+            "embeddings": embeddings,
+            "class_to_idx": {cls: i for i, cls in enumerate(df[class_column].unique())}
+        }
+        if self.verbose:
+            print(f"Generated embeddings for {len(result['classes'])} reference vectors")
+            print(f"Unique classes: {len(result['class_to_idx'])}")
+        return result

src/qualivec/evaluation.py ADDED Viewed

	@@ -0,0 +1,254 @@

+"""Evaluation utilities for QualiVec."""
+import numpy as np
+import pandas as pd
+from typing import Dict, List, Tuple, Optional, Union, Any
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
+import matplotlib.pyplot as plt
+import seaborn as sns
+from tqdm import tqdm
+class Evaluator:
+    """Handles evaluation for QualiVec."""
+    def __init__(self, verbose: bool = True):
+        """Initialize the evaluator.
+        Args:
+            verbose: Whether to print status messages.
+        """
+        self.verbose = verbose
+    def evaluate(self,
+                true_labels: List[str],
+                predicted_labels: List[str],
+                class_names: Optional[List[str]] = None) -> Dict[str, Any]:
+        """Evaluate predictions against true labels.
+        Args:
+            true_labels: List of true class labels.
+            predicted_labels: List of predicted class labels.
+            class_names: List of class names for detailed metrics.
+        Returns:
+            Dictionary with evaluation metrics.
+        """
+        if len(true_labels) != len(predicted_labels):
+            raise ValueError(f"Length mismatch: {len(true_labels)} true labels vs {len(predicted_labels)} predictions")
+        if self.verbose:
+            print(f"Evaluating {len(true_labels)} predictions")
+        # Calculate metrics
+        accuracy = accuracy_score(true_labels, predicted_labels)
+        # If class_names not provided, use unique values from true and predicted
+        if class_names is None:
+            class_names = sorted(set(true_labels) | set(predicted_labels))
+        # Calculate precision, recall, F1 (macro average)
+        precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
+            true_labels, predicted_labels, average='macro'
+        )
+        # Calculate per-class metrics
+        precision, recall, f1, support = precision_recall_fscore_support(
+            true_labels, predicted_labels, labels=class_names, average=None
+        )
+        # Create class-wise metrics
+        class_metrics = {
+            "precision": {cls: p for cls, p in zip(class_names, precision)},
+            "recall": {cls: r for cls, r in zip(class_names, recall)},
+            "f1": {cls: f for cls, f in zip(class_names, f1)},
+            "support": {cls: s for cls, s in zip(class_names, support)}
+        }
+        # Create confusion matrix
+        cm = confusion_matrix(true_labels, predicted_labels, labels=class_names)
+        # Compile results
+        results = {
+            "accuracy": accuracy,
+            "precision_macro": precision_macro,
+            "recall_macro": recall_macro,
+            "f1_macro": f1_macro,
+            "class_metrics": class_metrics,
+            "confusion_matrix": cm,
+            "confusion_matrix_labels": class_names,
+            "n_samples": len(true_labels)
+        }
+        if self.verbose:
+            print(f"Accuracy: {accuracy:.4f}")
+            print(f"Precision (macro): {precision_macro:.4f}")
+            print(f"Recall (macro): {recall_macro:.4f}")
+            print(f"F1 (macro): {f1_macro:.4f}")
+        return results
+    def bootstrap_evaluate(self,
+                          true_labels: List[str],
+                          predicted_labels: List[str],
+                          n_iterations: int = 1000,
+                          confidence_levels: List[float] = [0.9, 0.95, 0.99],
+                          random_seed: Optional[int] = None) -> Dict[str, Any]:
+        """Evaluate with bootstrap confidence intervals.
+        Args:
+            true_labels: List of true class labels.
+            predicted_labels: List of predicted class labels.
+            n_iterations: Number of bootstrap iterations.
+            confidence_levels: Confidence levels to compute.
+            random_seed: Random seed for reproducibility.
+        Returns:
+            Dictionary with evaluation metrics and confidence intervals.
+        """
+        if len(true_labels) != len(predicted_labels):
+            raise ValueError(f"Length mismatch: {len(true_labels)} true labels vs {len(predicted_labels)} predictions")
+        if self.verbose:
+            print(f"Running bootstrap evaluation with {n_iterations} iterations")
+        # Set random seed
+        if random_seed is not None:
+            np.random.seed(random_seed)
+        # Initialize storage for bootstrap results
+        bootstrap_metrics = {
+            "accuracy": [],
+            "precision_macro": [],
+            "recall_macro": [],
+            "f1_macro": []
+        }
+        # Original evaluation
+        original_results = self.evaluate(true_labels, predicted_labels)
+        # Run bootstrap iterations
+        n_samples = len(true_labels)
+        for _ in tqdm(range(n_iterations), disable=not self.verbose):
+            # Sample with replacement
+            indices = np.random.choice(n_samples, size=n_samples, replace=True)
+            # Get bootstrap sample
+            bootstrap_true = [true_labels[i] for i in indices]
+            bootstrap_pred = [predicted_labels[i] for i in indices]
+            # Evaluate
+            results = self.evaluate(bootstrap_true, bootstrap_pred)
+            # Store results
+            bootstrap_metrics["accuracy"].append(results["accuracy"])
+            bootstrap_metrics["precision_macro"].append(results["precision_macro"])
+            bootstrap_metrics["recall_macro"].append(results["recall_macro"])
+            bootstrap_metrics["f1_macro"].append(results["f1_macro"])
+        # Calculate confidence intervals
+        confidence_intervals = {}
+        for metric, values in bootstrap_metrics.items():
+            confidence_intervals[metric] = {}
+            for level in confidence_levels:
+                lower_percentile = (1 - level) / 2 * 100
+                upper_percentile = (1 + level) / 2 * 100
+                lower = np.percentile(values, lower_percentile)
+                upper = np.percentile(values, upper_percentile)
+                confidence_intervals[metric][level] = (lower, upper)
+        # Combine results
+        results = {
+            "point_estimates": {
+                "accuracy": original_results["accuracy"],
+                "precision_macro": original_results["precision_macro"],
+                "recall_macro": original_results["recall_macro"],
+                "f1_macro": original_results["f1_macro"]
+            },
+            "confidence_intervals": confidence_intervals,
+            "bootstrap_distribution": bootstrap_metrics,
+            "n_iterations": n_iterations,
+            "n_samples": n_samples
+        }
+        if self.verbose:
+            print(f"Bootstrap evaluation complete")
+            print(f"Accuracy: {results['point_estimates']['accuracy']:.4f}")
+            for level in confidence_levels:
+                lower, upper = results['confidence_intervals']['accuracy'][level]
+                print(f"  {level*100:.0f}% CI: ({lower:.4f}, {upper:.4f})")
+        return results
+    def plot_confusion_matrix(self,
+                             confusion_matrix: np.ndarray,
+                             class_names: List[str],
+                             figsize: Tuple[int, int] = (10, 8),
+                             title: str = "Confusion Matrix"):
+        """Plot a confusion matrix.
+        Args:
+            confusion_matrix: Confusion matrix as numpy array.
+            class_names: List of class names.
+            figsize: Figure size as (width, height).
+            title: Plot title.
+        """
+        plt.figure(figsize=figsize)
+        # Create heatmap
+        sns.heatmap(
+            confusion_matrix,
+            annot=True,
+            fmt="d",
+            cmap="Blues",
+            xticklabels=class_names,
+            yticklabels=class_names
+        )
+        plt.xlabel("Predicted")
+        plt.ylabel("True")
+        plt.title(title)
+        plt.tight_layout()
+        plt.show()
+    def plot_bootstrap_distributions(self, bootstrap_results: Dict[str, Any], figsize: Tuple[int, int] = (12, 8)):
+        """Plot bootstrap distributions for key metrics.
+        Args:
+            bootstrap_results: Results from bootstrap_evaluate.
+            figsize: Figure size as (width, height).
+        """
+        metrics = ["accuracy", "precision_macro", "recall_macro", "f1_macro"]
+        plt.figure(figsize=figsize)
+        for i, metric in enumerate(metrics):
+            plt.subplot(2, 2, i+1)
+            # Get distribution data
+            values = bootstrap_results["bootstrap_distribution"][metric]
+            # Plot histogram
+            sns.histplot(values, kde=True)
+            # Add point estimate
+            point_est = bootstrap_results["point_estimates"][metric]
+            plt.axvline(point_est, color='red', linestyle='--', label=f'Point est: {point_est:.4f}')
+            # Add confidence intervals
+            for level, (lower, upper) in bootstrap_results["confidence_intervals"][metric].items():
+                plt.axvline(lower, color='green', linestyle=':',
+                          label=f'{level*100:.0f}% CI: ({lower:.4f}, {upper:.4f})')
+                plt.axvline(upper, color='green', linestyle=':')
+            plt.title(f"{metric.replace('_', ' ').title()}")
+            if i == 0:  # Only add legend to first plot
+                plt.legend(loc='best')
+        plt.tight_layout()
+        plt.show()

src/qualivec/matching.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""Semantic matching utilities for QualiVec."""
+import numpy as np
+import pandas as pd
+from typing import Dict, Any, List, Tuple, Optional
+from sklearn.metrics.pairwise import cosine_similarity
+class SemanticMatcher:
+    """Handles semantic matching for QualiVec."""
+    def __init__(self,
+                 threshold: float = 0.7,
+                 verbose: bool = True):
+        """Initialize the semantic matcher.
+        Args:
+            threshold: Cosine similarity threshold for matching.
+            verbose: Whether to print status messages.
+        """
+        if not 0 <= threshold <= 1:
+            raise ValueError("Threshold must be between 0 and 1.")
+        self.threshold = threshold
+        self.verbose = verbose
+    def match(self,
+              query_embeddings: np.ndarray,
+              reference_data: Dict[str, Any],
+              return_similarities: bool = False) -> pd.DataFrame:
+        """Match query embeddings against reference vectors.
+        Args:
+            query_embeddings: Embeddings of the query texts.
+            reference_data: Dictionary with reference vector information.
+            return_similarities: Whether to return all similarity scores.
+        Returns:
+            DataFrame with matching results.
+        """
+        if self.verbose:
+            print(f"Matching {len(query_embeddings)} queries against {len(reference_data['embeddings'])} reference vectors")
+            print(f"Using cosine similarity threshold: {self.threshold}")
+        # Calculate cosine similarity
+        similarities = cosine_similarity(query_embeddings, reference_data['embeddings'])
+        # Find best matches
+        best_match_indices = np.argmax(similarities, axis=1)
+        best_match_scores = np.max(similarities, axis=1)
+        # Apply threshold
+        matches_mask = best_match_scores >= self.threshold
+        # Create results
+        classes = np.array(reference_data['classes'])[best_match_indices]
+        nodes = np.array(reference_data['nodes'])[best_match_indices]
+        # Apply threshold (set to "Other" if below threshold)
+        classes = np.where(matches_mask, classes, "Other")
+        nodes = np.where(matches_mask, nodes, "")
+        # Create result DataFrame
+        results = pd.DataFrame({
+            "predicted_class": classes,
+            "matched_node": nodes,
+            "similarity_score": best_match_scores
+        })
+        if return_similarities:
+            results["all_similarities"] = list(similarities)
+        if self.verbose:
+            print(f"Matching complete: {matches_mask.sum()} matches above threshold ({matches_mask.mean():.1%})")
+            print(f"Class distribution:\n{results['predicted_class'].value_counts().head(10)}")
+        return results
+    def classify_corpus(self,
+                        corpus_embeddings: np.ndarray,
+                        reference_data: Dict[str, Any],
+                        corpus_df: pd.DataFrame) -> pd.DataFrame:
+        """Classify an entire corpus using semantic matching.
+        Args:
+            corpus_embeddings: Embeddings of the corpus texts.
+            reference_data: Dictionary with reference vector information.
+            corpus_df: DataFrame containing the original corpus.
+        Returns:
+            DataFrame with classification results.
+        """
+        # Perform matching
+        match_results = self.match(corpus_embeddings, reference_data)
+        # Combine with original corpus
+        result_df = pd.concat([corpus_df.reset_index(drop=True),
+                              match_results.reset_index(drop=True)], axis=1)
+        if self.verbose:
+            print(f"Classified {len(result_df)} documents")
+            print(f"Class distribution:\n{result_df['predicted_class'].value_counts().head(10)}")
+        return result_df

src/qualivec/optimization.py ADDED Viewed

	@@ -0,0 +1,263 @@

+"""Threshold optimization utilities for QualiVec."""
+import numpy as np
+import pandas as pd
+from typing import Dict, List, Tuple, Optional, Union, Any, Callable
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+import seaborn as sns
+from qualivec.matching import SemanticMatcher
+from qualivec.evaluation import Evaluator
+class ThresholdOptimizer:
+    """Handles threshold optimization for QualiVec."""
+    def __init__(self,
+                 verbose: bool = True):
+        """Initialize the threshold optimizer.
+        Args:
+            verbose: Whether to print status messages.
+        """
+        self.verbose = verbose
+        self.evaluator = Evaluator(verbose=False)
+    def optimize(self,
+                query_embeddings: np.ndarray,
+                reference_data: Dict[str, Any],
+                true_labels: List[str],
+                start: float = 0.0,
+                end: float = 1.0,
+                step: float = 0.01,
+                metric: str = "f1_macro",
+                bootstrap: bool = True,
+                n_bootstrap: int = 100,
+                confidence_level: float = 0.95,
+                random_seed: Optional[int] = None) -> Dict[str, Any]:
+        """Find the optimal similarity threshold.
+        Args:
+            query_embeddings: Embeddings of the query texts.
+            reference_data: Dictionary with reference vector information.
+            true_labels: True class labels for evaluation.
+            start: Start threshold value.
+            end: End threshold value.
+            step: Threshold step size.
+            metric: Metric to optimize ("accuracy", "precision_macro", "recall_macro", "f1_macro").
+            bootstrap: Whether to use bootstrap evaluation.
+            n_bootstrap: Number of bootstrap iterations.
+            confidence_level: Confidence level for bootstrap.
+            random_seed: Random seed for reproducibility.
+        Returns:
+            Dictionary with optimization results.
+        """
+        if not 0 <= start < end <= 1:
+            raise ValueError("Threshold range must be between 0 and 1")
+        if metric not in ["accuracy", "precision_macro", "recall_macro", "f1_macro"]:
+            raise ValueError(f"Unsupported metric: {metric}")
+        if self.verbose:
+            print(f"Optimizing threshold for {metric}")
+            print(f"Threshold range: {start} to {end} (step: {step})")
+        # Generate threshold values
+        thresholds = np.arange(start, end + step/2, step)
+        # Initialize results storage
+        results = {
+            "thresholds": [],
+            "accuracy": [],
+            "precision_macro": [],
+            "recall_macro": [],
+            "f1_macro": [],
+            "class_distribution": []
+        }
+        if bootstrap:
+            results["confidence_intervals"] = []
+        # Evaluate each threshold
+        for threshold in tqdm(thresholds, disable=not self.verbose):
+            # Create matcher with current threshold
+            matcher = SemanticMatcher(threshold=threshold, verbose=False)
+            # Get predictions
+            match_results = matcher.match(query_embeddings, reference_data)
+            predicted_labels = match_results["predicted_class"].tolist()
+            # Calculate class distribution
+            class_distribution = pd.Series(predicted_labels).value_counts().to_dict()
+            # Evaluate
+            if bootstrap:
+                eval_results = self.evaluator.bootstrap_evaluate(
+                    true_labels,
+                    predicted_labels,
+                    n_iterations=n_bootstrap,
+                    confidence_levels=[confidence_level],
+                    random_seed=random_seed
+                )
+                # Extract point estimates
+                point_estimates = eval_results["point_estimates"]
+                # Extract confidence intervals
+                ci = {m: eval_results["confidence_intervals"][m][confidence_level]
+                     for m in ["accuracy", "precision_macro", "recall_macro", "f1_macro"]}
+                results["confidence_intervals"].append(ci)
+            else:
+                eval_results = self.evaluator.evaluate(true_labels, predicted_labels)
+                point_estimates = {
+                    "accuracy": eval_results["accuracy"],
+                    "precision_macro": eval_results["precision_macro"],
+                    "recall_macro": eval_results["recall_macro"],
+                    "f1_macro": eval_results["f1_macro"]
+                }
+            # Store results
+            results["thresholds"].append(threshold)
+            results["accuracy"].append(point_estimates["accuracy"])
+            results["precision_macro"].append(point_estimates["precision_macro"])
+            results["recall_macro"].append(point_estimates["recall_macro"])
+            results["f1_macro"].append(point_estimates["f1_macro"])
+            results["class_distribution"].append(class_distribution)
+        # Find optimal threshold
+        optimal_idx = np.argmax(results[metric])
+        optimal_threshold = results["thresholds"][optimal_idx]
+        optimal_metrics = {
+            "accuracy": results["accuracy"][optimal_idx],
+            "precision_macro": results["precision_macro"][optimal_idx],
+            "recall_macro": results["recall_macro"][optimal_idx],
+            "f1_macro": results["f1_macro"][optimal_idx]
+        }
+        if bootstrap:
+            optimal_ci = results["confidence_intervals"][optimal_idx]
+        else:
+            optimal_ci = None
+        # Compile results
+        optimization_results = {
+            "optimal_threshold": optimal_threshold,
+            "optimal_metrics": optimal_metrics,
+            "optimal_confidence_intervals": optimal_ci,
+            "results_by_threshold": results,
+            "optimized_metric": metric,
+            "n_thresholds": len(thresholds)
+        }
+        if self.verbose:
+            print(f"Optimal threshold: {optimal_threshold:.4f}")
+            print(f"Optimal {metric}: {optimal_metrics[metric]:.4f}")
+            if bootstrap:
+                lower, upper = optimal_ci[metric]
+                print(f"  {confidence_level*100:.0f}% CI: ({lower:.4f}, {upper:.4f})")
+        return optimization_results
+    def plot_optimization_results(self,
+                                 results: Dict[str, Any],
+                                 metrics: Optional[List[str]] = None,
+                                 figsize: Tuple[int, int] = (12, 6)):
+        """Plot optimization results.
+        Args:
+            results: Results from optimize method.
+            metrics: List of metrics to plot.
+            figsize: Figure size as (width, height).
+        """
+        if metrics is None:
+            metrics = ["accuracy", "precision_macro", "recall_macro", "f1_macro"]
+        plt.figure(figsize=figsize)
+        # Get data
+        thresholds = results["results_by_threshold"]["thresholds"]
+        # Plot metrics
+        for metric in metrics:
+            values = results["results_by_threshold"][metric]
+            plt.plot(thresholds, values, label=metric.replace("_", " ").title())
+            # Highlight optimal threshold
+            if metric == results["optimized_metric"]:
+                optimal_threshold = results["optimal_threshold"]
+                optimal_value = results["optimal_metrics"][metric]
+                plt.scatter([optimal_threshold], [optimal_value], color='red', s=100, zorder=5)
+                plt.axvline(optimal_threshold, color='red', linestyle='--', alpha=0.5,
+                          label=f"Optimal Threshold: {optimal_threshold:.4f}")
+        plt.xlabel("Threshold")
+        plt.ylabel("Metric Value")
+        plt.title("Threshold Optimization Results")
+        plt.legend()
+        plt.grid(True, alpha=0.3)
+        plt.tight_layout()
+        plt.show()
+    def plot_class_distribution(self,
+                               results: Dict[str, Any],
+                               top_n: int = 10,
+                               figsize: Tuple[int, int] = (12, 8)):
+        """Plot class distribution at different thresholds.
+        Args:
+            results: Results from optimize method.
+            top_n: Number of top classes to show.
+            figsize: Figure size as (width, height).
+        """
+        # Get data
+        thresholds = results["results_by_threshold"]["thresholds"]
+        distributions = results["results_by_threshold"]["class_distribution"]
+        # Find all classes
+        all_classes = set()
+        for dist in distributions:
+            all_classes.update(dist.keys())
+        # Count total occurrences to find top classes
+        total_counts = {}
+        for cls in all_classes:
+            total_counts[cls] = sum(dist.get(cls, 0) for dist in distributions)
+        # Get top N classes
+        top_classes = sorted(all_classes, key=lambda x: total_counts[x], reverse=True)[:top_n]
+        # Create data for plot
+        data = []
+        for i, threshold in enumerate(thresholds):
+            dist = distributions[i]
+            for cls in top_classes:
+                data.append({
+                    "Threshold": threshold,
+                    "Class": cls,
+                    "Count": dist.get(cls, 0)
+                })
+        # Create dataframe
+        df = pd.DataFrame(data)
+        # Create plot
+        plt.figure(figsize=figsize)
+        # Use seaborn for line plot
+        sns.lineplot(data=df, x="Threshold", y="Count", hue="Class")
+        # Add vertical line for optimal threshold
+        optimal_threshold = results["optimal_threshold"]
+        plt.axvline(optimal_threshold, color='red', linestyle='--', alpha=0.5,
+                  label=f"Optimal Threshold: {optimal_threshold:.4f}")
+        plt.title("Class Distribution by Threshold")
+        plt.xlabel("Threshold")
+        plt.ylabel("Count")
+        plt.legend(title="Class")
+        plt.grid(True, alpha=0.3)
+        plt.tight_layout()
+        plt.show()

src/qualivec/sampling.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""Sampling utilities for QualiVec."""
+import pandas as pd
+import numpy as np
+from typing import Optional, Union, Literal
+class Sampler:
+    """Handles sampling mechanisms for QualiVec."""
+    def __init__(self, verbose: bool = True):
+        """Initialize the Sampler.
+        Args:
+            verbose: Whether to print status messages.
+        """
+        self.verbose = verbose
+    def sample(self,
+               df: pd.DataFrame,
+               sampling_type: Literal["random", "stratified"] = "random",
+               sample_size: Union[int, float] = 0.1,
+               stratify_column: Optional[str] = None,
+               seed: Optional[int] = None,
+               label_column: str = "Label") -> pd.DataFrame:
+        """Sample data from a DataFrame.
+        Args:
+            df: DataFrame to sample from.
+            sampling_type: Type of sampling ("random" or "stratified").
+            sample_size: Size of the sample. If float, interpreted as a fraction.
+            stratify_column: Column to stratify by (required for stratified sampling).
+            seed: Random seed for reproducibility.
+            label_column: Name of the label column to add to the output.
+        Returns:
+            DataFrame containing the sampled data.
+        Raises:
+            ValueError: If parameters are invalid.
+        """
+        # Set random seed if provided
+        if seed is not None:
+            np.random.seed(seed)
+        # Calculate sample size if given as a fraction
+        if isinstance(sample_size, float):
+            if not 0 < sample_size <= 1:
+                raise ValueError("Sample size as fraction must be between 0 and 1.")
+            n_samples = int(len(df) * sample_size)
+        else:
+            if not 0 < sample_size <= len(df):
+                raise ValueError(f"Sample size must be between 1 and {len(df)}.")
+            n_samples = sample_size
+        if self.verbose:
+            print(f"Sampling {n_samples} rows ({n_samples/len(df):.1%} of data)...")
+        # Perform sampling
+        if sampling_type == "random":
+            sample = df.sample(n=n_samples, random_state=seed)
+        elif sampling_type == "stratified":
+            if stratify_column is None:
+                raise ValueError("stratify_column must be provided for stratified sampling.")
+            if stratify_column not in df.columns:
+                raise ValueError(f"Stratification column '{stratify_column}' not found in DataFrame.")
+            # Check for NaN values in stratification column
+            if df[stratify_column].isna().any():
+                raise ValueError(f"NaN values found in stratification column '{stratify_column}'.")
+            # Calculate the proportion for each stratum
+            strata = df[stratify_column].value_counts(normalize=True)
+            # Create empty sample DataFrame
+            sample = pd.DataFrame(columns=df.columns)
+            # Sample from each stratum
+            for stratum, proportion in strata.items():
+                stratum_df = df[df[stratify_column] == stratum]
+                stratum_samples = max(1, int(n_samples * proportion))
+                stratum_sample = stratum_df.sample(n=min(stratum_samples, len(stratum_df)),
+                                                 random_state=seed)
+                sample = pd.concat([sample, stratum_sample])
+            if self.verbose:
+                print(f"Stratified sampling based on '{stratify_column}':")
+                for stratum, count in sample[stratify_column].value_counts().items():
+                    print(f"  - {stratum}: {count} samples ({count/n_samples:.1%})")
+        else:
+            raise ValueError(f"Unknown sampling type: {sampling_type}")
+        # Add empty label column for manual annotation
+        if label_column not in sample.columns:
+            sample[label_column] = None
+        if self.verbose:
+            print(f"Created sample with {len(sample)} rows.")
+        return sample