Spaces:

samwaugh
/

ArteFact

Paused

File size: 3,250 Bytes

"""
Filtering logic for sentence selection based on topics and creators.
"""

from typing import Any, Dict, List, Set

# Import data from config (loaded from HF datasets)
from .config import sentences, works, creators, topics, topic_names

# Data is now loaded from Hugging Face datasets in config.py
# No need to load from local files anymore

def get_filtered_sentence_ids(
    filter_topics: List[str] = None, filter_creators: List[str] = None
) -> Set[str]:
    """
    Get the set of sentence IDs that match the given filters.
    """
    # Start with all sentence IDs from the sentences dictionary
    valid_sentence_ids = set(sentences.keys())

    # If no filters, return all sentences
    if not filter_topics and not filter_creators:
        return valid_sentence_ids

    # Build set of valid work IDs based on filters
    valid_work_ids = set()

    # Apply topic filter
    if filter_topics:
        # Using topics dictionary (topic -> works mapping)
        # For each selected topic, get all works that have it
        for topic_id in filter_topics:
            if topic_id in topics:
                # Add all works that have this topic
                valid_work_ids.update(topics[topic_id])
    else:
        # If no topic filter, all works are valid so far
        valid_work_ids = set(works.keys())

    # Apply creator filter
    if filter_creators:
        # Direct lookup in creators dictionary (more efficient)
        creator_work_ids = set()
        for creator_name in filter_creators:
            if creator_name in creators:
                # Get all works by this creator directly from creators dictionary
                creator_work_ids.update(creators[creator_name])

        # Intersect with existing valid_work_ids if topics were filtered
        if filter_topics:
            valid_work_ids = valid_work_ids.intersection(creator_work_ids)
        else:
            valid_work_ids = creator_work_ids

    # Now filter sentences to only those from valid works
    filtered_sentence_ids = set()
    for sentence_id in valid_sentence_ids:
        # Extract work ID from sentence ID (format: WORKID_sXXXX)
        work_id = sentence_id.split("_")[0]
        if work_id in valid_work_ids:
            filtered_sentence_ids.add(sentence_id)

    return filtered_sentence_ids


def apply_filters_to_results(
    results: List[Dict[str, Any]],
    filter_topics: List[str] = None,
    filter_creators: List[str] = None,
) -> List[Dict[str, Any]]:
    """
    Filter a list of results based on topics and creators.

    Args:
        results: List of result dictionaries with 'id' field
        filter_topics: List of topic codes to filter by
        filter_creators: List of creator names to filter by

    Returns:
        Filtered list of results
    """
    if not filter_topics and not filter_creators:
        return results

    valid_sentence_ids = get_filtered_sentence_ids(filter_topics, filter_creators)

    # Filter results to only include valid sentences
    filtered_results = [
        result for result in results if result.get("id") in valid_sentence_ids
    ]

    # Re-rank the filtered results
    for i, result in enumerate(filtered_results, 1):
        result["rank"] = i

    return filtered_results