File size: 3,250 Bytes
0e61117
 
 
 
 
 
efbac81
61f8035
0e61117
efbac81
 
0e61117
 
 
 
 
 
 
61f8035
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e61117
 
 
 
 
 
 
 
 
 
 
4ac1f80
0e61117
 
 
 
 
 
 
 
 
 
 
 
 
4ac1f80
0e61117
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""
Filtering logic for sentence selection based on topics and creators.
"""

from typing import Any, Dict, List, Set

# Import data from config (loaded from HF datasets)
from .config import sentences, works, creators, topics, topic_names

# Data is now loaded from Hugging Face datasets in config.py
# No need to load from local files anymore

def get_filtered_sentence_ids(
    filter_topics: List[str] = None, filter_creators: List[str] = None
) -> Set[str]:
    """
    Get the set of sentence IDs that match the given filters.
    """
    # Start with all sentence IDs from the sentences dictionary
    valid_sentence_ids = set(sentences.keys())

    # If no filters, return all sentences
    if not filter_topics and not filter_creators:
        return valid_sentence_ids

    # Build set of valid work IDs based on filters
    valid_work_ids = set()

    # Apply topic filter
    if filter_topics:
        # Using topics dictionary (topic -> works mapping)
        # For each selected topic, get all works that have it
        for topic_id in filter_topics:
            if topic_id in topics:
                # Add all works that have this topic
                valid_work_ids.update(topics[topic_id])
    else:
        # If no topic filter, all works are valid so far
        valid_work_ids = set(works.keys())

    # Apply creator filter
    if filter_creators:
        # Direct lookup in creators dictionary (more efficient)
        creator_work_ids = set()
        for creator_name in filter_creators:
            if creator_name in creators:
                # Get all works by this creator directly from creators dictionary
                creator_work_ids.update(creators[creator_name])

        # Intersect with existing valid_work_ids if topics were filtered
        if filter_topics:
            valid_work_ids = valid_work_ids.intersection(creator_work_ids)
        else:
            valid_work_ids = creator_work_ids

    # Now filter sentences to only those from valid works
    filtered_sentence_ids = set()
    for sentence_id in valid_sentence_ids:
        # Extract work ID from sentence ID (format: WORKID_sXXXX)
        work_id = sentence_id.split("_")[0]
        if work_id in valid_work_ids:
            filtered_sentence_ids.add(sentence_id)

    return filtered_sentence_ids


def apply_filters_to_results(
    results: List[Dict[str, Any]],
    filter_topics: List[str] = None,
    filter_creators: List[str] = None,
) -> List[Dict[str, Any]]:
    """
    Filter a list of results based on topics and creators.

    Args:
        results: List of result dictionaries with 'id' field
        filter_topics: List of topic codes to filter by
        filter_creators: List of creator names to filter by

    Returns:
        Filtered list of results
    """
    if not filter_topics and not filter_creators:
        return results

    valid_sentence_ids = get_filtered_sentence_ids(filter_topics, filter_creators)

    # Filter results to only include valid sentences
    filtered_results = [
        result for result in results if result.get("id") in valid_sentence_ids
    ]

    # Re-rank the filtered results
    for i, result in enumerate(filtered_results, 1):
        result["rank"] = i

    return filtered_results