File size: 1,576 Bytes
e9e68a0
 
 
 
 
 
 
 
 
 
 
 
 
 
148671a
 
 
 
e9e68a0
 
628deed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import re
from typing import Set

_COLLECTION_SAFE_RE = re.compile(r"[^a-z0-9_]+")

def normalize_folder_key(folder_key: str) -> str:
    value = (folder_key or "").strip().lower()
    value = value.replace("-", "_")
    value = _COLLECTION_SAFE_RE.sub("_", value)
    value = re.sub(r"_+", "_", value).strip("_")
    return value or "default"


def build_collection_name(folder_key: str, prefix: str = "rag") -> str:
    normalized = normalize_folder_key(folder_key)
    base = f"{prefix}_{normalized}"
    # Qdrant collection names should stay short and simple.
    return base[:63]


def extract_folder_key_from_collection_name(collection_name: str, prefix: str = "rag") -> str | None:
    """
    Extract folder_key from collection name.
    E.g., 'rag_k63' -> 'k63', 'rag_2023_2024' -> '2023_2024'
    Returns None if collection_name doesn't match the expected pattern.
    """
    if not collection_name:
        return None
    
    prefix_with_underscore = f"{prefix}_"
    if collection_name.startswith(prefix_with_underscore):
        return collection_name[len(prefix_with_underscore):]
    
    return None


def collection_matches_cohort(collection_name: str, cohort_key: str, prefix: str = "rag") -> bool:
    """
    Check if collection matches the given cohort_key.
    E.g., collection='rag_k63', cohort_key='k63' -> True
    """
    if not cohort_key:
        return False
    
    extracted = extract_folder_key_from_collection_name(collection_name, prefix)
    if not extracted:
        return False
    
    return extracted.lower() == cohort_key.lower()