File size: 1,157 Bytes
00ff675
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from sklearn.metrics.pairwise import cosine_similarity
from semantic_engine.embed import embed

SUBJECT_ANCHORS = {
    "DSA": [
        "data structures algorithms coding problems complexity"
    ],
    "OS": [
        "operating system deadlock process memory scheduling"
    ],
    "DBMS": [
        "database sql normalization indexing transactions"
    ],
    "CLOUD / DEVOPS": [
        "cloud aws gcp azure deployment scalability"
    ],
    "FRONTEND": [
        "react frontend state management ui"
    ],
    "SYSTEM DESIGN": [
        "system design architecture scalability"
    ],
    "HR": [
        "hr behavioral teamwork relocation goals"
    ],
}

# Precompute anchor embeddings
ANCHOR_EMBEDS = {
    k: embed(v) for k, v in SUBJECT_ANCHORS.items()
}

def classify_unit(text: str):
    unit_emb = embed([text])[0]

    best_subject = None
    best_score = 0.0

    for subject, emb in ANCHOR_EMBEDS.items():
        score = cosine_similarity(
            [unit_emb], emb
        ).max()

        if score > best_score:
            best_score = score
            best_subject = subject

    return best_subject, round(float(best_score), 2)