File size: 2,785 Bytes
d97a439
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

from typing import List, Any, Optional
from collections import defaultdict, deque

def predict_topic_nth_degree(
    new_article_title: str,
    new_article_embedding: List[float],
    edges: List[str],
    G: Any,
    max_depth: int = 1,
    is_weighted: bool = False,
    decay_factor: float = 1.0,
) -> Optional[str]:
    """
    Predicts topic based on neighbors up to n-degrees away.

    Args:
        max_depth: How many hops to traverse (1 = direct neighbors, 2 = neighbors of neighbors).
        decay_factor: Multiplier for distance. 1.0 = no decay.
                      0.5 means a neighbor at depth 2 has half the voting power of depth 1.
    """

    # 1. Setup BFS
    # Queue stores: (current_node_name, current_depth)
    queue = deque()

    # We maintain a visited set to avoid cycles and processing the same node twice
    visited = set()
    visited.add(new_article_title)

    # 2. Initialize BFS with the "Virtual" First Hop
    # We iterate the input list 'edges' manually because the new article isn't in G.
    for ref in edges:
        if ref in G and ref not in visited:
            visited.add(ref)
            queue.append((ref, 1))  # Depth 1

    if not queue:
        return None

    topic_scores = defaultdict(float)

    # 3. Process BFS
    while queue:
        current_node, current_depth = queue.popleft()

        # --- Score Calculation ---
        node_data = G.nodes[current_node]
        topic = node_data.get("label")

        if topic:
            # Determine base weight
            if is_weighted:
                neighbor_embedding = node_data["embedding"]
                # Calculate similarity
                base_score = cosine_similarity(
                    [new_article_embedding], [neighbor_embedding]
                )[0][0]
            else:
                base_score = 1.0

            # Apply Distance Decay
            # Formula: Score * (decay ^ (depth - 1))
            # Depth 1: Score * 1
            # Depth 2: Score * decay
            weighted_score = base_score * (decay_factor ** (current_depth - 1))

            topic_scores[topic] += weighted_score

        # --- Expand to next level if within limit ---
        if current_depth < max_depth:
            for neighbor in G.neighbors(current_node):
                if neighbor not in visited:
                    visited.add(neighbor)
                    queue.append((neighbor, current_depth + 1))

    # 4. Determine Winner
    if not topic_scores:
        return None

    columns = ["Class", "Score"]
    result_df = pd.DataFrame(
        [(topic, score) for topic, score in topic_scores.items()], columns=columns
    ).sort_values(by="Score", ascending=False)
    return result_df