File size: 4,926 Bytes
162b166
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""
Feature extraction for Quora question pairs.
"""
import distance
from fuzzywuzzy import fuzz
import numpy as np

from .preprocessing import preprocess

# Use NLTK stopwords (no pickle dependency)
try:
    from nltk.corpus import stopwords
    STOP_WORDS = set(stopwords.words('english'))
except LookupError:
    import nltk
    nltk.download('stopwords', quiet=True)
    from nltk.corpus import stopwords
    STOP_WORDS = set(stopwords.words('english'))

SAFE_DIV = 0.0001


def _common_words(q1: str, q2: str) -> int:
    w1 = set(word.lower().strip() for word in q1.split())
    w2 = set(word.lower().strip() for word in q2.split())
    return len(w1 & w2)


def _total_words(q1: str, q2: str) -> int:
    w1 = set(word.lower().strip() for word in q1.split())
    w2 = set(word.lower().strip() for word in q2.split())
    return len(w1) + len(w2)


def _fetch_token_features(q1: str, q2: str) -> list:
    token_features = [0.0] * 8

    q1_tokens = q1.split()
    q2_tokens = q2.split()

    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features

    q1_words = set(w for w in q1_tokens if w not in STOP_WORDS)
    q2_words = set(w for w in q2_tokens if w not in STOP_WORDS)
    q1_stops = set(w for w in q1_tokens if w in STOP_WORDS)
    q2_stops = set(w for w in q2_tokens if w in STOP_WORDS)

    common_word_count = len(q1_words & q2_words)
    common_stop_count = len(q1_stops & q2_stops)
    common_token_count = len(set(q1_tokens) & set(q2_tokens))

    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])

    return token_features


def _fetch_length_features(q1: str, q2: str) -> list:
    length_features = [0.0] * 3

    q1_tokens = q1.split()
    q2_tokens = q2.split()

    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return length_features

    length_features[0] = abs(len(q1_tokens) - len(q2_tokens))
    length_features[1] = (len(q1_tokens) + len(q2_tokens)) / 2

    # Guard against empty lcsubstrings (IndexError)
    strs = list(distance.lcsubstrings(q1, q2))
    if strs:
        length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1)
    else:
        length_features[2] = 0.0

    return length_features


def _fetch_fuzzy_features(q1: str, q2: str) -> list:
    return [
        fuzz.QRatio(q1, q2),
        fuzz.partial_ratio(q1, q2),
        fuzz.token_sort_ratio(q1, q2),
        fuzz.token_set_ratio(q1, q2),
    ]


def _jaccard_similarity(q1: str, q2: str) -> float:
    """|intersection| / |union| of word sets."""
    w1 = set(word.lower().strip() for word in q1.split())
    w2 = set(word.lower().strip() for word in q2.split())
    if not w1 and not w2:
        return 0.0
    inter = len(w1 & w2)
    union = len(w1 | w2)
    return inter / union if union else 0.0


def _sentence_length_ratio(q1: str, q2: str) -> float:
    """min(word_count) / max(word_count)."""
    n1, n2 = len(q1.split()), len(q2.split())
    if max(n1, n2) == 0:
        return 0.0
    return min(n1, n2) / max(n1, n2)


def query_point_creator(
    q1: str, q2: str, vectorizer, embedding_model=None
) -> np.ndarray:
    """
    Build feature vector for a question pair.
    Requires a fitted CountVectorizer or TfidfVectorizer.
    If embedding_model provided, adds cosine similarity between question embeddings.
    """
    q1 = preprocess(q1)
    q2 = preprocess(q2)

    input_query = [
        len(q1),
        len(q2),
        len(q1.split()),
        len(q2.split()),
        _common_words(q1, q2),
        _total_words(q1, q2),
        round(_common_words(q1, q2) / (_total_words(q1, q2) + SAFE_DIV), 2),
    ]
    input_query.extend(_fetch_token_features(q1, q2))
    input_query.extend(_fetch_length_features(q1, q2))
    input_query.extend(_fetch_fuzzy_features(q1, q2))
    input_query.append(_jaccard_similarity(q1, q2))
    input_query.append(_sentence_length_ratio(q1, q2))

    # Sentence Transformer cosine similarity (semantic)
    if embedding_model is not None:
        from .embeddings import embedding_cosine_similarity
        input_query.append(embedding_cosine_similarity(q1, q2, embedding_model))

    q1_vec = vectorizer.transform([q1]).toarray()
    q2_vec = vectorizer.transform([q2]).toarray()

    n_handcrafted = len(input_query)
    return np.hstack((np.array(input_query).reshape(1, n_handcrafted), q1_vec, q2_vec))