File size: 3,714 Bytes
d0abef8
 
 
 
 
 
0dda569
d0abef8
 
 
 
 
 
 
9ff3220
d0abef8
9ff3220
 
 
 
 
 
 
 
 
 
 
 
d0abef8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# src/explain_service/explainer.py

import re
import numpy as np
from sentence_transformers import SentenceTransformer
from google import genai
import os
STOPWORDS = set("""
a an the and or but if while with without for on in into by to from of is are was were be been being as it this that these those
""".split())


class Explainer:
    def __init__(self):
        # Sentence transformer for similarity scoring
        self.model = SentenceTransformer("all-MiniLM-L6-v2")

        # Load Gemini API key from environment
        api_key = os.environ.get("GENAI_API_KEY")

        # If key missing → disable LLM
        if not api_key:
            self.client = None
        else:
            try:
                self.client = genai.Client(api_key=api_key)
            except:
                self.client = None

    # ---------------------------
    # TOKENIZER
    # ---------------------------
    def tokenize(self, text: str):
        text = text.lower()
        tokens = re.findall(r"[a-zA-Z]+", text)
        tokens = [t for t in tokens if t not in STOPWORDS]
        return tokens

    # ---------------------------
    # KEYWORD OVERLAP
    # ---------------------------
    def keyword_overlap(self, query: str, doc: str):
        q_tokens = set(self.tokenize(query))
        d_tokens = set(self.tokenize(doc))

        overlap = q_tokens.intersection(d_tokens)
        overlap_ratio = len(overlap) / (len(q_tokens) + 1e-8)

        return list(overlap), float(overlap_ratio)

    # ---------------------------
    # BEST SENTENCES MATCHING QUERY
    # ---------------------------
    def best_sentences(self, query: str, doc: str, top_k=2):
        sentences = re.split(r"[.!?]", doc)
        sentences = [s.strip() for s in sentences if len(s.strip()) > 0]

        if len(sentences) == 0:
            return []

        q_emb = self.model.encode(query, convert_to_numpy=True)
        s_embs = self.model.encode(sentences, convert_to_numpy=True)

        q_emb = q_emb / (np.linalg.norm(q_emb) + 1e-10)
        s_norm = s_embs / (np.linalg.norm(s_embs, axis=1, keepdims=True) + 1e-10)

        sims = (s_norm @ q_emb).tolist()
        top_ids = np.argsort(sims)[::-1][:top_k]

        results = []
        for idx in top_ids:
            results.append({
                "sentence": sentences[idx],
                "score": float(sims[idx])
            })

        return results

    # ---------------------------
    # LLM-LEVEL EXPLANATION
    # ---------------------------
    def llm_explain(self, query, doc_text, top_sentences):

        formatted_sentences = "\n".join(
            [f"- {s['sentence']} (score: {s['score']:.2f})" for s in top_sentences]
        )

        prompt = f"""
You are an AI assistant that explains WHY a document matches a user query.

QUERY:
{query}

DOCUMENT EXCERPT:
{doc_text[:500]}

MOST RELEVANT SENTENCES:
{formatted_sentences}

Write 2–3 natural sentences explaining WHY this document is relevant.
"""

        response = self.client.models.generate_content(
            model="gemini-2.5-flash",
            contents=prompt,
            config={"temperature": 0.4}
        )

        return response.text.strip()

    # ---------------------------
    # MAIN EXPLAIN FUNCTION
    # ---------------------------
    def explain(self, query: str, doc_text: str):

        keywords, overlap_ratio = self.keyword_overlap(query, doc_text)
        top_sents = self.best_sentences(query, doc_text)
        llm_summary = self.llm_explain(query, doc_text, top_sents)

        return {
            "keyword_overlap": keywords,
            "overlap_ratio": overlap_ratio,
            "top_sentences": top_sents,
            "llm_explanation": llm_summary
        }