File size: 5,129 Bytes
86b932c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import os
import sys
import yaml
import logging
import spacy
import numpy as np

from duckduckgo_search import DDGS
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

_PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
if str(_PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(_PROJECT_ROOT))

logger = logging.getLogger("rag_retrieval")

# Lazy-load massive models securely within the file
_NLP = None
_SIM_MODEL = None

def load_spacy():
    global _NLP
    if _NLP is None:
        try:
            _NLP = spacy.load("en_core_web_sm")
        except OSError:
            logger.info("Downloading spaCy en_core_web_sm model dynamically...")
            import subprocess
            subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=True)
            _NLP = spacy.load("en_core_web_sm")
    return _NLP

def load_sim_model():
    global _SIM_MODEL
    if _SIM_MODEL is None:
        _SIM_MODEL = SentenceTransformer("all-MiniLM-L6-v2")
    return _SIM_MODEL

def extract_focused_query(title: str, text: str) -> str:
    """
    Extracts the top 3 named entities + main noun phrases to form a focused DDG query.
    """
    nlp = load_spacy()
    # Prioritize title if strong, else merge heavily
    target = title if (isinstance(title, str) and len(title.split()) > 4) else (str(title) + " " + str(text))[:1000]
    
    doc = nlp(target)
    
    # 1. Grab Entities (ORG, PERSON, GPE, DATE, EVENT)
    entities = [ent.text for ent in doc.ents if ent.label_ in ['ORG', 'PERSON', 'GPE', 'EVENT']]
    unique_entities = list(dict.fromkeys(entities))[:3]
    
    # 2. Grab top Noun Phrases if entities are missing
    noun_phrases = [chunk.text for chunk in doc.noun_chunks]
    
    query_parts = unique_entities.copy()
    if len(query_parts) < 3:
        for np_chunk in noun_phrases:
            if np_chunk not in query_parts and len(np_chunk.split()) <= 3:
                query_parts.append(np_chunk)
            if len(query_parts) >= 3:
                break
                
    focused_query = " ".join(query_parts)
    if not focused_query.strip():
        # Fallback to pure headline first 5 words
        focused_query = " ".join(target.split()[:5])
        
    return focused_query

def execute_rag(title: str, text: str):
    """
    1. Extracts Query.
    2. DuckDuckGo Search (top 5).
    3. Measure Similarity vs article body via all-MiniLM-L6-v2.
    4. Return strict evaluations.
    """
    cfg_path = os.path.join(_PROJECT_ROOT, "config", "config.yaml")
    with open(cfg_path, "r", encoding="utf-8") as f:
        cfg = yaml.safe_load(f)
    rag_cfg = cfg.get("rag", {})
    
    top_k = rag_cfg.get("top_k", 5)
    support_thresh = rag_cfg.get("support_threshold", 0.65)
    conflict_thresh = rag_cfg.get("conflict_threshold", 0.30)
    
    query = extract_focused_query(title, text)
    logger.info(f"RAG Triggered. Extracted Search Query: {query}")
    
    search_results = []
    try:
        with DDGS() as ddgs:
            results = list(ddgs.text(query, max_results=top_k))
        search_results = [r.get("body", r.get("title", "")) for r in results if isinstance(r, dict)]
    except Exception as e:
        logger.error(f"DDGS failure: {e}")
        return {"status": "error", "message": "Search engine failure", "data": []}, "INCONCLUSIVE"
        
    if not search_results:
        return {"status": "empty", "message": "No external context found", "data": []}, "INCONCLUSIVE"
        
    sim_model = load_sim_model()
    
    # Compare target text against all snippets
    corpus_text = (str(title) + " " + str(text))[:2000] # Cap memory context
    embed_target = sim_model.encode([corpus_text])
    embed_search = sim_model.encode(search_results)
    
    similarities = cosine_similarity(embed_target, embed_search)[0]
    
    supports = 0
    conflicts = 0
    eval_payload = []
    
    for i, sim in enumerate(similarities):
        s_float = float(sim)
        if s_float >= support_thresh:
            supports += 1
            nature = "SUPPORTS"
        elif s_float < conflict_thresh:
            conflicts += 1
            nature = "CONFLICTS"
        else:
            nature = "NEUTRAL"
            
        eval_payload.append({
            "snippet": search_results[i],
            "similarity": s_float,
            "nature": nature
        })
        
    # Rag Verdict Check
    if supports >= 2:
        verdict = "CORROBORATED"
    elif conflicts >= 2:
        verdict = "CONTRADICTED"
    else:
        verdict = "INCONCLUSIVE"
        
    final_output = {
        "status": "success",
        "query": query,
        "supports": supports,
        "conflicts": conflicts,
        "data": eval_payload
    }
    
    return final_output, verdict

if __name__ == "__main__":
    t = "Eiffel Tower sold for scrap metal in surprising Paris decree."
    tx = "The mayor of Paris declared the tower will be dismantled."
    o, v = execute_rag(t, tx)
    import json
    print("Verdict:", v)
    print(json.dumps(o, indent=2))