import os import sys import yaml import logging import spacy import numpy as np from duckduckgo_search import DDGS from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity _PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) if str(_PROJECT_ROOT) not in sys.path: sys.path.insert(0, str(_PROJECT_ROOT)) logger = logging.getLogger("rag_retrieval") # Lazy-load massive models securely within the file _NLP = None _SIM_MODEL = None def load_spacy(): global _NLP if _NLP is None: try: _NLP = spacy.load("en_core_web_sm") except OSError: logger.info("Downloading spaCy en_core_web_sm model dynamically...") import subprocess subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=True) _NLP = spacy.load("en_core_web_sm") return _NLP def load_sim_model(): global _SIM_MODEL if _SIM_MODEL is None: _SIM_MODEL = SentenceTransformer("all-MiniLM-L6-v2") return _SIM_MODEL def extract_focused_query(title: str, text: str) -> str: """ Extracts the top 3 named entities + main noun phrases to form a focused DDG query. """ nlp = load_spacy() # Prioritize title if strong, else merge heavily target = title if (isinstance(title, str) and len(title.split()) > 4) else (str(title) + " " + str(text))[:1000] doc = nlp(target) # 1. Grab Entities (ORG, PERSON, GPE, DATE, EVENT) entities = [ent.text for ent in doc.ents if ent.label_ in ['ORG', 'PERSON', 'GPE', 'EVENT']] unique_entities = list(dict.fromkeys(entities))[:3] # 2. Grab top Noun Phrases if entities are missing noun_phrases = [chunk.text for chunk in doc.noun_chunks] query_parts = unique_entities.copy() if len(query_parts) < 3: for np_chunk in noun_phrases: if np_chunk not in query_parts and len(np_chunk.split()) <= 3: query_parts.append(np_chunk) if len(query_parts) >= 3: break focused_query = " ".join(query_parts) if not focused_query.strip(): # Fallback to pure headline first 5 words focused_query = " ".join(target.split()[:5]) return focused_query def execute_rag(title: str, text: str): """ 1. Extracts Query. 2. DuckDuckGo Search (top 5). 3. Measure Similarity vs article body via all-MiniLM-L6-v2. 4. Return strict evaluations. """ cfg_path = os.path.join(_PROJECT_ROOT, "config", "config.yaml") with open(cfg_path, "r", encoding="utf-8") as f: cfg = yaml.safe_load(f) rag_cfg = cfg.get("rag", {}) top_k = rag_cfg.get("top_k", 5) support_thresh = rag_cfg.get("support_threshold", 0.65) conflict_thresh = rag_cfg.get("conflict_threshold", 0.30) query = extract_focused_query(title, text) logger.info(f"RAG Triggered. Extracted Search Query: {query}") search_results = [] try: with DDGS() as ddgs: results = list(ddgs.text(query, max_results=top_k)) search_results = [r.get("body", r.get("title", "")) for r in results if isinstance(r, dict)] except Exception as e: logger.error(f"DDGS failure: {e}") return {"status": "error", "message": "Search engine failure", "data": []}, "INCONCLUSIVE" if not search_results: return {"status": "empty", "message": "No external context found", "data": []}, "INCONCLUSIVE" sim_model = load_sim_model() # Compare target text against all snippets corpus_text = (str(title) + " " + str(text))[:2000] # Cap memory context embed_target = sim_model.encode([corpus_text]) embed_search = sim_model.encode(search_results) similarities = cosine_similarity(embed_target, embed_search)[0] supports = 0 conflicts = 0 eval_payload = [] for i, sim in enumerate(similarities): s_float = float(sim) if s_float >= support_thresh: supports += 1 nature = "SUPPORTS" elif s_float < conflict_thresh: conflicts += 1 nature = "CONFLICTS" else: nature = "NEUTRAL" eval_payload.append({ "snippet": search_results[i], "similarity": s_float, "nature": nature }) # Rag Verdict Check if supports >= 2: verdict = "CORROBORATED" elif conflicts >= 2: verdict = "CONTRADICTED" else: verdict = "INCONCLUSIVE" final_output = { "status": "success", "query": query, "supports": supports, "conflicts": conflicts, "data": eval_payload } return final_output, verdict if __name__ == "__main__": t = "Eiffel Tower sold for scrap metal in surprising Paris decree." tx = "The mayor of Paris declared the tower will be dismantled." o, v = execute_rag(t, tx) import json print("Verdict:", v) print(json.dumps(o, indent=2))