Spaces:
Sleeping
Sleeping
File size: 5,129 Bytes
86b932c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | import os
import sys
import yaml
import logging
import spacy
import numpy as np
from duckduckgo_search import DDGS
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
_PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
if str(_PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(_PROJECT_ROOT))
logger = logging.getLogger("rag_retrieval")
# Lazy-load massive models securely within the file
_NLP = None
_SIM_MODEL = None
def load_spacy():
global _NLP
if _NLP is None:
try:
_NLP = spacy.load("en_core_web_sm")
except OSError:
logger.info("Downloading spaCy en_core_web_sm model dynamically...")
import subprocess
subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=True)
_NLP = spacy.load("en_core_web_sm")
return _NLP
def load_sim_model():
global _SIM_MODEL
if _SIM_MODEL is None:
_SIM_MODEL = SentenceTransformer("all-MiniLM-L6-v2")
return _SIM_MODEL
def extract_focused_query(title: str, text: str) -> str:
"""
Extracts the top 3 named entities + main noun phrases to form a focused DDG query.
"""
nlp = load_spacy()
# Prioritize title if strong, else merge heavily
target = title if (isinstance(title, str) and len(title.split()) > 4) else (str(title) + " " + str(text))[:1000]
doc = nlp(target)
# 1. Grab Entities (ORG, PERSON, GPE, DATE, EVENT)
entities = [ent.text for ent in doc.ents if ent.label_ in ['ORG', 'PERSON', 'GPE', 'EVENT']]
unique_entities = list(dict.fromkeys(entities))[:3]
# 2. Grab top Noun Phrases if entities are missing
noun_phrases = [chunk.text for chunk in doc.noun_chunks]
query_parts = unique_entities.copy()
if len(query_parts) < 3:
for np_chunk in noun_phrases:
if np_chunk not in query_parts and len(np_chunk.split()) <= 3:
query_parts.append(np_chunk)
if len(query_parts) >= 3:
break
focused_query = " ".join(query_parts)
if not focused_query.strip():
# Fallback to pure headline first 5 words
focused_query = " ".join(target.split()[:5])
return focused_query
def execute_rag(title: str, text: str):
"""
1. Extracts Query.
2. DuckDuckGo Search (top 5).
3. Measure Similarity vs article body via all-MiniLM-L6-v2.
4. Return strict evaluations.
"""
cfg_path = os.path.join(_PROJECT_ROOT, "config", "config.yaml")
with open(cfg_path, "r", encoding="utf-8") as f:
cfg = yaml.safe_load(f)
rag_cfg = cfg.get("rag", {})
top_k = rag_cfg.get("top_k", 5)
support_thresh = rag_cfg.get("support_threshold", 0.65)
conflict_thresh = rag_cfg.get("conflict_threshold", 0.30)
query = extract_focused_query(title, text)
logger.info(f"RAG Triggered. Extracted Search Query: {query}")
search_results = []
try:
with DDGS() as ddgs:
results = list(ddgs.text(query, max_results=top_k))
search_results = [r.get("body", r.get("title", "")) for r in results if isinstance(r, dict)]
except Exception as e:
logger.error(f"DDGS failure: {e}")
return {"status": "error", "message": "Search engine failure", "data": []}, "INCONCLUSIVE"
if not search_results:
return {"status": "empty", "message": "No external context found", "data": []}, "INCONCLUSIVE"
sim_model = load_sim_model()
# Compare target text against all snippets
corpus_text = (str(title) + " " + str(text))[:2000] # Cap memory context
embed_target = sim_model.encode([corpus_text])
embed_search = sim_model.encode(search_results)
similarities = cosine_similarity(embed_target, embed_search)[0]
supports = 0
conflicts = 0
eval_payload = []
for i, sim in enumerate(similarities):
s_float = float(sim)
if s_float >= support_thresh:
supports += 1
nature = "SUPPORTS"
elif s_float < conflict_thresh:
conflicts += 1
nature = "CONFLICTS"
else:
nature = "NEUTRAL"
eval_payload.append({
"snippet": search_results[i],
"similarity": s_float,
"nature": nature
})
# Rag Verdict Check
if supports >= 2:
verdict = "CORROBORATED"
elif conflicts >= 2:
verdict = "CONTRADICTED"
else:
verdict = "INCONCLUSIVE"
final_output = {
"status": "success",
"query": query,
"supports": supports,
"conflicts": conflicts,
"data": eval_payload
}
return final_output, verdict
if __name__ == "__main__":
t = "Eiffel Tower sold for scrap metal in surprising Paris decree."
tx = "The mayor of Paris declared the tower will be dismantled."
o, v = execute_rag(t, tx)
import json
print("Verdict:", v)
print(json.dumps(o, indent=2))
|