File size: 6,247 Bytes
ac2901a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
# TruthLens – Lite (always-on CPU version)
# Retrieval + Extractive Answer + Citations (no heavy generators)
import re
import numpy as np
import pandas as pd
import gradio as gr
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
# -------------------------------
# Corpus (seed docs)
# -------------------------------
SAMPLE_DOCS = [
{"title": "IPCC on Climate Change",
"text": ("It is unequivocal that human influence has warmed the atmosphere, ocean and land. "
"Greenhouse gas emissions from human activities are responsible for approximately 1.1°C of warming since 1850–1900."),
"url": "https://example.org/ipcc"},
{"title": "Elections Security Myths",
"text": ("Independent audits and paper ballot backups reduce the risk of widespread election fraud. "
"No credible evidence supports claims of nationwide manipulation in recent elections."),
"url": "https://example.org/election-security"},
{"title": "WHO on Vaccines & Safety",
"text": ("Vaccines undergo rigorous testing in clinical trials and continuous safety monitoring. "
"Severe adverse reactions are rare and benefits outweigh risks."),
"url": "https://example.org/who-vaccines"},
]
# -------------------------------
# Model (tiny, fast)
# -------------------------------
EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
EMB = SentenceTransformer(EMB_MODEL)
INDEX = {"emb": None, "texts": [], "titles": [], "urls": []}
def _sent_split(text: str):
# lightweight sentence splitter
sents = re.split(r"(?<=[.!?])\s+", text.strip())
return [s.strip() for s in sents if s.strip()]
def build_index(extra=None):
texts = [d["text"] for d in SAMPLE_DOCS]
titles = [d["title"] for d in SAMPLE_DOCS]
urls = [d["url"] for d in SAMPLE_DOCS]
# add user sources
if extra:
for i, t in enumerate(extra):
if t and str(t).strip():
texts.append(str(t).strip())
titles.append(f"User Source {i+1}")
urls.append("user://paste")
INDEX["texts"], INDEX["titles"], INDEX["urls"] = texts, titles, urls
INDEX["emb"] = EMB.encode(texts, normalize_embeddings=True, convert_to_numpy=True)
def retrieve(query, k=3):
if INDEX["emb"] is None:
build_index()
q = EMB.encode([query], normalize_embeddings=True, convert_to_numpy=True)
sims = cosine_similarity(q, INDEX["emb"])[0]
top_idx = np.argsort(-sims)[:k]
return top_idx, sims
def extractive_answer(query, doc_indices, max_sents=5):
# score sentences from selected docs against query; pick top unique sentences
q = EMB.encode([query], normalize_embeddings=True, convert_to_numpy=True)[0]
cand_sents = []
mapping = [] # (doc_i, sent_text)
for rank, di in enumerate(doc_indices):
sents = _sent_split(INDEX["texts"][di])[:10]
if not sents:
continue
emb = EMB.encode(sents, normalize_embeddings=True, convert_to_numpy=True)
sc = cosine_similarity([q], emb)[0]
for s, score in zip(sents, sc):
cand_sents.append((score, s, di, rank))
mapping.append((di, s))
# sort by score, then take diverse sentences (avoid near-duplicates)
cand_sents.sort(key=lambda x: -x[0])
picked = []
picked_embs = []
for score, s, di, _ in cand_sents:
if len(picked) >= max_sents: break
e = EMB.encode([s], normalize_embeddings=True, convert_to_numpy=True)[0]
if picked_embs:
simmax = float(np.max(cosine_similarity([e], np.vstack(picked_embs))[0]))
if simmax > 0.85:
continue
picked.append((s, di))
picked_embs.append(e)
if not picked:
return "I’m uncertain based on the provided sources.", []
# stitch into a paragraph with inline citations
parts = []
cites_used = set()
for s, di in picked:
tag = f"[{doc_indices.index(di)+1}]" if di in doc_indices else ""
parts.append(f"{s} {tag}")
cites_used.add(di)
paragraph = " ".join(parts)
citations = [f"[{i+1}] {INDEX['titles'][di]} – {INDEX['urls'][di]}"
for i, di in enumerate(doc_indices)]
return paragraph.strip(), citations
# -------------------------------
# Pipeline
# -------------------------------
def run_pipeline(claim, s1, s2, s3):
build_index([s1, s2, s3])
idxs, sims = retrieve(claim, k=3)
answer, citations = extractive_answer(claim, list(idxs), max_sents=5)
# simple relevance table
table = pd.DataFrame({
"Source": [INDEX["titles"][i] for i in idxs],
"Relevance": [round(float(sims[i]), 3) for i in idxs]
})
# redacted = same as answer in Lite (no PII model)
redacted = answer
summary = "Mode: Lite (extractive). Answers are directly quoted/condensed from retrieved sources."
return summary, answer, "\n".join(citations), table, redacted
# -------------------------------
# UI
# -------------------------------
with gr.Blocks(title="TruthLens – Misinformation-Aware RAG (Lite)") as demo:
gr.Markdown(
"# 🧭 TruthLens – Lite\n"
"Reliable, CPU-friendly: retrieves sources and builds an **extractive answer** with citations."
)
with gr.Row():
with gr.Column():
claim = gr.Textbox(label="Claim or question", lines=2,
placeholder="e.g., Did humans cause climate change?")
run_btn = gr.Button("Run TruthLens", variant="primary")
with gr.Column():
s1 = gr.Textbox(label="Optional source 1 (paste text)", lines=4)
s2 = gr.Textbox(label="Optional source 2 (paste text)", lines=4)
s3 = gr.Textbox(label="Optional source 3 (paste text)", lines=4)
summary = gr.Markdown()
answer = gr.Markdown(label="Answer")
cites = gr.Markdown(label="Citations")
table = gr.Dataframe(label="Top sources (similarity)")
redacted = gr.Textbox(label="PII-redacted answer", lines=3)
run_btn.click(fn=run_pipeline, inputs=[claim, s1, s2, s3],
outputs=[summary, answer, cites, table, redacted])
if __name__ == "__main__":
demo.launch()
|