AsoBozorg commited on
Commit
ac2901a
·
verified ·
1 Parent(s): 9a676d6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +156 -0
app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TruthLens – Lite (always-on CPU version)
2
+ # Retrieval + Extractive Answer + Citations (no heavy generators)
3
+
4
+ import re
5
+ import numpy as np
6
+ import pandas as pd
7
+ import gradio as gr
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+ from sentence_transformers import SentenceTransformer
10
+
11
+ # -------------------------------
12
+ # Corpus (seed docs)
13
+ # -------------------------------
14
+ SAMPLE_DOCS = [
15
+ {"title": "IPCC on Climate Change",
16
+ "text": ("It is unequivocal that human influence has warmed the atmosphere, ocean and land. "
17
+ "Greenhouse gas emissions from human activities are responsible for approximately 1.1°C of warming since 1850–1900."),
18
+ "url": "https://example.org/ipcc"},
19
+ {"title": "Elections Security Myths",
20
+ "text": ("Independent audits and paper ballot backups reduce the risk of widespread election fraud. "
21
+ "No credible evidence supports claims of nationwide manipulation in recent elections."),
22
+ "url": "https://example.org/election-security"},
23
+ {"title": "WHO on Vaccines & Safety",
24
+ "text": ("Vaccines undergo rigorous testing in clinical trials and continuous safety monitoring. "
25
+ "Severe adverse reactions are rare and benefits outweigh risks."),
26
+ "url": "https://example.org/who-vaccines"},
27
+ ]
28
+
29
+ # -------------------------------
30
+ # Model (tiny, fast)
31
+ # -------------------------------
32
+ EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
33
+ EMB = SentenceTransformer(EMB_MODEL)
34
+
35
+ INDEX = {"emb": None, "texts": [], "titles": [], "urls": []}
36
+
37
+ def _sent_split(text: str):
38
+ # lightweight sentence splitter
39
+ sents = re.split(r"(?<=[.!?])\s+", text.strip())
40
+ return [s.strip() for s in sents if s.strip()]
41
+
42
+ def build_index(extra=None):
43
+ texts = [d["text"] for d in SAMPLE_DOCS]
44
+ titles = [d["title"] for d in SAMPLE_DOCS]
45
+ urls = [d["url"] for d in SAMPLE_DOCS]
46
+ # add user sources
47
+ if extra:
48
+ for i, t in enumerate(extra):
49
+ if t and str(t).strip():
50
+ texts.append(str(t).strip())
51
+ titles.append(f"User Source {i+1}")
52
+ urls.append("user://paste")
53
+ INDEX["texts"], INDEX["titles"], INDEX["urls"] = texts, titles, urls
54
+ INDEX["emb"] = EMB.encode(texts, normalize_embeddings=True, convert_to_numpy=True)
55
+
56
+ def retrieve(query, k=3):
57
+ if INDEX["emb"] is None:
58
+ build_index()
59
+ q = EMB.encode([query], normalize_embeddings=True, convert_to_numpy=True)
60
+ sims = cosine_similarity(q, INDEX["emb"])[0]
61
+ top_idx = np.argsort(-sims)[:k]
62
+ return top_idx, sims
63
+
64
+ def extractive_answer(query, doc_indices, max_sents=5):
65
+ # score sentences from selected docs against query; pick top unique sentences
66
+ q = EMB.encode([query], normalize_embeddings=True, convert_to_numpy=True)[0]
67
+ cand_sents = []
68
+ mapping = [] # (doc_i, sent_text)
69
+ for rank, di in enumerate(doc_indices):
70
+ sents = _sent_split(INDEX["texts"][di])[:10]
71
+ if not sents:
72
+ continue
73
+ emb = EMB.encode(sents, normalize_embeddings=True, convert_to_numpy=True)
74
+ sc = cosine_similarity([q], emb)[0]
75
+ for s, score in zip(sents, sc):
76
+ cand_sents.append((score, s, di, rank))
77
+ mapping.append((di, s))
78
+
79
+ # sort by score, then take diverse sentences (avoid near-duplicates)
80
+ cand_sents.sort(key=lambda x: -x[0])
81
+ picked = []
82
+ picked_embs = []
83
+ for score, s, di, _ in cand_sents:
84
+ if len(picked) >= max_sents: break
85
+ e = EMB.encode([s], normalize_embeddings=True, convert_to_numpy=True)[0]
86
+ if picked_embs:
87
+ simmax = float(np.max(cosine_similarity([e], np.vstack(picked_embs))[0]))
88
+ if simmax > 0.85:
89
+ continue
90
+ picked.append((s, di))
91
+ picked_embs.append(e)
92
+
93
+ if not picked:
94
+ return "I’m uncertain based on the provided sources.", []
95
+
96
+ # stitch into a paragraph with inline citations
97
+ parts = []
98
+ cites_used = set()
99
+ for s, di in picked:
100
+ tag = f"[{doc_indices.index(di)+1}]" if di in doc_indices else ""
101
+ parts.append(f"{s} {tag}")
102
+ cites_used.add(di)
103
+ paragraph = " ".join(parts)
104
+ citations = [f"[{i+1}] {INDEX['titles'][di]} – {INDEX['urls'][di]}"
105
+ for i, di in enumerate(doc_indices)]
106
+ return paragraph.strip(), citations
107
+
108
+ # -------------------------------
109
+ # Pipeline
110
+ # -------------------------------
111
+ def run_pipeline(claim, s1, s2, s3):
112
+ build_index([s1, s2, s3])
113
+ idxs, sims = retrieve(claim, k=3)
114
+ answer, citations = extractive_answer(claim, list(idxs), max_sents=5)
115
+
116
+ # simple relevance table
117
+ table = pd.DataFrame({
118
+ "Source": [INDEX["titles"][i] for i in idxs],
119
+ "Relevance": [round(float(sims[i]), 3) for i in idxs]
120
+ })
121
+
122
+ # redacted = same as answer in Lite (no PII model)
123
+ redacted = answer
124
+ summary = "Mode: Lite (extractive). Answers are directly quoted/condensed from retrieved sources."
125
+
126
+ return summary, answer, "\n".join(citations), table, redacted
127
+
128
+ # -------------------------------
129
+ # UI
130
+ # -------------------------------
131
+ with gr.Blocks(title="TruthLens – Misinformation-Aware RAG (Lite)") as demo:
132
+ gr.Markdown(
133
+ "# 🧭 TruthLens – Lite\n"
134
+ "Reliable, CPU-friendly: retrieves sources and builds an **extractive answer** with citations."
135
+ )
136
+ with gr.Row():
137
+ with gr.Column():
138
+ claim = gr.Textbox(label="Claim or question", lines=2,
139
+ placeholder="e.g., Did humans cause climate change?")
140
+ run_btn = gr.Button("Run TruthLens", variant="primary")
141
+ with gr.Column():
142
+ s1 = gr.Textbox(label="Optional source 1 (paste text)", lines=4)
143
+ s2 = gr.Textbox(label="Optional source 2 (paste text)", lines=4)
144
+ s3 = gr.Textbox(label="Optional source 3 (paste text)", lines=4)
145
+
146
+ summary = gr.Markdown()
147
+ answer = gr.Markdown(label="Answer")
148
+ cites = gr.Markdown(label="Citations")
149
+ table = gr.Dataframe(label="Top sources (similarity)")
150
+ redacted = gr.Textbox(label="PII-redacted answer", lines=3)
151
+
152
+ run_btn.click(fn=run_pipeline, inputs=[claim, s1, s2, s3],
153
+ outputs=[summary, answer, cites, table, redacted])
154
+
155
+ if __name__ == "__main__":
156
+ demo.launch()