""" VDHF - Streamlit Interface Verification-Driven Hallucination Firewall """ import os import sys import time import streamlit as st # Add project root to path sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) DATA_DIR = os.path.join(os.path.dirname(__file__), "data", "sample_docs") # ─── Page Config ───────────────────────────────────────────────────────────── st.set_page_config( page_title="VDHF - Hallucination Firewall", page_icon="🛡️", layout="wide", initial_sidebar_state="expanded", ) # ─── Custom CSS ────────────────────────────────────────────────────────────── st.markdown(""" """, unsafe_allow_html=True) # ─── Pipeline Init (cached) ───────────────────────────────────────────────── @st.cache_resource(show_spinner=False) def load_pipeline(): """Initialize the VDHF pipeline and load documents.""" from core.pipeline import VDHFPipeline pipeline = VDHFPipeline() # Load sample documents if os.path.exists(DATA_DIR): for filename in os.listdir(DATA_DIR): if filename.endswith('.txt'): filepath = os.path.join(DATA_DIR, filename) pipeline.ingest_file(filepath) return pipeline # ─── Sidebar ───────────────────────────────────────────────────────────────── with st.sidebar: st.markdown("## ⚙️ Configuration") similarity_threshold = st.slider( "Similarity Threshold (θ_sim)", min_value=0.0, max_value=1.0, value=0.75, step=0.05, help="Minimum semantic similarity for a claim to be considered supported." ) firewall_threshold = st.slider( "Firewall Threshold (τ)", min_value=0.0, max_value=1.0, value=0.80, step=0.05, help="Minimum support ratio to pass the firewall." ) top_k = st.slider( "Top-K Evidence Chunks", min_value=1, max_value=15, value=7, help="Number of evidence chunks to retrieve." ) st.markdown("---") st.markdown("## 📚 Knowledge Base") # Custom document upload uploaded_file = st.file_uploader( "Upload a document (.txt)", type=["txt"], help="Add your own document to the knowledge base." ) st.markdown("---") st.markdown("## ℹ️ How It Works") st.markdown(""" 1. **Retrieve** relevant evidence from documents 2. **Generate** an LLM response using context 3. **Extract** atomic factual claims 4. **Verify** each claim against evidence 5. **Firewall** blocks hallucinated responses 6. **Regenerate** using only verified evidence """) # ─── Header ────────────────────────────────────────────────────────────────── st.markdown('
🛡️ Hallucination Firewall
', unsafe_allow_html=True) st.markdown('
Verification-Driven Hallucination Firewall for RAG Systems
', unsafe_allow_html=True) # ─── Load Pipeline ─────────────────────────────────────────────────────────── with st.spinner("🔄 Loading models and documents... (first load may take a moment)"): pipeline = load_pipeline() # Handle file upload if uploaded_file is not None: content = uploaded_file.read().decode("utf-8") if f"uploaded_{uploaded_file.name}" not in st.session_state: pipeline.ingest_text(content, source=uploaded_file.name) st.session_state[f"uploaded_{uploaded_file.name}"] = True st.sidebar.success(f"✅ Uploaded: {uploaded_file.name}") # Show doc count st.sidebar.metric("Document Chunks Loaded", pipeline.document_count) # ─── Tabs ──────────────────────────────────────────────────────────────────── tab_query, tab_analyze, tab_about = st.tabs(["🔍 Query", "🧪 Analyze Claims", "📖 About"]) # ═══ TAB 1: Query ════════════════════════════════════════════════════════════ with tab_query: query = st.text_input( "Ask a question about the knowledge base:", placeholder="e.g., When was Python released and who created it?", ) col_btn, col_examples = st.columns([1, 3]) with col_btn: run_query = st.button("🚀 Run Query", type="primary", use_container_width=True) with col_examples: example = st.selectbox("Or try an example:", [ "", "When was Python released and who created it?", "What caused World War I?", "Tell me about artificial intelligence history.", "How does the human body work?", "What is climate change and what causes it?", "Tell me about the Renaissance period.", "How did the internet develop?", ], label_visibility="collapsed") if example and not query: query = example run_query = True if run_query and query: # Update pipeline thresholds pipeline.similarity_threshold = similarity_threshold pipeline.firewall_threshold = firewall_threshold pipeline.top_k = top_k pipeline.verifier.similarity_threshold = similarity_threshold pipeline.firewall.similarity_threshold = similarity_threshold pipeline.firewall.decision_engine.threshold = firewall_threshold pipeline.firewall.decision_engine.scoring_module.threshold = firewall_threshold with st.spinner("Processing query through the VDHF pipeline..."): start_time = time.time() result = pipeline.query(query, verbose=False) elapsed = time.time() - start_time # ── Status Banner ── if result.is_verified: st.markdown( f'
✅ VERIFIED — Support Ratio: {result.support_ratio:.0%} ' f'({result.supported_claims}/{result.total_claims} claims supported)
', unsafe_allow_html=True ) else: st.markdown( f'
⚠️ PARTIALLY VERIFIED — Support Ratio: {result.support_ratio:.0%} ' f'({result.supported_claims}/{result.total_claims} claims supported)
', unsafe_allow_html=True ) st.markdown("") # ── Metrics Row ── m1, m2, m3, m4 = st.columns(4) m1.metric("Support Ratio", f"{result.support_ratio:.0%}") m2.metric("Total Claims", result.total_claims) m3.metric("Supported", result.supported_claims) m4.metric("Regenerations", result.regeneration_attempts) st.markdown("---") # ── Response ── st.subheader("📝 Response") st.info(result.final_response) st.caption(f"⏱️ Processed in {elapsed:.2f}s") # ── Claims Breakdown ── if result.verification_results: st.subheader("🔬 Claims Verification") for vr in result.verification_results: if vr.is_supported: st.markdown( f'
' f'✅ SUPPORTED (similarity: {vr.similarity_score:.3f}, ' f'entailment: {vr.entailment_label})
' f'{vr.claim.text}
', unsafe_allow_html=True ) else: st.markdown( f'
' f'❌ UNSUPPORTED (similarity: {vr.similarity_score:.3f}, ' f'entailment: {vr.entailment_label})
' f'{vr.claim.text}
', unsafe_allow_html=True ) # ── Retrieved Evidence ── if result.retrieved_evidence: with st.expander(f"📄 Retrieved Evidence ({len(result.retrieved_evidence)} chunks)", expanded=False): for i, ev in enumerate(result.retrieved_evidence, 1): source = os.path.basename(ev.metadata.get("source", "Unknown")) st.markdown( f'
' f'[{i}] Score: {ev.similarity_score:.3f} | Source: {source}
' f'{ev.content[:300]}{"..." if len(ev.content) > 300 else ""}
', unsafe_allow_html=True ) # ═══ TAB 2: Analyze Claims ═══════════════════════════════════════════════════ with tab_analyze: st.subheader("Test Custom Claims Against the Knowledge Base") st.markdown("Enter individual claims to verify them against the loaded documents.") claims_input = st.text_area( "Enter claims (one per line):", placeholder="Python was created by Guido van Rossum.\nPython was released in 2005.\nPython is a compiled language.", height=150, ) if st.button("🔍 Verify Claims", type="primary"): if claims_input.strip(): from core.claim_extractor import Claim from retrieval.retriever import RetrievedEvidence lines = [l.strip() for l in claims_input.strip().split("\n") if l.strip()] with st.spinner("Verifying claims..."): # Retrieve evidence for all claims combined combined_query = " ".join(lines) evidence_list = pipeline.retriever.retrieve(combined_query, top_k=top_k) claims = [Claim(text=line, claim_id=i) for i, line in enumerate(lines)] results = pipeline.verifier.verify_all_claims(claims, evidence_list) supported = sum(1 for r in results if r.is_supported) total = len(results) ratio = supported / total if total > 0 else 0 st.markdown(f"**Results: {supported}/{total} claims supported ({ratio:.0%})**") st.progress(ratio) for vr in results: if vr.is_supported: st.markdown( f'
' f'✅ SUPPORTED (score: {vr.similarity_score:.3f})
' f'{vr.claim.text}
', unsafe_allow_html=True ) else: st.markdown( f'
' f'❌ UNSUPPORTED (score: {vr.similarity_score:.3f})
' f'{vr.claim.text}
', unsafe_allow_html=True ) if vr.best_evidence: with st.expander(f"Evidence for: {vr.claim.text[:50]}..."): st.markdown( f'
{vr.best_evidence[:500]}
', unsafe_allow_html=True ) else: st.warning("Please enter at least one claim.") # ═══ TAB 3: About ════════════════════════════════════════════════════════════ with tab_about: st.subheader("About VDHF") st.markdown(""" The **Verification-Driven Hallucination Firewall (VDHF)** is a post-generation verification system that detects and mitigates hallucinations in LLM-generated responses. ### Architecture ``` User Query │ ▼ ┌─────────────────┐ │ RAG Retrieval │ ← Sentence-BERT + ChromaDB └────────┬────────┘ ▼ ┌─────────────────┐ │ LLM Generation │ ← Groq API / Mock └────────┬────────┘ ▼ ┌─────────────────┐ │ Claim Extraction │ ← Rule-based decomposition └────────┬────────┘ ▼ ┌─────────────────┐ │ Verification │ ← Semantic Similarity + NLI └────────┬────────┘ ▼ ┌─────────────────┐ │ Firewall │ ← Support Ratio ≥ τ ? └────────┬────────┘ ┌───┴───┐ ▼ ▼ PASS REGENERATE ``` ### Key Parameters | Parameter | Default | Description | |-----------|---------|-------------| | Similarity Threshold (θ_sim) | 0.75 | Min cosine similarity for support | | Firewall Threshold (τ) | 0.80 | Min support ratio to pass | | Top-K | 7 | Evidence chunks retrieved | | Max Regenerations | 2 | Retry attempts on failure | ### Models Used - **Embeddings**: `all-MiniLM-L6-v2` (Sentence-BERT) - **NLI**: `microsoft/deberta-base-mnli` - **LLM**: `llama-3.3-70b-versatile` (via Groq API) ### Knowledge Base The system comes preloaded with 12 sample documents covering: Python, Ancient Egypt, AI, Climate Change, Economics, Human Body, Internet Technology, Music History, Quantum Physics, Renaissance, Solar System, and World War II. """)