Spaces:

DrDavis
/

RAGDemo

Sleeping

App Files Files Community

DrDavis commited on Sep 10, 2025

Commit

09f6cee

verified ·

1 Parent(s): 4cc612b

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -359

app.py CHANGED Viewed

@@ -1,377 +1,135 @@
-"""
-RAG Mini Demo (Presidents Theme) — Strict, Concise, and Clean (Refreshed)
------------------------------------------------------------------------
-This Gradio app compares:
-1) LLM-Only (sampling)  — answers directly from the model (can hallucinate)
-2) RAG (strict deterministic) — retrieves context and answers ONLY from that context
-   - Deterministic decoding (no sampling)
-   - One-sentence answers, no explanations, no brackets/citations
-   - Guardrail for the "female US presidents" query
-   - Post-clean to remove any instruction echoes or meta-talk
-"""
-import os, io, re, faiss
-import gradio as gr
-from typing import List, Tuple, Optional
-from dataclasses import dataclass
 from sentence_transformers import SentenceTransformer
-from transformers import pipeline
 # ----------------------------
-# Config (easy knobs)
 # ----------------------------
-EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
-GEN_MODEL_ID   = "Qwen/Qwen2.5-0.5B-Instruct"
-CHUNK_SIZE    = 50
-CHUNK_OVERLAP = 5
-TOP_K         = 3
 # ----------------------------
 # Utilities
 # ----------------------------
-def normalize_ws(text: str) -> str:
-    """Collapse whitespace and trim ends to keep chunks clean."""
-    return re.sub(r"\s+", " ", text).strip()
-def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
-    """Split long text into overlapping chunks for retrieval."""
-    text = normalize_ws(text)
-    if len(text) <= chunk_size:
-        return [text]
-    chunks, start = [], 0
-    while start < len(text):
-        end = min(len(text), start + chunk_size)
-        chunks.append(text[start:end])
-        if end == len(text):
-            break
-        start = max(0, end - overlap)
-    return chunks
-def read_txt_or_md(file_obj: io.BytesIO, filename: str) -> str:
-    """Read .txt/.md files as UTF-8; ignore other types for classroom simplicity."""
-    ext = os.path.splitext(filename.lower())[1]
-    if ext not in [".txt", ".md"]:
-        return ""
     try:
-        return file_obj.read().decode("utf-8", errors="ignore")
     except Exception:
-        return ""
-# ----------------------------
-# RAG store
-# ----------------------------
-@dataclass
-class RAGStore:
-    corpus_docs: List[str]
-    corpus_chunks: List[str]
-    embedder: SentenceTransformer
-    d: int
-    index: faiss.IndexFlatIP
-    matrix: any  # numpy array
-    @classmethod
-    def create(cls, embedder: SentenceTransformer):
-        """
-        Presidents-themed seed corpus so the Space works immediately.
-        Keep it short so CPU Spaces build quickly.
-        """
-        seed_docs = [
-            "As of 2025, the United States has never had a female president.",
-            "The current President of the United States is Donald J. Trump, who served as the 45th and 47th President of the United States."
-            "Abraham Lincoln served as the 16th president of the United States from 1861 to 1865. "
-            "He led the country during the Civil War and issued the Emancipation Proclamation, "
-            "which declared enslaved people in Confederate states free.",
-            "Franklin D. Roosevelt, often called FDR, was the 32nd president, serving four terms from 1933 to 1945. "
-            "He launched the New Deal programs during the Great Depression and led the U.S. during World War II.",
-            "John F. Kennedy was the 35th president, serving from 1961 until his assassination in 1963. "
-            "He is remembered for the Cuban Missile Crisis, advancing the Space Race, and inspiring a new generation "
-            "with his call to 'ask not what your country can do for you — ask what you can do for your country.'",
-            "Barack Obama served as the 44th president from 2009 to 2017. He was the first African American president. "
-            "His major achievements include passing the Affordable Care Act and ordering the military operation that "
-            "killed Osama bin Laden.",
-        ]
-        chunks = []
-        for doc in seed_docs:
-            chunks.extend(chunk_text(doc))
-        embeds = embedder.encode(chunks, convert_to_numpy=True, normalize_embeddings=True)
-        d = embeds.shape[1]
-        index = faiss.IndexFlatIP(d)  # inner product; with normalized vectors = cosine similarity
-        index.add(embeds)
-        return cls(
-            corpus_docs=seed_docs,
-            corpus_chunks=chunks,
-            embedder=embedder,
-            d=d,
-            index=index,
-            matrix=embeds
-        )
-    def add_documents(self, new_docs: List[str]):
-        """Add user-provided docs: clean → chunk → embed → index."""
-        clean = [normalize_ws(x) for x in new_docs if x and normalize_ws(x)]
-        if not clean:
-            return
-        self.corpus_docs.extend(clean)
-        new_chunks = []
-        for doc in clean:
-            new_chunks.extend(chunk_text(doc))
-        if not new_chunks:
-            return
-        new_embeds = self.embedder.encode(new_chunks, convert_to_numpy=True, normalize_embeddings=True)
-        self.index.add(new_embeds)
-        import numpy as np
-        self.matrix = np.vstack([self.matrix, new_embeds]) if self.matrix is not None else new_embeds
-        self.corpus_chunks.extend(new_chunks)
-    def retrieve(self, query: str, k: int = TOP_K) -> List[Tuple[float, str]]:
-        """Return top-k (score, chunk) pairs for the query."""
-        if not query.strip() or len(self.corpus_chunks) == 0:
-            return []
-        q = self.embedder.encode([normalize_ws(query)], convert_to_numpy=True, normalize_embeddings=True)
-        scores, idxs = self.index.search(q, min(k, len(self.corpus_chunks)))
-        hits = []
-        for score, idx in zip(scores[0], idxs[0]):
-            if idx == -1:
-                continue
-            hits.append((float(score), self.corpus_chunks[idx]))
-        return hits
 # ----------------------------
-# Load models once
-# ----------------------------
-embedder  = SentenceTransformer(EMBED_MODEL_ID)
-rag_store = RAGStore.create(embedder)
-generator = pipeline("text2text-generation", model=GEN_MODEL_ID)
-# ----------------------------
-# LLM-only (sampling) — baseline
-# ----------------------------
-def generate_llm_only(question: str, max_new_tokens: int = 128, temperature: float = 0.6, top_p: float = 0.9) -> str:
-    if not question.strip():
-        return "Please enter a question."
-    out = generator(
-        question.strip(),
-        max_new_tokens=int(max_new_tokens),
-        do_sample=True,
-        temperature=float(temperature),
-        top_p=float(top_p),
-    )
-    return out[0]["generated_text"]
-# ----------------------------
-# STRICT deterministic RAG (concise + clean, no brackets)
 # ----------------------------
 STRICT_RAG_SYSTEM = (
-    "Answer ONLY using the provided context. "
-    "Reply in one short sentence with just the answer. "
-    "Do not include citations, brackets, or numbers in your answer. "
-    "If the context does not contain the answer, reply exactly: "
-    "\"I don't know based on the provided context.\" "
-    "Do not explain your reasoning. Do not include any extra text."
-)
-def _mentions_no_female_president(text: str) -> bool:
-    t = text.lower()
-    return ("never had a female president" in t) or ("no female president" in t)
-def _female_president_guard(question: str, context_chunks: List[str]) -> Optional[str]:
-    """
-    If user asks about female US presidents and our context asserts 'none',
-    return the definitive answer immediately.
-    """
-    q = question.lower()
-    if ("female" in q or "woman" in q or "women" in q) and ("president" in q) and ("united states" in q or "u.s." in q or "us " in q):
-        combined = " ".join(context_chunks).lower()
-        if _mentions_no_female_president(combined):
-            return "As of 2025, the United States has never had a female president."
-    return None
-def _post_clean(answer: str) -> str:
-    """
-    Remove any instruction echoes or meta-justifications.
-    Keep only the first sentence; strip brackets/quotes; normalize spaces.
-    """
-    a = answer.strip()
-    # Trim if model echoed "Answer:" or instruction
-    if "Answer:" in a:
-        a = a.split("Answer:", 1)[-1].strip()
-    lowers = a.lower()
-    bad_starts = [
-        "answer only using the provided context",
-        "you are a careful assistant",
-        "this answer is correct",
-        "based solely",
-        "therefore,",
-        "therefore "
-    ]
-    for bs in bad_starts:
-        if lowers.startswith(bs):
-            a = a.split(".", 1)[-1].strip() or a
-            break
-    # Strip bracketed numeric citations like [1], [23], etc.
-    a = re.sub(r"\s*\[\d+\]\s*", " ", a).strip()
-    # Keep only the first sentence
-    if "." in a:
-        a = a.split(".", 1)[0].strip() + "."
-    # Strip surrounding quotes
-    a = a.strip(" \"'")
-    # Normalize internal whitespace
-    a = normalize_ws(a)
-    # If post-clean left us empty or only brackets, abstain
-    if not a or re.fullmatch(r"\[\d+\]", a):
-        a = "I don't know based on the provided context."
-    return a
-def generate_rag_strict(question: str, k: int = TOP_K, max_new_tokens: int = 80):
-    if not question.strip():
-        return "Please enter a question.", []
-    # 1) Retrieve
-    hits = rag_store.retrieve(question, k=k)
-    chunks = [c for _, c in hits]
-    # 2) Guardrail: female-president question
-    override = _female_president_guard(question, chunks)
-    if override is not None:
-        return override, hits
-    # 3) Build context with bullets (no bracket labels)
-    context = "\n\n".join([f"- {c}" for c in chunks]) if chunks else ""
-    # 4) Build strict prompt
-    prompt = (
-        f"{STRICT_RAG_SYSTEM}\n\n"
-        f"Context:\n{context}\n\n"
-        f"Question: {question.strip()}\n"
-        f"Answer:"
-    )
-    # 5) Deterministic decoding (no sampling)
-    out = generator(
-        prompt,
-        max_new_tokens=int(max_new_tokens),
-        do_sample=False,          # no randomness
-        num_beams=4,              # explore a few safe paths
-        early_stopping=True,
-        length_penalty=0.9,
-        no_repeat_ngram_size=3,
-    )
-    raw = out[0]["generated_text"]
-    # 6) Post-clean the model text (remove echoes/explanations/brackets)
-    answer = _post_clean(raw)
-    # 7) Enforce abstention if no context present
-    if not context.strip() and "i don't know based on the provided context" not in answer.lower():
-        answer = "I don't know based on the provided context."
-    return answer, hits
-# ----------------------------
-# Gradio UI
-# ----------------------------
-with gr.Blocks(fill_height=True, analytics_enabled=False) as demo:
-    gr.Markdown(
-        "# 🔎 Retrieval-Augmented Generation (RAG) — Presidents Edition\n"
-        "Compare **LLM-only** (sampling) vs **RAG-grounded** (strict & deterministic). "
-        "Add more facts to the corpus at left, then ask questions at right.\n\n"
-        "_Tip: keep outputs short on CPU. This demo may be incorrect; always verify facts._"
-    )
-    with gr.Row():
-        # Left: corpus management
-        with gr.Column(scale=1):
-            gr.Markdown("### 📚 Corpus\nPaste text or upload .txt/.md to add to the knowledge base.")
-            paste_box = gr.Textbox(lines=8, label="Paste text (optional)")
-            upload = gr.File(label="Upload .txt or .md", file_types=[".txt", ".md"], file_count="multiple")
-            add_btn = gr.Button("Add to Corpus", variant="secondary")
-            corpus_count = gr.Markdown(f"**Chunks indexed:** {len(rag_store.corpus_chunks)}")
-        # Right: Q&A panels
-        with gr.Column(scale=2):
-            question = gr.Textbox(
-                label="Your question",
-                placeholder="Example: Who is the current president of the United States?",
-                lines=3
-            )
-            with gr.Row():
-                # LLM-only (sampling)
-                with gr.Column():
-                    gr.Markdown("#### 🤖 LLM-Only (Sampling)")
-                    max_new_llm = gr.Slider(32, 256, value=128, step=8, label="Max new tokens")
-                    temp_llm    = gr.Slider(0.0, 1.5, value=0.6, step=0.05, label="Temperature")
-                    topp_llm    = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
-                    llm_btn     = gr.Button("Generate (LLM-Only)")
-                    llm_out     = gr.Textbox(label="LLM-Only Answer", lines=8)
-                # RAG (strict deterministic)
-                with gr.Column():
-                    gr.Markdown("#### 📎 RAG-Grounded (Strict Deterministic)")
-                    topk        = gr.Slider(1, 8, value=3, step=1, label="Top-K chunks")
-                    max_new_rag = gr.Slider(32, 256, value=80, step=8, label="Max new tokens")
-                    temp_rag    = gr.Slider(0.0, 1.5, value=0.6, step=0.05, label="Temperature (unused)", interactive=False)
-                    topp_rag    = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p (unused)", interactive=False)
-                    rag_btn     = gr.Button("Generate (RAG)")
-                    rag_out     = gr.Textbox(label="RAG Answer", lines=8)
-                    retrieved   = gr.Markdown("")
-    # ---------------- Callbacks ----------------
-    def _add_to_corpus(pasted: str, files: List[gr.File]) -> str:
-        """Add pasted text and uploaded files to the corpus; update chunk count."""
-        docs = []
-        if pasted and pasted.strip():
-            docs.append(pasted)
-        if files:
-            for f in files:
-                try:
-                    with open(f.name, "rb") as fh:
-                        content = read_txt_or_md(io.BytesIO(fh.read()), f.name)
-                        if content:
-                            docs.append(content)
-                except Exception:
-                    continue
-        if docs:
-            rag_store.add_documents(docs)
-        return f"**Chunks indexed:** {len(rag_store.corpus_chunks)}"
-    def _llm_only(q, mx, t, p):
-        return generate_llm_only(q, mx, t, p)
-    def _rag(q, k, mx, _t_unused, _p_unused):
-        ans, hits = generate_rag_strict(q, k=int(k), max_new_tokens=int(mx))
-        if hits:
-            md = "##### Retrieved Chunks\n" + "\n".join([f"- (score={score:.3f}) {chunk}" for score, chunk in hits])
-        else:
-            md = "_No chunks retrieved._"
-        return ans, md
-    # Wire UI
-    add_btn.click(_add_to_corpus, inputs=[paste_box, upload], outputs=[corpus_count])
-    llm_btn.click(_llm_only, inputs=[question, max_new_llm, temp_llm, topp_llm], outputs=[llm_out])
-    rag_btn.click(_rag, inputs=[question, topk, max_new_rag, temp_rag, topp_rag], outputs=[rag_out, retrieved])
-# Launch (HF Spaces will run this automatically)
-if __name__ == "__main__":
-    demo.launch()

+# app.py — ITC 754 Gradio demo (Deterministic + RAG with Beams & Length Penalty)
+# Place in: ~/ITC754/hf-demo/app.py
+# Corpus:   ~/ITC754/hf-demo/corpus/  (put a few .txt files here)
+import os
+import glob
+import hashlib
+from typing import List, Dict, Any, Optional
+import numpy as np
+import faiss
+import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 from sentence_transformers import SentenceTransformer
 # ----------------------------
+# Model configuration
 # ----------------------------
+GEN_MODEL_NAME = os.getenv("GEN_MODEL_NAME", "microsoft/Phi-3-mini-4k-instruct")
+EMB_MODEL_NAME = os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2")
+_tok = None
+_mdl = None
+_pipe = None
+_emb = None
+_faiss = None
+_docs: List[Dict[str, Any]] = []
 # ----------------------------
 # Utilities
 # ----------------------------
+def seed_all(seed: Optional[int]) -> None:
+    """Best-effort seeding that works even if torch isn't present."""
+    import random
+    s = 0 if seed is None else seed
+    random.seed(s)
     try:
+        import torch
+        torch.manual_seed(s)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(s)
     except Exception:
+        pass
+def get_pipe():
+    """Lazy-load a simple text-generation pipeline."""
+    global _pipe, _tok, _mdl
+    if _pipe is None:
+        _tok = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
+        _mdl = AutoModelForCausalLM.from_pretrained(GEN_MODEL_NAME)
+        _pipe = pipeline("text-generation", model=_mdl, tokenizer=_tok)
+    return _pipe
+def load_corpus(cdir: str = "./corpus") -> List[Dict[str, Any]]:
+    """Load *.txt corpus files into memory."""
+    os.makedirs(cdir, exist_ok=True)
+    out: List[Dict[str, Any]] = []
+    for p in sorted(glob.glob(os.path.join(cdir, "*.txt"))):
+        try:
+            with open(p, "r", encoding="utf-8", errors="ignore") as f:
+                txt = f.read().strip()
+            if txt:
+                out.append(
+                    {"id": hashlib.sha1(p.encode()).hexdigest()[:8], "text": txt, "path": p}
+                )
+        except Exception:
+            # Skip unreadable files
+            pass
+    return out
+def get_emb():
+    """Lazy-load the sentence embedding model."""
+    global _emb
+    if _emb is None:
+        _emb = SentenceTransformer(EMB_MODEL_NAME)
+    return _emb
+def embed(texts: List[str]) -> np.ndarray:
+    """Create normalized embeddings (cosine similarity via inner product)."""
+    E = get_emb()
+    vec = E.encode(texts, normalize_embeddings=True, convert_to_numpy=True)
+    return vec.astype(np.float32)
+def build_index(docs: List[Dict[str, Any]]) -> None:
+    """Build an inner-product FAISS index."""
+    global _faiss
+    if not docs:
+        # Placeholder index with default dim used by MiniLM
+        _faiss = faiss.IndexFlatIP(384)
+        return
+    V = embed([d["text"] for d in docs])
+    _faiss = faiss.IndexFlatIP(V.shape[1])
+    _faiss.add(V)
+def retrieve(q: str, k: int = 4) -> List[Dict[str, Any]]:
+    """Return top-k docs with similarity scores."""
+    global _docs, _faiss
+    if _faiss is None or not _docs:
+        return []
+    qv = embed([q])
+    scores, idxs = _faiss.search(qv, min(k, len(_docs)))
+    out: List[Dict[str, Any]] = []
+    for s, i in zip(scores[0], idxs[0]):
+        if i < 0:
+            continue
+        d = dict(_docs[i])
+        d["score"] = float(s)
+        out.append(d)
+    return out
+def fmt_ctx(snips: List[Dict[str, Any]]) -> str:
+    """Label retrieved chunks [C1], [C2], ... for inline citations."""
+    lines: List[str] = []
+    for i, s in enumerate(snips, 1):
+        lines.append(f"[C{i}] (doc={s['id']}, score={s['score']:.3f})")
+        lines.append(s["text"].strip())
+        lines.append("")  # blank line between items
+    return "\n".join(lines).strip()
 # ----------------------------
+# RAG prompt (relaxed strict)
 # ----------------------------
 STRICT_RAG_SYSTEM = (
+    'Role: You are a careful assistant. Your first duty is factual fidelity to the provided CONTEXT; '
+    'your second