Spaces:

Aluode
/

Conjuction-Reservoir-Rag

Sleeping

App Files Files Community

Aluode commited on Feb 19

Commit

f65b63e

verified ·

1 Parent(s): 4404bf7

Upload 5 files

Browse files

Files changed (5) hide show

app.py +537 -0
conjunctionreservoir/__init__.py +5 -0
conjunctionreservoir/retriever.py +209 -0
readme.md +42 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,537 @@

+"""
+ConjunctionReservoir Document Chat — HuggingFace Space
+=======================================================
+Upload any text or PDF document, then ask questions about it.
+Retrieval uses sentence-level conjunction scoring (no embeddings needed).
+Generation uses HuggingFace Inference API (free, no key required).
+"""
+import re
+import os
+import time
+import json
+import gradio as gr
+from pathlib import Path
+# ── ConjunctionReservoir ──────────────────────────────────────────────────────
+from conjunctionreservoir import ConjunctionReservoir
+# ── HuggingFace Inference ─────────────────────────────────────────────────────
+from huggingface_hub import InferenceClient
+# ── PDF support (optional) ────────────────────────────────────────────────────
+try:
+    import fitz  # PyMuPDF
+    PDF_SUPPORT = True
+except ImportError:
+    try:
+        import pypdf
+        PDF_SUPPORT = True
+    except ImportError:
+        PDF_SUPPORT = False
+# ── Constants ─────────────────────────────────────────────────────────────────
+DEFAULT_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
+FALLBACK_MODEL = "HuggingFaceH4/zephyr-7b-beta"
+MAX_TOKENS = 512
+MAX_HISTORY = 6  # turns to keep in context
+DEMO_TEXT = """The ConjunctionReservoir is a document retrieval system that asks not
+"do these query terms appear somewhere in this chunk?" but rather
+"do these query terms appear in the SAME SENTENCE?"
+This is grounded in auditory neuroscience. Norman-Haignere et al. (2025)
+showed that auditory cortex integration windows are time-yoked at approximately
+80ms — they are fixed clocks, not expanding to cover arbitrary structure.
+The sentence is the text analog of this fixed window.
+NMDA receptors implement coincidence detection by requiring simultaneous
+presynaptic glutamate release and postsynaptic depolarization to open.
+This is a hard AND gate, not a weighted average.
+The conjunction_threshold parameter mirrors this: below the threshold,
+a sentence contributes zero score to the chunk — it is absent, not degraded.
+Benchmark results show ConjunctionReservoir achieves 100% Rank-1 Rate on
+conjunction-specific queries, compared to 60% for both BM25 and SweepBrain.
+It intentionally trades broad-query recall for precision on specific
+co-occurrence queries. Use threshold=0.0 to approach standard TF-IDF."""
+# ── Text extraction ────────────────────────────────────────────────────────────
+def extract_text_from_file(filepath: str) -> str:
+    """Extract text from .txt or .pdf file."""
+    path = Path(filepath)
+    ext = path.suffix.lower()
+    if ext == ".pdf":
+        if not PDF_SUPPORT:
+            return "ERROR: PDF support not available. Please install PyMuPDF or pypdf."
+        try:
+            import fitz
+            doc = fitz.open(filepath)
+            return "\n\n".join(page.get_text() for page in doc)
+        except Exception:
+            try:
+                from pypdf import PdfReader
+                reader = PdfReader(filepath)
+                return "\n\n".join(p.extract_text() or "" for p in reader.pages)
+            except Exception as e:
+                return f"ERROR reading PDF: {e}"
+    elif ext in (".txt", ".md", ".rst", ".text"):
+        try:
+            return path.read_text(encoding="utf-8", errors="replace")
+        except Exception as e:
+            return f"ERROR reading file: {e}"
+    else:
+        try:
+            return path.read_text(encoding="utf-8", errors="replace")
+        except Exception as e:
+            return f"ERROR: Unsupported file type {ext}. Try .txt or .pdf"
+# ── LLM generation ────────────────────────────────────────────────────────────
+def get_client(hf_token: str = "") -> InferenceClient:
+    token = hf_token.strip() or os.environ.get("HF_TOKEN", "")
+    return InferenceClient(token=token if token else None)
+def format_messages(system: str, history: list, user_msg: str) -> list:
+    messages = [{"role": "system", "content": system}]
+    for user_h, asst_h in history[-MAX_HISTORY:]:
+        messages.append({"role": "user", "content": user_h})
+        messages.append({"role": "assistant", "content": asst_h})
+    messages.append({"role": "user", "content": user_msg})
+    return messages
+def stream_response(client, model, messages):
+    """Stream tokens from HF Inference API."""
+    try:
+        stream = client.chat.completions.create(
+            model=model,
+            messages=messages,
+            max_tokens=MAX_TOKENS,
+            stream=True,
+            temperature=0.3,
+        )
+        for chunk in stream:
+            delta = chunk.choices[0].delta.content
+            if delta:
+                yield delta
+    except Exception as e:
+        # Try fallback model
+        if model != FALLBACK_MODEL:
+            try:
+                stream = client.chat.completions.create(
+                    model=FALLBACK_MODEL,
+                    messages=messages,
+                    max_tokens=MAX_TOKENS,
+                    stream=True,
+                    temperature=0.3,
+                )
+                for chunk in stream:
+                    delta = chunk.choices[0].delta.content
+                    if delta:
+                        yield delta
+                return
+            except Exception:
+                pass
+        yield f"\n\n⚠️ Generation error: {e}\n\nTip: Add a HuggingFace token in Settings for better rate limits."
+# ── Retrieval helpers ─────────────────────────────────────────────────────────
+def best_sentence(chunk: str, q_tokens: set) -> tuple:
+    sents = [s.strip() for s in re.split(r'[.!?]+', chunk) if len(s.strip()) > 10]
+    best, best_cov = chunk[:80], 0.0
+    for s in sents:
+        toks = set(re.findall(r'\b[a-zA-Z]{3,}\b', s.lower()))
+        matches = sum(1 for qt in q_tokens if any(qt in t or t in qt for t in toks))
+        cov = matches / len(q_tokens) if q_tokens else 0.0
+        if cov > best_cov:
+            best_cov, best = cov, s
+    return best, best_cov
+def do_retrieve(retriever, query: str, threshold: float, n_chunks: int = 3):
+    retriever.conjunction_threshold = threshold
+    hits = retriever.retrieve(query, top_k=n_chunks, update_coverage=True)
+    hits = [(c, s) for c, s in hits if s > 0]
+    if not hits:
+        # Loosen and retry
+        old = retriever.conjunction_threshold
+        retriever.conjunction_threshold = 0.0
+        hits = retriever.retrieve(query, top_k=2, update_coverage=False)
+        retriever.conjunction_threshold = old
+        hits = [(c, s) for c, s in hits if s > 0][:2]
+    return hits
+def format_context_for_llm(hits: list) -> str:
+    if not hits:
+        return "No relevant passages found."
+    return "\n\n---\n\n".join(
+        f"[Passage {i} | relevance {score:.3f}]\n{chunk.strip()}"
+        for i, (chunk, score) in enumerate(hits, 1)
+    )
+def format_retrieval_display(hits: list, q_tokens: set, elapsed_ms: float) -> str:
+    if not hits:
+        return f"⚠️ No passages matched (try lowering threshold) • {elapsed_ms:.0f}ms"
+    lines = [f"📚 **{len(hits)} passages retrieved** • {elapsed_ms:.0f}ms\n"]
+    for i, (chunk, score) in enumerate(hits, 1):
+        sent, cov = best_sentence(chunk, q_tokens)
+        preview = sent[:120] + ("…" if len(sent) > 120 else "")
+        lines.append(f"**[{i}]** score={score:.3f} → *\"{preview}\"*")
+    return "\n".join(lines)
+# ── Main app state ─────────────────────────────────────────────────────────────
+class AppState:
+    def __init__(self):
+        self.retriever = None
+        self.doc_name = None
+        self.doc_chars = 0
+        self.chat_history = []  # list of (user, assistant) for display
+        self.llm_history = []   # list of (user_with_context, assistant) for LLM
+    def reset_doc(self):
+        self.retriever = None
+        self.doc_name = None
+        self.doc_chars = 0
+        self.reset_chat()
+    def reset_chat(self):
+        self.chat_history = []
+        self.llm_history = []
+# ── Build the Gradio UI ────────────────────────────────────────────────────────
+def create_app():
+    state = AppState()
+    # Load demo immediately
+    def _load_demo():
+        state.reset_doc()
+        r = ConjunctionReservoir(conjunction_threshold=0.4, coverage_decay=0.04)
+        r.build_index(DEMO_TEXT, verbose=False)
+        state.retriever = r
+        state.doc_name = "ConjunctionReservoir Demo"
+        state.doc_chars = len(DEMO_TEXT)
+        s = r.summary()
+        return (
+            f"✅ **{state.doc_name}** loaded  \n"
+            f"{s['n_chunks']} chunks • {s['n_sentences']} sentences • vocab {s['vocab_size']}"
+        )
+    # ── Gradio layout ──────────────────────────────────────────────────────────
+    css = """
+    #doc-status { border-left: 4px solid #4CAF50; padding: 8px 12px; background: #f9f9f9; border-radius: 4px; }
+    #retrieval-info { font-size: 0.85em; color: #555; background: #f5f5f5; padding: 8px; border-radius: 4px; }
+    .setting-row { display: flex; gap: 12px; align-items: center; }
+    footer { display: none !important; }
+    """
+    with gr.Blocks(
+        title="ConjunctionReservoir Document Chat",
+        css=css,
+        theme=gr.themes.Soft(primary_hue="blue", neutral_hue="slate"),
+    ) as demo:
+        # ── Header ─────────────────────────────────────────────────────────────
+        gr.Markdown("""
+# 🧠 ConjunctionReservoir Document Chat
+**Sentence-level conjunction retrieval** — terms must co-appear *in the same sentence* to score.
+Grounded in auditory neuroscience (Norman-Haignere 2025, Vollan 2025). Zero embeddings. Millisecond retrieval.
+        """)
+        with gr.Row():
+            # ── Left column: document + settings ──────────────────────────────
+            with gr.Column(scale=1, min_width=300):
+                gr.Markdown("### 📄 Document")
+                with gr.Tab("Upload File"):
+                    file_input = gr.File(
+                        label="Upload .txt or .pdf",
+                        file_types=[".txt", ".pdf", ".md"],
+                        type="filepath",
+                    )
+                    upload_btn = gr.Button("📥 Load File", variant="primary")
+                with gr.Tab("Paste Text"):
+                    text_input = gr.Textbox(
+                        label="Paste your text here",
+                        lines=8,
+                        placeholder="Paste any text...",
+                    )
+                    paste_name = gr.Textbox(label="Document name", value="pasted_text", max_lines=1)
+                    paste_btn = gr.Button("📥 Load Text", variant="primary")
+                with gr.Tab("Demo"):
+                    gr.Markdown("Load the built-in demo text about ConjunctionReservoir itself.")
+                    demo_btn = gr.Button("🧪 Load Demo", variant="secondary")
+                doc_status = gr.Markdown("*No document loaded*", elem_id="doc-status")
+                gr.Markdown("### ⚙️ Settings")
+                threshold_slider = gr.Slider(
+                    minimum=0.0, maximum=1.0, value=0.4, step=0.05,
+                    label="Conjunction threshold",
+                    info="Fraction of query terms that must co-appear in a sentence (0=TF-IDF, 1=strict AND)"
+                )
+                model_dropdown = gr.Dropdown(
+                    choices=[
+                        "mistralai/Mistral-7B-Instruct-v0.3",
+                        "HuggingFaceH4/zephyr-7b-beta",
+                        "microsoft/Phi-3-mini-4k-instruct",
+                        "google/gemma-2-2b-it",
+                        "Qwen/Qwen2.5-7B-Instruct",
+                    ],
+                    value=DEFAULT_MODEL,
+                    label="LLM model",
+                    info="HuggingFace Inference API (free)"
+                )
+                hf_token_input = gr.Textbox(
+                    label="HuggingFace token (optional)",
+                    placeholder="hf_...",
+                    type="password",
+                    info="Add for higher rate limits. Get one free at huggingface.co/settings/tokens"
+                )
+                show_retrieval_chk = gr.Checkbox(
+                    label="Show retrieved passages",
+                    value=True,
+                )
+                clear_btn = gr.Button("🗑️ Clear conversation", variant="stop", size="sm")
+            # ── Right column: chat ─────────────────────────────────────────────
+            with gr.Column(scale=2):
+                gr.Markdown("### 💬 Chat")
+                chatbot = gr.Chatbot(
+                    label="",
+                    height=480,
+                    show_label=False,
+                    bubble_full_width=False,
+                    render_markdown=True,
+                )
+                retrieval_info = gr.Markdown("", elem_id="retrieval-info")
+                with gr.Row():
+                    msg_input = gr.Textbox(
+                        placeholder="Ask anything about your document…",
+                        show_label=False,
+                        scale=5,
+                        container=False,
+                    )
+                    send_btn = gr.Button("Send ▶", variant="primary", scale=1)
+                gr.Markdown("""
+<small>
+**Tip:** Try queries that require two concepts together, e.g. *"NMDA coincidence detection"*.
+Commands: type `:coverage` to see sweep focus • `:summary` for index stats • `:threshold 0.7` to change on-the-fly
+</small>
+                """)
+        # ── Callbacks ────────────────────────────��─────────────────────────────
+        def load_file(filepath, threshold):
+            if not filepath:
+                return "*No file selected*", state.chat_history
+            text = extract_text_from_file(filepath)
+            if text.startswith("ERROR"):
+                return f"❌ {text}", state.chat_history
+            return _index_text(text, Path(filepath).name, threshold)
+        def load_paste(text, name, threshold):
+            if not text or not text.strip():
+                return "*No text provided*", state.chat_history
+            return _index_text(text.strip(), name or "pasted_text", threshold)
+        def load_demo_cb(threshold):
+            status = _load_demo()
+            state.chat_history = []
+            state.llm_history = []
+            return status, []
+        def _index_text(text, name, threshold):
+            state.reset_doc()
+            try:
+                r = ConjunctionReservoir(
+                    conjunction_threshold=float(threshold),
+                    coverage_decay=0.04
+                )
+                r.build_index(text, verbose=False)
+                state.retriever = r
+                state.doc_name = name
+                state.doc_chars = len(text)
+                s = r.summary()
+                status = (
+                    f"✅ **{name}** loaded  \n"
+                    f"{s['n_chunks']} chunks • {s['n_sentences']} sentences • "
+                    f"vocab {s['vocab_size']} • {s['index_time_ms']:.0f}ms"
+                )
+                return status, []
+            except Exception as e:
+                return f"❌ Error indexing: {e}", state.chat_history
+        def clear_chat():
+            state.reset_chat()
+            return [], ""
+        def handle_command(msg: str):
+            """Handle special : commands. Returns (response_str, is_command)."""
+            cmd = msg.strip().lower()
+            if cmd == ":coverage":
+                if state.retriever is None:
+                    return "No document loaded.", True
+                p = state.retriever.coverage_profile()
+                lines = [f"**Vollan sweep coverage** (after {p['n_queries']} queries)  \n"]
+                lines.append(f"Mean coverage: {p['mean_coverage']:.5f}  \n")
+                if p["most_covered"]:
+                    lines.append("**Most visited sentences:**")
+                    for sent, cov in p["most_covered"][:5]:
+                        lines.append(f"- [{cov:.3f}] {sent[:80]}…")
+                return "\n".join(lines), True
+            if cmd == ":summary":
+                if state.retriever is None:
+                    return "No document loaded.", True
+                s = state.retriever.summary()
+                return (
+                    f"**Index summary**  \n"
+                    + "\n".join(f"- **{k}**: {v}" for k, v in s.items())
+                ), True
+            if cmd.startswith(":threshold "):
+                try:
+                    val = float(cmd.split()[1])
+                    val = max(0.0, min(1.0, val))
+                    if state.retriever:
+                        state.retriever.conjunction_threshold = val
+                    return f"✅ Threshold set to **{val:.2f}**", True
+                except Exception:
+                    return "Usage: `:threshold 0.5`", True
+            if cmd == ":help":
+                return (
+                    "**Commands:**\n"
+                    "- `:coverage` — show Vollan sweep focus\n"
+                    "- `:summary` — index statistics\n"
+                    "- `:threshold N` — set conjunction gate (0.0–1.0)\n"
+                    "- `:help` — this message"
+                ), True
+            return "", False
+        def respond(msg, chat_history, threshold, model, hf_token, show_retrieval):
+            if not msg or not msg.strip():
+                yield chat_history, ""
+                return
+            if state.retriever is None:
+                chat_history = chat_history + [(msg, "⚠️ Please load a document first.")]
+                yield chat_history, ""
+                return
+            # Handle commands
+            cmd_response, is_cmd = handle_command(msg)
+            if is_cmd:
+                chat_history = chat_history + [(msg, cmd_response)]
+                yield chat_history, ""
+                return
+            # Retrieve
+            q_tokens = set(re.findall(r'\b[a-zA-Z]{3,}\b', msg.lower()))
+            t0 = time.perf_counter()
+            hits = do_retrieve(state.retriever, msg, float(threshold))
+            elapsed = (time.perf_counter() - t0) * 1000
+            retrieval_display = ""
+            if show_retrieval:
+                retrieval_display = format_retrieval_display(hits, q_tokens, elapsed)
+            # Build LLM prompt
+            context_str = format_context_for_llm(hits)
+            system = (
+                f'You are a document assistant helping the user understand "{state.doc_name}". '
+                f'Answer based on the provided passages. Be specific and cite the text when useful. '
+                f'If the answer is not in the passages, say so clearly. Keep answers concise.'
+            )
+            user_with_context = (
+                f"Question: {msg}\n\n"
+                f"Relevant passages from the document:\n\n{context_str}"
+            )
+            messages = format_messages(system, state.llm_history[-MAX_HISTORY:], user_with_context)
+            # Stream response
+            client = get_client(hf_token)
+            partial = ""
+            chat_history = chat_history + [(msg, "")]
+            for token in stream_response(client, model, messages):
+                partial += token
+                chat_history[-1] = (msg, partial)
+                yield chat_history, retrieval_display
+            # Save to history
+            state.llm_history.append((f"Question: {msg}", partial))
+            state.chat_history = chat_history
+        # ── Wire events ────────────────────────────────────────────────────────
+        upload_btn.click(
+            load_file,
+            inputs=[file_input, threshold_slider],
+            outputs=[doc_status, chatbot],
+        )
+        paste_btn.click(
+            load_paste,
+            inputs=[text_input, paste_name, threshold_slider],
+            outputs=[doc_status, chatbot],
+        )
+        demo_btn.click(
+            load_demo_cb,
+            inputs=[threshold_slider],
+            outputs=[doc_status, chatbot],
+        )
+        clear_btn.click(clear_chat, outputs=[chatbot, retrieval_info])
+        send_btn.click(
+            respond,
+            inputs=[msg_input, chatbot, threshold_slider, model_dropdown,
+                    hf_token_input, show_retrieval_chk],
+            outputs=[chatbot, retrieval_info],
+        ).then(lambda: "", outputs=[msg_input])
+        msg_input.submit(
+            respond,
+            inputs=[msg_input, chatbot, threshold_slider, model_dropdown,
+                    hf_token_input, show_retrieval_chk],
+            outputs=[chatbot, retrieval_info],
+        ).then(lambda: "", outputs=[msg_input])
+        # Load demo on startup
+        demo.load(_load_demo, outputs=[doc_status])
+    return demo
+if __name__ == "__main__":
+    app = create_app()
+    app.launch(share=False)

conjunctionreservoir/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .retriever import ConjunctionReservoir
+__version__ = "0.1.0"
+__author__ = "Antti Luode"
+__all__ = ["ConjunctionReservoir"]

conjunctionreservoir/retriever.py ADDED Viewed

	@@ -0,0 +1,209 @@

+"""
+ConjunctionReservoir — core retriever
+"""
+import numpy as np
+import re
+import time
+from typing import Dict, List, Optional, Tuple, Union
+def split_sentences(text: str, min_len: int = 15) -> List[str]:
+    return [s.strip() for s in re.split(r"[.!?]+", text) if len(s.strip()) >= min_len]
+def chunk_document(text: str, chunk_size: int = 400, overlap: int = 50) -> List[str]:
+    sections = re.split(r"\n(?=From:|Subject:|Date:|---)", text)
+    chunks = []
+    for section in sections:
+        section = section.strip()
+        if len(section) < 50:
+            continue
+        if len(section) <= chunk_size:
+            chunks.append(section)
+        else:
+            for i in range(0, len(section), chunk_size - overlap):
+                chunk = section[i : i + chunk_size].strip()
+                if len(chunk) > 50:
+                    chunks.append(chunk)
+    return chunks
+def build_vocab(texts: List[str], max_vocab: int = 2000) -> Dict[str, int]:
+    counts: Dict[str, int] = {}
+    for t in texts:
+        for w in re.findall(r"\b[a-zA-Z]{2,}\b", t.lower()):
+            counts[w] = counts.get(w, 0) + 1
+    return {
+        word: idx
+        for idx, (word, _) in enumerate(
+            sorted(counts.items(), key=lambda x: -x[1])[:max_vocab]
+        )
+    }
+def tfidf_weights(sentences: List[str], vocab: Dict[str, int]) -> np.ndarray:
+    n = len(sentences)
+    df = np.zeros(len(vocab))
+    for s in sentences:
+        for w in set(re.findall(r"\b[a-zA-Z]{2,}\b", s.lower())):
+            if w in vocab:
+                df[vocab[w]] += 1
+    return np.log((n + 1) / (df + 1)) + 1.0
+def encode_text(text: str, vocab: Dict[str, int], idf: np.ndarray) -> np.ndarray:
+    vec = np.zeros(len(vocab))
+    for w in re.findall(r"\b[a-zA-Z]{2,}\b", text.lower()):
+        if w in vocab:
+            vec[vocab[w]] += 1.0
+    vec *= idf
+    norm = np.linalg.norm(vec)
+    return vec / (norm + 1e-8)
+class ConjunctionReservoir:
+    """
+    Document retriever with sentence-level conjunction scoring.
+    """
+    def __init__(
+        self,
+        conjunction_threshold: float = 0.5,
+        coverage_decay: float = 0.04,
+        hebbian_lr: float = 0.01,
+        max_vocab: int = 2000,
+    ) -> None:
+        self.conjunction_threshold = conjunction_threshold
+        self.coverage_decay = coverage_decay
+        self.hebbian_lr = hebbian_lr
+        self.max_vocab = max_vocab
+        self.vocab: Optional[Dict[str, int]] = None
+        self.idf: Optional[np.ndarray] = None
+        self.chunk_texts: List[str] = []
+        self.all_sentences: List[str] = []
+        self.sentence_to_chunk: List[int] = []
+        self.sent_feats: Optional[np.ndarray] = None
+        self.chunk_feats: Optional[np.ndarray] = None
+        self.sentence_coverage: Optional[np.ndarray] = None
+        self.n_queries: int = 0
+        self.index_time: float = 0.0
+    def build_index(
+        self,
+        text_or_chunks: Union[str, List[str]],
+        verbose: bool = True,
+    ) -> "ConjunctionReservoir":
+        t0 = time.perf_counter()
+        if isinstance(text_or_chunks, str):
+            self.chunk_texts = chunk_document(text_or_chunks)
+        else:
+            self.chunk_texts = list(text_or_chunks)
+        if not self.chunk_texts:
+            raise ValueError("No chunks found.")
+        self.all_sentences = []
+        self.sentence_to_chunk = []
+        for chunk_idx, chunk in enumerate(self.chunk_texts):
+            for s in split_sentences(chunk):
+                self.all_sentences.append(s)
+                self.sentence_to_chunk.append(chunk_idx)
+        if not self.all_sentences:
+            raise ValueError("No sentences extracted.")
+        self.vocab = build_vocab(
+            self.all_sentences + self.chunk_texts, max_vocab=self.max_vocab
+        )
+        self.idf = tfidf_weights(self.all_sentences, self.vocab)
+        self.sent_feats = np.array(
+            [encode_text(s, self.vocab, self.idf) for s in self.all_sentences]
+        )
+        self.chunk_feats = np.array(
+            [encode_text(c, self.vocab, self.idf) for c in self.chunk_texts]
+        )
+        self.sentence_coverage = np.zeros(len(self.all_sentences))
+        self.index_time = time.perf_counter() - t0
+        return self
+    def retrieve(
+        self,
+        query: str,
+        top_k: int = 5,
+        update_coverage: bool = True,
+    ) -> List[Tuple[str, float]]:
+        if self.vocab is None:
+            raise RuntimeError("Call build_index() before retrieve().")
+        q_tokens = set(re.findall(r"\b[a-zA-Z]{3,}\b", query.lower()))
+        q_feat = encode_text(query, self.vocab, self.idf)
+        sent_scores = np.zeros(len(self.all_sentences))
+        for s_idx, sentence in enumerate(self.all_sentences):
+            s_tokens = set(re.findall(r"\b[a-zA-Z]{3,}\b", sentence.lower()))
+            matches = sum(
+                1 for qt in q_tokens
+                if any(qt in st or st in qt for st in s_tokens)
+            )
+            token_coverage = matches / len(q_tokens) if q_tokens else 0.0
+            if token_coverage < self.conjunction_threshold:
+                continue
+            tfidf_sim = float(self.sent_feats[s_idx] @ q_feat)
+            conj_weight = token_coverage ** 2
+            vollan_w = 1.0 / (1.0 + self.sentence_coverage[s_idx])
+            sent_scores[s_idx] = tfidf_sim * conj_weight * vollan_w
+        chunk_scores = np.zeros(len(self.chunk_texts))
+        for s_idx, (score, chunk_idx) in enumerate(zip(sent_scores, self.sentence_to_chunk)):
+            if score > chunk_scores[chunk_idx]:
+                chunk_scores[chunk_idx] = score
+        if chunk_scores.max() == 0.0:
+            chunk_scores = self.chunk_feats @ q_feat
+        top_idx = chunk_scores.argsort()[-top_k:][::-1]
+        results = [(self.chunk_texts[i], float(chunk_scores[i])) for i in top_idx]
+        if update_coverage and sent_scores.max() > 0.0:
+            norm = sent_scores / (sent_scores.max() + 1e-8)
+            self.sentence_coverage = (
+                self.sentence_coverage * (1.0 - self.coverage_decay) + norm
+            )
+            self.n_queries += 1
+        return results
+    def summary(self) -> Dict:
+        return {
+            "n_chunks": len(self.chunk_texts),
+            "n_sentences": len(self.all_sentences),
+            "avg_sentences_per_chunk": round(
+                len(self.all_sentences) / max(1, len(self.chunk_texts)), 2
+            ),
+            "vocab_size": len(self.vocab) if self.vocab else 0,
+            "conjunction_threshold": self.conjunction_threshold,
+            "coverage_decay": self.coverage_decay,
+            "n_queries": self.n_queries,
+            "index_time_ms": round(self.index_time * 1000, 1),
+        }
+    def coverage_profile(self) -> Dict:
+        if self.sentence_coverage is None:
+            return {}
+        top_idx = self.sentence_coverage.argsort()[-10:][::-1]
+        return {
+            "most_covered": [
+                (self.all_sentences[i], round(float(self.sentence_coverage[i]), 4))
+                for i in top_idx
+                if self.sentence_coverage[i] > 0
+            ],
+            "mean_coverage": round(float(self.sentence_coverage.mean()), 6),
+            "n_queries": self.n_queries,
+        }

readme.md ADDED Viewed

	@@ -0,0 +1,42 @@

+---
+title: ConjunctionReservoir Document Chat
+emoji: 🧠
+colorFrom: blue
+colorTo: indigo
+sdk: gradio
+sdk_version: "4.44.0"
+app_file: app.py
+pinned: false
+license: mit
+short_description: Chat with docs via sentence-level retrieval
+tags:
+  - rag
+  - retrieval
+  - nlp
+  - neuroscience
+  - document-qa
+---
+# ConjunctionReservoir Document Chat
+Upload any `.txt` or `.pdf` document and chat with it.
+**What makes this different from standard RAG:**
+Instead of asking *"do query terms appear somewhere in this chunk?"*, ConjunctionReservoir asks *"do query terms appear in the **same sentence**?"*
+This is grounded in auditory neuroscience:
+- **Norman-Haignere et al. (2025):** auditory cortex integration windows are time-yoked (~80ms fixed clocks)
+- **NMDA receptor logic:** hard AND gate — both inputs must arrive simultaneously
+- **Vollan et al. (2025):** coverage-maximizing theta sweep for exploration
+**Benchmark:** 100% Rank-1 rate on conjunction queries vs 60% for BM25 and SweepBrain.
+## Usage
+1. Upload a `.txt` or `.pdf`, or paste text directly
+2. Ask questions — works best for queries requiring two concepts together
+3. Adjust the **conjunction threshold** slider to tune precision vs recall
+4. Use `:coverage`, `:summary`, `:threshold N` commands in chat
+## No dependencies beyond NumPy for retrieval. Generation via HuggingFace Inference API (free).

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio>=4.0.0
+numpy>=1.21
+huggingface_hub>=0.20.0
+PyMuPDF>=1.23.0