Aluode commited on
Commit
f65b63e
·
verified ·
1 Parent(s): 4404bf7

Upload 5 files

Browse files
app.py ADDED
@@ -0,0 +1,537 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ConjunctionReservoir Document Chat — HuggingFace Space
3
+ =======================================================
4
+ Upload any text or PDF document, then ask questions about it.
5
+ Retrieval uses sentence-level conjunction scoring (no embeddings needed).
6
+ Generation uses HuggingFace Inference API (free, no key required).
7
+ """
8
+
9
+ import re
10
+ import os
11
+ import time
12
+ import json
13
+ import gradio as gr
14
+ from pathlib import Path
15
+
16
+ # ── ConjunctionReservoir ──────────────────────────────────────────────────────
17
+ from conjunctionreservoir import ConjunctionReservoir
18
+
19
+ # ── HuggingFace Inference ─────────────────────────────────────────────────────
20
+ from huggingface_hub import InferenceClient
21
+
22
+ # ── PDF support (optional) ────────────────────────────────────────────────────
23
+ try:
24
+ import fitz # PyMuPDF
25
+ PDF_SUPPORT = True
26
+ except ImportError:
27
+ try:
28
+ import pypdf
29
+ PDF_SUPPORT = True
30
+ except ImportError:
31
+ PDF_SUPPORT = False
32
+
33
+ # ── Constants ─────────────────────────────────────────────────────────────────
34
+ DEFAULT_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
35
+ FALLBACK_MODEL = "HuggingFaceH4/zephyr-7b-beta"
36
+ MAX_TOKENS = 512
37
+ MAX_HISTORY = 6 # turns to keep in context
38
+
39
+ DEMO_TEXT = """The ConjunctionReservoir is a document retrieval system that asks not
40
+ "do these query terms appear somewhere in this chunk?" but rather
41
+ "do these query terms appear in the SAME SENTENCE?"
42
+
43
+ This is grounded in auditory neuroscience. Norman-Haignere et al. (2025)
44
+ showed that auditory cortex integration windows are time-yoked at approximately
45
+ 80ms — they are fixed clocks, not expanding to cover arbitrary structure.
46
+ The sentence is the text analog of this fixed window.
47
+
48
+ NMDA receptors implement coincidence detection by requiring simultaneous
49
+ presynaptic glutamate release and postsynaptic depolarization to open.
50
+ This is a hard AND gate, not a weighted average.
51
+
52
+ The conjunction_threshold parameter mirrors this: below the threshold,
53
+ a sentence contributes zero score to the chunk — it is absent, not degraded.
54
+
55
+ Benchmark results show ConjunctionReservoir achieves 100% Rank-1 Rate on
56
+ conjunction-specific queries, compared to 60% for both BM25 and SweepBrain.
57
+ It intentionally trades broad-query recall for precision on specific
58
+ co-occurrence queries. Use threshold=0.0 to approach standard TF-IDF."""
59
+
60
+ # ── Text extraction ────────────────────────────────────────────────────────────
61
+
62
+ def extract_text_from_file(filepath: str) -> str:
63
+ """Extract text from .txt or .pdf file."""
64
+ path = Path(filepath)
65
+ ext = path.suffix.lower()
66
+
67
+ if ext == ".pdf":
68
+ if not PDF_SUPPORT:
69
+ return "ERROR: PDF support not available. Please install PyMuPDF or pypdf."
70
+ try:
71
+ import fitz
72
+ doc = fitz.open(filepath)
73
+ return "\n\n".join(page.get_text() for page in doc)
74
+ except Exception:
75
+ try:
76
+ from pypdf import PdfReader
77
+ reader = PdfReader(filepath)
78
+ return "\n\n".join(p.extract_text() or "" for p in reader.pages)
79
+ except Exception as e:
80
+ return f"ERROR reading PDF: {e}"
81
+
82
+ elif ext in (".txt", ".md", ".rst", ".text"):
83
+ try:
84
+ return path.read_text(encoding="utf-8", errors="replace")
85
+ except Exception as e:
86
+ return f"ERROR reading file: {e}"
87
+
88
+ else:
89
+ try:
90
+ return path.read_text(encoding="utf-8", errors="replace")
91
+ except Exception as e:
92
+ return f"ERROR: Unsupported file type {ext}. Try .txt or .pdf"
93
+
94
+
95
+ # ── LLM generation ────────────────────────────────────────────────────────────
96
+
97
+ def get_client(hf_token: str = "") -> InferenceClient:
98
+ token = hf_token.strip() or os.environ.get("HF_TOKEN", "")
99
+ return InferenceClient(token=token if token else None)
100
+
101
+
102
+ def format_messages(system: str, history: list, user_msg: str) -> list:
103
+ messages = [{"role": "system", "content": system}]
104
+ for user_h, asst_h in history[-MAX_HISTORY:]:
105
+ messages.append({"role": "user", "content": user_h})
106
+ messages.append({"role": "assistant", "content": asst_h})
107
+ messages.append({"role": "user", "content": user_msg})
108
+ return messages
109
+
110
+
111
+ def stream_response(client, model, messages):
112
+ """Stream tokens from HF Inference API."""
113
+ try:
114
+ stream = client.chat.completions.create(
115
+ model=model,
116
+ messages=messages,
117
+ max_tokens=MAX_TOKENS,
118
+ stream=True,
119
+ temperature=0.3,
120
+ )
121
+ for chunk in stream:
122
+ delta = chunk.choices[0].delta.content
123
+ if delta:
124
+ yield delta
125
+ except Exception as e:
126
+ # Try fallback model
127
+ if model != FALLBACK_MODEL:
128
+ try:
129
+ stream = client.chat.completions.create(
130
+ model=FALLBACK_MODEL,
131
+ messages=messages,
132
+ max_tokens=MAX_TOKENS,
133
+ stream=True,
134
+ temperature=0.3,
135
+ )
136
+ for chunk in stream:
137
+ delta = chunk.choices[0].delta.content
138
+ if delta:
139
+ yield delta
140
+ return
141
+ except Exception:
142
+ pass
143
+ yield f"\n\n⚠️ Generation error: {e}\n\nTip: Add a HuggingFace token in Settings for better rate limits."
144
+
145
+
146
+ # ── Retrieval helpers ─────────────────────────────────────────────────────────
147
+
148
+ def best_sentence(chunk: str, q_tokens: set) -> tuple:
149
+ sents = [s.strip() for s in re.split(r'[.!?]+', chunk) if len(s.strip()) > 10]
150
+ best, best_cov = chunk[:80], 0.0
151
+ for s in sents:
152
+ toks = set(re.findall(r'\b[a-zA-Z]{3,}\b', s.lower()))
153
+ matches = sum(1 for qt in q_tokens if any(qt in t or t in qt for t in toks))
154
+ cov = matches / len(q_tokens) if q_tokens else 0.0
155
+ if cov > best_cov:
156
+ best_cov, best = cov, s
157
+ return best, best_cov
158
+
159
+
160
+ def do_retrieve(retriever, query: str, threshold: float, n_chunks: int = 3):
161
+ retriever.conjunction_threshold = threshold
162
+ hits = retriever.retrieve(query, top_k=n_chunks, update_coverage=True)
163
+ hits = [(c, s) for c, s in hits if s > 0]
164
+ if not hits:
165
+ # Loosen and retry
166
+ old = retriever.conjunction_threshold
167
+ retriever.conjunction_threshold = 0.0
168
+ hits = retriever.retrieve(query, top_k=2, update_coverage=False)
169
+ retriever.conjunction_threshold = old
170
+ hits = [(c, s) for c, s in hits if s > 0][:2]
171
+ return hits
172
+
173
+
174
+ def format_context_for_llm(hits: list) -> str:
175
+ if not hits:
176
+ return "No relevant passages found."
177
+ return "\n\n---\n\n".join(
178
+ f"[Passage {i} | relevance {score:.3f}]\n{chunk.strip()}"
179
+ for i, (chunk, score) in enumerate(hits, 1)
180
+ )
181
+
182
+
183
+ def format_retrieval_display(hits: list, q_tokens: set, elapsed_ms: float) -> str:
184
+ if not hits:
185
+ return f"⚠️ No passages matched (try lowering threshold) • {elapsed_ms:.0f}ms"
186
+ lines = [f"📚 **{len(hits)} passages retrieved** • {elapsed_ms:.0f}ms\n"]
187
+ for i, (chunk, score) in enumerate(hits, 1):
188
+ sent, cov = best_sentence(chunk, q_tokens)
189
+ preview = sent[:120] + ("…" if len(sent) > 120 else "")
190
+ lines.append(f"**[{i}]** score={score:.3f} → *\"{preview}\"*")
191
+ return "\n".join(lines)
192
+
193
+
194
+ # ── Main app state ─────────────────────────────────────────────────────────────
195
+
196
+ class AppState:
197
+ def __init__(self):
198
+ self.retriever = None
199
+ self.doc_name = None
200
+ self.doc_chars = 0
201
+ self.chat_history = [] # list of (user, assistant) for display
202
+ self.llm_history = [] # list of (user_with_context, assistant) for LLM
203
+
204
+ def reset_doc(self):
205
+ self.retriever = None
206
+ self.doc_name = None
207
+ self.doc_chars = 0
208
+ self.reset_chat()
209
+
210
+ def reset_chat(self):
211
+ self.chat_history = []
212
+ self.llm_history = []
213
+
214
+
215
+ # ── Build the Gradio UI ────────────────────────────────────────────────────────
216
+
217
+ def create_app():
218
+ state = AppState()
219
+
220
+ # Load demo immediately
221
+ def _load_demo():
222
+ state.reset_doc()
223
+ r = ConjunctionReservoir(conjunction_threshold=0.4, coverage_decay=0.04)
224
+ r.build_index(DEMO_TEXT, verbose=False)
225
+ state.retriever = r
226
+ state.doc_name = "ConjunctionReservoir Demo"
227
+ state.doc_chars = len(DEMO_TEXT)
228
+ s = r.summary()
229
+ return (
230
+ f"✅ **{state.doc_name}** loaded \n"
231
+ f"{s['n_chunks']} chunks • {s['n_sentences']} sentences • vocab {s['vocab_size']}"
232
+ )
233
+
234
+ # ── Gradio layout ──────────────────────────────────────────────────────────
235
+ css = """
236
+ #doc-status { border-left: 4px solid #4CAF50; padding: 8px 12px; background: #f9f9f9; border-radius: 4px; }
237
+ #retrieval-info { font-size: 0.85em; color: #555; background: #f5f5f5; padding: 8px; border-radius: 4px; }
238
+ .setting-row { display: flex; gap: 12px; align-items: center; }
239
+ footer { display: none !important; }
240
+ """
241
+
242
+ with gr.Blocks(
243
+ title="ConjunctionReservoir Document Chat",
244
+ css=css,
245
+ theme=gr.themes.Soft(primary_hue="blue", neutral_hue="slate"),
246
+ ) as demo:
247
+
248
+ # ── Header ─────────────────────────────────────────────────────────────
249
+ gr.Markdown("""
250
+ # 🧠 ConjunctionReservoir Document Chat
251
+ **Sentence-level conjunction retrieval** — terms must co-appear *in the same sentence* to score.
252
+ Grounded in auditory neuroscience (Norman-Haignere 2025, Vollan 2025). Zero embeddings. Millisecond retrieval.
253
+ """)
254
+
255
+ with gr.Row():
256
+ # ── Left column: document + settings ──────────────────────────────
257
+ with gr.Column(scale=1, min_width=300):
258
+ gr.Markdown("### 📄 Document")
259
+
260
+ with gr.Tab("Upload File"):
261
+ file_input = gr.File(
262
+ label="Upload .txt or .pdf",
263
+ file_types=[".txt", ".pdf", ".md"],
264
+ type="filepath",
265
+ )
266
+ upload_btn = gr.Button("📥 Load File", variant="primary")
267
+
268
+ with gr.Tab("Paste Text"):
269
+ text_input = gr.Textbox(
270
+ label="Paste your text here",
271
+ lines=8,
272
+ placeholder="Paste any text...",
273
+ )
274
+ paste_name = gr.Textbox(label="Document name", value="pasted_text", max_lines=1)
275
+ paste_btn = gr.Button("📥 Load Text", variant="primary")
276
+
277
+ with gr.Tab("Demo"):
278
+ gr.Markdown("Load the built-in demo text about ConjunctionReservoir itself.")
279
+ demo_btn = gr.Button("🧪 Load Demo", variant="secondary")
280
+
281
+ doc_status = gr.Markdown("*No document loaded*", elem_id="doc-status")
282
+
283
+ gr.Markdown("### ⚙️ Settings")
284
+
285
+ threshold_slider = gr.Slider(
286
+ minimum=0.0, maximum=1.0, value=0.4, step=0.05,
287
+ label="Conjunction threshold",
288
+ info="Fraction of query terms that must co-appear in a sentence (0=TF-IDF, 1=strict AND)"
289
+ )
290
+
291
+ model_dropdown = gr.Dropdown(
292
+ choices=[
293
+ "mistralai/Mistral-7B-Instruct-v0.3",
294
+ "HuggingFaceH4/zephyr-7b-beta",
295
+ "microsoft/Phi-3-mini-4k-instruct",
296
+ "google/gemma-2-2b-it",
297
+ "Qwen/Qwen2.5-7B-Instruct",
298
+ ],
299
+ value=DEFAULT_MODEL,
300
+ label="LLM model",
301
+ info="HuggingFace Inference API (free)"
302
+ )
303
+
304
+ hf_token_input = gr.Textbox(
305
+ label="HuggingFace token (optional)",
306
+ placeholder="hf_...",
307
+ type="password",
308
+ info="Add for higher rate limits. Get one free at huggingface.co/settings/tokens"
309
+ )
310
+
311
+ show_retrieval_chk = gr.Checkbox(
312
+ label="Show retrieved passages",
313
+ value=True,
314
+ )
315
+
316
+ clear_btn = gr.Button("🗑️ Clear conversation", variant="stop", size="sm")
317
+
318
+ # ── Right column: chat ─────────────────────────────────────────────
319
+ with gr.Column(scale=2):
320
+ gr.Markdown("### 💬 Chat")
321
+
322
+ chatbot = gr.Chatbot(
323
+ label="",
324
+ height=480,
325
+ show_label=False,
326
+ bubble_full_width=False,
327
+ render_markdown=True,
328
+ )
329
+
330
+ retrieval_info = gr.Markdown("", elem_id="retrieval-info")
331
+
332
+ with gr.Row():
333
+ msg_input = gr.Textbox(
334
+ placeholder="Ask anything about your document…",
335
+ show_label=False,
336
+ scale=5,
337
+ container=False,
338
+ )
339
+ send_btn = gr.Button("Send ▶", variant="primary", scale=1)
340
+
341
+ gr.Markdown("""
342
+ <small>
343
+ **Tip:** Try queries that require two concepts together, e.g. *"NMDA coincidence detection"*.
344
+ Commands: type `:coverage` to see sweep focus • `:summary` for index stats • `:threshold 0.7` to change on-the-fly
345
+ </small>
346
+ """)
347
+
348
+ # ── Callbacks ────────────────────────────��─────────────────────────────
349
+
350
+ def load_file(filepath, threshold):
351
+ if not filepath:
352
+ return "*No file selected*", state.chat_history
353
+ text = extract_text_from_file(filepath)
354
+ if text.startswith("ERROR"):
355
+ return f"❌ {text}", state.chat_history
356
+ return _index_text(text, Path(filepath).name, threshold)
357
+
358
+ def load_paste(text, name, threshold):
359
+ if not text or not text.strip():
360
+ return "*No text provided*", state.chat_history
361
+ return _index_text(text.strip(), name or "pasted_text", threshold)
362
+
363
+ def load_demo_cb(threshold):
364
+ status = _load_demo()
365
+ state.chat_history = []
366
+ state.llm_history = []
367
+ return status, []
368
+
369
+ def _index_text(text, name, threshold):
370
+ state.reset_doc()
371
+ try:
372
+ r = ConjunctionReservoir(
373
+ conjunction_threshold=float(threshold),
374
+ coverage_decay=0.04
375
+ )
376
+ r.build_index(text, verbose=False)
377
+ state.retriever = r
378
+ state.doc_name = name
379
+ state.doc_chars = len(text)
380
+ s = r.summary()
381
+ status = (
382
+ f"✅ **{name}** loaded \n"
383
+ f"{s['n_chunks']} chunks • {s['n_sentences']} sentences • "
384
+ f"vocab {s['vocab_size']} • {s['index_time_ms']:.0f}ms"
385
+ )
386
+ return status, []
387
+ except Exception as e:
388
+ return f"❌ Error indexing: {e}", state.chat_history
389
+
390
+ def clear_chat():
391
+ state.reset_chat()
392
+ return [], ""
393
+
394
+ def handle_command(msg: str):
395
+ """Handle special : commands. Returns (response_str, is_command)."""
396
+ cmd = msg.strip().lower()
397
+ if cmd == ":coverage":
398
+ if state.retriever is None:
399
+ return "No document loaded.", True
400
+ p = state.retriever.coverage_profile()
401
+ lines = [f"**Vollan sweep coverage** (after {p['n_queries']} queries) \n"]
402
+ lines.append(f"Mean coverage: {p['mean_coverage']:.5f} \n")
403
+ if p["most_covered"]:
404
+ lines.append("**Most visited sentences:**")
405
+ for sent, cov in p["most_covered"][:5]:
406
+ lines.append(f"- [{cov:.3f}] {sent[:80]}…")
407
+ return "\n".join(lines), True
408
+
409
+ if cmd == ":summary":
410
+ if state.retriever is None:
411
+ return "No document loaded.", True
412
+ s = state.retriever.summary()
413
+ return (
414
+ f"**Index summary** \n"
415
+ + "\n".join(f"- **{k}**: {v}" for k, v in s.items())
416
+ ), True
417
+
418
+ if cmd.startswith(":threshold "):
419
+ try:
420
+ val = float(cmd.split()[1])
421
+ val = max(0.0, min(1.0, val))
422
+ if state.retriever:
423
+ state.retriever.conjunction_threshold = val
424
+ return f"✅ Threshold set to **{val:.2f}**", True
425
+ except Exception:
426
+ return "Usage: `:threshold 0.5`", True
427
+
428
+ if cmd == ":help":
429
+ return (
430
+ "**Commands:**\n"
431
+ "- `:coverage` — show Vollan sweep focus\n"
432
+ "- `:summary` — index statistics\n"
433
+ "- `:threshold N` — set conjunction gate (0.0–1.0)\n"
434
+ "- `:help` — this message"
435
+ ), True
436
+
437
+ return "", False
438
+
439
+ def respond(msg, chat_history, threshold, model, hf_token, show_retrieval):
440
+ if not msg or not msg.strip():
441
+ yield chat_history, ""
442
+ return
443
+
444
+ if state.retriever is None:
445
+ chat_history = chat_history + [(msg, "⚠️ Please load a document first.")]
446
+ yield chat_history, ""
447
+ return
448
+
449
+ # Handle commands
450
+ cmd_response, is_cmd = handle_command(msg)
451
+ if is_cmd:
452
+ chat_history = chat_history + [(msg, cmd_response)]
453
+ yield chat_history, ""
454
+ return
455
+
456
+ # Retrieve
457
+ q_tokens = set(re.findall(r'\b[a-zA-Z]{3,}\b', msg.lower()))
458
+ t0 = time.perf_counter()
459
+ hits = do_retrieve(state.retriever, msg, float(threshold))
460
+ elapsed = (time.perf_counter() - t0) * 1000
461
+
462
+ retrieval_display = ""
463
+ if show_retrieval:
464
+ retrieval_display = format_retrieval_display(hits, q_tokens, elapsed)
465
+
466
+ # Build LLM prompt
467
+ context_str = format_context_for_llm(hits)
468
+ system = (
469
+ f'You are a document assistant helping the user understand "{state.doc_name}". '
470
+ f'Answer based on the provided passages. Be specific and cite the text when useful. '
471
+ f'If the answer is not in the passages, say so clearly. Keep answers concise.'
472
+ )
473
+ user_with_context = (
474
+ f"Question: {msg}\n\n"
475
+ f"Relevant passages from the document:\n\n{context_str}"
476
+ )
477
+
478
+ messages = format_messages(system, state.llm_history[-MAX_HISTORY:], user_with_context)
479
+
480
+ # Stream response
481
+ client = get_client(hf_token)
482
+ partial = ""
483
+ chat_history = chat_history + [(msg, "")]
484
+ for token in stream_response(client, model, messages):
485
+ partial += token
486
+ chat_history[-1] = (msg, partial)
487
+ yield chat_history, retrieval_display
488
+
489
+ # Save to history
490
+ state.llm_history.append((f"Question: {msg}", partial))
491
+ state.chat_history = chat_history
492
+
493
+ # ── Wire events ────────────────────────────────────────────────────────
494
+
495
+ upload_btn.click(
496
+ load_file,
497
+ inputs=[file_input, threshold_slider],
498
+ outputs=[doc_status, chatbot],
499
+ )
500
+
501
+ paste_btn.click(
502
+ load_paste,
503
+ inputs=[text_input, paste_name, threshold_slider],
504
+ outputs=[doc_status, chatbot],
505
+ )
506
+
507
+ demo_btn.click(
508
+ load_demo_cb,
509
+ inputs=[threshold_slider],
510
+ outputs=[doc_status, chatbot],
511
+ )
512
+
513
+ clear_btn.click(clear_chat, outputs=[chatbot, retrieval_info])
514
+
515
+ send_btn.click(
516
+ respond,
517
+ inputs=[msg_input, chatbot, threshold_slider, model_dropdown,
518
+ hf_token_input, show_retrieval_chk],
519
+ outputs=[chatbot, retrieval_info],
520
+ ).then(lambda: "", outputs=[msg_input])
521
+
522
+ msg_input.submit(
523
+ respond,
524
+ inputs=[msg_input, chatbot, threshold_slider, model_dropdown,
525
+ hf_token_input, show_retrieval_chk],
526
+ outputs=[chatbot, retrieval_info],
527
+ ).then(lambda: "", outputs=[msg_input])
528
+
529
+ # Load demo on startup
530
+ demo.load(_load_demo, outputs=[doc_status])
531
+
532
+ return demo
533
+
534
+
535
+ if __name__ == "__main__":
536
+ app = create_app()
537
+ app.launch(share=False)
conjunctionreservoir/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .retriever import ConjunctionReservoir
2
+
3
+ __version__ = "0.1.0"
4
+ __author__ = "Antti Luode"
5
+ __all__ = ["ConjunctionReservoir"]
conjunctionreservoir/retriever.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ConjunctionReservoir — core retriever
3
+ """
4
+
5
+ import numpy as np
6
+ import re
7
+ import time
8
+ from typing import Dict, List, Optional, Tuple, Union
9
+
10
+
11
+ def split_sentences(text: str, min_len: int = 15) -> List[str]:
12
+ return [s.strip() for s in re.split(r"[.!?]+", text) if len(s.strip()) >= min_len]
13
+
14
+
15
+ def chunk_document(text: str, chunk_size: int = 400, overlap: int = 50) -> List[str]:
16
+ sections = re.split(r"\n(?=From:|Subject:|Date:|---)", text)
17
+ chunks = []
18
+ for section in sections:
19
+ section = section.strip()
20
+ if len(section) < 50:
21
+ continue
22
+ if len(section) <= chunk_size:
23
+ chunks.append(section)
24
+ else:
25
+ for i in range(0, len(section), chunk_size - overlap):
26
+ chunk = section[i : i + chunk_size].strip()
27
+ if len(chunk) > 50:
28
+ chunks.append(chunk)
29
+ return chunks
30
+
31
+
32
+ def build_vocab(texts: List[str], max_vocab: int = 2000) -> Dict[str, int]:
33
+ counts: Dict[str, int] = {}
34
+ for t in texts:
35
+ for w in re.findall(r"\b[a-zA-Z]{2,}\b", t.lower()):
36
+ counts[w] = counts.get(w, 0) + 1
37
+ return {
38
+ word: idx
39
+ for idx, (word, _) in enumerate(
40
+ sorted(counts.items(), key=lambda x: -x[1])[:max_vocab]
41
+ )
42
+ }
43
+
44
+
45
+ def tfidf_weights(sentences: List[str], vocab: Dict[str, int]) -> np.ndarray:
46
+ n = len(sentences)
47
+ df = np.zeros(len(vocab))
48
+ for s in sentences:
49
+ for w in set(re.findall(r"\b[a-zA-Z]{2,}\b", s.lower())):
50
+ if w in vocab:
51
+ df[vocab[w]] += 1
52
+ return np.log((n + 1) / (df + 1)) + 1.0
53
+
54
+
55
+ def encode_text(text: str, vocab: Dict[str, int], idf: np.ndarray) -> np.ndarray:
56
+ vec = np.zeros(len(vocab))
57
+ for w in re.findall(r"\b[a-zA-Z]{2,}\b", text.lower()):
58
+ if w in vocab:
59
+ vec[vocab[w]] += 1.0
60
+ vec *= idf
61
+ norm = np.linalg.norm(vec)
62
+ return vec / (norm + 1e-8)
63
+
64
+
65
+ class ConjunctionReservoir:
66
+ """
67
+ Document retriever with sentence-level conjunction scoring.
68
+ """
69
+
70
+ def __init__(
71
+ self,
72
+ conjunction_threshold: float = 0.5,
73
+ coverage_decay: float = 0.04,
74
+ hebbian_lr: float = 0.01,
75
+ max_vocab: int = 2000,
76
+ ) -> None:
77
+ self.conjunction_threshold = conjunction_threshold
78
+ self.coverage_decay = coverage_decay
79
+ self.hebbian_lr = hebbian_lr
80
+ self.max_vocab = max_vocab
81
+
82
+ self.vocab: Optional[Dict[str, int]] = None
83
+ self.idf: Optional[np.ndarray] = None
84
+ self.chunk_texts: List[str] = []
85
+ self.all_sentences: List[str] = []
86
+ self.sentence_to_chunk: List[int] = []
87
+ self.sent_feats: Optional[np.ndarray] = None
88
+ self.chunk_feats: Optional[np.ndarray] = None
89
+ self.sentence_coverage: Optional[np.ndarray] = None
90
+ self.n_queries: int = 0
91
+ self.index_time: float = 0.0
92
+
93
+ def build_index(
94
+ self,
95
+ text_or_chunks: Union[str, List[str]],
96
+ verbose: bool = True,
97
+ ) -> "ConjunctionReservoir":
98
+ t0 = time.perf_counter()
99
+
100
+ if isinstance(text_or_chunks, str):
101
+ self.chunk_texts = chunk_document(text_or_chunks)
102
+ else:
103
+ self.chunk_texts = list(text_or_chunks)
104
+
105
+ if not self.chunk_texts:
106
+ raise ValueError("No chunks found.")
107
+
108
+ self.all_sentences = []
109
+ self.sentence_to_chunk = []
110
+ for chunk_idx, chunk in enumerate(self.chunk_texts):
111
+ for s in split_sentences(chunk):
112
+ self.all_sentences.append(s)
113
+ self.sentence_to_chunk.append(chunk_idx)
114
+
115
+ if not self.all_sentences:
116
+ raise ValueError("No sentences extracted.")
117
+
118
+ self.vocab = build_vocab(
119
+ self.all_sentences + self.chunk_texts, max_vocab=self.max_vocab
120
+ )
121
+ self.idf = tfidf_weights(self.all_sentences, self.vocab)
122
+
123
+ self.sent_feats = np.array(
124
+ [encode_text(s, self.vocab, self.idf) for s in self.all_sentences]
125
+ )
126
+ self.chunk_feats = np.array(
127
+ [encode_text(c, self.vocab, self.idf) for c in self.chunk_texts]
128
+ )
129
+ self.sentence_coverage = np.zeros(len(self.all_sentences))
130
+
131
+ self.index_time = time.perf_counter() - t0
132
+ return self
133
+
134
+ def retrieve(
135
+ self,
136
+ query: str,
137
+ top_k: int = 5,
138
+ update_coverage: bool = True,
139
+ ) -> List[Tuple[str, float]]:
140
+ if self.vocab is None:
141
+ raise RuntimeError("Call build_index() before retrieve().")
142
+
143
+ q_tokens = set(re.findall(r"\b[a-zA-Z]{3,}\b", query.lower()))
144
+ q_feat = encode_text(query, self.vocab, self.idf)
145
+ sent_scores = np.zeros(len(self.all_sentences))
146
+
147
+ for s_idx, sentence in enumerate(self.all_sentences):
148
+ s_tokens = set(re.findall(r"\b[a-zA-Z]{3,}\b", sentence.lower()))
149
+ matches = sum(
150
+ 1 for qt in q_tokens
151
+ if any(qt in st or st in qt for st in s_tokens)
152
+ )
153
+ token_coverage = matches / len(q_tokens) if q_tokens else 0.0
154
+
155
+ if token_coverage < self.conjunction_threshold:
156
+ continue
157
+
158
+ tfidf_sim = float(self.sent_feats[s_idx] @ q_feat)
159
+ conj_weight = token_coverage ** 2
160
+ vollan_w = 1.0 / (1.0 + self.sentence_coverage[s_idx])
161
+ sent_scores[s_idx] = tfidf_sim * conj_weight * vollan_w
162
+
163
+ chunk_scores = np.zeros(len(self.chunk_texts))
164
+ for s_idx, (score, chunk_idx) in enumerate(zip(sent_scores, self.sentence_to_chunk)):
165
+ if score > chunk_scores[chunk_idx]:
166
+ chunk_scores[chunk_idx] = score
167
+
168
+ if chunk_scores.max() == 0.0:
169
+ chunk_scores = self.chunk_feats @ q_feat
170
+
171
+ top_idx = chunk_scores.argsort()[-top_k:][::-1]
172
+ results = [(self.chunk_texts[i], float(chunk_scores[i])) for i in top_idx]
173
+
174
+ if update_coverage and sent_scores.max() > 0.0:
175
+ norm = sent_scores / (sent_scores.max() + 1e-8)
176
+ self.sentence_coverage = (
177
+ self.sentence_coverage * (1.0 - self.coverage_decay) + norm
178
+ )
179
+ self.n_queries += 1
180
+
181
+ return results
182
+
183
+ def summary(self) -> Dict:
184
+ return {
185
+ "n_chunks": len(self.chunk_texts),
186
+ "n_sentences": len(self.all_sentences),
187
+ "avg_sentences_per_chunk": round(
188
+ len(self.all_sentences) / max(1, len(self.chunk_texts)), 2
189
+ ),
190
+ "vocab_size": len(self.vocab) if self.vocab else 0,
191
+ "conjunction_threshold": self.conjunction_threshold,
192
+ "coverage_decay": self.coverage_decay,
193
+ "n_queries": self.n_queries,
194
+ "index_time_ms": round(self.index_time * 1000, 1),
195
+ }
196
+
197
+ def coverage_profile(self) -> Dict:
198
+ if self.sentence_coverage is None:
199
+ return {}
200
+ top_idx = self.sentence_coverage.argsort()[-10:][::-1]
201
+ return {
202
+ "most_covered": [
203
+ (self.all_sentences[i], round(float(self.sentence_coverage[i]), 4))
204
+ for i in top_idx
205
+ if self.sentence_coverage[i] > 0
206
+ ],
207
+ "mean_coverage": round(float(self.sentence_coverage.mean()), 6),
208
+ "n_queries": self.n_queries,
209
+ }
readme.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: ConjunctionReservoir Document Chat
3
+ emoji: 🧠
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: "4.44.0"
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ short_description: Chat with docs via sentence-level retrieval
12
+ tags:
13
+ - rag
14
+ - retrieval
15
+ - nlp
16
+ - neuroscience
17
+ - document-qa
18
+ ---
19
+
20
+ # ConjunctionReservoir Document Chat
21
+
22
+ Upload any `.txt` or `.pdf` document and chat with it.
23
+
24
+ **What makes this different from standard RAG:**
25
+
26
+ Instead of asking *"do query terms appear somewhere in this chunk?"*, ConjunctionReservoir asks *"do query terms appear in the **same sentence**?"*
27
+
28
+ This is grounded in auditory neuroscience:
29
+ - **Norman-Haignere et al. (2025):** auditory cortex integration windows are time-yoked (~80ms fixed clocks)
30
+ - **NMDA receptor logic:** hard AND gate — both inputs must arrive simultaneously
31
+ - **Vollan et al. (2025):** coverage-maximizing theta sweep for exploration
32
+
33
+ **Benchmark:** 100% Rank-1 rate on conjunction queries vs 60% for BM25 and SweepBrain.
34
+
35
+ ## Usage
36
+
37
+ 1. Upload a `.txt` or `.pdf`, or paste text directly
38
+ 2. Ask questions — works best for queries requiring two concepts together
39
+ 3. Adjust the **conjunction threshold** slider to tune precision vs recall
40
+ 4. Use `:coverage`, `:summary`, `:threshold N` commands in chat
41
+
42
+ ## No dependencies beyond NumPy for retrieval. Generation via HuggingFace Inference API (free).
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ numpy>=1.21
3
+ huggingface_hub>=0.20.0
4
+ PyMuPDF>=1.23.0