Spaces:

thomascerniglia
/

kgbchatbot

Sleeping

App Files Files Community

thomascerniglia commited on Nov 3, 2025

Commit

4abd84c

0 Parent(s):

Initial commit: add app and source

Browse files

Files changed (20) hide show

.env.example +3 -0
.gitignore +40 -0
README.MD +13 -0
app.py +24 -0
requirements.txt +11 -0
src/config.py +24 -0
src/ingestion/__init__.py +0 -0
src/ingestion/build_index.py +42 -0
src/ingestion/loaders.py +37 -0
src/ingestion/preprocess.py +41 -0
src/llm/__init__.py +0 -0
src/llm/answer.py +39 -0
src/retrieval/__init__.py +1 -0
src/retrieval/embedder.py +14 -0
src/retrieval/search.py +13 -0
src/retrieval/vectorstore.py +43 -0
src/schema.py +8 -0
src/ui/__init__.py +0 -0
src/ui/components.py +39 -0
src/utils.py +36 -0

.env.example ADDED Viewed

	@@ -0,0 +1,3 @@

+# add tokens when wanting to use summarization bot
+HF_TOKEN=
+LOCAL_GENERATION_MODEL=

.gitignore ADDED Viewed

	@@ -0,0 +1,40 @@

+# Python cache and compiled files
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+# Virtual environments
+venv/
+env/
+ENV/
+# IDE and editor files
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# Environment variables
+.env
+.env.local
+# Storage and generated index files (rebuilt at runtime)
+storage/
+*.faiss
+*.pkl
+# Raw data folder (if large - upload docs separately or via git-lfs)
+data/raw/
+# OS files
+.DS_Store
+Thumbs.db
+# Jupyter notebooks checkpoints
+.ipynb_checkpoints/
+# Logs
+*.log

README.MD ADDED Viewed

	@@ -0,0 +1,13 @@

+# KGB Document Chatbot
+A Hugging Face Spaces-ready Gradio app for querying a corpus of declassified KGB documents. Currently retrieval only
+future usage:
+place documents in data/raw and run python -m src.ingestion.build_index
+## Usage
+```bash
+pip install -r requirements.txt
+cp .env.example .env
+python app.py

app.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import sys, subprocess, os
+sys.path.append(os.path.join(os.path.dirname(__file__), "src"))
+from src.ui.components import build_app
+from src.retrieval.vectorstore import VectorStore
+from src.retrieval.embedder import get_embedder
+from src.config import Settings
+from src.utils import ensure_dirs, bootstrap_demo_index
+settings = Settings()
+ensure_dirs()
+# Make sure an index exists (demo index auto-created if empty)
+if not os.path.exists(settings.index_path) or not os.path.exists(settings.docs_path):
+    bootstrap_demo_index()
+EMBEDDER = get_embedder(settings)
+VSTORE = VectorStore(settings).load()
+demo = build_app(settings=settings, embedder=EMBEDDER, vstore=VSTORE)
+if __name__ == "__main__":
+    # For Hugging Face Spaces, use default settings (no server_name/port needed)
+    # For local dev, you can add: server_name="127.0.0.1", server_port=7860
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+gradio>=4.44.0
+faiss-cpu>=1.8.0
+sentence-transformers>=3.0.1
+numpy>=1.26.4
+pandas>=2.2.2
+pydantic>=2.9.2
+tqdm>=4.66.5
+python-dotenv>=1.0.1
+pdfminer.six>=20240706
+pytesseract>=0.3.13
+Pillow>=10.4.0

src/config.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import os
+from pydantic import BaseModel
+class Settings(BaseModel):
+    raw_dir: str = os.path.join("data", "raw")
+    storage_dir: str = "storage"
+    index_path: str = os.path.join("storage", "index.faiss")
+    docs_path: str = os.path.join("storage", "docs.pkl")
+    meta_path: str = os.path.join("storage", "meta.json")
+    embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
+    local_generation_model: str = os.getenv("LOCAL_GENERATION_MODEL", "").strip()
+    hf_token: str = os.getenv("HF_TOKEN", "").strip()
+    @property
+    def mode(self) -> str:
+        return "rag" if (self.hf_token or self.local_generation_model) else "retrieval"
+    top_k: int = 5
+    max_context_chars: int = 9000
+    title: str = "KGB Document Chatbot "
+    description: str = "Retrieval-only for now. Add a model later by setting HF_TOKEN"

src/ingestion/__init__.py ADDED Viewed

File without changes

src/ingestion/build_index.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import os
+from tqdm import tqdm
+import pickle
+from src.config import Settings
+from src.ingestion.loaders import load_raw_corpus
+from src.ingestion.preprocess import clean_text, simple_chunk, attach_metadata
+from src.retrieval.embedder import get_embedder
+from src.retrieval.vectorstore import VectorStore
+from src.utils import ensure_dirs
+def main():
+    settings = Settings()
+    ensure_dirs()
+    print("[ingest] Loading raw corpus...")
+    pairs = load_raw_corpus(settings.raw_dir)
+    print(f"[ingest] {len(pairs)} files loaded. Cleaning/chunking...")
+    all_chunks_text = []
+    for src, txt in tqdm(pairs):
+        cleaned = clean_text(txt)
+        chunks = simple_chunk(cleaned, max_chars=1200, overlap=150)
+        for c in attach_metadata(chunks, source_id=src):
+            payload = c["text"] + f"\n\n[Source: {c['source']}, chunk {c['meta']['chunk_id']}]"
+            all_chunks_text.append(payload)
+    print(f"[ingest] {len(all_chunks_text)} chunks. Embedding & indexing...")
+    embedder = get_embedder(settings)
+    vs = VectorStore(settings).build(all_chunks_text, embedder)
+    print("[ingest] Saving artifacts...")
+    with open(settings.docs_path, "wb") as f:
+        pickle.dump(all_chunks_text, f)
+    vs.save()
+    with open(settings.meta_path, "w", encoding="utf-8") as f:
+        f.write('{"demo": false, "count": %d}' % len(all_chunks_text))
+    print("[ingest] Done.")
+if __name__ == "__main__":
+    main()

src/ingestion/loaders.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import os
+from typing import List, Tuple
+from pdfminer.high_level import extract_text
+from PIL import Image
+import pytesseract
+def load_txt(path: str) -> str:
+    with open(path, "r", encoding="utf-8", errors="ignore") as f:
+        return f.read()
+def load_pdf(path: str) -> str:
+    return extract_text(path) or ""
+def load_image_ocr(path: str, lang: str = "eng") -> str:
+    img = Image.open(path)
+    return pytesseract.image_to_string(img, lang=lang)
+def load_raw_corpus(raw_dir: str) -> List[Tuple[str, str]]:
+    docs = []
+    for root, _, files in os.walk(raw_dir):
+        for fn in files:
+            p = os.path.join(root, fn)
+            lower = fn.lower()
+            try:
+                if lower.endswith(".txt"):
+                    text = load_txt(p)
+                elif lower.endswith(".pdf"):
+                    text = load_pdf(p)
+                elif lower.endswith((".png", ".jpg", ".jpeg", ".tif", ".tiff")):
+                    text = load_image_ocr(p, lang="eng")
+                else:
+                    continue
+                if text.strip():
+                    docs.append((os.path.relpath(p, raw_dir), text))
+            except Exception as e:
+                print(f"[loader] Skipped {p}: {e}")
+    return docs

src/ingestion/preprocess.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import re
+from typing import List, Dict
+def clean_text(s: str) -> str:
+    s = s.replace("\x00", " ")
+    s = re.sub(r"[ \t]+", " ", s)
+    s = re.sub(r"\n{3,}", "\n\n", s)
+    return s.strip()
+def simple_chunk(text: str, max_chars: int = 1200, overlap: int = 150) -> List[str]:
+    paras = [p.strip() for p in text.split("\n\n") if p.strip()]
+    chunks: List[str] = []
+    cur = ""
+    for p in paras:
+        if len(cur) + len(p) + 2 <= max_chars:
+            cur = (cur + "\n\n" + p).strip() if cur else p
+        else:
+            if cur:
+                chunks.append(cur)
+            if len(p) <= max_chars:
+                cur = p
+            else:
+                for i in range(0, len(p), max_chars - overlap):
+                    segment = p[i:i + (max_chars - overlap)]
+                    if segment:
+                        chunks.append(segment)
+                cur = ""
+    if cur:
+        chunks.append(cur)
+    return chunks
+def attach_metadata(chunks: List[str], source_id: str) -> List[Dict]:
+    out = []
+    for i, ch in enumerate(chunks):
+        out.append({
+            "text": ch,
+            "source": source_id,
+            "page": None,
+            "meta": {"chunk_id": i}
+        })
+    return out

src/llm/__init__.py ADDED Viewed

File without changes

src/llm/answer.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from typing import List, Tuple
+from src.config import Settings
+def compose_answer_retrieval_only(query: str, hits: List[Tuple[str, float]], settings: Settings) -> str:
+    lines = []
+    lines.append("**Top relevant excerpts** (no model used):\n")
+    for i, (txt, score) in enumerate(hits, start=1):
+        source = "unknown"
+        body = txt
+        if "[Source:" in txt:
+            parts = txt.rsplit("[Source:", 1)
+            body = parts[0].strip()
+            source = "[Source:" + parts[1]
+        lines.append(f"**{i}.** {body}\n\n*{source}*  \n*similarity: {score:.3f}*")
+    return "\n\n---\n\n".join(lines)
+def compose_answer(query: str, hits: List[Tuple[str, float]], settings: Settings) -> str:
+    if settings.mode == "retrieval":
+        return compose_answer_retrieval_only(query, hits, settings)
+    # If summarizer is later enabled, logic here will use RAG generation
+    from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
+    chunks = [t for t, _ in hits]
+    context = "\n\n---\n\n".join(chunks)
+    if len(context) > settings.max_context_chars:
+        context = context[:settings.max_context_chars] + "\n\n[Context truncated]"
+    system = (
+        "You are a cautious historian assistant. Answer ONLY from the context. "
+        "Cite sources as [Source: ...]. If unknown, say so."
+    )
+    prompt = f"{system}\n\nQUESTION:\n{query}\n\nCONTEXT:\n{context}\n\nANSWER:"
+    tok = AutoTokenizer.from_pretrained(settings.local_generation_model)
+    mdl = AutoModelForSeq2SeqLM.from_pretrained(settings.local_generation_model)
+    gen = pipeline("text2text-generation", model=mdl, tokenizer=tok)
+    out = gen(prompt, max_new_tokens=256, do_sample=False)[0]["generated_text"]
+    return out.strip()

src/retrieval/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

src/retrieval/embedder.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from sentence_transformers import SentenceTransformer
+from functools import lru_cache
+from src.config import Settings
+@lru_cache(maxsize=1)
+def _cached_model(name: str):
+    # Load model without authentication token (for public models)
+    return SentenceTransformer(name, token=False)
+def get_embedder(settings: Settings):
+    model = _cached_model(settings.embedding_model)
+    def _encode(texts):
+        return model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
+    return _encode

src/retrieval/search.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from typing import List, Dict, Any, Tuple
+def retrieve(query: str, vstore, embedder, k: int) -> List[Tuple[str, float]]:
+    return vstore.search(query, embedder, k=k)
+def format_citations(hits: List[Tuple[str, float]], max_items: int = 5) -> List[Dict[str, Any]]:
+    out = []
+    for txt, score in hits[:max_items]:
+        src = "unknown"
+        if "[Source:" in txt:
+            src = txt.split("[Source:")[-1].strip("] ").strip()
+        out.append({"source": src, "score": round(score, 3)})
+    return out

src/retrieval/vectorstore.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os
+import faiss
+import numpy as np
+import pickle
+from typing import List, Tuple
+from src.config import Settings
+class VectorStore:
+    def __init__(self, settings: Settings):
+        self.settings = settings
+        self.index = None
+        self.docs: List[str] = []
+    def build(self, texts: List[str], embedder) -> "VectorStore":
+        self.docs = texts
+        X = embedder(texts).astype("float32")
+        self.index = faiss.IndexFlatIP(X.shape[1])
+        self.index.add(X)
+        return self
+    def save(self):
+        faiss.write_index(self.index, self.settings.index_path)
+        with open(self.settings.docs_path, "wb") as f:
+            pickle.dump(self.docs, f)
+    def load(self) -> "VectorStore":
+        if os.path.exists(self.settings.index_path) and os.path.exists(self.settings.docs_path):
+            self.index = faiss.read_index(self.settings.index_path)
+            with open(self.settings.docs_path, "rb") as f:
+                self.docs = pickle.load(f)
+        else:
+            raise FileNotFoundError("Index or docs not found. Run ingestion first.")
+        return self
+    def search(self, query: str, embedder, k: int = 5) -> List[Tuple[str, float]]:
+        q = embedder([query]).astype("float32")
+        sims, ids = self.index.search(q, k)
+        hits = []
+        for idx, score in zip(ids[0], sims[0]):
+            if idx == -1:
+                continue
+            hits.append((self.docs[idx], float(score)))
+        return hits

src/schema.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from typing import Optional, Dict, Any
+from pydantic import BaseModel
+class DocChunk(BaseModel):
+    text: str
+    source: str
+    page: Optional[int] = None
+    meta: Optional[ict[str, Any]] = None

src/ui/__init__.py ADDED Viewed

File without changes

src/ui/components.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import gradio as gr
+from typing import List, Tuple
+from src.config import Settings
+from src.retrieval.search import retrieve, format_citations
+from src.llm.answer import compose_answer
+def _ask(query: str, settings, embedder, vstore) -> tuple[str, List[dict]]:
+    if not query or not query.strip():
+        return "Please enter a question about the documents.", []
+    hits: List[Tuple[str, float]] = retrieve(query, vstore=vstore, embedder=embedder, k=settings.top_k)
+    if not hits:
+        return "No relevant passages found. Try adjusting your query.", []
+    answer = compose_answer(query, hits, settings)
+    citations = format_citations(hits, max_items=settings.top_k)
+    return answer, citations
+def build_app(settings: Settings, embedder, vstore):
+    with gr.Blocks(title=settings.title) as demo:
+        gr.Markdown(f"# {settings.title}\n{settings.description}")
+        gr.Markdown(
+            f"**Mode:** `{settings.mode}` "
+            + ("— no LLM used, showing excerpts only. LLM will be added later for summarization" if settings.mode == "retrieval"
+               else "— retrieval + summarizer enabled.")
+        )
+        query = gr.Textbox(label="Your question", placeholder="e.g., Orders about Sector 4 in 1963?")
+        ask_btn = gr.Button("Search", variant="primary")
+        answer = gr.Markdown("Ask a question to see excerpts.")
+        with gr.Accordion("Citations (top matches)", open=False):
+            citations = gr.JSON(label="Source & similarity")
+        def on_ask(q):
+            a, c = _ask(q, settings, embedder, vstore)
+            return a, c
+        ask_btn.click(on_ask, inputs=[query], outputs=[answer, citations])
+        query.submit(on_ask, inputs=[query], outputs=[answer, citations])
+    return demo

src/utils.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import os
+import pickle
+import json
+from src.config import Settings
+from src.retrieval.vectorstore import VectorStore
+from src.retrieval.embedder import get_embedder
+def ensure_dirs():
+    for p in ["data/raw", "storage"]:
+        os.makedirs(p, exist_ok=True)
+def save_pickle(obj, path: str):
+    with open(path, "wb") as f:
+        pickle.dump(obj, f)
+def load_pickle(path: str):
+    with open(path, "rb") as f:
+        return pickle.load(f)
+def write_json(obj, path: str):
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(obj, f, ensure_ascii=False, indent=2)
+def bootstrap_demo_index():
+    """Create a minimal index so the app works before ingestion."""
+    settings = Settings()
+    demo_docs = [
+        "Directive: Reinforce border surveillance along Sector 4. [Source: KGB/1963/SECTOR4]",
+        "Report: Intercepted correspondence near Murmansk. [Source: KGB/1972/MUR-OPS]",
+        "Memo: Field notes suggest supply shortages in winter 1979. [Source: KGB/1979/LOG-WS]"
+    ]
+    save_pickle(demo_docs, settings.docs_path)
+    embedder = get_embedder(settings)
+    vs = VectorStore(settings).build(demo_docs, embedder)
+    vs.save()
+    write_json({"demo": True, "count": len(demo_docs)}, settings.meta_path)