thomascerniglia commited on
Commit
4abd84c
·
0 Parent(s):

Initial commit: add app and source

Browse files
.env.example ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # add tokens when wanting to use summarization bot
2
+ HF_TOKEN=
3
+ LOCAL_GENERATION_MODEL=
.gitignore ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python cache and compiled files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+
8
+ # Virtual environments
9
+ venv/
10
+ env/
11
+ ENV/
12
+
13
+ # IDE and editor files
14
+ .vscode/
15
+ .idea/
16
+ *.swp
17
+ *.swo
18
+ *~
19
+
20
+ # Environment variables
21
+ .env
22
+ .env.local
23
+
24
+ # Storage and generated index files (rebuilt at runtime)
25
+ storage/
26
+ *.faiss
27
+ *.pkl
28
+
29
+ # Raw data folder (if large - upload docs separately or via git-lfs)
30
+ data/raw/
31
+
32
+ # OS files
33
+ .DS_Store
34
+ Thumbs.db
35
+
36
+ # Jupyter notebooks checkpoints
37
+ .ipynb_checkpoints/
38
+
39
+ # Logs
40
+ *.log
README.MD ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # KGB Document Chatbot
2
+
3
+ A Hugging Face Spaces-ready Gradio app for querying a corpus of declassified KGB documents. Currently retrieval only
4
+
5
+ future usage:
6
+ place documents in data/raw and run python -m src.ingestion.build_index
7
+
8
+ ## Usage
9
+
10
+ ```bash
11
+ pip install -r requirements.txt
12
+ cp .env.example .env
13
+ python app.py
app.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys, subprocess, os
2
+ sys.path.append(os.path.join(os.path.dirname(__file__), "src"))
3
+ from src.ui.components import build_app
4
+ from src.retrieval.vectorstore import VectorStore
5
+ from src.retrieval.embedder import get_embedder
6
+ from src.config import Settings
7
+ from src.utils import ensure_dirs, bootstrap_demo_index
8
+
9
+ settings = Settings()
10
+ ensure_dirs()
11
+
12
+ # Make sure an index exists (demo index auto-created if empty)
13
+ if not os.path.exists(settings.index_path) or not os.path.exists(settings.docs_path):
14
+ bootstrap_demo_index()
15
+
16
+ EMBEDDER = get_embedder(settings)
17
+ VSTORE = VectorStore(settings).load()
18
+
19
+ demo = build_app(settings=settings, embedder=EMBEDDER, vstore=VSTORE)
20
+
21
+ if __name__ == "__main__":
22
+ # For Hugging Face Spaces, use default settings (no server_name/port needed)
23
+ # For local dev, you can add: server_name="127.0.0.1", server_port=7860
24
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.44.0
2
+ faiss-cpu>=1.8.0
3
+ sentence-transformers>=3.0.1
4
+ numpy>=1.26.4
5
+ pandas>=2.2.2
6
+ pydantic>=2.9.2
7
+ tqdm>=4.66.5
8
+ python-dotenv>=1.0.1
9
+ pdfminer.six>=20240706
10
+ pytesseract>=0.3.13
11
+ Pillow>=10.4.0
src/config.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pydantic import BaseModel
3
+
4
+ class Settings(BaseModel):
5
+ raw_dir: str = os.path.join("data", "raw")
6
+ storage_dir: str = "storage"
7
+ index_path: str = os.path.join("storage", "index.faiss")
8
+ docs_path: str = os.path.join("storage", "docs.pkl")
9
+ meta_path: str = os.path.join("storage", "meta.json")
10
+
11
+ embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
12
+
13
+ local_generation_model: str = os.getenv("LOCAL_GENERATION_MODEL", "").strip()
14
+ hf_token: str = os.getenv("HF_TOKEN", "").strip()
15
+
16
+ @property
17
+ def mode(self) -> str:
18
+ return "rag" if (self.hf_token or self.local_generation_model) else "retrieval"
19
+
20
+ top_k: int = 5
21
+ max_context_chars: int = 9000
22
+
23
+ title: str = "KGB Document Chatbot "
24
+ description: str = "Retrieval-only for now. Add a model later by setting HF_TOKEN"
src/ingestion/__init__.py ADDED
File without changes
src/ingestion/build_index.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from tqdm import tqdm
3
+ import pickle
4
+ from src.config import Settings
5
+ from src.ingestion.loaders import load_raw_corpus
6
+ from src.ingestion.preprocess import clean_text, simple_chunk, attach_metadata
7
+ from src.retrieval.embedder import get_embedder
8
+ from src.retrieval.vectorstore import VectorStore
9
+ from src.utils import ensure_dirs
10
+
11
+ def main():
12
+ settings = Settings()
13
+ ensure_dirs()
14
+
15
+ print("[ingest] Loading raw corpus...")
16
+ pairs = load_raw_corpus(settings.raw_dir)
17
+
18
+ print(f"[ingest] {len(pairs)} files loaded. Cleaning/chunking...")
19
+ all_chunks_text = []
20
+ for src, txt in tqdm(pairs):
21
+ cleaned = clean_text(txt)
22
+ chunks = simple_chunk(cleaned, max_chars=1200, overlap=150)
23
+ for c in attach_metadata(chunks, source_id=src):
24
+ payload = c["text"] + f"\n\n[Source: {c['source']}, chunk {c['meta']['chunk_id']}]"
25
+ all_chunks_text.append(payload)
26
+
27
+ print(f"[ingest] {len(all_chunks_text)} chunks. Embedding & indexing...")
28
+ embedder = get_embedder(settings)
29
+ vs = VectorStore(settings).build(all_chunks_text, embedder)
30
+
31
+ print("[ingest] Saving artifacts...")
32
+ with open(settings.docs_path, "wb") as f:
33
+ pickle.dump(all_chunks_text, f)
34
+ vs.save()
35
+
36
+ with open(settings.meta_path, "w", encoding="utf-8") as f:
37
+ f.write('{"demo": false, "count": %d}' % len(all_chunks_text))
38
+
39
+ print("[ingest] Done.")
40
+
41
+ if __name__ == "__main__":
42
+ main()
src/ingestion/loaders.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List, Tuple
3
+ from pdfminer.high_level import extract_text
4
+ from PIL import Image
5
+ import pytesseract
6
+
7
+ def load_txt(path: str) -> str:
8
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
9
+ return f.read()
10
+
11
+ def load_pdf(path: str) -> str:
12
+ return extract_text(path) or ""
13
+
14
+ def load_image_ocr(path: str, lang: str = "eng") -> str:
15
+ img = Image.open(path)
16
+ return pytesseract.image_to_string(img, lang=lang)
17
+
18
+ def load_raw_corpus(raw_dir: str) -> List[Tuple[str, str]]:
19
+ docs = []
20
+ for root, _, files in os.walk(raw_dir):
21
+ for fn in files:
22
+ p = os.path.join(root, fn)
23
+ lower = fn.lower()
24
+ try:
25
+ if lower.endswith(".txt"):
26
+ text = load_txt(p)
27
+ elif lower.endswith(".pdf"):
28
+ text = load_pdf(p)
29
+ elif lower.endswith((".png", ".jpg", ".jpeg", ".tif", ".tiff")):
30
+ text = load_image_ocr(p, lang="eng")
31
+ else:
32
+ continue
33
+ if text.strip():
34
+ docs.append((os.path.relpath(p, raw_dir), text))
35
+ except Exception as e:
36
+ print(f"[loader] Skipped {p}: {e}")
37
+ return docs
src/ingestion/preprocess.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import List, Dict
3
+
4
+ def clean_text(s: str) -> str:
5
+ s = s.replace("\x00", " ")
6
+ s = re.sub(r"[ \t]+", " ", s)
7
+ s = re.sub(r"\n{3,}", "\n\n", s)
8
+ return s.strip()
9
+
10
+ def simple_chunk(text: str, max_chars: int = 1200, overlap: int = 150) -> List[str]:
11
+ paras = [p.strip() for p in text.split("\n\n") if p.strip()]
12
+ chunks: List[str] = []
13
+ cur = ""
14
+ for p in paras:
15
+ if len(cur) + len(p) + 2 <= max_chars:
16
+ cur = (cur + "\n\n" + p).strip() if cur else p
17
+ else:
18
+ if cur:
19
+ chunks.append(cur)
20
+ if len(p) <= max_chars:
21
+ cur = p
22
+ else:
23
+ for i in range(0, len(p), max_chars - overlap):
24
+ segment = p[i:i + (max_chars - overlap)]
25
+ if segment:
26
+ chunks.append(segment)
27
+ cur = ""
28
+ if cur:
29
+ chunks.append(cur)
30
+ return chunks
31
+
32
+ def attach_metadata(chunks: List[str], source_id: str) -> List[Dict]:
33
+ out = []
34
+ for i, ch in enumerate(chunks):
35
+ out.append({
36
+ "text": ch,
37
+ "source": source_id,
38
+ "page": None,
39
+ "meta": {"chunk_id": i}
40
+ })
41
+ return out
src/llm/__init__.py ADDED
File without changes
src/llm/answer.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ from src.config import Settings
3
+
4
+ def compose_answer_retrieval_only(query: str, hits: List[Tuple[str, float]], settings: Settings) -> str:
5
+ lines = []
6
+ lines.append("**Top relevant excerpts** (no model used):\n")
7
+ for i, (txt, score) in enumerate(hits, start=1):
8
+ source = "unknown"
9
+ body = txt
10
+ if "[Source:" in txt:
11
+ parts = txt.rsplit("[Source:", 1)
12
+ body = parts[0].strip()
13
+ source = "[Source:" + parts[1]
14
+ lines.append(f"**{i}.** {body}\n\n*{source}* \n*similarity: {score:.3f}*")
15
+ return "\n\n---\n\n".join(lines)
16
+
17
+ def compose_answer(query: str, hits: List[Tuple[str, float]], settings: Settings) -> str:
18
+ if settings.mode == "retrieval":
19
+ return compose_answer_retrieval_only(query, hits, settings)
20
+
21
+ # If summarizer is later enabled, logic here will use RAG generation
22
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
23
+ chunks = [t for t, _ in hits]
24
+ context = "\n\n---\n\n".join(chunks)
25
+ if len(context) > settings.max_context_chars:
26
+ context = context[:settings.max_context_chars] + "\n\n[Context truncated]"
27
+
28
+ system = (
29
+ "You are a cautious historian assistant. Answer ONLY from the context. "
30
+ "Cite sources as [Source: ...]. If unknown, say so."
31
+ )
32
+ prompt = f"{system}\n\nQUESTION:\n{query}\n\nCONTEXT:\n{context}\n\nANSWER:"
33
+
34
+ tok = AutoTokenizer.from_pretrained(settings.local_generation_model)
35
+ mdl = AutoModelForSeq2SeqLM.from_pretrained(settings.local_generation_model)
36
+ gen = pipeline("text2text-generation", model=mdl, tokenizer=tok)
37
+ out = gen(prompt, max_new_tokens=256, do_sample=False)[0]["generated_text"]
38
+ return out.strip()
39
+
src/retrieval/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
src/retrieval/embedder.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ from functools import lru_cache
3
+ from src.config import Settings
4
+
5
+ @lru_cache(maxsize=1)
6
+ def _cached_model(name: str):
7
+ # Load model without authentication token (for public models)
8
+ return SentenceTransformer(name, token=False)
9
+
10
+ def get_embedder(settings: Settings):
11
+ model = _cached_model(settings.embedding_model)
12
+ def _encode(texts):
13
+ return model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
14
+ return _encode
src/retrieval/search.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Any, Tuple
2
+
3
+ def retrieve(query: str, vstore, embedder, k: int) -> List[Tuple[str, float]]:
4
+ return vstore.search(query, embedder, k=k)
5
+
6
+ def format_citations(hits: List[Tuple[str, float]], max_items: int = 5) -> List[Dict[str, Any]]:
7
+ out = []
8
+ for txt, score in hits[:max_items]:
9
+ src = "unknown"
10
+ if "[Source:" in txt:
11
+ src = txt.split("[Source:")[-1].strip("] ").strip()
12
+ out.append({"source": src, "score": round(score, 3)})
13
+ return out
src/retrieval/vectorstore.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import faiss
3
+ import numpy as np
4
+ import pickle
5
+ from typing import List, Tuple
6
+ from src.config import Settings
7
+
8
+ class VectorStore:
9
+ def __init__(self, settings: Settings):
10
+ self.settings = settings
11
+ self.index = None
12
+ self.docs: List[str] = []
13
+
14
+ def build(self, texts: List[str], embedder) -> "VectorStore":
15
+ self.docs = texts
16
+ X = embedder(texts).astype("float32")
17
+ self.index = faiss.IndexFlatIP(X.shape[1])
18
+ self.index.add(X)
19
+ return self
20
+
21
+ def save(self):
22
+ faiss.write_index(self.index, self.settings.index_path)
23
+ with open(self.settings.docs_path, "wb") as f:
24
+ pickle.dump(self.docs, f)
25
+
26
+ def load(self) -> "VectorStore":
27
+ if os.path.exists(self.settings.index_path) and os.path.exists(self.settings.docs_path):
28
+ self.index = faiss.read_index(self.settings.index_path)
29
+ with open(self.settings.docs_path, "rb") as f:
30
+ self.docs = pickle.load(f)
31
+ else:
32
+ raise FileNotFoundError("Index or docs not found. Run ingestion first.")
33
+ return self
34
+
35
+ def search(self, query: str, embedder, k: int = 5) -> List[Tuple[str, float]]:
36
+ q = embedder([query]).astype("float32")
37
+ sims, ids = self.index.search(q, k)
38
+ hits = []
39
+ for idx, score in zip(ids[0], sims[0]):
40
+ if idx == -1:
41
+ continue
42
+ hits.append((self.docs[idx], float(score)))
43
+ return hits
src/schema.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Dict, Any
2
+ from pydantic import BaseModel
3
+
4
+ class DocChunk(BaseModel):
5
+ text: str
6
+ source: str
7
+ page: Optional[int] = None
8
+ meta: Optional[ict[str, Any]] = None
src/ui/__init__.py ADDED
File without changes
src/ui/components.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from typing import List, Tuple
3
+ from src.config import Settings
4
+ from src.retrieval.search import retrieve, format_citations
5
+ from src.llm.answer import compose_answer
6
+
7
+ def _ask(query: str, settings, embedder, vstore) -> tuple[str, List[dict]]:
8
+ if not query or not query.strip():
9
+ return "Please enter a question about the documents.", []
10
+ hits: List[Tuple[str, float]] = retrieve(query, vstore=vstore, embedder=embedder, k=settings.top_k)
11
+ if not hits:
12
+ return "No relevant passages found. Try adjusting your query.", []
13
+ answer = compose_answer(query, hits, settings)
14
+ citations = format_citations(hits, max_items=settings.top_k)
15
+ return answer, citations
16
+
17
+ def build_app(settings: Settings, embedder, vstore):
18
+ with gr.Blocks(title=settings.title) as demo:
19
+ gr.Markdown(f"# {settings.title}\n{settings.description}")
20
+ gr.Markdown(
21
+ f"**Mode:** `{settings.mode}` "
22
+ + ("— no LLM used, showing excerpts only. LLM will be added later for summarization" if settings.mode == "retrieval"
23
+ else "— retrieval + summarizer enabled.")
24
+ )
25
+
26
+ query = gr.Textbox(label="Your question", placeholder="e.g., Orders about Sector 4 in 1963?")
27
+ ask_btn = gr.Button("Search", variant="primary")
28
+ answer = gr.Markdown("Ask a question to see excerpts.")
29
+ with gr.Accordion("Citations (top matches)", open=False):
30
+ citations = gr.JSON(label="Source & similarity")
31
+
32
+ def on_ask(q):
33
+ a, c = _ask(q, settings, embedder, vstore)
34
+ return a, c
35
+
36
+ ask_btn.click(on_ask, inputs=[query], outputs=[answer, citations])
37
+ query.submit(on_ask, inputs=[query], outputs=[answer, citations])
38
+
39
+ return demo
src/utils.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import json
4
+ from src.config import Settings
5
+ from src.retrieval.vectorstore import VectorStore
6
+ from src.retrieval.embedder import get_embedder
7
+
8
+ def ensure_dirs():
9
+ for p in ["data/raw", "storage"]:
10
+ os.makedirs(p, exist_ok=True)
11
+
12
+ def save_pickle(obj, path: str):
13
+ with open(path, "wb") as f:
14
+ pickle.dump(obj, f)
15
+
16
+ def load_pickle(path: str):
17
+ with open(path, "rb") as f:
18
+ return pickle.load(f)
19
+
20
+ def write_json(obj, path: str):
21
+ with open(path, "w", encoding="utf-8") as f:
22
+ json.dump(obj, f, ensure_ascii=False, indent=2)
23
+
24
+ def bootstrap_demo_index():
25
+ """Create a minimal index so the app works before ingestion."""
26
+ settings = Settings()
27
+ demo_docs = [
28
+ "Directive: Reinforce border surveillance along Sector 4. [Source: KGB/1963/SECTOR4]",
29
+ "Report: Intercepted correspondence near Murmansk. [Source: KGB/1972/MUR-OPS]",
30
+ "Memo: Field notes suggest supply shortages in winter 1979. [Source: KGB/1979/LOG-WS]"
31
+ ]
32
+ save_pickle(demo_docs, settings.docs_path)
33
+ embedder = get_embedder(settings)
34
+ vs = VectorStore(settings).build(demo_docs, embedder)
35
+ vs.save()
36
+ write_json({"demo": True, "count": len(demo_docs)}, settings.meta_path)