Spaces:
Sleeping
Sleeping
Commit ·
4abd84c
0
Parent(s):
Initial commit: add app and source
Browse files- .env.example +3 -0
- .gitignore +40 -0
- README.MD +13 -0
- app.py +24 -0
- requirements.txt +11 -0
- src/config.py +24 -0
- src/ingestion/__init__.py +0 -0
- src/ingestion/build_index.py +42 -0
- src/ingestion/loaders.py +37 -0
- src/ingestion/preprocess.py +41 -0
- src/llm/__init__.py +0 -0
- src/llm/answer.py +39 -0
- src/retrieval/__init__.py +1 -0
- src/retrieval/embedder.py +14 -0
- src/retrieval/search.py +13 -0
- src/retrieval/vectorstore.py +43 -0
- src/schema.py +8 -0
- src/ui/__init__.py +0 -0
- src/ui/components.py +39 -0
- src/utils.py +36 -0
.env.example
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# add tokens when wanting to use summarization bot
|
| 2 |
+
HF_TOKEN=
|
| 3 |
+
LOCAL_GENERATION_MODEL=
|
.gitignore
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python cache and compiled files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
|
| 8 |
+
# Virtual environments
|
| 9 |
+
venv/
|
| 10 |
+
env/
|
| 11 |
+
ENV/
|
| 12 |
+
|
| 13 |
+
# IDE and editor files
|
| 14 |
+
.vscode/
|
| 15 |
+
.idea/
|
| 16 |
+
*.swp
|
| 17 |
+
*.swo
|
| 18 |
+
*~
|
| 19 |
+
|
| 20 |
+
# Environment variables
|
| 21 |
+
.env
|
| 22 |
+
.env.local
|
| 23 |
+
|
| 24 |
+
# Storage and generated index files (rebuilt at runtime)
|
| 25 |
+
storage/
|
| 26 |
+
*.faiss
|
| 27 |
+
*.pkl
|
| 28 |
+
|
| 29 |
+
# Raw data folder (if large - upload docs separately or via git-lfs)
|
| 30 |
+
data/raw/
|
| 31 |
+
|
| 32 |
+
# OS files
|
| 33 |
+
.DS_Store
|
| 34 |
+
Thumbs.db
|
| 35 |
+
|
| 36 |
+
# Jupyter notebooks checkpoints
|
| 37 |
+
.ipynb_checkpoints/
|
| 38 |
+
|
| 39 |
+
# Logs
|
| 40 |
+
*.log
|
README.MD
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# KGB Document Chatbot
|
| 2 |
+
|
| 3 |
+
A Hugging Face Spaces-ready Gradio app for querying a corpus of declassified KGB documents. Currently retrieval only
|
| 4 |
+
|
| 5 |
+
future usage:
|
| 6 |
+
place documents in data/raw and run python -m src.ingestion.build_index
|
| 7 |
+
|
| 8 |
+
## Usage
|
| 9 |
+
|
| 10 |
+
```bash
|
| 11 |
+
pip install -r requirements.txt
|
| 12 |
+
cp .env.example .env
|
| 13 |
+
python app.py
|
app.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys, subprocess, os
|
| 2 |
+
sys.path.append(os.path.join(os.path.dirname(__file__), "src"))
|
| 3 |
+
from src.ui.components import build_app
|
| 4 |
+
from src.retrieval.vectorstore import VectorStore
|
| 5 |
+
from src.retrieval.embedder import get_embedder
|
| 6 |
+
from src.config import Settings
|
| 7 |
+
from src.utils import ensure_dirs, bootstrap_demo_index
|
| 8 |
+
|
| 9 |
+
settings = Settings()
|
| 10 |
+
ensure_dirs()
|
| 11 |
+
|
| 12 |
+
# Make sure an index exists (demo index auto-created if empty)
|
| 13 |
+
if not os.path.exists(settings.index_path) or not os.path.exists(settings.docs_path):
|
| 14 |
+
bootstrap_demo_index()
|
| 15 |
+
|
| 16 |
+
EMBEDDER = get_embedder(settings)
|
| 17 |
+
VSTORE = VectorStore(settings).load()
|
| 18 |
+
|
| 19 |
+
demo = build_app(settings=settings, embedder=EMBEDDER, vstore=VSTORE)
|
| 20 |
+
|
| 21 |
+
if __name__ == "__main__":
|
| 22 |
+
# For Hugging Face Spaces, use default settings (no server_name/port needed)
|
| 23 |
+
# For local dev, you can add: server_name="127.0.0.1", server_port=7860
|
| 24 |
+
demo.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.44.0
|
| 2 |
+
faiss-cpu>=1.8.0
|
| 3 |
+
sentence-transformers>=3.0.1
|
| 4 |
+
numpy>=1.26.4
|
| 5 |
+
pandas>=2.2.2
|
| 6 |
+
pydantic>=2.9.2
|
| 7 |
+
tqdm>=4.66.5
|
| 8 |
+
python-dotenv>=1.0.1
|
| 9 |
+
pdfminer.six>=20240706
|
| 10 |
+
pytesseract>=0.3.13
|
| 11 |
+
Pillow>=10.4.0
|
src/config.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pydantic import BaseModel
|
| 3 |
+
|
| 4 |
+
class Settings(BaseModel):
|
| 5 |
+
raw_dir: str = os.path.join("data", "raw")
|
| 6 |
+
storage_dir: str = "storage"
|
| 7 |
+
index_path: str = os.path.join("storage", "index.faiss")
|
| 8 |
+
docs_path: str = os.path.join("storage", "docs.pkl")
|
| 9 |
+
meta_path: str = os.path.join("storage", "meta.json")
|
| 10 |
+
|
| 11 |
+
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
|
| 12 |
+
|
| 13 |
+
local_generation_model: str = os.getenv("LOCAL_GENERATION_MODEL", "").strip()
|
| 14 |
+
hf_token: str = os.getenv("HF_TOKEN", "").strip()
|
| 15 |
+
|
| 16 |
+
@property
|
| 17 |
+
def mode(self) -> str:
|
| 18 |
+
return "rag" if (self.hf_token or self.local_generation_model) else "retrieval"
|
| 19 |
+
|
| 20 |
+
top_k: int = 5
|
| 21 |
+
max_context_chars: int = 9000
|
| 22 |
+
|
| 23 |
+
title: str = "KGB Document Chatbot "
|
| 24 |
+
description: str = "Retrieval-only for now. Add a model later by setting HF_TOKEN"
|
src/ingestion/__init__.py
ADDED
|
File without changes
|
src/ingestion/build_index.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from tqdm import tqdm
|
| 3 |
+
import pickle
|
| 4 |
+
from src.config import Settings
|
| 5 |
+
from src.ingestion.loaders import load_raw_corpus
|
| 6 |
+
from src.ingestion.preprocess import clean_text, simple_chunk, attach_metadata
|
| 7 |
+
from src.retrieval.embedder import get_embedder
|
| 8 |
+
from src.retrieval.vectorstore import VectorStore
|
| 9 |
+
from src.utils import ensure_dirs
|
| 10 |
+
|
| 11 |
+
def main():
|
| 12 |
+
settings = Settings()
|
| 13 |
+
ensure_dirs()
|
| 14 |
+
|
| 15 |
+
print("[ingest] Loading raw corpus...")
|
| 16 |
+
pairs = load_raw_corpus(settings.raw_dir)
|
| 17 |
+
|
| 18 |
+
print(f"[ingest] {len(pairs)} files loaded. Cleaning/chunking...")
|
| 19 |
+
all_chunks_text = []
|
| 20 |
+
for src, txt in tqdm(pairs):
|
| 21 |
+
cleaned = clean_text(txt)
|
| 22 |
+
chunks = simple_chunk(cleaned, max_chars=1200, overlap=150)
|
| 23 |
+
for c in attach_metadata(chunks, source_id=src):
|
| 24 |
+
payload = c["text"] + f"\n\n[Source: {c['source']}, chunk {c['meta']['chunk_id']}]"
|
| 25 |
+
all_chunks_text.append(payload)
|
| 26 |
+
|
| 27 |
+
print(f"[ingest] {len(all_chunks_text)} chunks. Embedding & indexing...")
|
| 28 |
+
embedder = get_embedder(settings)
|
| 29 |
+
vs = VectorStore(settings).build(all_chunks_text, embedder)
|
| 30 |
+
|
| 31 |
+
print("[ingest] Saving artifacts...")
|
| 32 |
+
with open(settings.docs_path, "wb") as f:
|
| 33 |
+
pickle.dump(all_chunks_text, f)
|
| 34 |
+
vs.save()
|
| 35 |
+
|
| 36 |
+
with open(settings.meta_path, "w", encoding="utf-8") as f:
|
| 37 |
+
f.write('{"demo": false, "count": %d}' % len(all_chunks_text))
|
| 38 |
+
|
| 39 |
+
print("[ingest] Done.")
|
| 40 |
+
|
| 41 |
+
if __name__ == "__main__":
|
| 42 |
+
main()
|
src/ingestion/loaders.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import List, Tuple
|
| 3 |
+
from pdfminer.high_level import extract_text
|
| 4 |
+
from PIL import Image
|
| 5 |
+
import pytesseract
|
| 6 |
+
|
| 7 |
+
def load_txt(path: str) -> str:
|
| 8 |
+
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
| 9 |
+
return f.read()
|
| 10 |
+
|
| 11 |
+
def load_pdf(path: str) -> str:
|
| 12 |
+
return extract_text(path) or ""
|
| 13 |
+
|
| 14 |
+
def load_image_ocr(path: str, lang: str = "eng") -> str:
|
| 15 |
+
img = Image.open(path)
|
| 16 |
+
return pytesseract.image_to_string(img, lang=lang)
|
| 17 |
+
|
| 18 |
+
def load_raw_corpus(raw_dir: str) -> List[Tuple[str, str]]:
|
| 19 |
+
docs = []
|
| 20 |
+
for root, _, files in os.walk(raw_dir):
|
| 21 |
+
for fn in files:
|
| 22 |
+
p = os.path.join(root, fn)
|
| 23 |
+
lower = fn.lower()
|
| 24 |
+
try:
|
| 25 |
+
if lower.endswith(".txt"):
|
| 26 |
+
text = load_txt(p)
|
| 27 |
+
elif lower.endswith(".pdf"):
|
| 28 |
+
text = load_pdf(p)
|
| 29 |
+
elif lower.endswith((".png", ".jpg", ".jpeg", ".tif", ".tiff")):
|
| 30 |
+
text = load_image_ocr(p, lang="eng")
|
| 31 |
+
else:
|
| 32 |
+
continue
|
| 33 |
+
if text.strip():
|
| 34 |
+
docs.append((os.path.relpath(p, raw_dir), text))
|
| 35 |
+
except Exception as e:
|
| 36 |
+
print(f"[loader] Skipped {p}: {e}")
|
| 37 |
+
return docs
|
src/ingestion/preprocess.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import List, Dict
|
| 3 |
+
|
| 4 |
+
def clean_text(s: str) -> str:
|
| 5 |
+
s = s.replace("\x00", " ")
|
| 6 |
+
s = re.sub(r"[ \t]+", " ", s)
|
| 7 |
+
s = re.sub(r"\n{3,}", "\n\n", s)
|
| 8 |
+
return s.strip()
|
| 9 |
+
|
| 10 |
+
def simple_chunk(text: str, max_chars: int = 1200, overlap: int = 150) -> List[str]:
|
| 11 |
+
paras = [p.strip() for p in text.split("\n\n") if p.strip()]
|
| 12 |
+
chunks: List[str] = []
|
| 13 |
+
cur = ""
|
| 14 |
+
for p in paras:
|
| 15 |
+
if len(cur) + len(p) + 2 <= max_chars:
|
| 16 |
+
cur = (cur + "\n\n" + p).strip() if cur else p
|
| 17 |
+
else:
|
| 18 |
+
if cur:
|
| 19 |
+
chunks.append(cur)
|
| 20 |
+
if len(p) <= max_chars:
|
| 21 |
+
cur = p
|
| 22 |
+
else:
|
| 23 |
+
for i in range(0, len(p), max_chars - overlap):
|
| 24 |
+
segment = p[i:i + (max_chars - overlap)]
|
| 25 |
+
if segment:
|
| 26 |
+
chunks.append(segment)
|
| 27 |
+
cur = ""
|
| 28 |
+
if cur:
|
| 29 |
+
chunks.append(cur)
|
| 30 |
+
return chunks
|
| 31 |
+
|
| 32 |
+
def attach_metadata(chunks: List[str], source_id: str) -> List[Dict]:
|
| 33 |
+
out = []
|
| 34 |
+
for i, ch in enumerate(chunks):
|
| 35 |
+
out.append({
|
| 36 |
+
"text": ch,
|
| 37 |
+
"source": source_id,
|
| 38 |
+
"page": None,
|
| 39 |
+
"meta": {"chunk_id": i}
|
| 40 |
+
})
|
| 41 |
+
return out
|
src/llm/__init__.py
ADDED
|
File without changes
|
src/llm/answer.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Tuple
|
| 2 |
+
from src.config import Settings
|
| 3 |
+
|
| 4 |
+
def compose_answer_retrieval_only(query: str, hits: List[Tuple[str, float]], settings: Settings) -> str:
|
| 5 |
+
lines = []
|
| 6 |
+
lines.append("**Top relevant excerpts** (no model used):\n")
|
| 7 |
+
for i, (txt, score) in enumerate(hits, start=1):
|
| 8 |
+
source = "unknown"
|
| 9 |
+
body = txt
|
| 10 |
+
if "[Source:" in txt:
|
| 11 |
+
parts = txt.rsplit("[Source:", 1)
|
| 12 |
+
body = parts[0].strip()
|
| 13 |
+
source = "[Source:" + parts[1]
|
| 14 |
+
lines.append(f"**{i}.** {body}\n\n*{source}* \n*similarity: {score:.3f}*")
|
| 15 |
+
return "\n\n---\n\n".join(lines)
|
| 16 |
+
|
| 17 |
+
def compose_answer(query: str, hits: List[Tuple[str, float]], settings: Settings) -> str:
|
| 18 |
+
if settings.mode == "retrieval":
|
| 19 |
+
return compose_answer_retrieval_only(query, hits, settings)
|
| 20 |
+
|
| 21 |
+
# If summarizer is later enabled, logic here will use RAG generation
|
| 22 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
|
| 23 |
+
chunks = [t for t, _ in hits]
|
| 24 |
+
context = "\n\n---\n\n".join(chunks)
|
| 25 |
+
if len(context) > settings.max_context_chars:
|
| 26 |
+
context = context[:settings.max_context_chars] + "\n\n[Context truncated]"
|
| 27 |
+
|
| 28 |
+
system = (
|
| 29 |
+
"You are a cautious historian assistant. Answer ONLY from the context. "
|
| 30 |
+
"Cite sources as [Source: ...]. If unknown, say so."
|
| 31 |
+
)
|
| 32 |
+
prompt = f"{system}\n\nQUESTION:\n{query}\n\nCONTEXT:\n{context}\n\nANSWER:"
|
| 33 |
+
|
| 34 |
+
tok = AutoTokenizer.from_pretrained(settings.local_generation_model)
|
| 35 |
+
mdl = AutoModelForSeq2SeqLM.from_pretrained(settings.local_generation_model)
|
| 36 |
+
gen = pipeline("text2text-generation", model=mdl, tokenizer=tok)
|
| 37 |
+
out = gen(prompt, max_new_tokens=256, do_sample=False)[0]["generated_text"]
|
| 38 |
+
return out.strip()
|
| 39 |
+
|
src/retrieval/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
src/retrieval/embedder.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentence_transformers import SentenceTransformer
|
| 2 |
+
from functools import lru_cache
|
| 3 |
+
from src.config import Settings
|
| 4 |
+
|
| 5 |
+
@lru_cache(maxsize=1)
|
| 6 |
+
def _cached_model(name: str):
|
| 7 |
+
# Load model without authentication token (for public models)
|
| 8 |
+
return SentenceTransformer(name, token=False)
|
| 9 |
+
|
| 10 |
+
def get_embedder(settings: Settings):
|
| 11 |
+
model = _cached_model(settings.embedding_model)
|
| 12 |
+
def _encode(texts):
|
| 13 |
+
return model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
|
| 14 |
+
return _encode
|
src/retrieval/search.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Dict, Any, Tuple
|
| 2 |
+
|
| 3 |
+
def retrieve(query: str, vstore, embedder, k: int) -> List[Tuple[str, float]]:
|
| 4 |
+
return vstore.search(query, embedder, k=k)
|
| 5 |
+
|
| 6 |
+
def format_citations(hits: List[Tuple[str, float]], max_items: int = 5) -> List[Dict[str, Any]]:
|
| 7 |
+
out = []
|
| 8 |
+
for txt, score in hits[:max_items]:
|
| 9 |
+
src = "unknown"
|
| 10 |
+
if "[Source:" in txt:
|
| 11 |
+
src = txt.split("[Source:")[-1].strip("] ").strip()
|
| 12 |
+
out.append({"source": src, "score": round(score, 3)})
|
| 13 |
+
return out
|
src/retrieval/vectorstore.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import faiss
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pickle
|
| 5 |
+
from typing import List, Tuple
|
| 6 |
+
from src.config import Settings
|
| 7 |
+
|
| 8 |
+
class VectorStore:
|
| 9 |
+
def __init__(self, settings: Settings):
|
| 10 |
+
self.settings = settings
|
| 11 |
+
self.index = None
|
| 12 |
+
self.docs: List[str] = []
|
| 13 |
+
|
| 14 |
+
def build(self, texts: List[str], embedder) -> "VectorStore":
|
| 15 |
+
self.docs = texts
|
| 16 |
+
X = embedder(texts).astype("float32")
|
| 17 |
+
self.index = faiss.IndexFlatIP(X.shape[1])
|
| 18 |
+
self.index.add(X)
|
| 19 |
+
return self
|
| 20 |
+
|
| 21 |
+
def save(self):
|
| 22 |
+
faiss.write_index(self.index, self.settings.index_path)
|
| 23 |
+
with open(self.settings.docs_path, "wb") as f:
|
| 24 |
+
pickle.dump(self.docs, f)
|
| 25 |
+
|
| 26 |
+
def load(self) -> "VectorStore":
|
| 27 |
+
if os.path.exists(self.settings.index_path) and os.path.exists(self.settings.docs_path):
|
| 28 |
+
self.index = faiss.read_index(self.settings.index_path)
|
| 29 |
+
with open(self.settings.docs_path, "rb") as f:
|
| 30 |
+
self.docs = pickle.load(f)
|
| 31 |
+
else:
|
| 32 |
+
raise FileNotFoundError("Index or docs not found. Run ingestion first.")
|
| 33 |
+
return self
|
| 34 |
+
|
| 35 |
+
def search(self, query: str, embedder, k: int = 5) -> List[Tuple[str, float]]:
|
| 36 |
+
q = embedder([query]).astype("float32")
|
| 37 |
+
sims, ids = self.index.search(q, k)
|
| 38 |
+
hits = []
|
| 39 |
+
for idx, score in zip(ids[0], sims[0]):
|
| 40 |
+
if idx == -1:
|
| 41 |
+
continue
|
| 42 |
+
hits.append((self.docs[idx], float(score)))
|
| 43 |
+
return hits
|
src/schema.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional, Dict, Any
|
| 2 |
+
from pydantic import BaseModel
|
| 3 |
+
|
| 4 |
+
class DocChunk(BaseModel):
|
| 5 |
+
text: str
|
| 6 |
+
source: str
|
| 7 |
+
page: Optional[int] = None
|
| 8 |
+
meta: Optional[ict[str, Any]] = None
|
src/ui/__init__.py
ADDED
|
File without changes
|
src/ui/components.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from typing import List, Tuple
|
| 3 |
+
from src.config import Settings
|
| 4 |
+
from src.retrieval.search import retrieve, format_citations
|
| 5 |
+
from src.llm.answer import compose_answer
|
| 6 |
+
|
| 7 |
+
def _ask(query: str, settings, embedder, vstore) -> tuple[str, List[dict]]:
|
| 8 |
+
if not query or not query.strip():
|
| 9 |
+
return "Please enter a question about the documents.", []
|
| 10 |
+
hits: List[Tuple[str, float]] = retrieve(query, vstore=vstore, embedder=embedder, k=settings.top_k)
|
| 11 |
+
if not hits:
|
| 12 |
+
return "No relevant passages found. Try adjusting your query.", []
|
| 13 |
+
answer = compose_answer(query, hits, settings)
|
| 14 |
+
citations = format_citations(hits, max_items=settings.top_k)
|
| 15 |
+
return answer, citations
|
| 16 |
+
|
| 17 |
+
def build_app(settings: Settings, embedder, vstore):
|
| 18 |
+
with gr.Blocks(title=settings.title) as demo:
|
| 19 |
+
gr.Markdown(f"# {settings.title}\n{settings.description}")
|
| 20 |
+
gr.Markdown(
|
| 21 |
+
f"**Mode:** `{settings.mode}` "
|
| 22 |
+
+ ("— no LLM used, showing excerpts only. LLM will be added later for summarization" if settings.mode == "retrieval"
|
| 23 |
+
else "— retrieval + summarizer enabled.")
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
query = gr.Textbox(label="Your question", placeholder="e.g., Orders about Sector 4 in 1963?")
|
| 27 |
+
ask_btn = gr.Button("Search", variant="primary")
|
| 28 |
+
answer = gr.Markdown("Ask a question to see excerpts.")
|
| 29 |
+
with gr.Accordion("Citations (top matches)", open=False):
|
| 30 |
+
citations = gr.JSON(label="Source & similarity")
|
| 31 |
+
|
| 32 |
+
def on_ask(q):
|
| 33 |
+
a, c = _ask(q, settings, embedder, vstore)
|
| 34 |
+
return a, c
|
| 35 |
+
|
| 36 |
+
ask_btn.click(on_ask, inputs=[query], outputs=[answer, citations])
|
| 37 |
+
query.submit(on_ask, inputs=[query], outputs=[answer, citations])
|
| 38 |
+
|
| 39 |
+
return demo
|
src/utils.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pickle
|
| 3 |
+
import json
|
| 4 |
+
from src.config import Settings
|
| 5 |
+
from src.retrieval.vectorstore import VectorStore
|
| 6 |
+
from src.retrieval.embedder import get_embedder
|
| 7 |
+
|
| 8 |
+
def ensure_dirs():
|
| 9 |
+
for p in ["data/raw", "storage"]:
|
| 10 |
+
os.makedirs(p, exist_ok=True)
|
| 11 |
+
|
| 12 |
+
def save_pickle(obj, path: str):
|
| 13 |
+
with open(path, "wb") as f:
|
| 14 |
+
pickle.dump(obj, f)
|
| 15 |
+
|
| 16 |
+
def load_pickle(path: str):
|
| 17 |
+
with open(path, "rb") as f:
|
| 18 |
+
return pickle.load(f)
|
| 19 |
+
|
| 20 |
+
def write_json(obj, path: str):
|
| 21 |
+
with open(path, "w", encoding="utf-8") as f:
|
| 22 |
+
json.dump(obj, f, ensure_ascii=False, indent=2)
|
| 23 |
+
|
| 24 |
+
def bootstrap_demo_index():
|
| 25 |
+
"""Create a minimal index so the app works before ingestion."""
|
| 26 |
+
settings = Settings()
|
| 27 |
+
demo_docs = [
|
| 28 |
+
"Directive: Reinforce border surveillance along Sector 4. [Source: KGB/1963/SECTOR4]",
|
| 29 |
+
"Report: Intercepted correspondence near Murmansk. [Source: KGB/1972/MUR-OPS]",
|
| 30 |
+
"Memo: Field notes suggest supply shortages in winter 1979. [Source: KGB/1979/LOG-WS]"
|
| 31 |
+
]
|
| 32 |
+
save_pickle(demo_docs, settings.docs_path)
|
| 33 |
+
embedder = get_embedder(settings)
|
| 34 |
+
vs = VectorStore(settings).build(demo_docs, embedder)
|
| 35 |
+
vs.save()
|
| 36 |
+
write_json({"demo": True, "count": len(demo_docs)}, settings.meta_path)
|