Spaces:

ciorant
/

bioethics-rag

Sleeping

App Files Files Community

ciorant commited on Sep 5, 2025

Commit

90ed798

verified ·

1 Parent(s): b54ee36

Upload 9 files

Browse files

Files changed (9) hide show

.gitattributes +2 -35
documents.pkl +3 -0
faiss_index/.gitattributes +2 -0
faiss_index/documents.pkl +3 -0
faiss_index/index.faiss +3 -0
src/app.py +211 -0
src/chatbot.py +165 -0
src/document_processor.py +167 -0
src/vector_store.py +195 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,2 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


1	+ *.faiss filter=lfs diff=lfs merge=lfs -text
2	+ *.pkl filter=lfs diff=lfs merge=lfs -text

documents.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf9c7941cca8d55333bc9d1c0232934f0a8edf7bb17219a728acd6e6476fd897
+size 2235712

faiss_index/.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.faiss filter=lfs diff=lfs merge=lfs -text
2	+ *.pkl filter=lfs diff=lfs merge=lfs -text

faiss_index/documents.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf9c7941cca8d55333bc9d1c0232934f0a8edf7bb17219a728acd6e6476fd897
+size 2235712

faiss_index/index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:35d72c5b12d9f320e9cfd1836133df9514417bec14ddb4ab7937d746886c5abf
+size 29995053

src/app.py ADDED Viewed

	@@ -0,0 +1,211 @@

+# app.py
+import streamlit as st
+from chatbot import BioethicsChatbot
+import time
+import io
+import sys
+from contextlib import redirect_stdout, redirect_stderr
+st.set_page_config(
+    page_title="Bioethics AI Assistant",
+    page_icon="🧬",
+    layout="wide"
+)
+st.title("🧬 Bioethics AI Assistant")
+st.markdown("*Ask questions about medical ethics, informed consent, research ethics, and more*")
+# Custom CSS to hide debug output
+st.markdown("""
+<style>
+    .debug-output {
+        background-color: #f0f0f0;
+        padding: 10px;
+        border-radius: 5px;
+        font-family: monospace;
+        font-size: 12px;
+        color: #666;
+    }
+</style>
+""", unsafe_allow_html=True)
+# Sidebar info
+with st.sidebar:
+    st.markdown("### About")
+    st.write("This demo uses Retrieval-Augmented Generation (RAG) with open-access bioethics papers.")
+    st.markdown("### Sample Questions")
+    sample_questions = [
+        "What is informed consent in medical research?",
+        "What are the ethical issues with genetic testing?",
+        "How should AI bias in healthcare be addressed?",
+        "What is the principle of beneficence?",
+        "What are the ethics of end-of-life care?"
+    ]
+    for q in sample_questions:
+        if st.button(q, key=q, use_container_width=True):
+            st.session_state.current_question = q
+    st.markdown("---")
+    st.markdown("### Demo Info")
+    st.info("💡 This demo shows sources found and similarity scores for transparency")
+# Rate limiting
+if 'query_count' not in st.session_state:
+    st.session_state.query_count = 0
+# Initialize chatbot (only once)
+@st.cache_resource
+def load_chatbot():
+    """Load chatbot once and cache it"""
+    return BioethicsChatbot("data/")
+# Main interface
+col1, col2 = st.columns([4, 1])
+with col1:
+    question = st.text_input(
+        "Your question:",
+        value=st.session_state.get('current_question', ''),
+        placeholder="e.g., What are the ethical considerations in clinical trials?",
+        key="question_input"
+    )
+with col2:
+    st.metric("Queries Used", f"{st.session_state.query_count}/50")
+# Clear the current_question after it's been used
+if 'current_question' in st.session_state:
+    del st.session_state.current_question
+if question and st.session_state.query_count < 50:
+    # Load chatbot
+    try:
+        if 'bot' not in st.session_state:
+            with st.spinner("🔄 Loading bioethics knowledge base..."):
+                st.session_state.bot = load_chatbot()
+                st.success("✅ Knowledge base loaded!")
+        st.session_state.query_count += 1
+        # Create columns for response
+        response_col, debug_col = st.columns([2, 1])
+        with response_col:
+            st.markdown("### 🤖 Assistant Response")
+            # Capture the streaming output and debug info
+            start_time = time.time()
+            # Capture stdout to get debug prints
+            old_stdout = sys.stdout
+            old_stderr = sys.stderr
+            stdout_capture = io.StringIO()
+            stderr_capture = io.StringIO()
+            try:
+                # Redirect prints to capture debug info
+                sys.stdout = stdout_capture
+                sys.stderr = stderr_capture
+                # Get the answer (this will stream to captured stdout)
+                answer = st.session_state.bot.ask(question)
+            finally:
+                # Restore stdout/stderr
+                sys.stdout = old_stdout
+                sys.stderr = old_stderr
+            response_time = time.time() - start_time
+            # Display the final answer
+            st.write(answer)
+        with debug_col:
+            st.markdown("### 🔍 Debug Info")
+            # Show search results info
+            if 'bot' in st.session_state:
+                # Get search results for debug display
+                search_results = st.session_state.bot.vector_store.search(question, k=3)
+                with st.expander("📊 Search Results", expanded=True):
+                    for i, r in enumerate(search_results):
+                        st.write(f"**Result {i + 1}** (Score: {r.get('similarity_score', 0):.3f})")
+                        st.write(f"Source: {r['metadata'].get('filename', 'Unknown')}")
+                        st.write(f"Preview: {r['content'][:200]}...")
+                        st.write("---")
+            # Show response metadata
+            st.metric("Response Time", f"{response_time:.2f}s")
+            st.metric("Model", "GPT-4o-mini")
+            # Show conversation history count
+            if hasattr(st.session_state.bot, 'history'):
+                st.metric("Conversation Turn", len(st.session_state.bot.history))
+        # Show source information
+        with st.expander("📚 About the Sources"):
+            st.markdown("""
+            This assistant searches through open-access bioethics papers to find relevant information.
+            **Search Process:**
+            1. Your question is converted to embeddings
+            2. Similar text chunks are found using FAISS vector search
+            3. Only chunks with similarity score ≥ 0.65 are used for citations
+            4. The language model synthesizes an answer from these sources
+            """)
+    except Exception as e:
+        st.error(f"❌ Error: {str(e)}")
+        st.info("Please try refreshing the page or try a different question.")
+elif st.session_state.query_count >= 50:
+    st.error("📈 Demo limit reached for today. This prevents API abuse.")
+    st.info("💡 For unlimited use, clone the repository and use your own API key.")
+    with st.expander("🚀 How to run locally"):
+        st.code("""
+# Clone the repository
+git clone your-repo-url
+cd bioethics-chatbot
+# Install dependencies
+pip install -r requirements.txt
+# Set your OpenAI API key
+export OPENAI_API_KEY="your-key-here"
+# Run locally
+streamlit run app.py
+        """, language="bash")
+# Footer
+st.markdown("---")
+col1, col2, col3 = st.columns(3)
+with col1:
+    st.markdown("**🔗 Links**")
+    st.markdown("- [GitHub Repository](your-repo-link)")
+    st.markdown("- [Open Source Papers Used](./data/LICENSE_INFO.md)")
+with col2:
+    st.markdown("**🛠️ Tech Stack**")
+    st.markdown("- Python & Streamlit")
+    st.markdown("- OpenAI GPT-4o-mini")
+    st.markdown("- FAISS Vector Search")
+    st.markdown("- LangChain")
+with col3:
+    st.markdown("**📊 Demo Stats**")
+    if 'bot' in st.session_state and hasattr(st.session_state.bot, 'vector_store'):
+        doc_count = len(st.session_state.bot.vector_store.documents)
+        st.markdown(f"- {doc_count} text chunks indexed")
+        st.markdown(f"- Vector dimension: {st.session_state.bot.vector_store.dimension}")
+    st.markdown(f"- Queries today: {st.session_state.query_count}")
+# Add some spacing
+st.markdown("<br>", unsafe_allow_html=True)

src/chatbot.py ADDED Viewed

	@@ -0,0 +1,165 @@

+from document_processor import DocumentProcessor
+from vector_store import FAISSVectorStore
+from langchain_openai import ChatOpenAI
+from pathlib import Path
+import logging
+import os
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+from langchain.callbacks.base import BaseCallbackHandler
+class StreamHandler(BaseCallbackHandler):
+    def __init__(self):
+        self.current_text = ""
+    def on_llm_new_token(self, token: str, **kwargs):
+        print(token, end="", flush=True)   # stream to console
+        self.current_text += token
+    def get_text(self):
+        return self.current_text
+class BioethicsChatbot:
+    def __init__(self, data_dir: str="data/sample_papers"):
+        self.processor = DocumentProcessor()
+        self.vector_store = FAISSVectorStore()
+        self.history = []
+        self.confidence_thresholds = {
+            'high': 0.8,
+            'medium': 0.65,
+            'low': 0.5}
+        if not self.vector_store.load_index():
+            print("No existing vector store, creating one...")
+            pdf_files = list(Path(data_dir).glob("*.pdf"))
+            if not pdf_files:
+                raise ValueError(f"No PDFs found in {data_dir}")
+            chunks = self.processor.process_documents([str(p) for p in pdf_files])
+            self.vector_store.add_documents(chunks)
+            logger.info("Indexed %d documents.", len(chunks))
+        else:
+            logger.info("Index loaded from disk")
+        self.stream_handler = StreamHandler()
+        self.llm = ChatOpenAI(model="gpt-4o-mini", streaming=True,
+                              callbacks=[self.stream_handler])
+    def add_new_document(self, pdf_path: str):
+        filename = Path(pdf_path).name
+        # Check if already in the index
+        existing_files = {doc["metadata"].get("filename") for doc in self.vector_store.documents}
+        if filename in existing_files:
+            print(f"Skipping {filename}: already indexed.")
+            return
+        # Otherwise process & add
+        chunks = self.processor.process_document(pdf_path)
+        self.vector_store.add_documents(chunks)
+        print(f"Added {len(chunks)} chunks from {pdf_path}")
+    def get_citation_confidence(self, similarity_score: float) -> str:
+        """Determine citation confidence level based on similarity score"""
+        if similarity_score >= self.confidence_thresholds['high']:
+            return "high_confidence"
+        elif similarity_score >= self.confidence_thresholds['medium']:
+            return "medium_confidence"
+        elif similarity_score >= self.confidence_thresholds['low']:
+            return "low_confidence"
+        return "context_only"
+    def ask(self, question: str, k: int = 10) -> str:
+        # Step 1: Retrieve relevant chunks
+        results = self.vector_store.search(question, k=k)
+        # DEBUG: Print what we found
+        print(f"Found {len(results)} results for query: '{question}'")
+        for i, r in enumerate(results[:3]):  # Show top 3
+            print(f"Result {i + 1} (score: {r.get('similarity_score', 'N/A'):.3f}): {r['content'][:200]}...")
+        if not results:
+            return "I couldn't find relevant information in the documents."
+        # Step 2: Build context from retrieved chunks
+        context_blocks = []
+        citation_groups = {
+            'high_confidence': [],
+            'medium_confidence': [],
+            'low_confidence': [],
+        }
+        for r in results:
+            title = r["metadata"].get("title", None)
+            authors = r["metadata"].get("authors", None)
+            year = r["metadata"].get("year", "n.d.")
+            confidence = self.get_citation_confidence(r["similarity_score"])
+            block = (
+                f"Source: {authors} ({year}). *{title}* "
+                f"[chunk {r['metadata'].get('chunk_id', '?')}, confidence: {confidence}]\n"
+                f"{r['content']}\n"
+            )
+            context_blocks.append(block)
+            if authors is not None and authors != "Unknown Author(s)":
+                citation_groups[confidence].append(block)
+        history_text = "\n".join(
+            [f"User: {u}\nBot: {b}" for u, b in self.history[-4:]]
+        ) or "No previous conversation."
+        context = f"""
+        Conversation so far:
+        {history_text}
+        Relevant sources (use them to guide your answer, but cite only the ones in citation groups):
+        {"\n\n".join(context_blocks)}
+        Do not cite if the author is "Unknown Author(s)".
+        CITATION GUIDELINES:
+        - HIGH CONFIDENCE sources: Use direct citations "(Author, Year)"
+        - MEDIUM CONFIDENCE sources: Use "According to Author (Year)..."
+        - LOW CONFIDENCE sources: Use "(see Author, Year)"
+        High confidence sources:
+        {"\n\n".join(citation_groups['high_confidence']) or "None"}
+        Medium confidence sources:
+        {"\n\n".join(citation_groups['medium_confidence']) or "None"}
+        Low confidence sources:
+        {"\n\n".join(citation_groups['low_confidence']) or "None"}
+        """
+        # Step 3: Construct prompt
+        prompt = f"""
+        You are a bioethics expert assistant.
+        Answer the user's question using the context provided below.
+        Draw justified connections between concepts even if not explicitly stated.
+        If you need to make reasonable inferences based on the context, do so.
+        If the context doesn't contain enough information, say what you do know from the context and indicate what information is missing.
+        If the question doesn't concern neither bioethics nor previous questions, inform the user about it and don't answer it.
+        Context:
+        {context}
+        Question: {question}
+        Answer:
+        """
+        self.stream_handler.current_text = ""
+        _ = self.llm.invoke(prompt)  # streaming happens here
+        print()  # newline after streaming
+        answer = self.stream_handler.get_text()
+        self.history.append((question, answer))
+        return answer

src/document_processor.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import fitz
+import re
+from typing import List, Dict
+from pathlib import Path
+import logging
+import PyPDF2
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class DocumentProcessor:
+    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+    def extract_text_from_pdf(self,pdf_path: str) -> str:
+        """Extract text from PDF file"""
+        try:
+            doc = fitz.open(pdf_path)
+            text = ""
+            for page in doc:
+                text += page.get_text()
+                text += f"\n--- Page {page.number + 1} ---\n"  # page.number is 0-indexed
+            logger.info(f"Extracted text from {pdf_path}: {len(text)} characters, {len(doc)} pages")
+            doc.close()
+            return text
+        except Exception as e:
+            logger.error(f"Error extracting text from {pdf_path}: {e}")
+            return ""
+    def clean_text(self,text: str) -> str:
+        """Clean text from PDF"""
+        text = re.sub(r'\n{2,}', '\n', text)  # keep single newlines
+        text = re.sub(r'[ \t]+', ' ', text)  # collapse spaces/tabs
+        # Remove page headers/footers
+        text = re.sub(r'Page \d+.*?\n', '', text)
+        # Remove references to figures/tables
+        text = re.sub(r'\[Figure \d+\]|\[Table \d+\]', '', text)
+        return text.strip()
+    def chunk_text(self,text: str, metadata: Dict = None) -> List[Dict]:
+        """Split text into chunks with metadata"""
+        if not text:
+            return []
+        sentences = text.split('. ')
+        chunks = []
+        current_chunk = ""
+        for sentence in sentences:
+            # If adding this sentence would exceed chunk size
+            if len(current_chunk) + len(sentence) > self.chunk_size:
+                if current_chunk:
+                    chunks.append({
+                        "text": current_chunk.strip(),
+                        "metadata": metadata or {},
+                        "chunk_id": len(chunks)
+                    })
+                    # Start new chunk with overlap
+                    overlap_text = current_chunk[-self.chunk_overlap:] if len(
+                        current_chunk) > self.chunk_overlap else current_chunk
+                    current_chunk = overlap_text + " " + sentence
+                else:
+                    current_chunk = sentence
+            else:
+                current_chunk += ". " + sentence if current_chunk else sentence
+            # Add final chunk
+        if current_chunk:
+            chunks.append({
+                "text": current_chunk.strip(),
+                "metadata": metadata or {},
+                "chunk_id": len(chunks)
+            })
+        logger.info(f"Created {len(chunks)} chunks")
+        return chunks
+    def extract_metadata(self, pdf_path: str) -> dict:
+        """Extract metadata (title, authors, year, filename, file_size) from a PDF."""
+        metadata = {
+            "filename": Path(pdf_path).name,
+            "file_size": Path(pdf_path).stat().st_size,
+            "title": None,
+            "authors": None,
+            "year": None
+        }
+        with open(pdf_path, "rb") as f:
+            reader = PyPDF2.PdfReader(f)
+            # 1. Try embedded PDF metadata
+            pdf_meta = reader.metadata
+            if pdf_meta:
+                title = pdf_meta.get("/Title", "").strip()
+                author = pdf_meta.get("/Author", "").strip()
+                if title and title.lower() not in ["", "untitled", "unknown"]:
+                    metadata["title"] = title
+                if author and author.lower() not in ["", "anonymous", "unknown"]:
+                    metadata["authors"] = author
+            # 2. Fallback: look at first page
+            if not metadata["title"] or not metadata["authors"]:
+                try:
+                    first_page = reader.pages[0].extract_text() or ""
+                    lines = [line.strip() for line in first_page.split("\n") if line.strip()]
+                    # crude heuristic: first line = title
+                    if not metadata["title"] and lines:
+                        metadata["title"] = lines[0]
+                    # crude heuristic: authors in line(s) after title
+                    if not metadata["authors"] and len(lines) > 1:
+                        possible_authors = lines[1]
+                        if re.search(r"[A-Z][a-z]+(?: [A-Z][a-z]+)*", possible_authors):
+                            metadata["authors"] = possible_authors
+                    # crude heuristic: find year (e.g., 2023, 2024)
+                    year_match = re.search(r"\b(19|20)\d{2}\b", first_page)
+                    if year_match:
+                        metadata["year"] = year_match.group(0)
+                except Exception:
+                    pass
+        # Defaults if missing
+        metadata["title"] = metadata["title"] or "Unknown Title"
+        metadata["authors"] = metadata["authors"] if metadata["authors"] else None
+        metadata["year"] = metadata["year"] or "n.d."
+        return metadata
+    def process_document(self,pdf_path: str) -> List[Dict]:
+        """Complete document processing"""
+        try:
+            file_path = Path(pdf_path)
+        except TypeError as e:  # Catches specifically if pdf_path is the wrong type
+            logger.error(f"Invalid path type: {pdf_path}: {e}")
+            raise
+        except OSError as e:  # Catches other filesystem-related errors
+            logger.error(f"OS error with path: {pdf_path}: {e}")
+            raise
+        metadata=self.extract_metadata(pdf_path)
+        raw_text = self.extract_text_from_pdf(pdf_path)
+        clean_text = self.clean_text(raw_text)
+        chunks = self.chunk_text(clean_text, metadata)
+        logger.info(f"Processed {pdf_path}: {len(chunks)} chunks created")
+        return chunks
+    def process_documents(self, pdf_paths: List[str]) -> List[Dict]:
+        documents = []
+        for path in pdf_paths:
+            documents.extend(self.process_document(path))
+        return documents

src/vector_store.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import faiss
+import numpy as np
+import pickle
+from pathlib import Path
+from langchain_openai import OpenAIEmbeddings
+from threading import Lock
+from typing import List, Dict, Any
+import logging
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+DocumentChunk = Dict[str, Any]
+class FAISSVectorStore:
+    def __init__(
+        self,
+        dimension: int = 3072,
+        index_path: str = "data/faiss_index",
+        embedding_model: str = "text-embedding-3-large", #3072-dim vectors
+    ):
+        if OpenAIEmbeddings is None:
+            raise ImportError(
+                "Could not import OpenAIEmbeddings from langchain. "
+                "Install langchain or adapt the import to your environment."
+            )
+        self.dimension = dimension
+        self.index_path = Path(index_path)
+        self._lock = Lock()
+        self.index_path.mkdir(parents=True, exist_ok=True)
+        # Instantiate embeddings (may make API calls later when embedding)
+        self.embeddings = OpenAIEmbeddings(model=embedding_model)
+        # in-memory structures
+        self.documents: List[DocumentChunk] = []
+        # Create a new FAISS index (will be replaced by load if a saved index exists)
+        self.index = faiss.IndexFlatIP(self.dimension) # All vectors must be this length
+        # If there's a saved index, load it (overwrites the index created above).
+        self.load_index()  # safe: will return False if nothing to load
+    def _ensure_index_dim(self, d: int):
+        """Ensure FAISS index has dimension d."""
+        # If current index has no vectors, and d != self.dimension, recreate.
+        # Using getattr for defensive programming
+        if getattr(self.index, "ntotal", 0) == 0 and getattr(self.index, "d", None) != d:
+            logger.info("Recreating an empty index with dimension %d", d)
+            self.dimension = d
+            self.index = faiss.IndexFlatIP(self.dimension)
+        elif getattr(self.index, "d", None) is not None and self.index.d != d:
+            raise ValueError(f"Embedding dimension ({d}) does not match existing index dimension ({self.index.d}).")
+    def add_documents(self, chunks: List[DocumentChunk], save: bool = True):
+        """
+        Add list of chunks to the FAISS index. Each chunk MUST contain 'text'.
+        If index is empty and embedding dimension differs, the index will be re-created.
+        """
+        with self._lock:
+            if not chunks:
+                logger.debug("No chunks to add.")
+                return
+            texts = []
+            for i, chunk in enumerate(chunks):
+                if not isinstance(chunk, dict):
+                    raise ValueError(f"Chunk {i} is not a dictionary")
+                if "text" not in chunk:
+                    raise ValueError(f"Chunk {i} missing required 'text' field")
+                if not isinstance(chunk["text"], str):
+                    raise ValueError(f"Chunk {i} 'text' field must be a string")
+                if not chunk["text"].strip():
+                    logger.warning(f"Chunk {i} has empty text content")
+                    continue
+                texts.append(chunk["text"])
+            # Get embeddings from the embedding provider (call to a model)
+            embeddings = self.embeddings.embed_documents(texts)
+            embeddings_np = np.asarray(embeddings, dtype=np.float32)
+            # Embedding shape checks
+            if embeddings_np.ndim == 1:
+                # single vector returned as 1D array -> reshape to (1, d)
+                embeddings_np = embeddings_np.reshape(1, -1)
+            emb_d = embeddings_np.shape[1]
+            # If needed, recreate the index dimension (only possible if index currently empty)
+            self._ensure_index_dim(emb_d)
+            if emb_d != self.index.d:
+                raise ValueError(f"Embedding dim {emb_d} != index dim {self.index.d}")
+            # L2-normalize rows (in place) so inner product == cosine similarity
+            faiss.normalize_L2(embeddings_np)
+            # Add to index
+            self.index.add(embeddings_np)
+            # The documentation of "add" suggests we have to put the number of vectors,
+            # as a first argument, but Python does it for us.
+            # Append documents (simple positional mapping: index position -> documents list)
+            self.documents.extend(chunks)
+            if save:
+                self.save_index()
+    def search(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
+        """
+        Search similar documents for `query`. Returns up to k results.
+        Each result: { "content": <text>, "metadata": <metadata>, "similarity_score": <float> }
+        similarity_score is the inner product of normalized vectors => cosine similarity in [-1,1].
+        """
+        with self._lock:
+            # guard: no vectors at all
+            if getattr(self.index, "ntotal", 0) == 0:
+                logger.debug("Search called but index is empty.")
+                return []
+            # embed query
+            q_emb = self.embeddings.embed_query(query)
+            q_np = np.asarray([q_emb], dtype=np.float32)
+            if q_np.ndim == 1:
+                q_np = q_np.reshape(1, -1)
+            if q_np.shape[1] != self.index.d:
+                # if index is empty we could recreate; but at this point we know index has vectors.
+                raise ValueError(f"Query embedding dim {q_np.shape[1]} does not match index dimension {self.index.d}")
+            faiss.normalize_L2(q_np)
+            # clamp k
+            k = min(k, int(self.index.ntotal))
+            distances, indices = self.index.search(q_np, k)  # distances shape (1,k) ; indices shape (1,k)
+            results = []
+            for score, idx in zip(distances[0], indices[0]):
+                if idx < 0:
+                    # FAISS returns -1 for "empty" slots sometimes; skip
+                    continue
+                if idx >= len(self.documents):
+                    logger.warning("Index returned idx %d but documents list has length %d", idx, len(self.documents))
+                    continue
+                doc = self.documents[idx]
+                results.append({
+                    "content": doc.get("text"),
+                    "metadata": doc.get("metadata", {}),
+                    "similarity_score": float(score)  # already cosine because of normalization
+                })
+            return results
+    def save_index(self):
+        """Persist index and documents to disk."""
+        self.index_path.mkdir(parents=True, exist_ok=True)
+        faiss.write_index(self.index, str(self.index_path / "index.faiss"))
+        with open(self.index_path / "documents.pkl", "wb") as f:
+            pickle.dump(self.documents, f)
+        logger.debug("FAISS index and documents saved to %s", self.index_path)
+    def load_index(self) -> bool:
+        """Load index and documents from disk. Returns True if loaded."""
+        index_file = self.index_path / "index.faiss"
+        docs_file = self.index_path / "documents.pkl"
+        if index_file.exists() and docs_file.exists():
+            self.index = faiss.read_index(str(index_file))
+            with open(docs_file, "rb") as f:
+                self.documents = pickle.load(f)
+            # update dimension to match loaded index
+            if getattr(self.index, "d", None) is not None:
+                self.dimension = int(self.index.d)
+            if self.index.d == 0 or len(self.documents) != self.index.ntotal:
+                logger.error("Corrupted index detected, deleting...")
+                index_file.unlink()
+                docs_file.unlink()
+                return False
+            # warn if counts differ
+            if len(self.documents) != self.index.ntotal:
+                logger.warning(
+                    "Loaded documents list length (%d) differs from index.ntotal (%d). "
+                    "This can lead to mismatches. Using what's available.",
+                    len(self.documents),
+                    self.index.ntotal,
+                )
+            logger.info("Loaded FAISS index from %s (ntotal=%d, dim=%d)",
+                        index_file, int(self.index.ntotal), int(self.index.d))
+            return True
+        return False