| """ |
| app.py β FECB RAG Search Application |
| |
| Loads a pre-built FAISS index (produced by ingest.py) and provides a |
| Gradio interface for semantic search and AI-assisted Q&A over your PDF documents. |
| |
| Environment variables: |
| ANTHROPIC_API_KEY β Anthropic API key (required) |
| CLAUDE_MODEL β Claude model ID (default: claude-sonnet-4-6) |
| EMBED_MODEL β Embedding model (default: BAAI/bge-small-en-v1.5) |
| TOP_K β Max documents to retrieve (default: 5) |
| INDEX_DIR β Path to FAISS index (default: faiss_index) |
| META_FILE β Path to metadata JSON (default: metadata.json) |
| |
| Run: |
| python app.py |
| """ |
|
|
| import json |
| import os |
| import re |
| from pathlib import Path |
|
|
| import anthropic |
| import gradio as gr |
| from langchain_community.vectorstores import FAISS |
| from langchain_huggingface import HuggingFaceEmbeddings |
|
|
| |
| EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-small-en-v1.5") |
| CLAUDE_MODEL = os.getenv("CLAUDE_MODEL", "claude-sonnet-4-6") |
| API_KEY = os.getenv("ANTHROPIC_API_KEY") |
| TOP_K = int(os.getenv("TOP_K", "8")) |
| INDEX_DIR = Path(os.getenv("INDEX_DIR", "faiss_index")) |
| META_FILE = Path(os.getenv("META_FILE", "metadata.json")) |
|
|
| SYSTEM_PROMPT = ( |
| "You are a knowledgeable research assistant. You help users find relevant " |
| "information from a document collection and synthesize key findings. " |
| "When answering, cite the specific document(s) by their bracketed number [N]. " |
| "Be concise and precise. If the context doesn't contain enough information " |
| "to answer fully, say so clearly." |
| ) |
|
|
| |
| print(f"Loading embedding model: {EMBED_MODEL}") |
| _embeddings = HuggingFaceEmbeddings( |
| model_name=EMBED_MODEL, |
| model_kwargs={"device": "cpu"}, |
| encode_kwargs={"normalize_embeddings": True}, |
| ) |
|
|
| if not INDEX_DIR.exists(): |
| raise FileNotFoundError( |
| f"FAISS index not found at '{INDEX_DIR}'. " |
| "Run 'python ingest.py' first to build the index from your PDFs." |
| ) |
|
|
| print(f"Loading FAISS index from: {INDEX_DIR}") |
| _vectorstore = FAISS.load_local( |
| str(INDEX_DIR), _embeddings, allow_dangerous_deserialization=True |
| ) |
|
|
| _metadata: dict[str, dict] = {} |
| if META_FILE.exists(): |
| print(f"Loading metadata from: {META_FILE}") |
| with open(META_FILE, encoding="utf-8") as f: |
| for record in json.load(f): |
| _metadata[record["doc_id"]] = record |
| print(f" Loaded metadata for {len(_metadata)} documents") |
| else: |
| print(f" [WARN] {META_FILE} not found β document names will be inferred from IDs") |
|
|
| if not API_KEY: |
| raise EnvironmentError("ANTHROPIC_API_KEY is not set. Export it before running.") |
|
|
| _client = anthropic.Anthropic(api_key=API_KEY) |
| print(f"Claude model: {CLAUDE_MODEL}") |
| print("Ready.\n") |
|
|
|
|
| |
|
|
| def retrieve(query: str, n_chunks: int) -> list[tuple]: |
| """Return the top n_chunks most relevant chunks, allowing multiple per document.""" |
| raw = _vectorstore.similarity_search_with_score(query, k=n_chunks) |
| return sorted(raw, key=lambda x: x[1]) |
|
|
|
|
| def build_context(hits: list[tuple]) -> str: |
| parts = [] |
| for i, (doc, _) in enumerate(hits, 1): |
| doc_id = doc.metadata.get("doc_id", f"doc_{i}") |
| filename = doc.metadata.get("filename", f"{doc_id}.pdf") |
| excerpt = doc.page_content.strip() |
| parts.append(f"[{i}] {filename} (ID: {doc_id})\n{excerpt}") |
| return "\n\n---\n\n".join(parts) |
|
|
|
|
| def ask_claude(query: str, context: str) -> str: |
| user_content = ( |
| f"Using the document excerpts below, answer the following question. " |
| f"Cite documents by their bracketed number.\n\n" |
| f"Question: {query}\n\nContext:\n{context}" |
| ) |
| try: |
| message = _client.messages.create( |
| model=CLAUDE_MODEL, |
| max_tokens=800, |
| system=SYSTEM_PROMPT, |
| messages=[{"role": "user", "content": user_content}], |
| ) |
| return message.content[0].text.strip() |
| except anthropic.APIError as exc: |
| return ( |
| f"Could not reach Claude ({exc}).\n\n" |
| "Check that **ANTHROPIC_API_KEY** is set and valid." |
| ) |
|
|
|
|
| def cosine_to_pct(score: float) -> str: |
| """Convert FAISS L2 distance (normalised embeddings) to 0β100% relevance.""" |
| pct = (1.0 - min(max(score, 0.0), 2.0) / 2.0) * 100 |
| return f"{pct:.1f}%" |
|
|
|
|
| |
|
|
| def rag_search(query: str, n_docs: int) -> tuple[str, str]: |
| query = query.strip() |
| if not query: |
| return "Please enter a question or keyword.", "" |
|
|
| hits = retrieve(query, n_docs) |
| if not hits: |
| return "No relevant documents found. Try different keywords.", "" |
|
|
| context = build_context(hits) |
| answer = ask_claude(query, context) |
|
|
| cards = [] |
| for i, (doc, score) in enumerate(hits, 1): |
| doc_id = doc.metadata.get("doc_id", f"doc_{i}") |
| filename = doc.metadata.get("filename", f"{doc_id}.pdf") |
| rel = cosine_to_pct(score) |
| snippet = doc.page_content.replace("\n", " ").strip()[:350] |
|
|
| cards.append( |
| f"### [{i}] {filename}\n" |
| f"**Relevance:** {rel} \n" |
| f"**ID:** {doc_id} \n" |
| f"> {snippet}β¦" |
| ) |
|
|
| return answer, "\n\n---\n\n".join(cards) |
|
|
|
|
| |
|
|
| with gr.Blocks(title="FECB Document Search") as demo: |
|
|
| gr.Markdown( |
| """ |
| # FECB Document Search β AI-Powered RAG |
| |
| Search your document collection using semantic AI search. |
| Ask a question or enter keywords; the app retrieves the most relevant |
| documents and generates a synthesised answer with citations. |
| |
| > **Powered by** `BAAI/bge-small-en-v1.5` embeddings Β· Claude via Anthropic API |
| """ |
| ) |
|
|
| with gr.Row(): |
| with gr.Column(scale=5): |
| query_box = gr.Textbox( |
| label="Question or keywords", |
| placeholder="e.g. 'What are the main findings about X?'", |
| lines=2, |
| elem_id="query-box", |
| ) |
| with gr.Column(scale=1, min_width=160): |
| n_slider = gr.Slider( |
| minimum=3, maximum=20, value=TOP_K, step=1, |
| label="Chunks to retrieve", |
| ) |
|
|
| search_btn = gr.Button("Search", variant="primary", size="lg") |
|
|
| gr.Markdown("---") |
|
|
| with gr.Row(): |
| with gr.Column(scale=2): |
| gr.Markdown("### AI Answer") |
| answer_md = gr.Markdown(value="*Results will appear here after searching.*") |
|
|
| with gr.Column(scale=3): |
| gr.Markdown("### Relevant Documents") |
| papers_md = gr.Markdown(value="") |
|
|
| search_btn.click(rag_search, [query_box, n_slider], [answer_md, papers_md]) |
| query_box.submit(rag_search, [query_box, n_slider], [answer_md, papers_md]) |
|
|
|
|
| if __name__ == "__main__": |
| demo.launch( |
| server_name="0.0.0.0", |
| server_port=7860, |
| share=False, |
| theme=gr.themes.Soft(primary_hue="blue", font=gr.themes.GoogleFont("Inter")), |
| css=""" |
| .gradio-container { max-width: 1100px; margin: auto; } |
| #query-box textarea { font-size: 16px; } |
| """, |
| ) |
|
|