Spaces:

CGIAR
/

fecb-rag

Running

App Files Files Community

fecb-rag / app.py

malaporte

Upload folder using huggingface_hub

32bcdac verified about 2 months ago

Raw

History Blame Contribute Delete

8.06 kB

	"""
	app.py — FECB RAG Search Application

	Loads a pre-built FAISS index (produced by ingest.py) and provides a
	Gradio interface for semantic search and AI-assisted Q&A over your PDF documents.

	Environment variables:
	ANTHROPIC_API_KEY — Anthropic API key (required)
	CLAUDE_MODEL — Claude model ID (default: claude-sonnet-4-6)
	EMBED_MODEL — Embedding model (default: BAAI/bge-small-en-v1.5)
	TOP_K — Max documents to retrieve (default: 5)
	INDEX_DIR — Path to FAISS index (default: faiss_index)
	META_FILE — Path to metadata JSON (default: metadata.json)

	Run:
	python app.py
	"""

	import json
	import os
	import re
	from pathlib import Path

	import anthropic
	import gradio as gr
	from langchain_community.vectorstores import FAISS
	from langchain_huggingface import HuggingFaceEmbeddings

	# ── Config ────────────────────────────────────────────────────────────────────
	EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-small-en-v1.5")
	CLAUDE_MODEL = os.getenv("CLAUDE_MODEL", "claude-sonnet-4-6")
	API_KEY = os.getenv("ANTHROPIC_API_KEY")
	TOP_K = int(os.getenv("TOP_K", "8"))
	INDEX_DIR = Path(os.getenv("INDEX_DIR", "faiss_index"))
	META_FILE = Path(os.getenv("META_FILE", "metadata.json"))

	SYSTEM_PROMPT = (
	"You are a knowledgeable research assistant. You help users find relevant "
	"information from a document collection and synthesize key findings. "
	"When answering, cite the specific document(s) by their bracketed number [N]. "
	"Be concise and precise. If the context doesn't contain enough information "
	"to answer fully, say so clearly."
	)

	# ── Load resources ────────────────────────────────────────────────────────────
	print(f"Loading embedding model: {EMBED_MODEL}")
	_embeddings = HuggingFaceEmbeddings(
	model_name=EMBED_MODEL,
	model_kwargs={"device": "cpu"},
	encode_kwargs={"normalize_embeddings": True},
	)

	if not INDEX_DIR.exists():
	raise FileNotFoundError(
	f"FAISS index not found at '{INDEX_DIR}'. "
	"Run 'python ingest.py' first to build the index from your PDFs."
	)

	print(f"Loading FAISS index from: {INDEX_DIR}")
	_vectorstore = FAISS.load_local(
	str(INDEX_DIR), _embeddings, allow_dangerous_deserialization=True
	)

	_metadata: dict[str, dict] = {}
	if META_FILE.exists():
	print(f"Loading metadata from: {META_FILE}")
	with open(META_FILE, encoding="utf-8") as f:
	for record in json.load(f):
	_metadata[record["doc_id"]] = record
	print(f" Loaded metadata for {len(_metadata)} documents")
	else:
	print(f" [WARN] {META_FILE} not found — document names will be inferred from IDs")

	if not API_KEY:
	raise EnvironmentError("ANTHROPIC_API_KEY is not set. Export it before running.")

	_client = anthropic.Anthropic(api_key=API_KEY)
	print(f"Claude model: {CLAUDE_MODEL}")
	print("Ready.\n")


	# ── RAG helpers ───────────────────────────────────────────────────────────────

	def retrieve(query: str, n_chunks: int) -> list[tuple]:
	"""Return the top n_chunks most relevant chunks, allowing multiple per document."""
	raw = _vectorstore.similarity_search_with_score(query, k=n_chunks)
	return sorted(raw, key=lambda x: x[1])


	def build_context(hits: list[tuple]) -> str:
	parts = []
	for i, (doc, _) in enumerate(hits, 1):
	doc_id = doc.metadata.get("doc_id", f"doc_{i}")
	filename = doc.metadata.get("filename", f"{doc_id}.pdf")
	excerpt = doc.page_content.strip()
	parts.append(f"[{i}] {filename} (ID: {doc_id})\n{excerpt}")
	return "\n\n---\n\n".join(parts)


	def ask_claude(query: str, context: str) -> str:
	user_content = (
	f"Using the document excerpts below, answer the following question. "
	f"Cite documents by their bracketed number.\n\n"
	f"Question: {query}\n\nContext:\n{context}"
	)
	try:
	message = _client.messages.create(
	model=CLAUDE_MODEL,
	max_tokens=800,
	system=SYSTEM_PROMPT,
	messages=[{"role": "user", "content": user_content}],
	)
	return message.content[0].text.strip()
	except anthropic.APIError as exc:
	return (
	f"Could not reach Claude ({exc}).\n\n"
	"Check that ANTHROPIC_API_KEY is set and valid."
	)


	def cosine_to_pct(score: float) -> str:
	"""Convert FAISS L2 distance (normalised embeddings) to 0–100% relevance."""
	pct = (1.0 - min(max(score, 0.0), 2.0) / 2.0) * 100
	return f"{pct:.1f}%"


	# ── Main search function ──────────────────────────────────────────────────────

	def rag_search(query: str, n_docs: int) -> tuple[str, str]:
	query = query.strip()
	if not query:
	return "Please enter a question or keyword.", ""

	hits = retrieve(query, n_docs)
	if not hits:
	return "No relevant documents found. Try different keywords.", ""

	context = build_context(hits)
	answer = ask_claude(query, context)

	cards = []
	for i, (doc, score) in enumerate(hits, 1):
	doc_id = doc.metadata.get("doc_id", f"doc_{i}")
	filename = doc.metadata.get("filename", f"{doc_id}.pdf")
	rel = cosine_to_pct(score)
	snippet = doc.page_content.replace("\n", " ").strip()[:350]

	cards.append(
	f"### [{i}] {filename}\n"
	f"Relevance: {rel} \n"
	f"ID: {doc_id} \n"
	f"> {snippet}…"
	)

	return answer, "\n\n---\n\n".join(cards)


	# ── Gradio UI ─────────────────────────────────────────────────────────────────

	with gr.Blocks(title="FECB Document Search") as demo:

	gr.Markdown(
	"""
	# FECB Document Search — AI-Powered RAG

	Search your document collection using semantic AI search.
	Ask a question or enter keywords; the app retrieves the most relevant
	documents and generates a synthesised answer with citations.

	> Powered by `BAAI/bge-small-en-v1.5` embeddings · Claude via Anthropic API
	"""
	)

	with gr.Row():
	with gr.Column(scale=5):
	query_box = gr.Textbox(
	label="Question or keywords",
	placeholder="e.g. 'What are the main findings about X?'",
	lines=2,
	elem_id="query-box",
	)
	with gr.Column(scale=1, min_width=160):
	n_slider = gr.Slider(
	minimum=3, maximum=20, value=TOP_K, step=1,
	label="Chunks to retrieve",
	)

	search_btn = gr.Button("Search", variant="primary", size="lg")

	gr.Markdown("---")

	with gr.Row():
	with gr.Column(scale=2):
	gr.Markdown("### AI Answer")
	answer_md = gr.Markdown(value="Results will appear here after searching.")

	with gr.Column(scale=3):
	gr.Markdown("### Relevant Documents")
	papers_md = gr.Markdown(value="")

	search_btn.click(rag_search, [query_box, n_slider], [answer_md, papers_md])
	query_box.submit(rag_search, [query_box, n_slider], [answer_md, papers_md])


	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	theme=gr.themes.Soft(primary_hue="blue", font=gr.themes.GoogleFont("Inter")),
	css="""
	.gradio-container { max-width: 1100px; margin: auto; }
	#query-box textarea { font-size: 16px; }
	""",
	)