Spaces:

eaglelandsonce
/

Cross_Encoding_Example

Runtime error

App Files Files Community

Cross_Encoding_Example / app.py

eaglelandsonce

Create app.py

d105d84 verified about 1 month ago

raw

history blame

24.3 kB

	import os
	import uuid
	import json
	from typing import List, Tuple, Dict, Any, Optional

	import chromadb
	from chromadb.config import Settings
	from openai import OpenAI
	import gradio as gr
	from pypdf import PdfReader

	# Cross-encoder (Hugging Face / sentence-transformers)
	# pip install sentence-transformers torch
	from sentence_transformers import CrossEncoder


	# =========================
	# Chroma Client (Persistent)
	# =========================

	chroma_client = chromadb.PersistentClient(
	path="chroma_db",
	settings=Settings(anonymized_telemetry=False),
	)

	collection = chroma_client.get_or_create_collection(
	name="rag_docs",
	metadata={"hnsw:space": "cosine"},
	)


	# =========================
	# Cross-Encoder (lazy global)
	# =========================

	_CROSS_ENCODER: Optional[CrossEncoder] = None
	CROSS_ENCODER_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"


	def get_cross_encoder() -> CrossEncoder:
	global _CROSS_ENCODER
	if _CROSS_ENCODER is None:
	_CROSS_ENCODER = CrossEncoder(CROSS_ENCODER_MODEL_NAME)
	return _CROSS_ENCODER


	# =========================
	# Helper Functions
	# =========================

	def get_openai_client(api_key: str) -> OpenAI:
	if not api_key or not api_key.strip():
	raise ValueError("OpenAI API key is missing.")
	return OpenAI(api_key=api_key.strip())


	def extract_text_from_file(file_path: str) -> str:
	ext = os.path.splitext(file_path)[1].lower()

	if ext in [".txt", ".md"]:
	with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
	return f.read()

	if ext == ".pdf":
	text = []
	reader = PdfReader(file_path)
	for page in reader.pages:
	page_text = page.extract_text()
	if page_text:
	text.append(page_text)
	return "\n".join(text)

	with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
	return f.read()


	def chunk_text(text: str, chunk_size: int = 800, overlap: int = 200) -> List[str]:
	text = text.replace("\r\n", "\n").replace("\r", "\n")
	chunks = []
	start = 0
	while start < len(text):
	end = start + chunk_size
	chunks.append(text[start:end])
	start += chunk_size - overlap
	return chunks


	def embed_texts(texts: List[str], api_key: str) -> List[List[float]]:
	if not texts:
	return []
	client = get_openai_client(api_key)
	resp = client.embeddings.create(
	model="text-embedding-3-small",
	input=texts,
	)
	return [d.embedding for d in resp.data]


	def add_documents_to_chroma(file_paths: List[str], api_key: str) -> str:
	if not file_paths:
	return "⚠️ No files were provided."

	total_chunks = 0
	for file_path in file_paths:
	file_name = os.path.basename(file_path)
	raw_text = extract_text_from_file(file_path)

	if not raw_text.strip():
	continue

	chunks = chunk_text(raw_text)
	embeddings = embed_texts(chunks, api_key)

	ids = [f"{file_name}-{uuid.uuid4()}" for _ in chunks]
	metadatas = [{"source": file_name} for _ in chunks]

	collection.add(
	ids=ids,
	documents=chunks,
	metadatas=metadatas,
	embeddings=embeddings,
	)

	total_chunks += len(chunks)

	count = collection.count()
	return (
	f"✅ Indexed {len(file_paths)} file(s) into Chroma with {total_chunks} chunks. "
	f"Collection now has {count} vectors."
	)


	# =========================
	# Query Expansion
	# =========================

	def query_expansion(user_query: str, api_key: str) -> List[str]:
	user_query = (user_query or "").strip()
	if not user_query:
	return []

	client = get_openai_client(api_key)

	system_prompt = (
	"You are an expert in information retrieval systems, particularly skilled in enhancing queries "
	"for document search efficiency."
	)

	user_prompt = f"""
	Perform query expansion on the received question by considering alternative phrasings or synonyms commonly used in document retrieval contexts.
	If there are multiple ways to phrase the user's question or common synonyms for key terms, provide several reworded versions.
	If there are acronyms or words you are not familiar with, do not try to rephrase them.
	Return at least 3 versions of the question.
	Return ONLY valid JSON with this exact shape:
	{{
	"expanded": ["q1", "q2", "q3"]
	}}
	Question:
	{user_query}
	""".strip()

	completion = client.chat.completions.create(
	model="gpt-4.1-mini",
	temperature=0.2,
	response_format={"type": "json_object"},
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt},
	],
	)

	raw = completion.choices[0].message.content
	try:
	data = json.loads(raw)
	expanded = data.get("expanded", [])
	except json.JSONDecodeError:
	expanded = []

	expanded = [q.strip() for q in expanded if isinstance(q, str) and q.strip()]
	while len(expanded) < 3:
	expanded.append(user_query)

	# include original as first option
	if expanded and expanded[0] != user_query:
	expanded = [user_query] + expanded

	# De-dupe preserving order
	seen = set()
	out = []
	for q in expanded:
	if q not in seen:
	seen.add(q)
	out.append(q)

	return out


	def format_expansions_md(expanded: List[str]) -> str:
	if not expanded:
	return "(No expansions yet — type a question and press Enter.)"
	lines = [f"{i+1}. {q}" for i, q in enumerate(expanded)]
	return "### 🧠 Expanded Queries\n\n" + "\n".join(lines)


	# =========================
	# LLM Self-Evaluation Helper
	# =========================

	def evaluate_answer(question: str, context: str, answer: str, api_key: str) -> dict:
	client = get_openai_client(api_key)

	system_prompt = (
	"You are an impartial evaluator for a Retrieval-Augmented Generation (RAG) system. "
	"You will receive: (1) the user query, (2) the retrieved context, and (3) the model's answer. "
	"You must evaluate the answer on five metrics, each scored from 1 (very poor) to 5 (excellent):\n"
	"- Groundedness: Is the answer supported by the retrieved CONTEXT (not outside knowledge)?\n"
	"- Relevance: Does the answer address the USER QUERY directly and appropriately?\n"
	"- Faithfulness: Are the statements logically valid and consistent with the context (no contradictions)?\n"
	"- Context Precision: Does the answer avoid including irrelevant details from the context?\n"
	"- Context Recall: Does the answer capture all IMPORTANT information from the context needed to answer well?\n\n"
	"Return ONLY a single JSON object with this exact structure:\n"
	"{\n"
	' "query": string,\n'
	' "response": string,\n'
	' "groundedness_evaluation": {"score": int, "justification": string},\n'
	' "relevance_evaluation": {"score": int, "justification": string},\n'
	' "faithfulness_evaluation": {"score": int, "justification": string},\n'
	' "context_precision_evaluation": {"score": int, "justification": string},\n'
	' "context_recall_evaluation": {"score": int, "justification": string}\n'
	"}"
	)

	user_prompt = (
	f"USER QUERY:\n{question}\n\n"
	f"RETRIEVED CONTEXT:\n{context}\n\n"
	f"MODEL ANSWER:\n{answer}"
	)

	completion = client.chat.completions.create(
	model="gpt-4.1-mini",
	temperature=0.0,
	response_format={"type": "json_object"},
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt},
	],
	)

	raw = completion.choices[0].message.content
	try:
	return json.loads(raw)
	except json.JSONDecodeError:
	return {
	"query": question,
	"response": answer,
	"groundedness_evaluation": {"score": None, "justification": "Failed to parse JSON evaluation."},
	"relevance_evaluation": {"score": None, "justification": raw},
	"faithfulness_evaluation": {"score": None, "justification": ""},
	"context_precision_evaluation": {"score": None, "justification": ""},
	"context_recall_evaluation": {"score": None, "justification": ""},
	}


	# =========================================================
	# REQUIRED: Chroma Retrieval + Cross-Encoder Rerank + Prompt
	# =========================================================

	def retrieve_from_chroma(query: str, top_k: int, api_key: str) -> List[Dict[str, Any]]:
	"""
	Retrieve top_k passages from Chroma using embeddings.
	Preserves ids + metadatas + distances + documents.

	Returns list[dict] with keys:
	- id: str
	- text: str
	- metadata: dict
	- distance: float\|None
	"""
	query = (query or "").strip()
	if not query:
	return []

	if collection.count() == 0:
	return []

	q_emb = embed_texts([query], api_key)[0]
	results = collection.query(
	query_embeddings=[q_emb],
	n_results=top_k,
	)

	ids = results.get("ids", [[]])[0] or []
	docs = results.get("documents", [[]])[0] or []
	metas = results.get("metadatas", [[]])[0] or []
	dists = results.get("distances", [[]])[0] if "distances" in results else [None] * len(docs)

	out = []
	for i in range(min(len(docs), len(ids), len(metas))):
	out.append({
	"id": ids[i],
	"text": docs[i],
	"metadata": metas[i] or {},
	"distance": dists[i] if i < len(dists) else None,
	})
	return out


	def cross_encoder_rerank(query: str, docs: List[Dict[str, Any]], top_n: int) -> List[Dict[str, Any]]:
	"""
	Rerank retrieved passages with a HF cross-encoder:
	model = "cross-encoder/ms-marco-MiniLM-L-6-v2"

	Inputs:
	- query: str
	- docs: list of dicts from retrieve_from_chroma or merged retrieval
	- top_n: int

	Returns: list of docs with added field:
	- score: float (higher is better)
	"""
	query = (query or "").strip()
	if not query or not docs:
	return []

	model = get_cross_encoder()

	pairs = [(query, d.get("text", "")) for d in docs]
	scores = model.predict(pairs)

	reranked = []
	for d, s in zip(docs, scores):
	dd = dict(d)
	dd["score"] = float(s)
	reranked.append(dd)

	reranked.sort(key=lambda x: x.get("score", float("-inf")), reverse=True)
	return reranked[:top_n]


	def build_prompt(query: str, reranked_docs: List[Dict[str, Any]]) -> Tuple[str, str]:
	"""
	Build the final context string and the LLM prompt.

	Returns:
	- context: str (the final context string)
	- prompt: str (full prompt for the LLM)
	"""
	parts = []
	for d in reranked_docs:
	md = d.get("metadata", {}) or {}
	source = md.get("source", "unknown")
	page = md.get("page", md.get("page_number", md.get("pageno", "")))

	header = f"Source: {source}"
	if page != "" and page is not None:
	header += f" \| Page: {page}"

	parts.append(f"{header}\n{d.get('text','')}".strip())

	context = "\n\n---\n\n".join(parts).strip()

	prompt = (
	"You are a helpful assistant that answers questions ONLY using the provided document context. "
	"If the context does not contain the answer, say you do not know.\n\n"
	f"Context from documents:\n\n{context}\n\n"
	f"Question: {query}\n\n"
	"Answer based only on the context above."
	)

	return context, prompt


	# =========================
	# Existing Multi-Query RAG (unchanged behavior)
	# =========================

	def _merge_docs_by_id(doc_lists: List[List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
	"""
	Merge/dedupe docs (dicts) by Chroma chunk id. Keeps the best (lowest) distance if present.
	"""
	merged: Dict[str, Dict[str, Any]] = {}
	for docs in doc_lists:
	for d in docs:
	cid = d.get("id")
	if not cid:
	continue
	if cid not in merged:
	merged[cid] = d
	else:
	# keep best distance if both have it
	old_dist = merged[cid].get("distance")
	new_dist = d.get("distance")
	if old_dist is not None and new_dist is not None and new_dist < old_dist:
	merged[cid] = d
	return list(merged.values())


	def query_rag_multi(selected_queries: List[str], api_key: str) -> str:
	selected_queries = [q.strip() for q in (selected_queries or []) if isinstance(q, str) and q.strip()]
	if not selected_queries:
	return "⚠️ Please select at least one expanded query."

	if collection.count() == 0:
	return "⚠️ No documents in the database yet. Upload and index some documents first."

	# Your prior behavior: embed each selected query, retrieve 5 each, merge, take top 5 overall.
	# (We keep this as-is.)
	q_embs = embed_texts(selected_queries, api_key)
	results = collection.query(
	query_embeddings=q_embs,
	n_results=5,
	)

	# Convert multi-query results to docs
	all_ids = results.get("ids", [])
	all_docs = results.get("documents", [])
	all_metas = results.get("metadatas", [])
	all_dist = results.get("distances", None)

	doc_lists: List[List[Dict[str, Any]]] = []
	for qi in range(len(all_docs)):
	ids_i = all_ids[qi] if qi < len(all_ids) else []
	docs_i = all_docs[qi] if qi < len(all_docs) else []
	metas_i = all_metas[qi] if qi < len(all_metas) else []
	dist_i = all_dist[qi] if isinstance(all_dist, list) and qi < len(all_dist) else [None] * len(docs_i)

	out_i = []
	for cid, doc, meta, dist in zip(ids_i, docs_i, metas_i, dist_i):
	out_i.append({"id": cid, "text": doc, "metadata": meta or {}, "distance": dist})
	doc_lists.append(out_i)

	merged = _merge_docs_by_id(doc_lists)
	if not merged:
	return "I couldn't find any relevant context in the indexed documents."

	# best-first by distance if available
	merged.sort(key=lambda d: (d.get("distance") is None, d.get("distance", 1e9)))
	top = merged[:5]

	context_parts = []
	for d in top:
	md = d.get("metadata", {}) or {}
	context_parts.append(f"Source: {md.get('source','unknown')}\n{d.get('text','')}")
	context = "\n\n---\n\n".join(context_parts)

	client = get_openai_client(api_key)
	system_prompt = (
	"You are a helpful assistant that answers questions ONLY using the provided document context. "
	"If the context does not contain the answer, say you do not know."
	)
	user_prompt = (
	f"Context from documents:\n\n{context}\n\n"
	f"Selected expanded queries:\n- " + "\n- ".join(selected_queries) + "\n\n"
	"Answer based only on the context above."
	)

	completion = client.chat.completions.create(
	model="gpt-4.1-mini",
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt},
	],
	temperature=0.1,
	)

	response_text = completion.choices[0].message.content.strip()

	try:
	eval_dict = evaluate_answer(
	question=" \| ".join(selected_queries),
	context=context,
	answer=response_text,
	api_key=api_key,
	)

	log_record = {
	"query": eval_dict.get("query"),
	"response": eval_dict.get("response"),
	"groundedness_evaluation": eval_dict.get("groundedness_evaluation"),
	"relevance_evaluation": eval_dict.get("relevance_evaluation"),
	"faithfulness_evaluation": eval_dict.get("faithfulness_evaluation"),
	"context_precision_evaluation": eval_dict.get("context_precision_evaluation"),
	"context_recall_evaluation": eval_dict.get("context_recall_evaluation"),
	}

	return (
	f"### 💬 Answer\n\n{response_text}\n\n"
	f"---\n\n"
	f"### 🔍 Self-evaluation (1–5)\n\n"
	f"```json\n{json.dumps(log_record, indent=2)}\n```"
	)
	except Exception as e:
	return (
	f"### 💬 Answer\n\n{response_text}\n\n"
	f"---\n\n"
	f"⚠️ Self-evaluation failed: {e}"
	)


	# =========================
	# Cross-Encode Stage UI Helpers
	# =========================

	def format_rerank_results_md(query: str, reranked: List[Dict[str, Any]], top_n: int) -> str:
	if not reranked:
	return "(No reranked results to display.)"

	lines = []
	lines.append(f"### 🎯 Cross-Encoder Rerank Results (top {top_n})")
	lines.append("")
	lines.append("\| Rank \| Score \| Source \| Page \| Snippet \|")
	lines.append("\|---:\|---:\|---\|---:\|---\|")

	for i, d in enumerate(reranked, start=1):
	md = d.get("metadata", {}) or {}
	source = str(md.get("source", "unknown"))
	page = md.get("page", md.get("page_number", md.get("pageno", "")))
	score = d.get("score", None)
	snippet = (d.get("text", "") or "").replace("\n", " ").strip()
	if len(snippet) > 160:
	snippet = snippet[:160] + "…"
	lines.append(f"\| {i} \| {score:.4f} \| {source} \| {page if page is not None else ''} \| {snippet} \|")

	return "\n".join(lines)


	# =========================
	# Gradio Wrappers
	# =========================

	def gradio_ingest(files, api_key):
	if not api_key or not api_key.strip():
	return "❌ Please enter your OpenAI API key before indexing."

	if not files:
	return "⚠️ Please drop at least one document."

	file_paths = files if isinstance(files, list) else [files]

	try:
	status = add_documents_to_chroma(file_paths, api_key)
	except Exception as e:
	return f"❌ Error during indexing: {e}"
	return status


	def gradio_expand(question: str, api_key: str):
	if not api_key or not api_key.strip():
	return gr.update(choices=[], value=[]), "❌ Please enter your OpenAI API key first."

	expanded = query_expansion(question, api_key)
	md = format_expansions_md(expanded)
	default_value = expanded[:1] if expanded else []
	return gr.update(choices=expanded, value=default_value), md


	def gradio_run_selected(selected_queries: List[str], api_key: str) -> str:
	if not api_key or not api_key.strip():
	return "❌ Please enter your OpenAI API key before searching."
	if not selected_queries:
	return "⚠️ Please expand a question and select one or more to run."

	try:
	return query_rag_multi(selected_queries, api_key)
	except Exception as e:
	return f"❌ Error during question answering: {e}"


	def gradio_cross_encode(original_question: str, selected_queries: List[str], api_key: str) -> Tuple[str, str]:
	"""
	Cross-encode button:
	- Initial retrieval via Chroma: top_k=20 (per requirement)
	- Rerank via cross-encoder: top_n=5 (per requirement)
	- Show:
	(a) top_n reranked passages,
	(b) their scores,
	(c) final context string
	"""
	if not api_key or not api_key.strip():
	return "❌ Please enter your OpenAI API key first.", ""

	if collection.count() == 0:
	return "⚠️ No documents in the database yet. Upload and index some documents first.", ""

	original_question = (original_question or "").strip()
	selected_queries = [q.strip() for q in (selected_queries or []) if isinstance(q, str) and q.strip()]

	if not original_question and not selected_queries:
	return "⚠️ Please type a question and/or select expansions first.", ""

	# Retrieval: use selected expansions if present, otherwise fall back to original question
	retrieval_queries = selected_queries if selected_queries else [original_question]

	# Requirement: Chroma retrieval top_k=20
	retrieved_lists = [retrieve_from_chroma(q, top_k=20, api_key=api_key) for q in retrieval_queries]
	merged_docs = _merge_docs_by_id(retrieved_lists)

	if not merged_docs:
	return "I couldn't find any relevant context in the indexed documents.", ""

	# Cross-encoder scoring query: use the original user question if available; else first retrieval query
	scoring_query = original_question if original_question else retrieval_queries[0]

	# Requirement: rerank top_n=5
	reranked = cross_encoder_rerank(scoring_query, merged_docs, top_n=5)

	# Build final context + prompt
	context, _prompt = build_prompt(scoring_query, reranked)

	# Return:
	# (a) reranked passages (shown in table),
	# (b) scores (in table),
	# (c) final context string (shown separately)
	md = format_rerank_results_md(scoring_query, reranked, top_n=5)
	return md, f"### 🧩 Final Context (for LLM)\n\n{context}"


	# =========================
	# Gradio Interface
	# =========================

	with gr.Blocks(title="RAG with ChromaDB") as demo:
	gr.Markdown(
	"""
	# 📚 RAG Q&A with ChromaDB + Gradio (Multi-Select Query Expansion + Cross-Encoder Rerank)
	1. Paste your OpenAI API key below.
	2. Drag & drop one or more documents into the upload box.
	3. Click "Index documents" to store them in a Chroma vector database.
	4. Type a question and press Enter (or click Expand) to generate expanded queries.
	5. Select one or more expanded queries.
	6. Click Run Search for the normal pipeline, or Cross Encode to view reranked passages + scores + final context.
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	api_key_box = gr.Textbox(
	label="OpenAI API Key",
	placeholder="sk-... (this is kept in memory only for this session)",
	type="password",
	)

	file_input = gr.File(
	label="Drop your document(s) here",
	file_count="multiple",
	type="filepath",
	)
	ingest_button = gr.Button("Index documents")
	ingest_status = gr.Markdown("⚙️ Waiting for documents...")

	with gr.Column(scale=1):
	question_box = gr.Textbox(
	label="Type a question, then press Enter to expand",
	placeholder="e.g., What are the main findings in the report?",
	lines=3,
	)

	with gr.Row():
	expand_button = gr.Button("Expand")
	run_button = gr.Button("Run Search")
	cross_button = gr.Button("Cross Encode")

	expanded_checks = gr.CheckboxGroup(
	label="Choose one or more expanded queries to run",
	choices=[],
	value=[],
	interactive=True,
	)

	expansions_preview = gr.Markdown("(No expansions yet — type a question and press Enter.)")
	answer_box = gr.Markdown("💬 Answer will appear here (with self-evaluation).")

	gr.Markdown("---")
	rerank_results_box = gr.Markdown("(Cross-encoder rerank results will appear here.)")
	rerank_context_box = gr.Markdown("(Final context for the LLM will appear here.)")

	ingest_button.click(
	fn=gradio_ingest,
	inputs=[file_input, api_key_box],
	outputs=[ingest_status],
	)

	# Expand on Enter
	question_box.submit(
	fn=gradio_expand,
	inputs=[question_box, api_key_box],
	outputs=[expanded_checks, expansions_preview],
	)

	# Expand on button click
	expand_button.click(
	fn=gradio_expand,
	inputs=[question_box, api_key_box],
	outputs=[expanded_checks, expansions_preview],
	)

	# Run selected expanded queries (existing pipeline)
	run_button.click(
	fn=gradio_run_selected,
	inputs=[expanded_checks, api_key_box],
	outputs=[answer_box],
	)

	# Cross-encoder rerank (new button + UI outputs)
	cross_button.click(
	fn=gradio_cross_encode,
	inputs=[question_box, expanded_checks, api_key_box],
	outputs=[rerank_results_box, rerank_context_box],
	)

	if __name__ == "__main__":
	demo.launch()