Spaces:

ciorant
/

bioethics-rag

Sleeping

App Files Files Community

bioethics-rag / src /chatbot.py

ciorant

Patching metadata

66b97d2 5 months ago

raw

history blame contribute delete

7.5 kB

	from src.document_processor import DocumentProcessor
	from src.vector_store import FAISSVectorStore
	from src.metadata_patcher import patch_metadata_for_store
	from langchain_openai import ChatOpenAI
	from dotenv import load_dotenv
	from pathlib import Path
	import logging
	import os

	load_dotenv()

	if not os.getenv('OPENAI_API_KEY'):
	raise ValueError("OPENAI_API_KEY environment variable is not set")

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	from langchain.callbacks.base import BaseCallbackHandler

	class StreamHandler(BaseCallbackHandler):
	def __init__(self):
	self.current_text = ""
	self.placeholder = None # will be set by the UI

	def on_llm_new_token(self, token: str, **kwargs):
	self.current_text += token
	if self.placeholder is not None:
	try:
	# keep an element id so JS can find & center the in-flight answer
	self.placeholder.markdown(
	f"<div id='assistant-inflight'>{self.current_text}▌</div>",
	unsafe_allow_html=True
	)
	except Exception:
	# placeholder may be invalid during reruns; ignore errors
	pass

	def get_text(self):
	return self.current_text


	class BioethicsChatbot:
	def __init__(self, data_dir: str="data/sample_papers"):
	self.processor = DocumentProcessor()
	self.vector_store = FAISSVectorStore()
	self.history = []
	self.confidence_thresholds = {
	'high': 0.8,
	'medium': 0.65,
	'low': 0.5}

	if not self.vector_store.load_index():
	print("No existing vector store, creating one...")
	pdf_files = list(Path(data_dir).glob("*.pdf"))
	if not pdf_files:
	raise ValueError(f"No PDFs found in {data_dir}")

	chunks = self.processor.process_documents([str(p) for p in pdf_files])
	self.vector_store.add_documents(chunks)
	logger.info("Indexed %d documents.", len(chunks))

	else:
	logger.info("Index loaded from disk")

	metadata_fixes = {
	"A_Theory_of_Bioethics.pdf": {"authors": "DeGrazia and Millum", "year": "2021"},
	"588.full": {"authors": "Wilkinson et al.", "year": "2024"},
	"The Concept of Personal Utility in Genomic Testing Three Ethical Tensions": {"authors": "Watts and Newson", "year": "2025"},
	}
	patch_metadata_for_store(self.vector_store, metadata_fixes)

	self.stream_handler = StreamHandler()
	self.llm = ChatOpenAI(model="gpt-4o-mini", streaming=True,
	callbacks=[self.stream_handler])

	def add_new_document(self, pdf_path: str):
	filename = Path(pdf_path).name

	# Check if already in the index
	existing_files = {doc["metadata"].get("filename") for doc in self.vector_store.documents}
	if filename in existing_files:
	print(f"Skipping {filename}: already indexed.")
	return

	# Otherwise process & add
	chunks = self.processor.process_document(pdf_path)
	self.vector_store.add_documents(chunks)
	print(f"Added {len(chunks)} chunks from {pdf_path}")

	def get_citation_confidence(self, similarity_score: float) -> str:
	"""Determine citation confidence level based on similarity score"""
	if similarity_score >= self.confidence_thresholds['high']:
	return "high_confidence"
	elif similarity_score >= self.confidence_thresholds['medium']:
	return "medium_confidence"
	elif similarity_score >= self.confidence_thresholds['low']:
	return "low_confidence"
	return "context_only"

	def ask(self, question: str, k: int = 10, history_pairs=None) -> str:
	# Step 1: Retrieve relevant chunks
	results = self.vector_store.search(question, k=k)

	# DEBUG: Print what we found
	print(f"Found {len(results)} results for query: '{question}'")
	for i, r in enumerate(results[:3]): # Show top 3
	print(f"Result {i + 1} (score: {r.get('similarity_score', 'N/A'):.3f}): {r['content'][:200]}...")

	if not results:
	return "I couldn't find relevant information in the documents."

	# Step 2: Build context from retrieved chunks
	context_blocks = []
	citation_groups = {
	'high_confidence': [],
	'medium_confidence': [],
	'low_confidence': [],
	'context_only': []
	}
	for r in results:
	title = r["metadata"].get("title", None)
	authors = r["metadata"].get("authors", None)
	year = r["metadata"].get("year", "n.d.")

	confidence = self.get_citation_confidence(r["similarity_score"])

	block = (
	f"Source: {authors} ({year}). {title} "
	f"[chunk {r['metadata'].get('chunk_id', '?')}, confidence: {confidence}]\n"
	f"{r['content']}\n"
	)

	context_blocks.append(block)
	if authors is not None and authors != "Unknown Author(s)":
	citation_groups[confidence].append(block)

	if history_pairs:
	limited = history_pairs[-4:]
	history_text = "\n".join([f"User: {u}\nBot: {b}" for u, b in limited])
	else:
	history_text = "No previous conversation."

	# Build text outside f-string
	joined_context = "\n\n".join(context_blocks)
	joined_high = "\n\n".join(citation_groups['high_confidence']) or "None"
	joined_medium = "\n\n".join(citation_groups['medium_confidence']) or "None"
	joined_low = "\n\n".join(citation_groups['low_confidence']) or "None"

	context = f"""
	Conversation so far:
	{history_text}

	Relevant sources (use them to guide your answer, but cite only the ones in citation groups):
	{joined_context}

	DO NOT CITE IF THE AUTHOR IS "Unknown Author(s)".

	CITATION GUIDELINES:
	- HIGH CONFIDENCE sources: Use direct citations "(Author, Year)"
	- MEDIUM CONFIDENCE sources: Use "According to Author (Year)..."
	- LOW CONFIDENCE sources: Use "(see Author, Year)"

	High confidence sources:
	{joined_high}

	Medium confidence sources:
	{joined_medium}

	Low confidence sources:
	{joined_low}
	"""

	# Step 3: Construct prompt
	prompt = f"""
	You are a bioethics expert assistant.
	Answer the user's question using the context provided below.
	Draw justified connections between concepts even if not explicitly stated.
	If you need to make reasonable inferences based on the context, do so.
	If the context doesn't contain enough information, say what you do know from the context and indicate what information is missing.
	If the question doesn't concern neither bioethics nor previous questions, inform the user about it and don't answer it. Do not
	be rude; respond to a greeting or goodbye.
	Context:
	{context}

	Question: {question}
	Answer:
	"""

	self.stream_handler.current_text = ""

	# streaming happens here
	_ = self.llm.invoke(prompt)
	answer = self.stream_handler.get_text()
	return answer