Spaces:

Haneen211
/

radiotherapy-chatbot

Running

App Files Files Community

radiotherapy-chatbot / app.py

Haneen211

Update app.py

4fb5de4 verified about 2 months ago

raw

history blame contribute delete

6.26 kB

	import gradio as gr
	from helper import download_hugging_face_embeddings
	from langchain_pinecone import PineconeVectorStore
	from langchain_openai import ChatOpenAI
	from langchain.chains import create_retrieval_chain
	from langchain.chains.combine_documents import create_stuff_documents_chain
	from langchain_core.prompts import ChatPromptTemplate
	from langchain.memory import ConversationBufferMemory
	from prompt import *
	import os
	import re

	# Get API keys from environment (Hugging Face Spaces secrets)
	PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
	OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

	# Verify API keys are present
	if not PINECONE_API_KEY:
	raise ValueError("PINECONE_API_KEY not found! Please add it in Space Settings > Secrets")
	if not OPENAI_API_KEY:
	raise ValueError("OPENAI_API_KEY not found! Please add it in Space Settings > Secrets")

	print("API keys loaded successfully!")

	embeddings = download_hugging_face_embeddings()
	index_name = "medical-chatbot"

	# Pass API key explicitly to Pinecone
	from pinecone import Pinecone
	pc = Pinecone(api_key=PINECONE_API_KEY)

	docsearch = PineconeVectorStore.from_existing_index(
	index_name=index_name,
	embedding=embeddings
	)

	retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":10})
	chatModel = ChatOpenAI(model="gpt-5.2", api_key=OPENAI_API_KEY)
	memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

	prompt = ChatPromptTemplate.from_messages([
	("system", system_prompt),
	("system", "Conversation so far: {chat_history}"),
	("human", "{input}"),
	])

	question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
	rag_chain = create_retrieval_chain(retriever, question_answer_chain)

	def format_citation(filename):
	"""
	Convert filename format 'Author_Year - Title' to 'Author (Year) – Title'
	Handles 'et al.' and '&' in author names properly

	Examples:
	'Ng et al._2023 - Problems and Solutions' -> 'Ng et al. (2023) – Problems and Solutions'
	'Godley & Xia_2016 - Physics Guide' -> 'Godley & Xia (2016) – Physics Guide'
	'Khan (2003) - Therapy' -> 'Khan (2003) – Therapy'
	"""
	# Remove file extension if present
	filename = re.sub(r'\.(pdf\|txt)$', '', filename, flags=re.IGNORECASE)

	# Pattern 1: Handle "Author_YEAR - Title" format
	match = re.match(r'^(.+?)_(\d{4})\s-\s(.+)$', filename)
	if match:
	author = match.group(1)
	year = match.group(2)
	title = match.group(3)

	# Replace underscores in author names with spaces
	author = author.replace('_', ' ')

	# Format: Author (Year) – Title
	return f"{author} ({year}) – {title}"

	# Pattern 2: Already has parentheses "Author (YEAR) - Title"
	match = re.match(r'^(.+?)\s$(\d{4})$\s-\s*(.+)$', filename)
	if match:
	author = match.group(1)
	year = match.group(2)
	title = match.group(3)

	# Replace underscores and hyphens with proper formatting
	author = author.replace('_', ' ')
	return f"{author} ({year}) – {title}"

	# Pattern 3: Just clean up underscores and hyphens in any format
	filename = filename.replace('_', ' ')
	filename = re.sub(r'\s-\s', ' – ', filename)

	return filename

	def format_latex_for_gradio(text):
	"""Convert LaTeX delimiters to Gradio-friendly format while preserving markdown"""
	# Convert display math \[ ... \] to $$ ... $$
	text = re.sub(r'\\\[(.*?)\\\]', r'$$\1$$', text, flags=re.DOTALL)
	# Convert inline math $ ... $ to $ ... $
	text = re.sub(r'\\$(.*?)\\$', r'$\1$', text)
	return text

	def chat_function(message, history):
	memory.chat_memory.add_user_message(message)

	response = rag_chain.invoke({
	"input": message,
	"chat_history": memory.load_memory_variables({})["chat_history"]
	})

	final_answer = response["answer"]

	# Extract sources from retrieved documents
	source_documents = response.get("context", [])
	unique_sources = []
	seen_sources = set()

	for doc in source_documents:
	source = doc.metadata.get('source', None)
	if source and source not in seen_sources:
	# Format the citation properly
	formatted_source = format_citation(source)
	unique_sources.append(formatted_source)
	seen_sources.add(source)

	# Remove ANY existing "Sources:" section that the LLM generated
	if "Sources:" in final_answer:
	final_answer = re.split(r'\nSources:\s', final_answer)[0].strip()

	# Check if the answer says information was NOT found in retrieved documents
	not_found_phrases = [
	"retrieved documents do not contain",
	"not found in the retrieved",
	"no information about this",
	"retrieved documents do not mention",
	"not available in the retrieved"
	]

	info_not_in_docs = any(phrase in final_answer.lower() for phrase in not_found_phrases)

	# Enforce correct citations
	if info_not_in_docs:
	final_answer += "\n\nSources:\n\nNone - Answer based on general medical physics knowledge"
	elif unique_sources:
	final_answer += "\n\nSources:\n\n"
	for source in unique_sources:
	final_answer += f"- {source}\n"
	else:
	final_answer += "\n\nSources:\n\nNone available"

	# FORMAT LATEX FOR GRADIO
	final_answer = format_latex_for_gradio(final_answer)

	memory.chat_memory.add_ai_message(final_answer)
	return final_answer

	# Create Gradio interface with formatting support
	demo = gr.ChatInterface(
	fn=chat_function,
	title="☢️ Radiotherapy Chatbot",
	description="By: Haneen Sakaji",
	examples=[
	"What is an organ at risk?",
	"What are the guidelines for single photon beam use?",
	"Calculate the activity of an Ir-192 source after 2 months if initial activity is 13.5 Ci"
	],
	chatbot=gr.Chatbot(
	latex_delimiters=[
	{"left": "$$", "right": "$$", "display": True},
	{"left": "$", "right": "$", "display": False},
	],
	height=600,
	),
	)

	if __name__ == "__main__":
	demo.launch()