Haneen211's picture
Update app.py
4fb5de4 verified
import gradio as gr
from helper import download_hugging_face_embeddings
from langchain_pinecone import PineconeVectorStore
from langchain_openai import ChatOpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.memory import ConversationBufferMemory
from prompt import *
import os
import re
# Get API keys from environment (Hugging Face Spaces secrets)
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
# Verify API keys are present
if not PINECONE_API_KEY:
raise ValueError("PINECONE_API_KEY not found! Please add it in Space Settings > Secrets")
if not OPENAI_API_KEY:
raise ValueError("OPENAI_API_KEY not found! Please add it in Space Settings > Secrets")
print("API keys loaded successfully!")
embeddings = download_hugging_face_embeddings()
index_name = "medical-chatbot"
# Pass API key explicitly to Pinecone
from pinecone import Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
docsearch = PineconeVectorStore.from_existing_index(
index_name=index_name,
embedding=embeddings
)
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":10})
chatModel = ChatOpenAI(model="gpt-5.2", api_key=OPENAI_API_KEY)
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
prompt = ChatPromptTemplate.from_messages([
("system", system_prompt),
("system", "Conversation so far: {chat_history}"),
("human", "{input}"),
])
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
def format_citation(filename):
"""
Convert filename format 'Author_Year - Title' to 'Author (Year) – Title'
Handles 'et al.' and '&' in author names properly
Examples:
'Ng et al._2023 - Problems and Solutions' -> 'Ng et al. (2023) – Problems and Solutions'
'Godley & Xia_2016 - Physics Guide' -> 'Godley & Xia (2016) – Physics Guide'
'Khan (2003) - Therapy' -> 'Khan (2003) – Therapy'
"""
# Remove file extension if present
filename = re.sub(r'\.(pdf|txt)$', '', filename, flags=re.IGNORECASE)
# Pattern 1: Handle "Author_YEAR - Title" format
match = re.match(r'^(.+?)_(\d{4})\s*-\s*(.+)$', filename)
if match:
author = match.group(1)
year = match.group(2)
title = match.group(3)
# Replace underscores in author names with spaces
author = author.replace('_', ' ')
# Format: Author (Year) – Title
return f"{author} ({year}) – {title}"
# Pattern 2: Already has parentheses "Author (YEAR) - Title"
match = re.match(r'^(.+?)\s*\((\d{4})\)\s*-\s*(.+)$', filename)
if match:
author = match.group(1)
year = match.group(2)
title = match.group(3)
# Replace underscores and hyphens with proper formatting
author = author.replace('_', ' ')
return f"{author} ({year}) – {title}"
# Pattern 3: Just clean up underscores and hyphens in any format
filename = filename.replace('_', ' ')
filename = re.sub(r'\s*-\s*', ' – ', filename)
return filename
def format_latex_for_gradio(text):
"""Convert LaTeX delimiters to Gradio-friendly format while preserving markdown"""
# Convert display math \[ ... \] to $$ ... $$
text = re.sub(r'\\\[(.*?)\\\]', r'$$\1$$', text, flags=re.DOTALL)
# Convert inline math \( ... \) to $ ... $
text = re.sub(r'\\\((.*?)\\\)', r'$\1$', text)
return text
def chat_function(message, history):
memory.chat_memory.add_user_message(message)
response = rag_chain.invoke({
"input": message,
"chat_history": memory.load_memory_variables({})["chat_history"]
})
final_answer = response["answer"]
# Extract sources from retrieved documents
source_documents = response.get("context", [])
unique_sources = []
seen_sources = set()
for doc in source_documents:
source = doc.metadata.get('source', None)
if source and source not in seen_sources:
# Format the citation properly
formatted_source = format_citation(source)
unique_sources.append(formatted_source)
seen_sources.add(source)
# Remove ANY existing "Sources:" section that the LLM generated
if "Sources:" in final_answer:
final_answer = re.split(r'\n*Sources:\s*', final_answer)[0].strip()
# Check if the answer says information was NOT found in retrieved documents
not_found_phrases = [
"retrieved documents do not contain",
"not found in the retrieved",
"no information about this",
"retrieved documents do not mention",
"not available in the retrieved"
]
info_not_in_docs = any(phrase in final_answer.lower() for phrase in not_found_phrases)
# Enforce correct citations
if info_not_in_docs:
final_answer += "\n\n**Sources:**\n\nNone - Answer based on general medical physics knowledge"
elif unique_sources:
final_answer += "\n\n**Sources:**\n\n"
for source in unique_sources:
final_answer += f"- {source}\n"
else:
final_answer += "\n\n**Sources:**\n\nNone available"
# FORMAT LATEX FOR GRADIO
final_answer = format_latex_for_gradio(final_answer)
memory.chat_memory.add_ai_message(final_answer)
return final_answer
# Create Gradio interface with formatting support
demo = gr.ChatInterface(
fn=chat_function,
title="☒️ Radiotherapy Chatbot",
description="By: Haneen Sakaji",
examples=[
"What is an organ at risk?",
"What are the guidelines for single photon beam use?",
"Calculate the activity of an Ir-192 source after 2 months if initial activity is 13.5 Ci"
],
chatbot=gr.Chatbot(
latex_delimiters=[
{"left": "$$", "right": "$$", "display": True},
{"left": "$", "right": "$", "display": False},
],
height=600,
),
)
if __name__ == "__main__":
demo.launch()