Spaces:

spark-ds549
/

BPL-RAG-Spring-2025

Sleeping

BPL-RAG-Spring-2025 / streamlit_app.py

rithvik213

updated RAG pipeline and streamlit file

5e31ea3 8 months ago

8.52 kB

	import streamlit as st
	import os
	from typing import List, Tuple, Optional
	from pinecone import Pinecone
	from langchain_pinecone import PineconeVectorStore
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_openai import ChatOpenAI
	from langchain_core.prompts import PromptTemplate
	from dotenv import load_dotenv
	from RAG import RAG
	import logging
	from image_scraper import DigitalCommonwealthScraper
	import shutil

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Page configuration
	st.set_page_config(
	page_title="Boston Public Library Chatbot",
	page_icon="🤖",
	layout="wide"
	)

	def initialize_models() -> Tuple[Optional[ChatOpenAI], HuggingFaceEmbeddings]:
	"""Initialize the language model and embeddings."""
	try:
	load_dotenv()

	if "llm" not in st.session_state:
	# Initialize OpenAI model
	st.session_state.llm = ChatOpenAI(
	model="gpt-3.5-turbo",
	temperature=0,
	timeout=60, # Added reasonable timeout
	max_retries=2
	)

	if "embeddings" not in st.session_state:
	# Initialize embeddings
	st.session_state.embeddings = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-mpnet-base-v2"
	#model_name="sentence-transformers/all-MiniLM-L6-v2"
	)

	if "pinecone" not in st.session_state:
	pinecone_api_key = os.getenv("PINECONE_API_KEY")
	INDEX_NAME = 'bpl-test'
	#initialize vectorstore
	pc = Pinecone(api_key=pinecone_api_key)

	index = pc.Index(INDEX_NAME)
	st.session_state.pinecone = PineconeVectorStore(index=index, embedding=st.session_state.embeddings)

	if "vectorstore" not in st.session_state:
	#st.session_state.vectorstore = CloudSQLVectorStore(embedding=st.session_state.embeddings)
	st.session_state.vectorstore = st.session_state.pinecone

	except Exception as e:
	logger.error(f"Error initializing models: {str(e)}")
	st.error(f"Failed to initialize models: {str(e)}")
	return None, None

	def process_message(
	query: str,
	llm: ChatOpenAI,
	vectorstore: PineconeVectorStore,

	) -> Tuple[str, List]:
	"""Process the user message using the RAG system."""
	try:
	response, sources = RAG(
	query=query,
	llm=llm,
	vectorstore=vectorstore,
	)
	return response, sources
	except Exception as e:
	logger.error(f"Error in process_message: {str(e)}")
	return f"Error processing message: {str(e)}", []

	def display_sources(sources: List) -> None:
	"""Display sources with minimal output: content preview, source, URL, and image/audio if available."""
	if not sources:
	st.info("No sources available for this response.")
	return

	st.subheader("Sources")
	for doc in sources:
	try:
	metadata = doc.metadata
	source = metadata.get("source", "Unknown Source")
	title = metadata.get("title_info_primary_tsi", "Unknown Title")
	format_type = metadata.get("format", "").lower()

	is_audio = "audio" in format_type

	expander_title = f"🔊 {title}" if is_audio else title

	with st.expander(expander_title):
	# Content preview
	if hasattr(doc, 'page_content'):
	st.markdown(f"Content: {doc.page_content[:300]} ...")

	# URL building
	doc_url = metadata.get("URL", "").strip()
	if not doc_url and source:
	doc_url = f"https://www.digitalcommonwealth.org/search/{source}"

	st.markdown(f"Source ID: {source}")
	st.markdown(f"Format: {format_type if format_type else 'Not specified'}")
	st.markdown(f"URL: {doc_url}")

	# 🔊 Try to show audio if it's an audio entry and there's a media file
	if is_audio:
	# Try to find a playable media file — if metadata has audio URLs
	# For now, just embed a dummy player or placeholder
	st.info("This is an audio entry.")
	# Optionally:
	# st.audio("https://example.com/audio-file.mp3") # replace with real audio URL
	else:
	# 🖼️ Show image if it's not audio
	scraper = DigitalCommonwealthScraper()
	images = scraper.extract_images(doc_url)
	images = images[:1]

	if images:
	output_dir = 'downloaded_images'
	if os.path.exists(output_dir):
	shutil.rmtree(output_dir)
	downloaded_files = scraper.download_images(images)
	st.image(downloaded_files, width=400, caption=[
	img.get('alt', f'Image') for img in images
	])
	except Exception as e:
	logger.warning(f"[display_sources] Error displaying document: {e}")
	st.error("Error displaying one of the sources.")



	def main():
	st.title("Digital Commonwealth RAG 🤖")

	INDEX_NAME = 'bpl-rag'

	# Initialize session state
	if "messages" not in st.session_state:
	st.session_state.messages = []

	if "show_settings" not in st.session_state:
	st.session_state.show_settings = False

	if "num_sources" not in st.session_state:
	st.session_state.num_sources = 10


	initialize_models()

	# 🔵 Settings button
	open_settings = st.button("⚙️ Settings")

	if open_settings:
	st.session_state.show_settings = True

	if st.session_state.show_settings:
	with st.container():
	st.markdown("---")
	st.markdown("### ⚙️ Settings")

	num_sources = st.number_input(
	"Number of Sources to Display",
	min_value=1,
	max_value=100,
	value=st.session_state.num_sources,
	step=1,
	)
	st.session_state.num_sources = num_sources

	close_settings = st.button("❌ Close Settings")
	if close_settings:
	st.session_state.show_settings = False
	st.markdown("---")

	# Show chat history
	for message in st.session_state.messages:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	# ⬇️ CHAT INPUT BOX always stuck to bottom
	user_input = st.chat_input("Type your question here...")

	if user_input:
	with st.chat_message("user"):
	st.markdown(user_input)
	st.session_state.messages.append({"role": "user", "content": user_input})

	with st.chat_message("assistant"):
	with st.spinner("Thinking... Please be patient..."):
	response, sources = process_message(
	query=user_input,
	llm=st.session_state.llm,
	vectorstore=st.session_state.vectorstore
	)

	if isinstance(response, str):
	st.markdown(response)
	st.session_state.messages.append({
	"role": "assistant",
	"content": response
	})

	display_sources(sources[:int(st.session_state.num_sources)])
	else:
	st.error("Received an invalid response format")

	# Footer (optional, will be above chat input)
	st.markdown("---")
	st.markdown(
	"Built with Langchain + Streamlit + Pinecone",
	help="Natural Language Querying for Digital Commonwealth"
	)
	st.markdown(
	"The Digital Commonwealth site provides access to photographs, manuscripts, books, "
	"audio recordings, and other materials of historical interest that have been digitized "
	"and made available by members of Digital Commonwealth."
	)

	if __name__ == "__main__":
	main()