Spaces:

anl139
/

test

Sleeping

App Files Files Community

test / app.py

anl139

Update app.py

3f89aff verified 11 months ago

raw

history blame contribute delete

13.4 kB

	import os
	import time
	import gradio as gr
	from dotenv import load_dotenv
	from pathlib import Path
	import re
	import json
	from langchain_community.document_loaders import JSONLoader
	# Import Document from your LangChain module.
	# (Adjust the import if your version of LangChain uses a different path.)
	from langchain_core.documents import Document

	# Import additional libraries from LangChain
	from langchain_chroma import Chroma
	from langchain_openai import OpenAIEmbeddings
	from langchain_community.retrievers import BM25Retriever
	from langchain.retrievers import EnsembleRetriever
	from langchain_core.runnables import RunnablePassthrough
	from langchain_core.output_parsers import StrOutputParser
	from langchain_openai import ChatOpenAI
	from langchain.chains import create_retrieval_chain
	from langchain.chains.combine_documents import create_stuff_documents_chain
	from langchain_core.prompts import ChatPromptTemplate

	# Load environment variables for Hugging Face and OpenAI
	load_dotenv()
	os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
	os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

	# -------------------------------
	# Utility Functions
	# -------------------------------

	def flatten_metadata(metadata):
	"""Helper function to flatten dictionary metadata into a string."""
	if isinstance(metadata, dict):
	return " \| ".join([f"{key}: {value}" for key, value in metadata.items()])
	return str(metadata) # If it's not a dict, just return the string version

	def metadata_func(record, additional_fields=None):
	is_winner = record.get("Ranking", "").lower() == "winner"

	return {
	"Project Title": record.get("Title", ""),
	"Organization": record.get("Organization", ""),
	"LA 2050 Grant Status": record.get("Ranking", ""),
	"Impact Metrics": record.get("Impact Metrics", ""),
	"LA 2050 Year": record.get("Year", ""),
	"Organizations urls": flatten_metadata({
	"Organization website": record.get("Website", ""),
	"Organization newsletter": record.get("Newsletter", ""),
	"volunteer": record.get("Volunteer", ""),
	"LA2050 website": record.get("LA2050", "")
	}),
	"social": flatten_metadata({
	"twitter": record.get("Twitter", ""),
	"instagram": record.get("Instagram", ""),
	"facebook": record.get("FaceBook", "")
	}),
	"working_area": record.get("Working Areas in LA", ""),
	"zipcode": record.get("Zipcode", "")
	}
	# Load the JSON data with custom metadata and content key
	loader = JSONLoader(
	file_path='data.json',
	jq_schema='.[]',
	content_key='Summary',
	metadata_func=metadata_func
	)

	data = loader.load()


	# Use a text splitter to create chunks from the documents.
	# (If you find that key fields are getting split, consider implementing a custom splitter.)
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1600,
	chunk_overlap=150,
	add_start_index=True,
	separators=["\n\n", "\n", ". ", " ", ""]
	)
	def split_document_with_metadata(document):
	# Split the document text into chunks.
	chunks = text_splitter.split_text(document.page_content)
	# Ensure every chunk has the complete original metadata.
	return [Document(page_content=chunk, metadata=document.metadata) for chunk in chunks]

	all_splits = []
	for doc in data:
	all_splits.extend(split_document_with_metadata(doc))
	# -------------------------------
	# Set Up Retrievers
	# -------------------------------

	# Create a Chroma vector store using the document splits.
	persist_directory = "path_to_persist_directory"

	# Check if the directory exists and contains a persisted vector store
	if os.path.exists(persist_directory):
	# Attempt to load the existing vector store
	try:
	vectorstore = Chroma.load(persist_directory, embedding_function=OpenAIEmbeddings())
	print("Loaded existing vector store from persisted directory.")
	except Exception as e:
	print(f"Error loading vector store: {e}. Proceeding to create a new one.")
	# Fallback to creating a new vector store if loading fails
	vectorstore = Chroma.from_documents(
	documents=all_splits,
	embedding=OpenAIEmbeddings(),
	persist_directory=persist_directory
	)
	print("Created new vector store and persisted embeddings.")
	else:
	# Create a new vector store if the directory doesn't exist
	vectorstore = Chroma.from_documents(
	documents=all_splits,
	embedding=OpenAIEmbeddings(),
	persist_directory=persist_directory
	)
	print("Created new vector store and persisted embeddings.")

	# Create a BM25 retriever from the document splits.
	bm25_retriever = BM25Retriever.from_documents(all_splits)
	ensemble_retriever = EnsembleRetriever(
	retrievers=[
	vectorstore.as_retriever(),
	bm25_retriever
	],
	weights=[0.9, 0.1]
	)
	retriever = ensemble_retriever

	# -------------------------------
	# Prepare Retrieval and Generation Chain
	# -------------------------------

	system_prompt = (
	"You are the LA2050 Navigator, an AI-powered chatbot created to help users discover organizations and community initiatives featured in the Goldhirsh Foundation’s LA2050 Ideas Hub. "
	"Your role is to deliver succinct, personalized recommendations, guide users toward supporting these initiatives, and answer questions about the Goldhirsh Foundation, LA2050, and its projects. "
	"When responding, include the full name of the organization, a brief (1-2 sentence) description, and a link to its website (labeled as Organization website) or social media; (please do not alter the URL). "
	"If an organization’s personal website is unavailable, refer to its LA2050 URL. "
	"Prioritize nonprofit organizations designated as 'winners' by the Goldhirsh Foundation and those with multiple proposal submissions. "
	"If a user inquires about the LA2050 grant winners for a specific year, be sure to look out for 'LA 2050 Grant Status'-explicitly noting if the organization was awarded the grant that year(disregard if it has 'Submitted)'. "
	"Use the data files as your primary source of information. These files have been pre-processed into context-rich segments using a recursive text-splitting approach to ensure key details are preserved. "
	"If some information is missing, acknowledge it and direct the user to additional resources. "
	"Maintain a polite, helpful, respectful, and enthusiastic tone at all times. "
	"If the user responds with a follow-up confirmation (e.g., 'yes') after an initial answer, please expand on that topic with further details. "
	"\n\nIMPORTANT: Answer the question using ONLY the information provided in the following documents. DO NOT invent or include any organizations that are not present in the retrieved evidence. "
	"Before giving your final answer, perform the following steps: "
	"Step 1: Identify all organizations mentioned in the retrieved documents. "
	"Step 2: Check if there are any organizations beyond those provided that could be considered 'new'. "
	"Step 3: If no additional organizations exist, clearly state that based on the current dataset, these are all the organizations we have information on. "
	"\n\n{context}"
	)




	prompt = ChatPromptTemplate.from_messages(
	[
	("system", system_prompt),
	("human", "{input}"),
	]
	)

	# Build the chain that will combine documents with the prompt.
	question_answer_chain = create_stuff_documents_chain(ChatOpenAI(model_name="gpt-4o-mini", temperature=0), prompt)
	rag_chain = create_retrieval_chain(retriever, question_answer_chain)

	def post_process_answer(answer, retrieved_docs):
	"""
	Append a disclaimer to the answer confirming that only organizations from the retrieved documents were used.
	(A more advanced implementation might parse and filter out any hallucinated names.)
	"""
	# Extract allowed organization names from retrieved docs.
	allowed_orgs = {doc.metadata.get("Organization", "").strip() for doc in retrieved_docs if doc.metadata.get("Organization", "").strip()}
	disclaimer = "\n\n[Answer verified against retrieved documents: Only organizations present in the evidence were included. Allowed organizations: " + ", ".join(sorted(allowed_orgs)) + ".]"
	return answer + disclaimer

	def debug_retrieved_docs(user_input):
	retrieved_docs = retriever.get_relevant_documents(user_input)
	print(f"DEBUG: Retrieved {len(retrieved_docs)} documents.")
	for i, doc in enumerate(retrieved_docs):
	print(f"Doc {i+1}: {doc.metadata}")
	return retrieved_docs
	# -------------------------------
	# Gradio Interface and Conversation Handling
	# -------------------------------

	green_theme = gr.themes.Base(
	primary_hue=gr.themes.Color(
	c50="#00A168",
	c100="#57B485",
	c200="#D7ECE0",
	c300="#FFFFFF",
	c400="#EAE9E9",
	c500="#000000",
	c600="#3A905E",
	c700="#2A774A",
	c800="#1A5E36",
	c900="#0A4512",
	c950="#052A08"
	),
	font=[gr.themes.GoogleFont('Space Grotesk'), 'ui-sans-serif', 'system-ui', 'sans-serif']
	).set(
	body_background_fill='#00A168',
	body_text_color='#000000',
	background_fill_primary='#FFFFFF',
	background_fill_secondary='#FFFFFF',
	border_color_accent='#57B485',
	border_color_accent_subdued='#EAE9E9',
	color_accent='#57B485',
	color_accent_soft='#D7ECE0',
	checkbox_background_color='#FFFFFF',
	button_primary_background_fill='#57B485',
	button_primary_background_fill_hover='#3A905E',
	button_secondary_background_fill='#D7ECE0',
	button_secondary_text_color='#000000'
	)


	def message_and_history(message, history):
	# Initialize conversation with a welcome message if history is empty.
	if not history:
	history = [{"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> Welcome to the LA2050 ideas hub! How can I help you today?"}]

	# Handle if message is provided as a string or a dict.
	user_text = message if isinstance(message, str) else message.get("text", "")
	history.append({"role": "user", "content": user_text})

	time.sleep(1)

	if not user_text:
	history.append({"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> Please enter a valid message."})
	yield history, history
	return

	# Combine the most recent conversation turns, excluding the assistant's prefix.
	conversation_context = "\n".join(
	[f"{msg['role']}: {msg['content'].replace('<b>LA2050 Navigator:</b><br>', '')}" for msg in history[-1:]]
	)

	retrieved_docs = retriever.invoke(conversation_context)

	print(f"DEBUG: Retrieved {len(retrieved_docs)} documents.")
	for i, doc in enumerate(retrieved_docs):
	# Print out key metadata fields to verify correctness.
	print(f"Doc {i+1} Page Content: {doc.page_content}")
	chain_input = {"input": conversation_context}

	try:
	response = rag_chain.invoke(chain_input)
	answer = response["answer"]
	# Post-process the answer to append a disclaimer verifying the evidence.

	except Exception as e:
	answer = f"An error occurred: {e}"

	# Remove the prefix if the model includes it.
	if answer.startswith("<b>LA2050 Navigator:</b><br>"):
	answer = answer[len("<b>LA2050 Navigator:</b><br>"):]

	# Initialize the assistant's response with the prefix.
	assistant_response = {"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> "}
	history.append(assistant_response)

	# Stream the answer character by character.
	for character in answer:
	assistant_response["content"] += character
	yield history, history

	# Finalize the answer.
	history[-1]["content"] = assistant_response["content"]
	yield history, history

	# Set Gradio to light mode via JavaScript
	js_func = """
	function refresh() {
	const url = new URL(window.location);
	if (url.searchParams.get('__theme') !== 'light') {
	url.searchParams.set('__theme', 'light');
	window.location.href = url.href;
	}
	}
	"""

	css = """
	.chat-header {
	text-color: #FFFFFF;
	text-align: center;
	}
	.gradio-container .prose .chat-header h1 {
	color: #FFFFFF;
	text-align: center;
	}
	"""

	with gr.Blocks(theme=green_theme, js=js_func, css=css) as block:
	gr.HTML('<div class="chat-header"><h1>LA2050 Navigator</h1></div>')

	chatbot = gr.Chatbot(
	value=[{"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> Welcome to the LA2050 ideas hub! How can I help you today?"}],
	type="messages",
	bubble_full_width=False
	)

	state = gr.State([])

	message = gr.MultimodalTextbox(
	interactive=True,
	file_count="multiple",
	placeholder="Type a message",
	label="",
	elem_classes="custom-textbox",
	scale=3,
	show_label=False
	)

	# When a message is submitted, the function sends the recent conversation history along with the new input.
	message.submit(
	message_and_history,
	inputs=[message, state],
	outputs=[chatbot, state]
	).then(
	lambda: "", inputs=[], outputs=message
	)

	block.launch(debug=True, share=True)