File size: 13,439 Bytes
6b74882 e6a37df 6b74882 caca53e 2098069 4ac8a78 8e12a2a 4ac8a78 8e12a2a 4ac8a78 8e12a2a caca53e 4ac8a78 caca53e 4ac8a78 8e12a2a 4ac8a78 caca53e 4ac8a78 6b6cdaa 4ac8a78 43551fe db281a9 0a15c33 6b74882 db281a9 a3052bc 9d8bc7f 6d0b309 db281a9 a256849 4f3c8f5 a256849 4f3c8f5 a256849 db281a9 a256849 afb88fe a256849 43551fe 6b74882 a444de9 4f3c8f5 a444de9 4c80355 6b74882 1ae1f3d 8e12a2a a30cb90 5146408 a30cb90 a3052bc a30cb90 8e12a2a db281a9 43551fe 85d9ea1 8e12a2a 81bfe4a 8e12a2a 3f89aff 8b6f5b2 8e12a2a 4c80355 8e12a2a 81bfe4a 7b879ab 85d9ea1 4c80355 2098069 cf08f6f db281a9 6b74882 4c80355 43551fe 81bfe4a 1ae1f3d 8099edb 4c80355 cf08f6f db281a9 81bfe4a 40fdda1 6b74882 f752e13 4c80355 db281a9 6b74882 4c80355 db281a9 6b74882 73e480f 1dbb807 d7c5f43 b9aff09 ff77dbe 556e2af 8099edb d8e40b3 4c80355 db281a9 4c80355 db281a9 81bfe4a c564996 db281a9 6b74882 73e480f 81bfe4a 6b74882 dd586dd 6b74882 db281a9 dd586dd db281a9 6b74882 f752e13 73e480f db281a9 6b74882 cf08f6f 4c80355 cf08f6f 4c80355 cf08f6f 5a90b6e db281a9 5a90b6e db281a9 5a90b6e 5d67d40 cf08f6f 5d67d40 cf08f6f 5a90b6e f752e13 cf08f6f db281a9 cf08f6f db281a9 cf08f6f db281a9 5a90b6e 43551fe f752e13 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 |
import os
import time
import gradio as gr
from dotenv import load_dotenv
from pathlib import Path
import re
import json
from langchain_community.document_loaders import JSONLoader
# Import Document from your LangChain module.
# (Adjust the import if your version of LangChain uses a different path.)
from langchain_core.documents import Document
# Import additional libraries from LangChain
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
# Load environment variables for Hugging Face and OpenAI
load_dotenv()
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
# -------------------------------
# Utility Functions
# -------------------------------
def flatten_metadata(metadata):
"""Helper function to flatten dictionary metadata into a string."""
if isinstance(metadata, dict):
return " | ".join([f"{key}: {value}" for key, value in metadata.items()])
return str(metadata) # If it's not a dict, just return the string version
def metadata_func(record, additional_fields=None):
is_winner = record.get("Ranking", "").lower() == "winner"
return {
"Project Title": record.get("Title", ""),
"Organization": record.get("Organization", ""),
"LA 2050 Grant Status": record.get("Ranking", ""),
"Impact Metrics": record.get("Impact Metrics", ""),
"LA 2050 Year": record.get("Year", ""),
"Organizations urls": flatten_metadata({
"Organization website": record.get("Website", ""),
"Organization newsletter": record.get("Newsletter", ""),
"volunteer": record.get("Volunteer", ""),
"LA2050 website": record.get("LA2050", "")
}),
"social": flatten_metadata({
"twitter": record.get("Twitter", ""),
"instagram": record.get("Instagram", ""),
"facebook": record.get("FaceBook", "")
}),
"working_area": record.get("Working Areas in LA", ""),
"zipcode": record.get("Zipcode", "")
}
# Load the JSON data with custom metadata and content key
loader = JSONLoader(
file_path='data.json',
jq_schema='.[]',
content_key='Summary',
metadata_func=metadata_func
)
data = loader.load()
# Use a text splitter to create chunks from the documents.
# (If you find that key fields are getting split, consider implementing a custom splitter.)
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1600,
chunk_overlap=150,
add_start_index=True,
separators=["\n\n", "\n", ". ", " ", ""]
)
def split_document_with_metadata(document):
# Split the document text into chunks.
chunks = text_splitter.split_text(document.page_content)
# Ensure every chunk has the complete original metadata.
return [Document(page_content=chunk, metadata=document.metadata) for chunk in chunks]
all_splits = []
for doc in data:
all_splits.extend(split_document_with_metadata(doc))
# -------------------------------
# Set Up Retrievers
# -------------------------------
# Create a Chroma vector store using the document splits.
persist_directory = "path_to_persist_directory"
# Check if the directory exists and contains a persisted vector store
if os.path.exists(persist_directory):
# Attempt to load the existing vector store
try:
vectorstore = Chroma.load(persist_directory, embedding_function=OpenAIEmbeddings())
print("Loaded existing vector store from persisted directory.")
except Exception as e:
print(f"Error loading vector store: {e}. Proceeding to create a new one.")
# Fallback to creating a new vector store if loading fails
vectorstore = Chroma.from_documents(
documents=all_splits,
embedding=OpenAIEmbeddings(),
persist_directory=persist_directory
)
print("Created new vector store and persisted embeddings.")
else:
# Create a new vector store if the directory doesn't exist
vectorstore = Chroma.from_documents(
documents=all_splits,
embedding=OpenAIEmbeddings(),
persist_directory=persist_directory
)
print("Created new vector store and persisted embeddings.")
# Create a BM25 retriever from the document splits.
bm25_retriever = BM25Retriever.from_documents(all_splits)
ensemble_retriever = EnsembleRetriever(
retrievers=[
vectorstore.as_retriever(),
bm25_retriever
],
weights=[0.9, 0.1]
)
retriever = ensemble_retriever
# -------------------------------
# Prepare Retrieval and Generation Chain
# -------------------------------
system_prompt = (
"You are the LA2050 Navigator, an AI-powered chatbot created to help users discover organizations and community initiatives featured in the Goldhirsh Foundation’s LA2050 Ideas Hub. "
"Your role is to deliver succinct, personalized recommendations, guide users toward supporting these initiatives, and answer questions about the Goldhirsh Foundation, LA2050, and its projects. "
"When responding, include the full name of the organization, a brief (1-2 sentence) description, and a link to its website (labeled as Organization website) or social media; (please do not alter the URL). "
"If an organization’s personal website is unavailable, refer to its LA2050 URL. "
"Prioritize nonprofit organizations designated as 'winners' by the Goldhirsh Foundation and those with multiple proposal submissions. "
"If a user inquires about the LA2050 grant winners for a specific year, be sure to look out for 'LA 2050 Grant Status'-explicitly noting if the organization was awarded the grant that year(disregard if it has 'Submitted)'. "
"Use the data files as your primary source of information. These files have been pre-processed into context-rich segments using a recursive text-splitting approach to ensure key details are preserved. "
"If some information is missing, acknowledge it and direct the user to additional resources. "
"Maintain a polite, helpful, respectful, and enthusiastic tone at all times. "
"If the user responds with a follow-up confirmation (e.g., 'yes') after an initial answer, please expand on that topic with further details. "
"\n\nIMPORTANT: Answer the question using ONLY the information provided in the following documents. DO NOT invent or include any organizations that are not present in the retrieved evidence. "
"Before giving your final answer, perform the following steps: "
"Step 1: Identify all organizations mentioned in the retrieved documents. "
"Step 2: Check if there are any organizations beyond those provided that could be considered 'new'. "
"Step 3: If no additional organizations exist, clearly state that based on the current dataset, these are all the organizations we have information on. "
"\n\n{context}"
)
prompt = ChatPromptTemplate.from_messages(
[
("system", system_prompt),
("human", "{input}"),
]
)
# Build the chain that will combine documents with the prompt.
question_answer_chain = create_stuff_documents_chain(ChatOpenAI(model_name="gpt-4o-mini", temperature=0), prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
def post_process_answer(answer, retrieved_docs):
"""
Append a disclaimer to the answer confirming that only organizations from the retrieved documents were used.
(A more advanced implementation might parse and filter out any hallucinated names.)
"""
# Extract allowed organization names from retrieved docs.
allowed_orgs = {doc.metadata.get("Organization", "").strip() for doc in retrieved_docs if doc.metadata.get("Organization", "").strip()}
disclaimer = "\n\n[Answer verified against retrieved documents: Only organizations present in the evidence were included. Allowed organizations: " + ", ".join(sorted(allowed_orgs)) + ".]"
return answer + disclaimer
def debug_retrieved_docs(user_input):
retrieved_docs = retriever.get_relevant_documents(user_input)
print(f"DEBUG: Retrieved {len(retrieved_docs)} documents.")
for i, doc in enumerate(retrieved_docs):
print(f"Doc {i+1}: {doc.metadata}")
return retrieved_docs
# -------------------------------
# Gradio Interface and Conversation Handling
# -------------------------------
green_theme = gr.themes.Base(
primary_hue=gr.themes.Color(
c50="#00A168",
c100="#57B485",
c200="#D7ECE0",
c300="#FFFFFF",
c400="#EAE9E9",
c500="#000000",
c600="#3A905E",
c700="#2A774A",
c800="#1A5E36",
c900="#0A4512",
c950="#052A08"
),
font=[gr.themes.GoogleFont('Space Grotesk'), 'ui-sans-serif', 'system-ui', 'sans-serif']
).set(
body_background_fill='#00A168',
body_text_color='#000000',
background_fill_primary='#FFFFFF',
background_fill_secondary='#FFFFFF',
border_color_accent='#57B485',
border_color_accent_subdued='#EAE9E9',
color_accent='#57B485',
color_accent_soft='#D7ECE0',
checkbox_background_color='#FFFFFF',
button_primary_background_fill='#57B485',
button_primary_background_fill_hover='#3A905E',
button_secondary_background_fill='#D7ECE0',
button_secondary_text_color='#000000'
)
def message_and_history(message, history):
# Initialize conversation with a welcome message if history is empty.
if not history:
history = [{"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> Welcome to the LA2050 ideas hub! How can I help you today?"}]
# Handle if message is provided as a string or a dict.
user_text = message if isinstance(message, str) else message.get("text", "")
history.append({"role": "user", "content": user_text})
time.sleep(1)
if not user_text:
history.append({"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> Please enter a valid message."})
yield history, history
return
# Combine the most recent conversation turns, excluding the assistant's prefix.
conversation_context = "\n".join(
[f"{msg['role']}: {msg['content'].replace('<b>LA2050 Navigator:</b><br>', '')}" for msg in history[-1:]]
)
retrieved_docs = retriever.invoke(conversation_context)
print(f"DEBUG: Retrieved {len(retrieved_docs)} documents.")
for i, doc in enumerate(retrieved_docs):
# Print out key metadata fields to verify correctness.
print(f"Doc {i+1} Page Content: {doc.page_content}")
chain_input = {"input": conversation_context}
try:
response = rag_chain.invoke(chain_input)
answer = response["answer"]
# Post-process the answer to append a disclaimer verifying the evidence.
except Exception as e:
answer = f"An error occurred: {e}"
# Remove the prefix if the model includes it.
if answer.startswith("<b>LA2050 Navigator:</b><br>"):
answer = answer[len("<b>LA2050 Navigator:</b><br>"):]
# Initialize the assistant's response with the prefix.
assistant_response = {"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> "}
history.append(assistant_response)
# Stream the answer character by character.
for character in answer:
assistant_response["content"] += character
yield history, history
# Finalize the answer.
history[-1]["content"] = assistant_response["content"]
yield history, history
# Set Gradio to light mode via JavaScript
js_func = """
function refresh() {
const url = new URL(window.location);
if (url.searchParams.get('__theme') !== 'light') {
url.searchParams.set('__theme', 'light');
window.location.href = url.href;
}
}
"""
css = """
.chat-header {
text-color: #FFFFFF;
text-align: center;
}
.gradio-container .prose .chat-header h1 {
color: #FFFFFF;
text-align: center;
}
"""
with gr.Blocks(theme=green_theme, js=js_func, css=css) as block:
gr.HTML('<div class="chat-header"><h1>LA2050 Navigator</h1></div>')
chatbot = gr.Chatbot(
value=[{"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> Welcome to the LA2050 ideas hub! How can I help you today?"}],
type="messages",
bubble_full_width=False
)
state = gr.State([])
message = gr.MultimodalTextbox(
interactive=True,
file_count="multiple",
placeholder="Type a message",
label="",
elem_classes="custom-textbox",
scale=3,
show_label=False
)
# When a message is submitted, the function sends the recent conversation history along with the new input.
message.submit(
message_and_history,
inputs=[message, state],
outputs=[chatbot, state]
).then(
lambda: "", inputs=[], outputs=message
)
block.launch(debug=True, share=True)
|