anl139 commited on
Commit
f752e13
·
verified ·
1 Parent(s): 028664e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -42
app.py CHANGED
@@ -32,12 +32,6 @@ os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
32
  # Utility Functions
33
  # -------------------------------
34
 
35
- import re
36
- import json
37
- from pathlib import Path
38
- # Make sure to import your Document class from your LangChain module.
39
- from langchain_core.documents import Document
40
-
41
  def extract_metadata(text: str) -> dict:
42
  metadata = {}
43
 
@@ -50,35 +44,38 @@ def extract_metadata(text: str) -> dict:
50
  if title_match:
51
  metadata["title"] = title_match.group(1).strip()
52
 
53
- # Extract the Organization field
54
- org_match = re.search(
55
- r"Organization:\s*(.*?)\s+(?=Goal:|Ranking:|Impact Metrics:)",
56
- text,
57
- re.IGNORECASE | re.DOTALL
58
- )
59
- if org_match:
60
- metadata["organization"] = org_match.group(1).strip()
61
-
62
- # Extract the Ranking field with a more flexible pattern:
63
  ranking_match = re.search(
64
- r"Ranking:\s*(.*?)\s*(?:Impact Metrics:|$)",
65
  text,
66
  re.IGNORECASE | re.DOTALL
67
  )
68
  if ranking_match:
69
- metadata["ranking"] = ranking_match.group(1).strip()
 
 
70
 
71
  # Extract the Year field (assuming a four-digit year)
72
  year_match = re.search(r"Year:\s*(\d{4})", text, re.IGNORECASE)
73
  if year_match:
74
  metadata["year"] = year_match.group(1).strip()
75
 
76
- # Extract URLs for Website, Volunteer, and Newsletter
 
 
 
 
 
 
 
 
 
77
  urls = re.findall(r"(Website|Volunteer|Newsletter):\s*((?:https?://)?\S+)", text)
78
  for key, url in urls:
79
  metadata[key.lower()] = url.strip()
80
 
81
- # Extract social handles (Twitter, Instagram, FaceBook)
82
  social = re.findall(r"(Twitter|Instagram|FaceBook):\s*(\S+)", text)
83
  for platform, handle in social:
84
  if handle.startswith("http"):
@@ -90,6 +87,11 @@ def extract_metadata(text: str) -> dict:
90
 
91
 
92
  def load_and_process_data(file_path: str):
 
 
 
 
 
93
  try:
94
  data = json.loads(Path(file_path).read_text(encoding='utf-8'))
95
  docs = []
@@ -98,7 +100,7 @@ def load_and_process_data(file_path: str):
98
  if not org_text:
99
  continue
100
  metadata = extract_metadata(org_text)
101
- # Optionally, prioritize winners
102
  if metadata.get("ranking", "").lower() == "winner":
103
  docs.insert(0, Document(page_content=org_text, metadata=metadata))
104
  else:
@@ -120,7 +122,7 @@ docs = load_and_process_data(file_path)
120
  # (If you find that key fields are getting split, consider implementing a custom splitter.)
121
  from langchain_text_splitters import RecursiveCharacterTextSplitter
122
  text_splitter = RecursiveCharacterTextSplitter(
123
- chunk_size=1500,
124
  chunk_overlap=150,
125
  add_start_index=True
126
  )
@@ -152,7 +154,7 @@ bm25_retriever = BM25Retriever.from_documents(all_splits)
152
  # Combine the retrievers using an ensemble approach.
153
  ensemble_retriever = EnsembleRetriever(
154
  retrievers=[vectorstore.as_retriever(search_kwargs={"k": 6}), bm25_retriever],
155
- weights=[0.7, 0.3]
156
  )
157
  retriever = ensemble_retriever
158
 
@@ -160,31 +162,19 @@ retriever = ensemble_retriever
160
  # Prepare Retrieval and Generation Chain
161
  # -------------------------------
162
 
163
- # Updated system prompt: Note the explicit instructions to use only the provided context and to avoid mixing details.
164
  system_prompt = (
165
-
166
  "You are the LA2050 Navigator, an AI-powered chatbot designed to help users explore organizations and community initiatives within the Goldhirsh Foundation’s LA2050 Ideas Hub. "
167
-
168
  "Your role is to provide concise, personalized recommendations, guide users toward supporting these organizations and initiatives, and answer relevant questions about the Goldhirsh Foundation, LA2050, and its projects. "
169
-
170
  "When answering, include the full name of the organization, a brief (1-2 sentence) description, and a link to its website or social media (as provided under the website column; please do not alter or normalize the URL). "
171
-
172
  "If a company's personal website is unavailable, navigate to the LA2050 URLs. "
173
-
174
  "Prioritize nonprofit organizations awarded by the Goldhirsh Foundation (designated 'Winner' under ranking column) and those with multiple proposal submissions. "
175
-
176
  "Use the data files as your primary source of information. If information is unavailable, acknowledge it and guide the user to relevant resources. "
177
-
178
  "Maintain a polite, helpful, respectful, and enthusiastic tone at all times. "
179
-
180
  "If the user responds with a follow-up confirmation (e.g. 'yes') after a previous answer, please expand on that topic with additional information. "
181
- "When answering questions about grant winners, only list organizations whose metadata ranking field is marked as 'Winner'"
182
-
183
  "\n\n{context}"
184
-
185
  )
186
 
187
-
188
  prompt = ChatPromptTemplate.from_messages(
189
  [
190
  ("system", system_prompt),
@@ -233,13 +223,15 @@ green_theme = gr.themes.Base(
233
 
234
  def message_and_history(message, history):
235
  # Initialize conversation with a welcome message if history is empty.
236
- history = history or [{"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> Welcome to the LA2050 ideas hub! How can I help you today?"}]
237
- user_text = message.get("text", "")
 
 
 
238
  history.append({"role": "user", "content": user_text})
239
 
240
  time.sleep(1)
241
 
242
- # If the user did not provide any input, ask for a valid message.
243
  if not user_text:
244
  history.append({"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> Please enter a valid message."})
245
  yield history, history
@@ -261,7 +253,7 @@ def message_and_history(message, history):
261
  # Remove the prefix if the model includes it.
262
  if answer.startswith("<b>LA2050 Navigator:</b><br>"):
263
  answer = answer[len("<b>LA2050 Navigator:</b><br>"):]
264
-
265
  # Initialize the assistant's response with the prefix.
266
  assistant_response = {"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> "}
267
  history.append(assistant_response)
@@ -271,7 +263,7 @@ def message_and_history(message, history):
271
  assistant_response["content"] += character
272
  yield history, history
273
 
274
- # Finalize the answer without re-adding the prefix.
275
  history[-1]["content"] = assistant_response["content"]
276
  yield history, history
277
 
@@ -318,7 +310,7 @@ with gr.Blocks(theme=green_theme, js=js_func, css=css) as block:
318
  show_label=False
319
  )
320
 
321
- # When a message is submitted, the function now sends the recent conversation history along with the new input.
322
  message.submit(
323
  message_and_history,
324
  inputs=[message, state],
@@ -328,3 +320,4 @@ with gr.Blocks(theme=green_theme, js=js_func, css=css) as block:
328
  )
329
 
330
  block.launch(debug=True, share=True)
 
 
32
  # Utility Functions
33
  # -------------------------------
34
 
 
 
 
 
 
 
35
  def extract_metadata(text: str) -> dict:
36
  metadata = {}
37
 
 
44
  if title_match:
45
  metadata["title"] = title_match.group(1).strip()
46
 
47
+ # Extract the Ranking field but only add it if the value is "winner"
48
+ # (Using \s* after the captured group to allow for no trailing whitespace)
 
 
 
 
 
 
 
 
49
  ranking_match = re.search(
50
+ r"Ranking:\s*(.*?)\s*(?=Impact Metrics:|$)",
51
  text,
52
  re.IGNORECASE | re.DOTALL
53
  )
54
  if ranking_match:
55
+ ranking_value = ranking_match.group(1).strip()
56
+ if ranking_value.lower() == "winner":
57
+ metadata["ranking"] = ranking_value
58
 
59
  # Extract the Year field (assuming a four-digit year)
60
  year_match = re.search(r"Year:\s*(\d{4})", text, re.IGNORECASE)
61
  if year_match:
62
  metadata["year"] = year_match.group(1).strip()
63
 
64
+ # Extract the Organization field
65
+ org_match = re.search(
66
+ r"Organization:\s*(.*?)\s+(?=Goal:|Ranking:|Impact Metrics:)",
67
+ text,
68
+ re.IGNORECASE | re.DOTALL
69
+ )
70
+ if org_match:
71
+ metadata["organization"] = org_match.group(1).strip()
72
+
73
+ # Modified URL extraction: make http/https optional.
74
  urls = re.findall(r"(Website|Volunteer|Newsletter):\s*((?:https?://)?\S+)", text)
75
  for key, url in urls:
76
  metadata[key.lower()] = url.strip()
77
 
78
+ # Adjust social handle extraction to capture full URLs.
79
  social = re.findall(r"(Twitter|Instagram|FaceBook):\s*(\S+)", text)
80
  for platform, handle in social:
81
  if handle.startswith("http"):
 
87
 
88
 
89
  def load_and_process_data(file_path: str):
90
+ """
91
+ Loads JSON data from a file, extracts organization text and metadata,
92
+ and returns a list of Documents. Documents will have the ranking metadata
93
+ only if the organization is marked as a winner.
94
+ """
95
  try:
96
  data = json.loads(Path(file_path).read_text(encoding='utf-8'))
97
  docs = []
 
100
  if not org_text:
101
  continue
102
  metadata = extract_metadata(org_text)
103
+ # Insert winners at the beginning of the list
104
  if metadata.get("ranking", "").lower() == "winner":
105
  docs.insert(0, Document(page_content=org_text, metadata=metadata))
106
  else:
 
122
  # (If you find that key fields are getting split, consider implementing a custom splitter.)
123
  from langchain_text_splitters import RecursiveCharacterTextSplitter
124
  text_splitter = RecursiveCharacterTextSplitter(
125
+ chunk_size=2000,
126
  chunk_overlap=150,
127
  add_start_index=True
128
  )
 
154
  # Combine the retrievers using an ensemble approach.
155
  ensemble_retriever = EnsembleRetriever(
156
  retrievers=[vectorstore.as_retriever(search_kwargs={"k": 6}), bm25_retriever],
157
+ weights=[0.8, 0.3]
158
  )
159
  retriever = ensemble_retriever
160
 
 
162
  # Prepare Retrieval and Generation Chain
163
  # -------------------------------
164
 
 
165
  system_prompt = (
 
166
  "You are the LA2050 Navigator, an AI-powered chatbot designed to help users explore organizations and community initiatives within the Goldhirsh Foundation’s LA2050 Ideas Hub. "
 
167
  "Your role is to provide concise, personalized recommendations, guide users toward supporting these organizations and initiatives, and answer relevant questions about the Goldhirsh Foundation, LA2050, and its projects. "
 
168
  "When answering, include the full name of the organization, a brief (1-2 sentence) description, and a link to its website or social media (as provided under the website column; please do not alter or normalize the URL). "
 
169
  "If a company's personal website is unavailable, navigate to the LA2050 URLs. "
 
170
  "Prioritize nonprofit organizations awarded by the Goldhirsh Foundation (designated 'Winner' under ranking column) and those with multiple proposal submissions. "
 
171
  "Use the data files as your primary source of information. If information is unavailable, acknowledge it and guide the user to relevant resources. "
 
172
  "Maintain a polite, helpful, respectful, and enthusiastic tone at all times. "
 
173
  "If the user responds with a follow-up confirmation (e.g. 'yes') after a previous answer, please expand on that topic with additional information. "
174
+ "When answering questions about grant winners, only list organizations whose metadata ranking field is marked as 'Winner'."
 
175
  "\n\n{context}"
 
176
  )
177
 
 
178
  prompt = ChatPromptTemplate.from_messages(
179
  [
180
  ("system", system_prompt),
 
223
 
224
  def message_and_history(message, history):
225
  # Initialize conversation with a welcome message if history is empty.
226
+ if not history:
227
+ history = [{"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> Welcome to the LA2050 ideas hub! How can I help you today?"}]
228
+
229
+ # Handle if message is provided as a string or a dict.
230
+ user_text = message if isinstance(message, str) else message.get("text", "")
231
  history.append({"role": "user", "content": user_text})
232
 
233
  time.sleep(1)
234
 
 
235
  if not user_text:
236
  history.append({"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> Please enter a valid message."})
237
  yield history, history
 
253
  # Remove the prefix if the model includes it.
254
  if answer.startswith("<b>LA2050 Navigator:</b><br>"):
255
  answer = answer[len("<b>LA2050 Navigator:</b><br>"):]
256
+
257
  # Initialize the assistant's response with the prefix.
258
  assistant_response = {"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> "}
259
  history.append(assistant_response)
 
263
  assistant_response["content"] += character
264
  yield history, history
265
 
266
+ # Finalize the answer.
267
  history[-1]["content"] = assistant_response["content"]
268
  yield history, history
269
 
 
310
  show_label=False
311
  )
312
 
313
+ # When a message is submitted, the function sends the recent conversation history along with the new input.
314
  message.submit(
315
  message_and_history,
316
  inputs=[message, state],
 
320
  )
321
 
322
  block.launch(debug=True, share=True)
323
+