Spaces:

anl139
/

test

Sleeping

App Files Files Community

anl139 commited on Feb 11, 2025

Commit

f752e13

verified ·

1 Parent(s): 028664e

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -42

app.py CHANGED Viewed

@@ -32,12 +32,6 @@ os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
 # Utility Functions
 # -------------------------------
-import re
-import json
-from pathlib import Path
-# Make sure to import your Document class from your LangChain module.
-from langchain_core.documents import Document
 def extract_metadata(text: str) -> dict:
     metadata = {}
@@ -50,35 +44,38 @@ def extract_metadata(text: str) -> dict:
     if title_match:
         metadata["title"] = title_match.group(1).strip()
-    # Extract the Organization field
-    org_match = re.search(
-        r"Organization:\s*(.*?)\s+(?=Goal:|Ranking:|Impact Metrics:)",
-        text,
-        re.IGNORECASE | re.DOTALL
-    )
-    if org_match:
-        metadata["organization"] = org_match.group(1).strip()
-    # Extract the Ranking field with a more flexible pattern:
     ranking_match = re.search(
-        r"Ranking:\s*(.*?)\s*(?:Impact Metrics:|$)",
         text,
         re.IGNORECASE | re.DOTALL
     )
     if ranking_match:
-        metadata["ranking"] = ranking_match.group(1).strip()
     # Extract the Year field (assuming a four-digit year)
     year_match = re.search(r"Year:\s*(\d{4})", text, re.IGNORECASE)
     if year_match:
         metadata["year"] = year_match.group(1).strip()
-    # Extract URLs for Website, Volunteer, and Newsletter
     urls = re.findall(r"(Website|Volunteer|Newsletter):\s*((?:https?://)?\S+)", text)
     for key, url in urls:
         metadata[key.lower()] = url.strip()
-    # Extract social handles (Twitter, Instagram, FaceBook)
     social = re.findall(r"(Twitter|Instagram|FaceBook):\s*(\S+)", text)
     for platform, handle in social:
         if handle.startswith("http"):
@@ -90,6 +87,11 @@ def extract_metadata(text: str) -> dict:
 def load_and_process_data(file_path: str):
     try:
         data = json.loads(Path(file_path).read_text(encoding='utf-8'))
         docs = []
@@ -98,7 +100,7 @@ def load_and_process_data(file_path: str):
             if not org_text:
                 continue
             metadata = extract_metadata(org_text)
-            # Optionally, prioritize winners
             if metadata.get("ranking", "").lower() == "winner":
                 docs.insert(0, Document(page_content=org_text, metadata=metadata))
             else:
@@ -120,7 +122,7 @@ docs = load_and_process_data(file_path)
 # (If you find that key fields are getting split, consider implementing a custom splitter.)
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 text_splitter = RecursiveCharacterTextSplitter(
-    chunk_size=1500,
     chunk_overlap=150,
     add_start_index=True
 )
@@ -152,7 +154,7 @@ bm25_retriever = BM25Retriever.from_documents(all_splits)
 # Combine the retrievers using an ensemble approach.
 ensemble_retriever = EnsembleRetriever(
     retrievers=[vectorstore.as_retriever(search_kwargs={"k": 6}), bm25_retriever],
-    weights=[0.7, 0.3]
 )
 retriever = ensemble_retriever
@@ -160,31 +162,19 @@ retriever = ensemble_retriever
 # Prepare Retrieval and Generation Chain
 # -------------------------------
-# Updated system prompt: Note the explicit instructions to use only the provided context and to avoid mixing details.
 system_prompt = (
     "You are the LA2050 Navigator, an AI-powered chatbot designed to help users explore organizations and community initiatives within the Goldhirsh Foundation’s LA2050 Ideas Hub. "
     "Your role is to provide concise, personalized recommendations, guide users toward supporting these organizations and initiatives, and answer relevant questions about the Goldhirsh Foundation, LA2050, and its projects. "
     "When answering, include the full name of the organization, a brief (1-2 sentence) description, and a link to its website or social media (as provided under the website column; please do not alter or normalize the URL). "
     "If a company's personal website is unavailable, navigate to the LA2050 URLs. "
     "Prioritize nonprofit organizations awarded by the Goldhirsh Foundation (designated 'Winner' under ranking column) and those with multiple proposal submissions. "
     "Use the data files as your primary source of information. If information is unavailable, acknowledge it and guide the user to relevant resources. "
     "Maintain a polite, helpful, respectful, and enthusiastic tone at all times. "
     "If the user responds with a follow-up confirmation (e.g. 'yes') after a previous answer, please expand on that topic with additional information. "
-    "When answering questions about grant winners, only list organizations whose metadata ranking field is marked as 'Winner'"
     "\n\n{context}"
 )
 prompt = ChatPromptTemplate.from_messages(
     [
         ("system", system_prompt),
@@ -233,13 +223,15 @@ green_theme = gr.themes.Base(
 def message_and_history(message, history):
     # Initialize conversation with a welcome message if history is empty.
-    history = history or [{"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> Welcome to the LA2050 ideas hub! How can I help you today?"}]
-    user_text = message.get("text", "")
     history.append({"role": "user", "content": user_text})
     time.sleep(1)
-    # If the user did not provide any input, ask for a valid message.
     if not user_text:
         history.append({"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> Please enter a valid message."})
         yield history, history
@@ -261,7 +253,7 @@ def message_and_history(message, history):
     # Remove the prefix if the model includes it.
     if answer.startswith("<b>LA2050 Navigator:</b><br>"):
         answer = answer[len("<b>LA2050 Navigator:</b><br>"):]
     # Initialize the assistant's response with the prefix.
     assistant_response = {"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> "}
     history.append(assistant_response)
@@ -271,7 +263,7 @@ def message_and_history(message, history):
         assistant_response["content"] += character
         yield history, history
-    # Finalize the answer without re-adding the prefix.
     history[-1]["content"] = assistant_response["content"]
     yield history, history
@@ -318,7 +310,7 @@ with gr.Blocks(theme=green_theme, js=js_func, css=css) as block:
         show_label=False
     )
-    # When a message is submitted, the function now sends the recent conversation history along with the new input.
     message.submit(
         message_and_history,
         inputs=[message, state],
@@ -328,3 +320,4 @@ with gr.Blocks(theme=green_theme, js=js_func, css=css) as block:
     )
 block.launch(debug=True, share=True)

 # Utility Functions
 # -------------------------------
 def extract_metadata(text: str) -> dict:
     metadata = {}
     if title_match:
         metadata["title"] = title_match.group(1).strip()
+    # Extract the Ranking field but only add it if the value is "winner"
+    # (Using \s* after the captured group to allow for no trailing whitespace)
     ranking_match = re.search(
+        r"Ranking:\s*(.*?)\s*(?=Impact Metrics:|$)",
         text,
         re.IGNORECASE | re.DOTALL
     )
     if ranking_match:
+        ranking_value = ranking_match.group(1).strip()
+        if ranking_value.lower() == "winner":
+            metadata["ranking"] = ranking_value
     # Extract the Year field (assuming a four-digit year)
     year_match = re.search(r"Year:\s*(\d{4})", text, re.IGNORECASE)
     if year_match:
         metadata["year"] = year_match.group(1).strip()
+    # Extract the Organization field
+    org_match = re.search(
+        r"Organization:\s*(.*?)\s+(?=Goal:|Ranking:|Impact Metrics:)",
+        text,
+        re.IGNORECASE | re.DOTALL
+    )
+    if org_match:
+        metadata["organization"] = org_match.group(1).strip()
+    # Modified URL extraction: make http/https optional.
     urls = re.findall(r"(Website|Volunteer|Newsletter):\s*((?:https?://)?\S+)", text)
     for key, url in urls:
         metadata[key.lower()] = url.strip()
+    # Adjust social handle extraction to capture full URLs.
     social = re.findall(r"(Twitter|Instagram|FaceBook):\s*(\S+)", text)
     for platform, handle in social:
         if handle.startswith("http"):
 def load_and_process_data(file_path: str):
+    """
+    Loads JSON data from a file, extracts organization text and metadata,
+    and returns a list of Documents. Documents will have the ranking metadata
+    only if the organization is marked as a winner.
+    """
     try:
         data = json.loads(Path(file_path).read_text(encoding='utf-8'))
         docs = []
             if not org_text:
                 continue
             metadata = extract_metadata(org_text)
+            # Insert winners at the beginning of the list
             if metadata.get("ranking", "").lower() == "winner":
                 docs.insert(0, Document(page_content=org_text, metadata=metadata))
             else:
 # (If you find that key fields are getting split, consider implementing a custom splitter.)
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=2000,
     chunk_overlap=150,
     add_start_index=True
 )
 # Combine the retrievers using an ensemble approach.
 ensemble_retriever = EnsembleRetriever(
     retrievers=[vectorstore.as_retriever(search_kwargs={"k": 6}), bm25_retriever],
+    weights=[0.8, 0.3]
 )
 retriever = ensemble_retriever
 # Prepare Retrieval and Generation Chain
 # -------------------------------
 system_prompt = (
     "You are the LA2050 Navigator, an AI-powered chatbot designed to help users explore organizations and community initiatives within the Goldhirsh Foundation’s LA2050 Ideas Hub. "
     "Your role is to provide concise, personalized recommendations, guide users toward supporting these organizations and initiatives, and answer relevant questions about the Goldhirsh Foundation, LA2050, and its projects. "
     "When answering, include the full name of the organization, a brief (1-2 sentence) description, and a link to its website or social media (as provided under the website column; please do not alter or normalize the URL). "
     "If a company's personal website is unavailable, navigate to the LA2050 URLs. "
     "Prioritize nonprofit organizations awarded by the Goldhirsh Foundation (designated 'Winner' under ranking column) and those with multiple proposal submissions. "
     "Use the data files as your primary source of information. If information is unavailable, acknowledge it and guide the user to relevant resources. "
     "Maintain a polite, helpful, respectful, and enthusiastic tone at all times. "
     "If the user responds with a follow-up confirmation (e.g. 'yes') after a previous answer, please expand on that topic with additional information. "
+    "When answering questions about grant winners, only list organizations whose metadata ranking field is marked as 'Winner'."
     "\n\n{context}"
 )
 prompt = ChatPromptTemplate.from_messages(
     [
         ("system", system_prompt),
 def message_and_history(message, history):
     # Initialize conversation with a welcome message if history is empty.
+    if not history:
+        history = [{"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> Welcome to the LA2050 ideas hub! How can I help you today?"}]
+    # Handle if message is provided as a string or a dict.
+    user_text = message if isinstance(message, str) else message.get("text", "")
     history.append({"role": "user", "content": user_text})
     time.sleep(1)
     if not user_text:
         history.append({"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> Please enter a valid message."})
         yield history, history
     # Remove the prefix if the model includes it.
     if answer.startswith("<b>LA2050 Navigator:</b><br>"):
         answer = answer[len("<b>LA2050 Navigator:</b><br>"):]
     # Initialize the assistant's response with the prefix.
     assistant_response = {"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> "}
     history.append(assistant_response)
         assistant_response["content"] += character
         yield history, history
+    # Finalize the answer.
     history[-1]["content"] = assistant_response["content"]
     yield history, history
         show_label=False
     )
+    # When a message is submitted, the function sends the recent conversation history along with the new input.
     message.submit(
         message_and_history,
         inputs=[message, state],
     )
 block.launch(debug=True, share=True)