Spaces:

anl139
/

test

Sleeping

App Files Files Community

anl139 commited on Feb 11, 2025

Commit

6b6cdaa

verified ·

1 Parent(s): 4378ccc

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -83

app.py CHANGED Viewed

@@ -1,41 +1,7 @@
-import os
-import time
-import gradio as gr
-from dotenv import load_dotenv
-from pathlib import Path
-import re
-import json
-# Import Document from your LangChain module.
-# (Adjust the import if your version of LangChain uses a different path.)
-from langchain_core.documents import Document
-# Import additional libraries from LangChain
-from langchain_chroma import Chroma
-from langchain_openai import OpenAIEmbeddings
-from langchain_community.retrievers import BM25Retriever
-from langchain.retrievers import EnsembleRetriever
-from langchain_core.runnables import RunnablePassthrough
-from langchain_core.output_parsers import StrOutputParser
-from langchain_openai import ChatOpenAI
-from langchain.chains import create_retrieval_chain
-from langchain.chains.combine_documents import create_stuff_documents_chain
-from langchain_core.prompts import ChatPromptTemplate
-# Load environment variables for Hugging Face and OpenAI
-load_dotenv()
-os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
-os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
-# -------------------------------
-# Utility Functions
-# -------------------------------
 import re
 import json
 from pathlib import Path
-# Make sure to import your Document class from your LangChain module.
 from langchain_core.documents import Document
 def extract_metadata(text: str) -> tuple[dict, str]:
@@ -50,7 +16,6 @@ def extract_metadata(text: str) -> tuple[dict, str]:
     )
     if title_match:
         metadata["title"] = title_match.group(1).strip()
-        # Remove Title from cleaned_text
         cleaned_text = re.sub(
             r"Title:\s*.*?(?=Website:|Twitter:|Instagram:|FaceBook:|Newsletter:)",
             "",
@@ -68,7 +33,6 @@ def extract_metadata(text: str) -> tuple[dict, str]:
         ranking_value = ranking_match.group(1).strip()
         if ranking_value.lower() == "winner":
             metadata["ranking"] = ranking_value
-        # Remove Ranking from cleaned_text
         cleaned_text = re.sub(
             r"Ranking:\s*.*?(?=Impact Metrics:|$)",
             "",
@@ -80,7 +44,6 @@ def extract_metadata(text: str) -> tuple[dict, str]:
     year_match = re.search(r"Year:\s*(\d{4})", cleaned_text, re.IGNORECASE)
     if year_match:
         metadata["year"] = year_match.group(1).strip()
-        # Remove Year from cleaned_text
         cleaned_text = re.sub(r"Year:\s*\d{4}", "", cleaned_text, flags=re.IGNORECASE)
     # Extract and remove Organization
@@ -91,7 +54,6 @@ def extract_metadata(text: str) -> tuple[dict, str]:
     )
     if org_match:
         metadata["organization"] = org_match.group(1).strip()
-        # Remove Organization from cleaned_text
         cleaned_text = re.sub(
             r"Organization:\s*.*?(?=Goal:|Ranking:|Impact Metrics:)",
             "",
@@ -103,7 +65,6 @@ def extract_metadata(text: str) -> tuple[dict, str]:
     urls = re.findall(r"(Website|Volunteer|Newsletter):\s*((?:https?://)?\S+)", cleaned_text)
     for key, url in urls:
         metadata[key.lower()] = url.strip()
-        # Remove URL from cleaned_text
         cleaned_text = re.sub(
             rf"{key}:\s*{re.escape(url)}",
             "",
@@ -111,14 +72,13 @@ def extract_metadata(text: str) -> tuple[dict, str]:
             flags=re.IGNORECASE
         )
-    # Extract and remove social handles
     social = re.findall(r"(Twitter|Instagram|FaceBook):\s*(\S+)", cleaned_text)
     for platform, handle in social:
         if handle.startswith("http"):
             metadata[platform.lower()] = handle.strip()
         else:
             metadata[f"{platform.lower()}_handle"] = f"https://{platform.lower()}.com/{handle.strip()}"
-        # Remove social handle from cleaned_text
         cleaned_text = re.sub(
             rf"{platform}:\s*{re.escape(handle)}",
             "",
@@ -126,13 +86,54 @@ def extract_metadata(text: str) -> tuple[dict, str]:
             flags=re.IGNORECASE
         )
     # Clean up extra whitespace
     cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
-    return metadata, cleaned_text
 def load_and_process_data(file_path: str):
     try:
         data = json.loads(Path(file_path).read_text(encoding='utf-8'))
         docs = []
@@ -140,15 +141,16 @@ def load_and_process_data(file_path: str):
             org_text = entry.get("OrganizationText", "")
             if not org_text:
                 continue
-            metadata, cleaned_text = extract_metadata(org_text)  # Now returns cleaned text
             if metadata.get("ranking", "").lower() == "winner":
-                docs.insert(0, Document(page_content=cleaned_text, metadata=metadata))
             else:
-                docs.append(Document(page_content=cleaned_text, metadata=metadata))
         return docs
     except Exception as e:
         print(f"Error loading JSON: {e}")
         return []
 # -------------------------------
 # Data Loading and Preprocessing
 # -------------------------------
@@ -157,11 +159,10 @@ file_path = './data.json'  # Ensure this file is available in your environment.
 docs = load_and_process_data(file_path)
 # Use a text splitter to create chunks from the documents.
-# (If you find that key fields are getting split, consider implementing a custom splitter.)
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 text_splitter = RecursiveCharacterTextSplitter(
-    chunk_size=1500,
-    chunk_overlap=150,
     add_start_index=True
 )
 all_splits = text_splitter.split_documents(docs)
@@ -170,7 +171,11 @@ all_splits = text_splitter.split_documents(docs)
 # Set Up Retrievers
 # -------------------------------
-# Create a Chroma vector store using the document splits.
 persist_directory = "./chroma_db"
 if os.path.exists(persist_directory) and os.listdir(persist_directory):
     vectorstore = Chroma(
@@ -186,13 +191,10 @@ else:
     )
     print("Created new vector store and persisted embeddings.")
-# Create a BM25 retriever from the document splits.
 bm25_retriever = BM25Retriever.from_documents(all_splits)
-# Combine the retrievers using an ensemble approach.
 ensemble_retriever = EnsembleRetriever(
     retrievers=[vectorstore.as_retriever(search_kwargs={"k": 6}), bm25_retriever],
-    weights=[0.8, 0.3]
 )
 retriever = ensemble_retriever
@@ -200,31 +202,24 @@ retriever = ensemble_retriever
 # Prepare Retrieval and Generation Chain
 # -------------------------------
-# Updated system prompt: Note the explicit instructions to use only the provided context and to avoid mixing details.
-system_prompt = (
     "You are the LA2050 Navigator, an AI-powered chatbot designed to help users explore organizations and community initiatives within the Goldhirsh Foundation’s LA2050 Ideas Hub. "
     "Your role is to provide concise, personalized recommendations, guide users toward supporting these organizations and initiatives, and answer relevant questions about the Goldhirsh Foundation, LA2050, and its projects. "
     "When answering, include the full name of the organization, a brief (1-2 sentence) description, and a link to its website or social media (as provided under the website column; please do not alter or normalize the URL). "
     "If a company's personal website is unavailable, navigate to the LA2050 URLs. "
     "Prioritize nonprofit organizations awarded by the Goldhirsh Foundation (designated 'Winner' under ranking column) and those with multiple proposal submissions. "
     "Use the data files as your primary source of information. If information is unavailable, acknowledge it and guide the user to relevant resources. "
     "Maintain a polite, helpful, respectful, and enthusiastic tone at all times. "
     "If the user responds with a follow-up confirmation (e.g. 'yes') after a previous answer, please expand on that topic with additional information. "
-    "When answering questions about grant winners, only list organizations whose metadata ranking field is marked as 'Winner'"
     "\n\n{context}"
 )
 prompt = ChatPromptTemplate.from_messages(
     [
         ("system", system_prompt),
@@ -232,7 +227,6 @@ prompt = ChatPromptTemplate.from_messages(
     ]
 )
-# Build the chain that will combine documents with the prompt.
 question_answer_chain = create_stuff_documents_chain(ChatOpenAI(model_name="gpt-4o-mini", temperature=0), prompt)
 rag_chain = create_retrieval_chain(retriever, question_answer_chain)
@@ -240,6 +234,11 @@ rag_chain = create_retrieval_chain(retriever, question_answer_chain)
 # Gradio Interface and Conversation Handling
 # -------------------------------
 green_theme = gr.themes.Base(
     primary_hue=gr.themes.Color(
         c50="#00A168",
@@ -272,24 +271,19 @@ green_theme = gr.themes.Base(
 )
 def message_and_history(message, history):
-    # Initialize conversation with a welcome message if history is empty.
     history = history or [{"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> Welcome to the LA2050 ideas hub! How can I help you today?"}]
     user_text = message.get("text", "")
     history.append({"role": "user", "content": user_text})
     time.sleep(1)
-    # If the user did not provide any input, ask for a valid message.
     if not user_text:
         history.append({"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> Please enter a valid message."})
         yield history, history
         return
-    # Combine the most recent conversation turns, excluding the assistant's prefix.
     conversation_context = "\n".join(
         [f"{msg['role']}: {msg['content'].replace('<b>LA2050 Navigator:</b><br>', '')}" for msg in history[-3:]]
     )
     chain_input = {"input": conversation_context}
     try:
@@ -297,25 +291,20 @@ def message_and_history(message, history):
         answer = response["answer"]
     except Exception as e:
         answer = f"An error occurred: {e}"
-    # Remove the prefix if the model includes it.
     if answer.startswith("<b>LA2050 Navigator:</b><br>"):
         answer = answer[len("<b>LA2050 Navigator:</b><br>"):]
-    # Initialize the assistant's response with the prefix.
     assistant_response = {"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> "}
     history.append(assistant_response)
-    # Stream the answer character by character.
     for character in answer:
         assistant_response["content"] += character
         yield history, history
-    # Finalize the answer without re-adding the prefix.
     history[-1]["content"] = assistant_response["content"]
     yield history, history
-# Set Gradio to light mode via JavaScript
 js_func = """
 function refresh() {
     const url = new URL(window.location);
@@ -358,7 +347,6 @@ with gr.Blocks(theme=green_theme, js=js_func, css=css) as block:
         show_label=False
     )
-    # When a message is submitted, the function now sends the recent conversation history along with the new input.
     message.submit(
         message_and_history,
         inputs=[message, state],
@@ -368,3 +356,5 @@ with gr.Blocks(theme=green_theme, js=js_func, css=css) as block:
     )
 block.launch(debug=True, share=True)

 import re
 import json
 from pathlib import Path
+# Import your Document class from your LangChain module.
 from langchain_core.documents import Document
 def extract_metadata(text: str) -> tuple[dict, str]:
     )
     if title_match:
         metadata["title"] = title_match.group(1).strip()
         cleaned_text = re.sub(
             r"Title:\s*.*?(?=Website:|Twitter:|Instagram:|FaceBook:|Newsletter:)",
             "",
         ranking_value = ranking_match.group(1).strip()
         if ranking_value.lower() == "winner":
             metadata["ranking"] = ranking_value
         cleaned_text = re.sub(
             r"Ranking:\s*.*?(?=Impact Metrics:|$)",
             "",
     year_match = re.search(r"Year:\s*(\d{4})", cleaned_text, re.IGNORECASE)
     if year_match:
         metadata["year"] = year_match.group(1).strip()
         cleaned_text = re.sub(r"Year:\s*\d{4}", "", cleaned_text, flags=re.IGNORECASE)
     # Extract and remove Organization
     )
     if org_match:
         metadata["organization"] = org_match.group(1).strip()
         cleaned_text = re.sub(
             r"Organization:\s*.*?(?=Goal:|Ranking:|Impact Metrics:)",
             "",
     urls = re.findall(r"(Website|Volunteer|Newsletter):\s*((?:https?://)?\S+)", cleaned_text)
     for key, url in urls:
         metadata[key.lower()] = url.strip()
         cleaned_text = re.sub(
             rf"{key}:\s*{re.escape(url)}",
             "",
             flags=re.IGNORECASE
         )
+    # Extract and remove social handles (Twitter, Instagram, FaceBook)
     social = re.findall(r"(Twitter|Instagram|FaceBook):\s*(\S+)", cleaned_text)
     for platform, handle in social:
         if handle.startswith("http"):
             metadata[platform.lower()] = handle.strip()
         else:
             metadata[f"{platform.lower()}_handle"] = f"https://{platform.lower()}.com/{handle.strip()}"
         cleaned_text = re.sub(
             rf"{platform}:\s*{re.escape(handle)}",
             "",
             flags=re.IGNORECASE
         )
+    # Extract and remove Working Areas in LA
+    working_match = re.search(
+        r"Working Areas in LA:\s*(.*?)\s+(?=Summary:|Ranking:|Impact Metrics:|$)",
+        cleaned_text,
+        re.IGNORECASE | re.DOTALL
+    )
+    if working_match:
+        metadata["working_areas"] = working_match.group(1).strip()
+        cleaned_text = re.sub(
+            r"Working Areas in LA:\s*.*?(?=Summary:|Ranking:|Impact Metrics:|$)",
+            "",
+            cleaned_text,
+            flags=re.IGNORECASE | re.DOTALL
+        )
+    # Extract and remove Zipcode (assuming 5-digit US zipcodes)
+    zipcode_match = re.search(r"Zipcode:\s*(\d{5})", cleaned_text, re.IGNORECASE)
+    if zipcode_match:
+        metadata["zipcode"] = zipcode_match.group(1).strip()
+        cleaned_text = re.sub(r"Zipcode:\s*\d{5}", "", cleaned_text, flags=re.IGNORECASE)
     # Clean up extra whitespace
     cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
+    # Create a metadata summary to append to the cleaned text.
+    meta_summary = ""
+    if "year" in metadata:
+        meta_summary += f"Year: {metadata['year']}. "
+    if "ranking" in metadata:
+        meta_summary += f"Ranking: {metadata['ranking']}. "
+    if "organization" in metadata:
+        meta_summary += f"Organization: {metadata['organization']}. "
+    if "working_areas" in metadata:
+        meta_summary += f"Working Areas in LA: {metadata['working_areas']}. "
+    if "zipcode" in metadata:
+        meta_summary += f"Zipcode: {metadata['zipcode']}. "
+    combined_text = meta_summary + "\n" + cleaned_text if meta_summary else cleaned_text
+    return metadata, combined_text
 def load_and_process_data(file_path: str):
+    """
+    Loads JSON data from a file, extracts organization text and metadata (including working areas and zipcode),
+    cleans the text by removing redundant metadata, and returns a list of Documents.
+    Documents with a "winner" ranking are inserted at the beginning of the list.
+    """
     try:
         data = json.loads(Path(file_path).read_text(encoding='utf-8'))
         docs = []
             org_text = entry.get("OrganizationText", "")
             if not org_text:
                 continue
+            metadata, combined_text = extract_metadata(org_text)
             if metadata.get("ranking", "").lower() == "winner":
+                docs.insert(0, Document(page_content=combined_text, metadata=metadata))
             else:
+                docs.append(Document(page_content=combined_text, metadata=metadata))
         return docs
     except Exception as e:
         print(f"Error loading JSON: {e}")
         return []
 # -------------------------------
 # Data Loading and Preprocessing
 # -------------------------------
 docs = load_and_process_data(file_path)
 # Use a text splitter to create chunks from the documents.
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=1800,
+    chunk_overlap=200,
     add_start_index=True
 )
 all_splits = text_splitter.split_documents(docs)
 # Set Up Retrievers
 # -------------------------------
+from langchain_chroma import Chroma
+from langchain_openai import OpenAIEmbeddings
+from langchain_community.retrievers import BM25Retriever
+from langchain.retrievers import EnsembleRetriever
 persist_directory = "./chroma_db"
 if os.path.exists(persist_directory) and os.listdir(persist_directory):
     vectorstore = Chroma(
     )
     print("Created new vector store and persisted embeddings.")
 bm25_retriever = BM25Retriever.from_documents(all_splits)
 ensemble_retriever = EnsembleRetriever(
     retrievers=[vectorstore.as_retriever(search_kwargs={"k": 6}), bm25_retriever],
+    weights=[0.7, 0.3]
 )
 retriever = ensemble_retriever
 # Prepare Retrieval and Generation Chain
 # -------------------------------
+from langchain_openai import ChatOpenAI
+from langchain.chains import create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain_core.prompts import ChatPromptTemplate
+system_prompt = (
     "You are the LA2050 Navigator, an AI-powered chatbot designed to help users explore organizations and community initiatives within the Goldhirsh Foundation’s LA2050 Ideas Hub. "
     "Your role is to provide concise, personalized recommendations, guide users toward supporting these organizations and initiatives, and answer relevant questions about the Goldhirsh Foundation, LA2050, and its projects. "
     "When answering, include the full name of the organization, a brief (1-2 sentence) description, and a link to its website or social media (as provided under the website column; please do not alter or normalize the URL). "
     "If a company's personal website is unavailable, navigate to the LA2050 URLs. "
     "Prioritize nonprofit organizations awarded by the Goldhirsh Foundation (designated 'Winner' under ranking column) and those with multiple proposal submissions. "
     "Use the data files as your primary source of information. If information is unavailable, acknowledge it and guide the user to relevant resources. "
     "Maintain a polite, helpful, respectful, and enthusiastic tone at all times. "
     "If the user responds with a follow-up confirmation (e.g. 'yes') after a previous answer, please expand on that topic with additional information. "
+    "When answering questions about grant winners, only list organizations whose metadata ranking field is marked as 'Winner'."
     "\n\n{context}"
 )
 prompt = ChatPromptTemplate.from_messages(
     [
         ("system", system_prompt),
     ]
 )
 question_answer_chain = create_stuff_documents_chain(ChatOpenAI(model_name="gpt-4o-mini", temperature=0), prompt)
 rag_chain = create_retrieval_chain(retriever, question_answer_chain)
 # Gradio Interface and Conversation Handling
 # -------------------------------
+import gradio as gr
+import time
+from dotenv import load_dotenv
+load_dotenv()
 green_theme = gr.themes.Base(
     primary_hue=gr.themes.Color(
         c50="#00A168",
 )
 def message_and_history(message, history):
     history = history or [{"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> Welcome to the LA2050 ideas hub! How can I help you today?"}]
     user_text = message.get("text", "")
     history.append({"role": "user", "content": user_text})
     time.sleep(1)
     if not user_text:
         history.append({"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> Please enter a valid message."})
         yield history, history
         return
     conversation_context = "\n".join(
         [f"{msg['role']}: {msg['content'].replace('<b>LA2050 Navigator:</b><br>', '')}" for msg in history[-3:]]
     )
     chain_input = {"input": conversation_context}
     try:
         answer = response["answer"]
     except Exception as e:
         answer = f"An error occurred: {e}"
     if answer.startswith("<b>LA2050 Navigator:</b><br>"):
         answer = answer[len("<b>LA2050 Navigator:</b><br>"):]
     assistant_response = {"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> "}
     history.append(assistant_response)
     for character in answer:
         assistant_response["content"] += character
         yield history, history
     history[-1]["content"] = assistant_response["content"]
     yield history, history
 js_func = """
 function refresh() {
     const url = new URL(window.location);
         show_label=False
     )
     message.submit(
         message_and_history,
         inputs=[message, state],
     )
 block.launch(debug=True, share=True)