anl139 commited on
Commit
43551fe
·
verified ·
1 Parent(s): 8c96287

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -35
app.py CHANGED
@@ -4,9 +4,14 @@ import gradio as gr
4
  from dotenv import load_dotenv
5
  from langchain_community.document_loaders import JSONLoader
6
  from pathlib import Path
7
- from langchain_core.documents import Document
8
  import re
9
  import json
 
 
 
 
 
 
10
  from langchain_chroma import Chroma
11
  from langchain_openai import OpenAIEmbeddings
12
  from langchain_community.retrievers import BM25Retriever
@@ -19,13 +24,18 @@ from langchain.chains import create_retrieval_chain
19
  from langchain.chains.combine_documents import create_stuff_documents_chain
20
  from langchain_core.prompts import ChatPromptTemplate
21
 
22
- # Load environment variables for Hugging Face
23
  load_dotenv()
24
  os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
25
  os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
26
 
27
- # Extract metadata from text
 
 
 
 
28
  def extract_metadata(text: str) -> dict:
 
29
  metadata = {}
30
  urls = re.findall(r"(Website|Volunteer|Newsletter):\s*(https?://\S+)", text)
31
  for key, url in urls:
@@ -35,8 +45,9 @@ def extract_metadata(text: str) -> dict:
35
  metadata[f"{platform.lower()}_handle"] = f"https://{platform.lower()}.com/{handle}"
36
  return metadata
37
 
38
- # Load and process JSON data
39
  def load_and_process_data(file_path: str):
 
40
  try:
41
  data = json.loads(Path(file_path).read_text(encoding='utf-8'))
42
  docs = []
@@ -51,11 +62,15 @@ def load_and_process_data(file_path: str):
51
  print(f"Error loading JSON: {e}")
52
  return []
53
 
54
- # Set up document loading and processing
55
- file_path = './2024data.json' # In Hugging Face, you'll need to upload this file or access it from Hugging Face Datasets
 
 
 
 
56
  docs = load_and_process_data(file_path)
57
 
58
- # Set up LangChain text splitter
59
  from langchain_text_splitters import RecursiveCharacterTextSplitter
60
  text_splitter = RecursiveCharacterTextSplitter(
61
  chunk_size=1000,
@@ -64,73 +79,126 @@ text_splitter = RecursiveCharacterTextSplitter(
64
  )
65
  all_splits = text_splitter.split_documents(docs)
66
 
67
- # Set up retrievers
68
- vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings(), persist_directory="./chroma_db")
 
 
 
 
 
 
 
 
 
 
69
  bm25_retriever = BM25Retriever.from_documents(all_splits)
 
 
70
  ensemble_retriever = EnsembleRetriever(
71
  retrievers=[vectorstore.as_retriever(search_kwargs={"k": 4}), bm25_retriever],
72
  weights=[0.7, 0.3]
73
  )
74
  retriever = ensemble_retriever
75
 
76
- # Prepare for retrieval and generation
 
 
 
 
77
  prompt = hub.pull("rlm/rag-prompt")
78
- llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
79
 
80
- def format_docs(docs):
81
- return "\n\n".join(doc.page_content for doc in docs)
82
 
 
83
  question_answer_chain = create_stuff_documents_chain(llm, prompt)
84
- rag_chain = create_retrieval_chain(retriever, question_answer_chain)
85
 
86
- # Set up Gradio interface
87
- green_theme = gr.themes.Base(
88
- primary_hue=gr.themes.Color(c50="#00A168", c100="#57B485", c200="#D7ECE0", c300="#FFFFFF", c400="#EAE9E9", c500="#000000", c600="#3A905E", c700="#2A774A", c800="#1A5E36", c900="#0A4512", c950="#052A08")
89
- )
90
 
91
- # Define response logic
92
- def message_and_history(message, history):
93
- history = history or [{"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> Welcome to the LA2050 ideas hub! How can I help you today?"}]
94
- history.append({"role": "user", "content": message.get("text", "")})
95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  time.sleep(1)
97
-
98
- user_input = message.get("text", "")
99
- if not user_input:
100
  history.append({"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> Please enter a valid message."})
101
  yield history, history
102
  return
103
 
104
  try:
105
- response = rag_chain.invoke({"input": user_input})
 
106
  answer = response["answer"]
107
  except Exception as e:
108
  answer = f"An error occurred: {e}"
109
 
 
110
  dynamic_message = {"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> "}
111
  history.append(dynamic_message)
112
 
 
113
  for character in answer:
114
  dynamic_message["content"] += character
115
  yield history, history
116
 
 
117
  history[-1]["content"] = f"<b>LA2050 Navigator:</b><br> {answer}"
118
  yield history, history
119
 
120
- # Set up the Gradio interface
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  with gr.Blocks(theme=green_theme) as block:
122
  gr.HTML('<div class="chat-header"><h1>LA2050 Navigator</h1></div>')
123
- chatbot = gr.Chatbot(value=[{"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> Welcome to the LA2050 ideas hub! How can I help you today?"}], type="messages")
 
 
 
 
 
 
 
124
  state = gr.State([])
125
-
126
- message = gr.Textbox(placeholder="Type a message", scale=3, show_label=False)
127
-
128
- message.submit(
 
 
129
  message_and_history,
130
- inputs=[message, state],
131
  outputs=[chatbot, state]
132
  ).then(
133
- lambda: "", inputs=[], outputs=message
134
  )
135
 
136
- block.launch(debug=True,share=True)
 
 
4
  from dotenv import load_dotenv
5
  from langchain_community.document_loaders import JSONLoader
6
  from pathlib import Path
 
7
  import re
8
  import json
9
+
10
+ # Import Document from your LangChain module.
11
+ # (If your version of LangChain uses a different path, update accordingly.)
12
+ from langchain_core.documents import Document
13
+
14
+ # Import additional libraries from LangChain
15
  from langchain_chroma import Chroma
16
  from langchain_openai import OpenAIEmbeddings
17
  from langchain_community.retrievers import BM25Retriever
 
24
  from langchain.chains.combine_documents import create_stuff_documents_chain
25
  from langchain_core.prompts import ChatPromptTemplate
26
 
27
+ # Load environment variables for Hugging Face and OpenAI
28
  load_dotenv()
29
  os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
30
  os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
31
 
32
+
33
+ # -------------------------------
34
+ # Utility Functions
35
+ # -------------------------------
36
+
37
  def extract_metadata(text: str) -> dict:
38
+ """Extracts URLs and social handles from the given text."""
39
  metadata = {}
40
  urls = re.findall(r"(Website|Volunteer|Newsletter):\s*(https?://\S+)", text)
41
  for key, url in urls:
 
45
  metadata[f"{platform.lower()}_handle"] = f"https://{platform.lower()}.com/{handle}"
46
  return metadata
47
 
48
+
49
  def load_and_process_data(file_path: str):
50
+ """Loads JSON data from a file, extracts organization text and metadata, and returns a list of Documents."""
51
  try:
52
  data = json.loads(Path(file_path).read_text(encoding='utf-8'))
53
  docs = []
 
62
  print(f"Error loading JSON: {e}")
63
  return []
64
 
65
+
66
+ # -------------------------------
67
+ # Data Loading and Preprocessing
68
+ # -------------------------------
69
+
70
+ file_path = './2024data.json' # Ensure this file is available in your environment.
71
  docs = load_and_process_data(file_path)
72
 
73
+ # Use a text splitter to create chunks from the documents
74
  from langchain_text_splitters import RecursiveCharacterTextSplitter
75
  text_splitter = RecursiveCharacterTextSplitter(
76
  chunk_size=1000,
 
79
  )
80
  all_splits = text_splitter.split_documents(docs)
81
 
82
+ # -------------------------------
83
+ # Set Up Retrievers
84
+ # -------------------------------
85
+
86
+ # Create a Chroma vector store using the document splits
87
+ vectorstore = Chroma.from_documents(
88
+ documents=all_splits,
89
+ embedding=OpenAIEmbeddings(),
90
+ persist_directory="./chroma_db"
91
+ )
92
+
93
+ # Create a BM25 retriever from the document splits
94
  bm25_retriever = BM25Retriever.from_documents(all_splits)
95
+
96
+ # Combine the retrievers using an ensemble approach
97
  ensemble_retriever = EnsembleRetriever(
98
  retrievers=[vectorstore.as_retriever(search_kwargs={"k": 4}), bm25_retriever],
99
  weights=[0.7, 0.3]
100
  )
101
  retriever = ensemble_retriever
102
 
103
+ # -------------------------------
104
+ # Prepare Retrieval and Generation Chain
105
+ # -------------------------------
106
+
107
+ # Pull the prompt from the hub; ensure that the prompt exists at the specified location
108
  prompt = hub.pull("rlm/rag-prompt")
 
109
 
110
+ # Initialize the language model (adjust the model name as needed)
111
+ llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
112
 
113
+ # Create the document chain (the "stuff" chain that combines retrieved documents)
114
  question_answer_chain = create_stuff_documents_chain(llm, prompt)
 
115
 
116
+ # Create the retrieval augmented generation (RAG) chain using the retriever and document chain
117
+ rag_chain = create_retrieval_chain(retriever, question_answer_chain)
 
 
118
 
 
 
 
 
119
 
120
+ # -------------------------------
121
+ # Define the Chat Callback Function
122
+ # -------------------------------
123
+
124
+ def message_and_history(user_message, history):
125
+ """
126
+ Processes the user input, performs retrieval and generation,
127
+ and updates the conversation history.
128
+ """
129
+ # Initialize history if empty
130
+ if not history:
131
+ history = [{"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> Welcome to the LA2050 ideas hub! How can I help you today?"}]
132
+
133
+ # Append the user's message to history
134
+ history.append({"role": "user", "content": user_message})
135
+
136
+ # Simulate a brief delay (optional)
137
  time.sleep(1)
138
+
139
+ # If the input is empty, return an error message
140
+ if not user_message.strip():
141
  history.append({"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> Please enter a valid message."})
142
  yield history, history
143
  return
144
 
145
  try:
146
+ # Invoke the RAG chain with the user's input
147
+ response = rag_chain.invoke({"input": user_message})
148
  answer = response["answer"]
149
  except Exception as e:
150
  answer = f"An error occurred: {e}"
151
 
152
+ # Prepare a dynamic response that simulates streaming text
153
  dynamic_message = {"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> "}
154
  history.append(dynamic_message)
155
 
156
+ # Stream the answer character by character (this loop yields intermediate updates)
157
  for character in answer:
158
  dynamic_message["content"] += character
159
  yield history, history
160
 
161
+ # Finalize the answer and yield the final history
162
  history[-1]["content"] = f"<b>LA2050 Navigator:</b><br> {answer}"
163
  yield history, history
164
 
165
+
166
+ # -------------------------------
167
+ # Set Up the Gradio Interface
168
+ # -------------------------------
169
+
170
+ # Define a custom green theme for the interface
171
+ green_theme = gr.themes.Base(
172
+ primary_hue=gr.themes.Color(
173
+ c50="#00A168", c100="#57B485", c200="#D7ECE0", c300="#FFFFFF",
174
+ c400="#EAE9E9", c500="#000000", c600="#3A905E", c700="#2A774A",
175
+ c800="#1A5E36", c900="#0A4512", c950="#052A08"
176
+ )
177
+ )
178
+
179
  with gr.Blocks(theme=green_theme) as block:
180
  gr.HTML('<div class="chat-header"><h1>LA2050 Navigator</h1></div>')
181
+
182
+ # Initialize the chatbot with a welcome message
183
+ chatbot = gr.Chatbot(
184
+ value=[{"role": "assistant", "content": "<b>LA2050 Navigator:</b><br> Welcome to the LA2050 ideas hub! How can I help you today?"}],
185
+ type="messages"
186
+ )
187
+
188
+ # Use a Gradio State to keep track of the conversation history
189
  state = gr.State([])
190
+
191
+ # Textbox for user input
192
+ user_input_box = gr.Textbox(placeholder="Type a message", scale=3, show_label=False)
193
+
194
+ # When the textbox is submitted, run the callback function
195
+ user_input_box.submit(
196
  message_and_history,
197
+ inputs=[user_input_box, state],
198
  outputs=[chatbot, state]
199
  ).then(
200
+ lambda: "", inputs=[], outputs=user_input_box
201
  )
202
 
203
+ block.launch(debug=True, share=True)
204
+