Rabbit-Innotech commited on
Commit
bd83779
·
verified ·
1 Parent(s): 7f2bd5c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +130 -188
app.py CHANGED
@@ -928,6 +928,7 @@
928
  # if __name__ == "__main__":
929
  # demo.launch(share=True, inbrowser=True, debug=True)
930
 
 
931
  import os
932
  from langchain_groq import ChatGroq
933
  from langchain.prompts import ChatPromptTemplate, PromptTemplate
@@ -942,14 +943,39 @@ from langchain_core.prompts import ChatPromptTemplate
942
  import gradio as gr
943
  from PyPDF2 import PdfReader
944
  from langchain_huggingface import HuggingFaceEmbeddings
945
- from langchain_core.messages import HumanMessage, AIMessage
946
- from langchain_core.runnables import RunnablePassthrough
947
  from langchain_core.output_parsers import StrOutputParser
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
948
 
949
- # Set up environment variables
950
- groq_api_key = os.environ.get('GBV')
 
 
951
 
952
- # Initialize embedding model
953
  embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
954
 
955
  def scrape_websites(base_urls):
@@ -1042,12 +1068,12 @@ def extract_pdf_text(pdf_url):
1042
 
1043
  def clean_body_content(html_content):
1044
  soup = BeautifulSoup(html_content, "html.parser")
 
1045
 
1046
- # Remove scripts and styles
1047
  for script_or_style in soup(["script", "style"]):
1048
  script_or_style.extract()
 
1049
 
1050
- # Get cleaned text
1051
  cleaned_content = soup.get_text(separator="\n")
1052
  cleaned_content = "\n".join(
1053
  line.strip() for line in cleaned_content.splitlines() if line.strip()
@@ -1055,91 +1081,51 @@ def clean_body_content(html_content):
1055
  return cleaned_content
1056
 
1057
 
1058
- def chunk_string(s, chunk_size=1000):
1059
- return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
1060
-
 
 
1061
 
1062
- # Setup vectorstore for RAG
1063
- def setup_vectorstore():
1064
- if __name__ == "__main__":
1065
- website = ["https://haguruka.org.rw/"]
1066
- all_content = scrape_websites(website)
1067
 
1068
- temp_list = []
1069
- for url, content in all_content.items():
1070
- temp_list.append((url, content))
1071
 
1072
- processed_texts = []
1073
-
1074
- for element in temp_list:
1075
- if isinstance(element, tuple):
1076
- url, content = element
1077
- processed_texts.append(f"url: {url}, content: {content}")
1078
- elif isinstance(element, str):
1079
- processed_texts.append(element)
1080
- else:
1081
- processed_texts.append(str(element))
1082
-
1083
- chunked_texts = []
1084
- for text in processed_texts:
1085
- chunked_texts.extend(chunk_string(text))
1086
-
1087
- vectorstore = Chroma(
1088
- collection_name="GBVR_Dataset",
1089
- embedding_function=embed_model,
1090
- persist_directory="./",
1091
- )
1092
-
1093
- vectorstore.add_texts(chunked_texts)
1094
- return vectorstore
1095
  else:
1096
- # If imported as a module, just load the existing vectorstore
1097
- vectorstore = Chroma(
1098
- collection_name="GBVR_Dataset",
1099
- embedding_function=embed_model,
1100
- persist_directory="./",
1101
- )
1102
- return vectorstore
1103
 
 
 
1104
 
1105
- # Session Manager class to handle conversation history
1106
- class SessionManager:
1107
- def __init__(self):
1108
- self.sessions = {}
1109
-
1110
- def get_session(self, session_id):
1111
- if session_id not in self.sessions:
1112
- self.sessions[session_id] = []
1113
- return self.sessions[session_id]
1114
-
1115
- def add_message(self, session_id, role, content):
1116
- session = self.get_session(session_id)
1117
- if role == "human":
1118
- session.append(HumanMessage(content=content))
1119
- elif role == "ai":
1120
- session.append(AIMessage(content=content))
1121
-
1122
- def get_history_as_string(self, session_id, max_turns=5):
1123
- """Convert recent conversation history to string format for context"""
1124
- session = self.get_session(session_id)
1125
-
1126
- # Get the most recent conversations (limited to max_turns)
1127
- recent_messages = session[-max_turns*2:] if len(session) > max_turns*2 else session
1128
-
1129
- history_str = ""
1130
- for msg in recent_messages:
1131
- role = "User" if isinstance(msg, HumanMessage) else "Assistant"
1132
- history_str += f"{role}: {msg.content}\n"
1133
-
1134
- return history_str.strip()
1135
 
 
 
1136
 
1137
- # Initialize session manager
1138
- session_manager = SessionManager()
1139
 
1140
- # Modified template to include conversation history
1141
- template = """
1142
- You are a friendly, intelligent, and conversational AI assistant designed to provide accurate, engaging, and human-like responses based on the given context. Your goal is to extract relevant details from the provided context and assist the user effectively. Follow these guidelines:
 
 
 
 
 
 
 
 
 
 
1143
 
1144
  1. **Warm & Natural Interaction**
1145
  - If the user greets you (e.g., "Hello," "Hi," "Good morning"), respond warmly and acknowledge them.
@@ -1148,7 +1134,7 @@ template = """
1148
  - "Hello! What can I do for you? 🚀"
1149
 
1150
  2. **Precise Information Extraction**
1151
- - Provide only the relevant details from the given context.
1152
  - Do not generate extra content or assumptions beyond the provided information.
1153
 
1154
  3. **Conversational & Engaging Tone**
@@ -1164,133 +1150,81 @@ template = """
1164
 
1165
  6. **Personalized Interaction**
1166
  - Use the conversation history to provide more personalized and contextually relevant responses.
 
1167
 
1168
  7. **Direct, Concise Responses**
1169
  - If the user requests specific data, provide only the requested details without unnecessary explanations unless asked.
1170
 
1171
  8. **Extracting Relevant Links**
1172
- - If the user asks for a link related to their request, extract the most relevant URL from the context and provide it directly.
1173
  - Example response:
1174
  - "Here is the link you requested: [URL]"
1175
 
1176
- **Context from knowledge base:** {context}
1177
-
1178
- **Previous conversation history:**
1179
- {history}
1180
-
1181
- **Current User's Question:** {question}
1182
-
1183
- **Your Response:**
1184
- """
1185
 
1186
- # Create prompt template with history
1187
  rag_prompt = PromptTemplate.from_template(template)
1188
 
1189
- # Initialize Groq LLM
 
1190
  llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=groq_api_key)
1191
 
 
 
 
1192
  # Define the RAG chain with session history
1193
- def get_rag_chain(vectorstore):
1194
- retriever = vectorstore.as_retriever()
 
1195
 
1196
- def rag_chain_with_history(query, session_id):
1197
- # Get conversation history
1198
- history = session_manager.get_history_as_string(session_id)
1199
-
1200
- # Get relevant documents from retriever
1201
- retrieved_docs = retriever.invoke(query)
1202
- context = "\n".join([doc.page_content for doc in retrieved_docs])
1203
-
1204
- # Create the prompt with context and history
1205
- prompt = rag_prompt.format(
1206
- context=context,
1207
- history=history,
1208
- question=query
1209
- )
1210
-
1211
- # Generate response
1212
- response = llm.invoke(prompt)
1213
-
1214
- # Add to session history
1215
- session_manager.add_message(session_id, "human", query)
1216
- session_manager.add_message(session_id, "ai", response.content)
1217
-
1218
- return response.content
1219
 
1220
- return rag_chain_with_history
1221
-
1222
- # Initialize the vectorstore
1223
- vectorstore = setup_vectorstore()
1224
-
1225
- # Get the RAG chain
1226
- rag_chain_fn = get_rag_chain(vectorstore)
1227
-
1228
- # Define the streaming function for Gradio
1229
- def rag_memory_stream(message, history, session_id=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1230
  if session_id is None:
1231
- # Generate a simple session ID if none provided
1232
- # In a production app, you would use something more sophisticated
1233
  session_id = "default_session"
1234
 
1235
- # Process the message and get the response
1236
- response = rag_chain_fn(message, session_id)
1237
 
1238
  # Stream the response word by word
1239
- words = response.split()
1240
- partial_response = ""
1241
-
1242
  for word in words:
1243
- partial_response += word + " "
1244
- yield partial_response.strip()
1245
-
1246
- # Create the Chat Interface with session management
1247
- def create_chat_interface():
1248
- with gr.Blocks(theme="soft", css=custom_css) as demo:
1249
- gr.Markdown(f"# {title}")
1250
-
1251
- # Hidden session ID - in a real app, this would be managed by authentication
1252
- session_id = gr.State(value="default_session")
1253
-
1254
- chatbot = gr.Chatbot(height=600)
1255
- msg = gr.Textbox(
1256
- placeholder="Ask me anything about GBV resources...",
1257
- container=False,
1258
- scale=7
1259
- )
1260
-
1261
- def user_input(message, chat_history, session_id_val):
1262
- if message.strip() == "":
1263
- return "", chat_history
1264
-
1265
- chat_history.append([message, None])
1266
- return "", chat_history
1267
-
1268
- def bot_response(chat_history, session_id_val):
1269
- if chat_history and chat_history[-1][1] is None:
1270
- user_message = chat_history[-1][0]
1271
- bot_message = ""
1272
-
1273
- for chunk in rag_memory_stream(user_message, chat_history, session_id_val):
1274
- bot_message = chunk
1275
- chat_history[-1][1] = bot_message
1276
- yield chat_history
1277
-
1278
- send = gr.Button("Send", variant="primary", scale=1)
1279
- clear = gr.Button("Clear Chat", variant="secondary")
1280
-
1281
- # Event handlers
1282
- send_event = msg.submit(user_input, [msg, chatbot, session_id], [msg, chatbot]).then(
1283
- bot_response, [chatbot, session_id], chatbot
1284
- )
1285
- send.click(user_input, [msg, chatbot, session_id], [msg, chatbot]).then(
1286
- bot_response, [chatbot, session_id], chatbot
1287
- )
1288
- clear.click(lambda: [], outputs=[chatbot])
1289
-
1290
- return demo
1291
 
1292
  # Title with emojis
1293
- title = "🤖 GBVR Chatbot"
1294
 
1295
  # Custom CSS for styling the interface
1296
  custom_css = """
@@ -1314,7 +1248,15 @@ body {
1314
  }
1315
  """
1316
 
 
 
 
 
 
 
 
 
 
1317
  # Launch the app
1318
  if __name__ == "__main__":
1319
- demo = create_chat_interface()
1320
  demo.launch(share=True, inbrowser=True, debug=True)
 
928
  # if __name__ == "__main__":
929
  # demo.launch(share=True, inbrowser=True, debug=True)
930
 
931
+
932
  import os
933
  from langchain_groq import ChatGroq
934
  from langchain.prompts import ChatPromptTemplate, PromptTemplate
 
943
  import gradio as gr
944
  from PyPDF2 import PdfReader
945
  from langchain_huggingface import HuggingFaceEmbeddings
 
 
946
  from langchain_core.output_parsers import StrOutputParser
947
+ from langchain_core.runnables import RunnablePassthrough
948
+
949
+ # Simple session management
950
+ class SessionManager:
951
+ def __init__(self):
952
+ self.sessions = {}
953
+
954
+ def get_or_create_session(self, session_id):
955
+ if session_id not in self.sessions:
956
+ self.sessions[session_id] = []
957
+ return self.sessions[session_id]
958
+
959
+ def add_interaction(self, session_id, user_message, ai_response):
960
+ session = self.get_or_create_session(session_id)
961
+ session.append({"user": user_message, "ai": ai_response})
962
+
963
+ def get_history(self, session_id, max_turns=5):
964
+ session = self.get_or_create_session(session_id)
965
+ recent_history = session[-max_turns:] if len(session) > max_turns else session
966
+
967
+ history_text = ""
968
+ for interaction in recent_history:
969
+ history_text += f"User: {interaction['user']}\n"
970
+ history_text += f"Assistant: {interaction['ai']}\n\n"
971
+
972
+ return history_text.strip()
973
 
974
+ # Initialize session manager
975
+ session_manager = SessionManager()
976
+
977
+ groq_api_key= os.environ.get('GBV')
978
 
 
979
  embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
980
 
981
  def scrape_websites(base_urls):
 
1068
 
1069
  def clean_body_content(html_content):
1070
  soup = BeautifulSoup(html_content, "html.parser")
1071
+
1072
 
 
1073
  for script_or_style in soup(["script", "style"]):
1074
  script_or_style.extract()
1075
+
1076
 
 
1077
  cleaned_content = soup.get_text(separator="\n")
1078
  cleaned_content = "\n".join(
1079
  line.strip() for line in cleaned_content.splitlines() if line.strip()
 
1081
  return cleaned_content
1082
 
1083
 
1084
+ if __name__ == "__main__":
1085
+ website = ["https://haguruka.org.rw/"
1086
+
1087
+ ]
1088
+ all_content = scrape_websites(website)
1089
 
1090
+ temp_list = []
1091
+ for url, content in all_content.items():
1092
+ temp_list.append((url, content))
 
 
1093
 
1094
+
1095
+ processed_texts = []
 
1096
 
1097
+
1098
+ for element in temp_list:
1099
+ if isinstance(element, tuple):
1100
+ url, content = element
1101
+ processed_texts.append(f"url: {url}, content: {content}")
1102
+ elif isinstance(element, str):
1103
+ processed_texts.append(element)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1104
  else:
1105
+ processed_texts.append(str(element))
 
 
 
 
 
 
1106
 
1107
+ def chunk_string(s, chunk_size=1000):
1108
+ return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
1109
 
1110
+ chunked_texts = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1111
 
1112
+ for text in processed_texts:
1113
+ chunked_texts.extend(chunk_string(text))
1114
 
 
 
1115
 
1116
+ vectorstore = Chroma(
1117
+ collection_name="GBVR_Dataset",
1118
+ embedding_function=embed_model,
1119
+ persist_directory="./",
1120
+ )
1121
+
1122
+ vectorstore.get().keys()
1123
+
1124
+ vectorstore.add_texts(chunked_texts)
1125
+
1126
+ # Updated template to include conversation history
1127
+ template = ("""
1128
+ You are a friendly, intelligent, and conversational AI assistant designed to provide accurate, engaging, and human-like responses based on the given context. Your goal is to extract relevant details from the provided context: {context} and assist the user effectively. Follow these guidelines:
1129
 
1130
  1. **Warm & Natural Interaction**
1131
  - If the user greets you (e.g., "Hello," "Hi," "Good morning"), respond warmly and acknowledge them.
 
1134
  - "Hello! What can I do for you? 🚀"
1135
 
1136
  2. **Precise Information Extraction**
1137
+ - Provide only the relevant details from the given context: {context}.
1138
  - Do not generate extra content or assumptions beyond the provided information.
1139
 
1140
  3. **Conversational & Engaging Tone**
 
1150
 
1151
  6. **Personalized Interaction**
1152
  - Use the conversation history to provide more personalized and contextually relevant responses.
1153
+ - Previous conversation history: {conversation_history}
1154
 
1155
  7. **Direct, Concise Responses**
1156
  - If the user requests specific data, provide only the requested details without unnecessary explanations unless asked.
1157
 
1158
  8. **Extracting Relevant Links**
1159
+ - If the user asks for a link related to their request `{question}`, extract the most relevant URL from `{context}` and provide it directly.
1160
  - Example response:
1161
  - "Here is the link you requested: [URL]"
1162
 
1163
+ **Context:** {context}
1164
+ **User's Question:** {question}
1165
+ **Your Response:**
1166
+ """)
1167
+
 
 
 
 
1168
 
 
1169
  rag_prompt = PromptTemplate.from_template(template)
1170
 
1171
+ retriever = vectorstore.as_retriever()
1172
+
1173
  llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=groq_api_key)
1174
 
1175
+ # Dictionary to store user sessions with session IDs
1176
+ user_sessions = {}
1177
+
1178
  # Define the RAG chain with session history
1179
+ def rag_chain(question, session_id="default"):
1180
+ # Get conversation history if available
1181
+ conversation_history = session_manager.get_history(session_id)
1182
 
1183
+ # Get context from retriever
1184
+ context_docs = retriever.invoke(question)
1185
+ context = "\n".join(doc.page_content for doc in context_docs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1186
 
1187
+ # Create prompt with history
1188
+ prompt = rag_prompt.format(
1189
+ context=context,
1190
+ question=question,
1191
+ conversation_history=conversation_history
1192
+ )
1193
+
1194
+ # Generate response
1195
+ response = llm.invoke(prompt).content
1196
+
1197
+ # Store the interaction
1198
+ session_manager.add_interaction(session_id, question, response)
1199
+
1200
+ return response
1201
+
1202
+ # Define the RAG memory stream function
1203
+ def rag_memory_stream(message, history):
1204
+ # Generate a session ID based on the first message if not exists
1205
+ session_id = None
1206
+ for msg in history:
1207
+ if msg[0]: # If there's a user message
1208
+ # Use first few characters of first message as simple session ID
1209
+ session_id = hash(msg[0][:20]) if session_id is None else session_id
1210
+ break
1211
+
1212
+ # Default session ID if history is empty
1213
  if session_id is None:
 
 
1214
  session_id = "default_session"
1215
 
1216
+ # Process the message and get response
1217
+ response = rag_chain(message, str(session_id))
1218
 
1219
  # Stream the response word by word
1220
+ partial_text = ""
1221
+ words = response.split(' ')
 
1222
  for word in words:
1223
+ partial_text += word + " "
1224
+ yield partial_text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1225
 
1226
  # Title with emojis
1227
+ title = "GBVR Chatbot"
1228
 
1229
  # Custom CSS for styling the interface
1230
  custom_css = """
 
1248
  }
1249
  """
1250
 
1251
+ # Create the Chat Interface
1252
+ demo = gr.ChatInterface(
1253
+ fn=rag_memory_stream,
1254
+ title=title,
1255
+ fill_height=True,
1256
+ theme="soft",
1257
+ css=custom_css, # Apply the custom CSS
1258
+ )
1259
+
1260
  # Launch the app
1261
  if __name__ == "__main__":
 
1262
  demo.launch(share=True, inbrowser=True, debug=True)