Files changed (9) hide show
  1. .gitattributes +35 -0
  2. Contributors +3 -0
  3. README.md +1 -2
  4. app.py +68 -42
  5. chs.json +0 -0
  6. crawler.py +0 -0
  7. database.zip +2 -2
  8. parse.py +0 -67
  9. requirements.txt +7 -4
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Contributors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Atharv Gupta
2
+ -------------
3
+ Aryan Anumula
README.md CHANGED
@@ -9,6 +9,5 @@ app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
12
- # schoolQuestBackend
13
 
14
- The backend for [schoolquest](https://school-quest.streamlit.app/)
 
9
  pinned: false
10
  license: mit
11
  ---
 
12
 
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,13 +1,13 @@
1
  import asyncio
2
  import json
3
  from websockets.server import serve
4
-
5
- from langchain.vectorstores import Chroma
6
- from langchain_huggingface.embeddings import HuggingFaceEmbeddings
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langchain_huggingface.llms import HuggingFaceEndpoint
9
- from langchain.document_loaders import TextLoader
10
- from langchain.document_loaders import DirectoryLoader
11
  from langchain import hub
12
  from langchain_core.runnables import RunnablePassthrough
13
  from langchain_core.output_parsers import StrOutputParser
@@ -18,34 +18,40 @@ from langchain.chains.combine_documents import create_stuff_documents_chain
18
  from langchain_core.runnables.history import RunnableWithMessageHistory
19
  from langchain_core.chat_history import BaseChatMessageHistory
20
  from langchain_community.chat_message_histories import ChatMessageHistory
21
-
 
 
 
 
 
 
 
22
  loader = DirectoryLoader('./database', glob="./*.txt", loader_cls=TextLoader)
23
 
24
  documents = loader.load()
25
 
26
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
27
- texts = text_splitter.split_documents(documents)
28
-
29
- persist_directory = 'db'
30
-
31
- embedding = HuggingFaceEmbeddings()
32
-
33
- vectordb = Chroma.from_documents(documents=texts,
34
- embedding=embedding,
35
- persist_directory=persist_directory)
36
-
37
- vectordb.persist()
38
- vectordb = None
39
 
40
- vectordb = Chroma(persist_directory=persist_directory,
41
- embedding_function=embedding)
42
 
43
  def format_docs(docs):
44
  return "\n\n".join(doc.page_content for doc in docs)
45
 
46
- retriever = vectordb.as_retriever()
 
47
  prompt = hub.pull("rlm/rag-prompt")
48
- llm = HuggingFaceEndpoint(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1")
49
  rag_chain = (
50
  {"context": retriever | format_docs, "question": RunnablePassthrough()}
51
  | prompt
@@ -53,9 +59,10 @@ rag_chain = (
53
  | StrOutputParser()
54
  )
55
 
56
- contextualize_q_system_prompt = """Given a chat history and the latest user question \
57
- which might reference context in the chat history, formulate a standalone question \
58
- which can be understood without the chat history. Do NOT answer the question, \
 
59
  just reformulate it if needed and otherwise return it as is."""
60
  contextualize_q_prompt = ChatPromptTemplate.from_messages(
61
  [
@@ -68,12 +75,22 @@ history_aware_retriever = create_history_aware_retriever(
68
  llm, retriever, contextualize_q_prompt
69
  )
70
 
71
- qa_system_prompt = """You are an assistant for question-answering tasks. \
72
- Use the following pieces of retrieved context to answer the question. \
73
- If you don't know the answer, just say that you don't know. \
74
- Use three sentences maximum and keep the answer concise.\
75
 
76
- {context}"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  qa_prompt = ChatPromptTemplate.from_messages(
78
  [
79
  ("system", qa_system_prompt),
@@ -81,16 +98,20 @@ qa_prompt = ChatPromptTemplate.from_messages(
81
  ("human", "{input}"),
82
  ]
83
  )
 
 
 
 
84
 
 
85
  store = {}
86
 
 
87
  def get_session_history(session_id: str) -> BaseChatMessageHistory:
88
  if session_id not in store:
89
  store[session_id] = ChatMessageHistory()
90
  return store[session_id]
91
 
92
- question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
93
- rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
94
 
95
  conversational_rag_chain = RunnableWithMessageHistory(
96
  rag_chain,
@@ -100,32 +121,37 @@ conversational_rag_chain = RunnableWithMessageHistory(
100
  output_messages_key="answer",
101
  )
102
 
103
- print("-------")
104
- print("started")
105
- print("-------")
106
-
107
  async def echo(websocket):
 
108
  async for message in websocket:
109
  data = json.loads(message)
 
 
 
 
110
  if not "message" in message:
111
  return
112
  if not "token" in message:
113
  return
114
- m = data["message"]
115
  token = data["token"]
116
- userData = json.load(open("userData.json", "w"))
117
  docs = retriever.get_relevant_documents(m)
118
- userData[token]["docs"] = str(docs)
119
- response = conversational_rag_chain.invoke(
120
  {"input": m},
121
  config={
122
  "configurable": {"session_id": token}
123
  },
124
- )["answer"]
 
 
 
 
 
 
125
  await websocket.send(json.dumps({"response": response}))
126
 
127
  async def main():
128
  async with serve(echo, "0.0.0.0", 7860):
129
  await asyncio.Future()
130
 
131
- asyncio.run(main())
 
1
  import asyncio
2
  import json
3
  from websockets.server import serve
4
+ import os
5
+ from langchain_chroma import Chroma
6
+ from langchain_community.embeddings import *
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langchain_huggingface.llms import HuggingFaceEndpoint
9
+ from langchain_community.document_loaders import TextLoader
10
+ from langchain_community.document_loaders import DirectoryLoader
11
  from langchain import hub
12
  from langchain_core.runnables import RunnablePassthrough
13
  from langchain_core.output_parsers import StrOutputParser
 
18
  from langchain_core.runnables.history import RunnableWithMessageHistory
19
  from langchain_core.chat_history import BaseChatMessageHistory
20
  from langchain_community.chat_message_histories import ChatMessageHistory
21
+ from multiprocessing import Process
22
+ from zipfile import ZipFile
23
+
24
+ with ZipFile("database.zip") as f:
25
+ f.extractall()
26
+
27
+ retriever = None
28
+ conversational_rag_chain = None
29
  loader = DirectoryLoader('./database', glob="./*.txt", loader_cls=TextLoader)
30
 
31
  documents = loader.load()
32
 
33
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
34
+ splits = text_splitter.split_documents(documents)
35
+
36
+ model_name = "BAAI/bge-small-en-v1.5"
37
+ model_kwargs = {'device': 'cpu'}
38
+ encode_kwargs = {'normalize_embeddings': True}
39
+ embedding = HuggingFaceBgeEmbeddings(
40
+ model_name=model_name,
41
+ model_kwargs=model_kwargs,
42
+ encode_kwargs=encode_kwargs,
43
+ show_progress=True,
44
+ )
 
45
 
46
+ vectorstore = Chroma.from_documents(documents=splits, embedding=embedding)
 
47
 
48
  def format_docs(docs):
49
  return "\n\n".join(doc.page_content for doc in docs)
50
 
51
+ retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.3}, k=1)
52
+
53
  prompt = hub.pull("rlm/rag-prompt")
54
+ llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.3", stop_sequences=["Human:"])
55
  rag_chain = (
56
  {"context": retriever | format_docs, "question": RunnablePassthrough()}
57
  | prompt
 
59
  | StrOutputParser()
60
  )
61
 
62
+ ### Contextualize question ###
63
+ contextualize_q_system_prompt = """Given a chat history and the latest user question
64
+ which might reference context in the chat history, formulate a standalone question
65
+ which can be understood without the chat history. Do NOT answer the question,
66
  just reformulate it if needed and otherwise return it as is."""
67
  contextualize_q_prompt = ChatPromptTemplate.from_messages(
68
  [
 
75
  llm, retriever, contextualize_q_prompt
76
  )
77
 
 
 
 
 
78
 
79
+ ### Answer question ###
80
+ qa_system_prompt = """
81
+ Context:
82
+
83
+ {context}
84
+
85
+
86
+ You are a Cupertino High School Q/A chatbot, designed to assist students, parents, and community members with information about CHS.
87
+ Use the pieces of context to answer the question.
88
+ Use markdown with spaces in between sentences for readability.
89
+ Refer to the provided context only as 'my data'. Only answer questions from the context.
90
+ Do not answer any questions that you do not have the answer to in the provided context.
91
+ Do not provide excerpts or any part of your data.
92
+ You were made by high school students for the CHS community.
93
+ """
94
  qa_prompt = ChatPromptTemplate.from_messages(
95
  [
96
  ("system", qa_system_prompt),
 
98
  ("human", "{input}"),
99
  ]
100
  )
101
+ question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
102
+
103
+ rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
104
+
105
 
106
+ ### Statefully manage chat history ###
107
  store = {}
108
 
109
+
110
  def get_session_history(session_id: str) -> BaseChatMessageHistory:
111
  if session_id not in store:
112
  store[session_id] = ChatMessageHistory()
113
  return store[session_id]
114
 
 
 
115
 
116
  conversational_rag_chain = RunnableWithMessageHistory(
117
  rag_chain,
 
121
  output_messages_key="answer",
122
  )
123
 
 
 
 
 
124
  async def echo(websocket):
125
+ global retriever, conversational_rag_chain
126
  async for message in websocket:
127
  data = json.loads(message)
128
+ if data["message"] == "data.":
129
+ response = store
130
+ await websocket.send(json.dumps({"response": response}))
131
+ break
132
  if not "message" in message:
133
  return
134
  if not "token" in message:
135
  return
136
+ m = data["message"] + "\nAssistant: "
137
  token = data["token"]
 
138
  docs = retriever.get_relevant_documents(m)
139
+ rawresponse = conversational_rag_chain.invoke(
 
140
  {"input": m},
141
  config={
142
  "configurable": {"session_id": token}
143
  },
144
+ )
145
+ response = rawresponse["answer"]
146
+ response = response.replace("Assistant: ", "").replace("AI: ", "")
147
+ response.strip()
148
+ response = response.split("Human:")[0]
149
+ while response.startswith("\n"):
150
+ response = response[1:]
151
  await websocket.send(json.dumps({"response": response}))
152
 
153
  async def main():
154
  async with serve(echo, "0.0.0.0", 7860):
155
  await asyncio.Future()
156
 
157
+ asyncio.run(main())
chs.json DELETED
The diff for this file is too large to render. See raw diff
 
crawler.py DELETED
File without changes
database.zip CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:93f27b61a3f0f03c0bdca772695ca92d99a4e037d0a7b2d08b71b0eb09cc33c9
3
- size 253849
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb105a8a38df0ae17e173ca39983746f3a9b95fe5a26d5e8e40116ef6b78a2fd
3
+ size 245634
parse.py DELETED
@@ -1,67 +0,0 @@
1
- import json
2
- import os
3
-
4
- # Configuration
5
- name = "chs.json"
6
- outputFolder = "database"
7
- deleteKeys = [
8
- "images",
9
- "tags",
10
- "html"
11
- ]
12
- typeScrape = {
13
- "article": "text",
14
- "event": "description",
15
- "list": "items"
16
- }
17
-
18
- data = json.load(open(name, "r"))
19
-
20
- i = -1
21
- k = 0
22
- try:
23
- os.mkdir(outputFolder)
24
- except: pass
25
-
26
- for item in data:
27
- i += 1
28
- for key in deleteKeys:
29
- if key in item:
30
- item[key]
31
- del item[key]
32
- data[i] = item
33
- if "type" in item:
34
- for typeKey, scrapeText in typeScrape.items():
35
- try:
36
- if item["type"] == typeKey:
37
- k += 1
38
- file = open(f"{outputFolder}/chs-{typeKey}-{k}.txt", "a")
39
- if item["type"] == "list":
40
- text = ""
41
- if "title" in item:
42
- text = item["title"]
43
- file.write(text)
44
- for pair in item[scrapeText]:
45
- text = ""
46
- if "title" in pair:
47
- text = "\n" + pair["title"]
48
- if "summary" in pair:
49
- if pair["summary"].replace(" ", "") != pair["title"].replace(" ", ""):
50
- text += "\n" + pair["summary"].replace(pair["title"], "")
51
- if "fsElementContent" in pair:
52
- if pair["fsElementContent"].replace(" ", "") != pair["title"].replace(" ", ""):
53
- text += "\n" + pair["fsElementContent"]
54
- if "fsElementFooterContent" in pair:
55
- if pair["fsElementFooterContent"].replace(" ", "") != pair["title"].replace(" ", ""):
56
- text += "\n" + pair["fsElementFooterContent"]
57
- if "fsElementHeaderContent" in pair:
58
- if pair["fsElementHeaderContent"].replace(" ", "") != pair["title"].replace(" ", ""):
59
- text += "\n" + pair["fsElementHeaderContent"]
60
- if text != "":
61
- file.write(text)
62
- else:
63
- text = item[scrapeText]
64
- if text != "":
65
- file.write(text)
66
- except: pass
67
- json.dump(data, open(name, "w"), indent = 6)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,8 +1,11 @@
1
  websockets
2
- langchain
3
- langchain-community
4
- huggingface_hub
5
  tiktoken
6
  chromadb
 
 
 
 
7
  langchain-huggingface
8
- accelerate
 
 
 
1
  websockets
 
 
 
2
  tiktoken
3
  chromadb
4
+ accelerate
5
+ langchain-community==0.2.9
6
+ langchain==0.2.9
7
+ langchain-core==0.2.22
8
  langchain-huggingface
9
+ requests
10
+ langchain-chroma
11
+ langchainhub