Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -164,9 +164,9 @@ def merge_markdown_contents(contents):
|
|
| 164 |
|
| 165 |
def upload_to_firebase(user_id, file):
|
| 166 |
"""
|
| 167 |
-
Upload document to Firebase
|
| 168 |
"""
|
| 169 |
-
content = convert_file_to_md(file) # Ensure this function
|
| 170 |
if not content:
|
| 171 |
return None, "Failed to extract content from the file."
|
| 172 |
|
|
@@ -176,15 +176,41 @@ def upload_to_firebase(user_id, file):
|
|
| 176 |
# Save document to Firebase
|
| 177 |
db.child("users").child(user_id).child("KnowledgeBase").child(doc_id).set(document_data)
|
| 178 |
|
| 179 |
-
#
|
| 180 |
-
if "
|
| 181 |
-
st.session_state["
|
| 182 |
-
st.session_state["
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
-
st.sidebar.success(f"Document '{file.name}' uploaded successfully!")
|
| 185 |
return content, None
|
| 186 |
|
| 187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
def fetch_trustbuilders(user_id):
|
| 190 |
"""
|
|
@@ -1060,21 +1086,23 @@ def google_search(query):
|
|
| 1060 |
|
| 1061 |
def rag_response(query):
|
| 1062 |
"""
|
| 1063 |
-
Handle
|
| 1064 |
"""
|
| 1065 |
try:
|
| 1066 |
-
|
| 1067 |
-
|
| 1068 |
-
|
| 1069 |
-
|
| 1070 |
-
|
| 1071 |
-
|
| 1072 |
-
|
|
|
|
|
|
|
| 1073 |
return "No relevant information found in the knowledge base."
|
| 1074 |
|
| 1075 |
-
|
| 1076 |
prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
|
| 1077 |
-
llm = ChatOpenAI(model="gpt-
|
| 1078 |
response = llm.invoke(prompt)
|
| 1079 |
|
| 1080 |
return response.content
|
|
@@ -1083,7 +1111,6 @@ def rag_response(query):
|
|
| 1083 |
return "An error occurred during the RAG response generation process."
|
| 1084 |
|
| 1085 |
|
| 1086 |
-
|
| 1087 |
# Define tools
|
| 1088 |
@tool
|
| 1089 |
def knowledge_base_tool(query: str):
|
|
@@ -1799,54 +1826,7 @@ def load_user_memory(user_id):
|
|
| 1799 |
st.session_state["documents"] = {}
|
| 1800 |
st.session_state["vector_store"] = {}
|
| 1801 |
|
| 1802 |
-
def get_document_content(doc_name=None):
|
| 1803 |
-
"""
|
| 1804 |
-
Retrieve content of an uploaded document from Streamlit session state.
|
| 1805 |
-
"""
|
| 1806 |
-
documents = st.session_state.get("documents", {})
|
| 1807 |
-
if not documents:
|
| 1808 |
-
return None, "No documents have been uploaded."
|
| 1809 |
-
|
| 1810 |
-
if doc_name:
|
| 1811 |
-
doc_name = doc_name.strip().lower()
|
| 1812 |
-
for doc_id, doc_data in documents.items():
|
| 1813 |
-
if doc_data.get("name", "").strip().lower() == doc_name:
|
| 1814 |
-
return doc_data.get("content"), None
|
| 1815 |
-
return None, f"Document '{doc_name}' not found."
|
| 1816 |
-
|
| 1817 |
-
# Default to the most recently uploaded document
|
| 1818 |
-
last_doc = list(documents.values())[-1]
|
| 1819 |
-
return last_doc.get("content"), None
|
| 1820 |
-
|
| 1821 |
-
|
| 1822 |
-
def handle_document_query(query):
|
| 1823 |
-
"""
|
| 1824 |
-
Handle user queries related to uploaded documents.
|
| 1825 |
-
"""
|
| 1826 |
-
# Extract document name from the query
|
| 1827 |
-
doc_name_match = re.search(r"[\"']?([^\"']+\.(pdf|docx|doc|txt))[\"']?", query, re.IGNORECASE)
|
| 1828 |
-
doc_name = doc_name_match.group(1) if doc_name_match else None
|
| 1829 |
|
| 1830 |
-
if not doc_name:
|
| 1831 |
-
return "Please specify a document name in your query."
|
| 1832 |
-
|
| 1833 |
-
|
| 1834 |
-
# Fetch document content
|
| 1835 |
-
doc_content, error = get_document_content(doc_name)
|
| 1836 |
-
if error:
|
| 1837 |
-
return error
|
| 1838 |
-
|
| 1839 |
-
|
| 1840 |
-
# Generate AI response using document context
|
| 1841 |
-
full_prompt = f"Document Content:\n{doc_content}\n\nUser Query: {query}\n\nResponse:"
|
| 1842 |
-
try:
|
| 1843 |
-
llm = ChatOpenAI(model="gpt-4o", temperature=0.5, api_key=openai_api_key)
|
| 1844 |
-
response = llm.invoke(full_prompt)
|
| 1845 |
-
return response.content
|
| 1846 |
-
except Exception as e:
|
| 1847 |
-
logger.error(f"Error generating response using the document: {e}")
|
| 1848 |
-
return f"Error generating response using the document: {e}"
|
| 1849 |
-
|
| 1850 |
|
| 1851 |
if "missing_trustbucket_content" not in st.session_state:
|
| 1852 |
st.session_state["missing_trustbucket_content"] = None
|
|
|
|
| 164 |
|
| 165 |
def upload_to_firebase(user_id, file):
|
| 166 |
"""
|
| 167 |
+
Upload document to Firebase, extract content, and add it to the knowledge base.
|
| 168 |
"""
|
| 169 |
+
content = convert_file_to_md(file) # Ensure this function extracts content correctly
|
| 170 |
if not content:
|
| 171 |
return None, "Failed to extract content from the file."
|
| 172 |
|
|
|
|
| 176 |
# Save document to Firebase
|
| 177 |
db.child("users").child(user_id).child("KnowledgeBase").child(doc_id).set(document_data)
|
| 178 |
|
| 179 |
+
# Add content to the knowledge base
|
| 180 |
+
if "knowledge_base" not in st.session_state:
|
| 181 |
+
st.session_state["knowledge_base"] = []
|
| 182 |
+
st.session_state["knowledge_base"].append({"doc_id": doc_id, "content": content})
|
| 183 |
+
|
| 184 |
+
# Index the document content for semantic search
|
| 185 |
+
index_document_content(content, doc_id)
|
| 186 |
|
| 187 |
+
st.sidebar.success(f"Document '{file.name}' uploaded successfully and added to the knowledge base!")
|
| 188 |
return content, None
|
| 189 |
|
| 190 |
|
| 191 |
+
def index_document_content(doc_content, doc_id):
|
| 192 |
+
"""
|
| 193 |
+
Indexes the document content by splitting it into chunks and creating embeddings.
|
| 194 |
+
"""
|
| 195 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
| 196 |
+
texts = text_splitter.split_text(doc_content)
|
| 197 |
+
|
| 198 |
+
# Create embeddings for each chunk
|
| 199 |
+
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
|
| 200 |
+
doc_metadata = [{"doc_id": doc_id, "chunk_id": i} for i in range(len(texts))]
|
| 201 |
+
vector_store = FAISS.from_texts(texts, embeddings, metadatas=doc_metadata)
|
| 202 |
+
|
| 203 |
+
# Save the vector store in session state
|
| 204 |
+
if "vector_store" not in st.session_state:
|
| 205 |
+
st.session_state["vector_store"] = {}
|
| 206 |
+
st.session_state["vector_store"][doc_id] = vector_store
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
|
| 214 |
|
| 215 |
def fetch_trustbuilders(user_id):
|
| 216 |
"""
|
|
|
|
| 1086 |
|
| 1087 |
def rag_response(query):
|
| 1088 |
"""
|
| 1089 |
+
Handle queries by searching both static and dynamically uploaded knowledge base.
|
| 1090 |
"""
|
| 1091 |
try:
|
| 1092 |
+
# Retrieve relevant chunks from the vector store
|
| 1093 |
+
results = []
|
| 1094 |
+
if "vector_store" in st.session_state:
|
| 1095 |
+
for vector_store in st.session_state["vector_store"].values():
|
| 1096 |
+
results.extend(vector_store.similarity_search(query, k=3)) # Adjust `k` for the number of results
|
| 1097 |
+
|
| 1098 |
+
# Combine results into a context
|
| 1099 |
+
context = "\n".join([result.page_content for result in results])
|
| 1100 |
+
if not context:
|
| 1101 |
return "No relevant information found in the knowledge base."
|
| 1102 |
|
| 1103 |
+
# Generate AI response with the retrieved context
|
| 1104 |
prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
|
| 1105 |
+
llm = ChatOpenAI(model="gpt-4", temperature=0.3, api_key=openai_api_key)
|
| 1106 |
response = llm.invoke(prompt)
|
| 1107 |
|
| 1108 |
return response.content
|
|
|
|
| 1111 |
return "An error occurred during the RAG response generation process."
|
| 1112 |
|
| 1113 |
|
|
|
|
| 1114 |
# Define tools
|
| 1115 |
@tool
|
| 1116 |
def knowledge_base_tool(query: str):
|
|
|
|
| 1826 |
st.session_state["documents"] = {}
|
| 1827 |
st.session_state["vector_store"] = {}
|
| 1828 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1829 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1830 |
|
| 1831 |
if "missing_trustbucket_content" not in st.session_state:
|
| 1832 |
st.session_state["missing_trustbucket_content"] = None
|