Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -102,61 +102,92 @@ storage = firebase.storage()
|
|
| 102 |
|
| 103 |
backend_url = "https://backend-web-05122eab4e09.herokuapp.com"
|
| 104 |
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
"""
|
| 107 |
-
Convert a PDF file to
|
| 108 |
"""
|
| 109 |
try:
|
| 110 |
-
text = extract_text(file)
|
| 111 |
-
return
|
| 112 |
except Exception as e:
|
| 113 |
-
|
| 114 |
-
return
|
| 115 |
|
| 116 |
-
def
|
| 117 |
"""
|
| 118 |
-
Extract text from a .docx file
|
| 119 |
"""
|
| 120 |
try:
|
| 121 |
-
|
| 122 |
-
doc = docx.Document(file)
|
| 123 |
-
# Extract all text
|
| 124 |
text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
| 125 |
-
|
| 126 |
-
raise ValueError("The document has no content.")
|
| 127 |
-
return text
|
| 128 |
except Exception as e:
|
| 129 |
-
|
|
|
|
| 130 |
|
| 131 |
-
def
|
| 132 |
"""
|
| 133 |
-
|
| 134 |
"""
|
| 135 |
try:
|
| 136 |
text = file.read().decode("utf-8")
|
| 137 |
-
return
|
| 138 |
except Exception as e:
|
| 139 |
-
|
| 140 |
-
return
|
| 141 |
-
|
|
|
|
| 142 |
"""
|
| 143 |
-
|
| 144 |
"""
|
| 145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
-
def
|
| 148 |
"""
|
| 149 |
-
|
| 150 |
"""
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
return
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
def merge_markdown_contents(contents):
|
| 162 |
"""
|
|
@@ -170,7 +201,7 @@ def upload_to_firebase(user_id, file):
|
|
| 170 |
"""
|
| 171 |
Upload document to Firebase, extract content, and add it to the knowledge base.
|
| 172 |
"""
|
| 173 |
-
content =
|
| 174 |
if not content:
|
| 175 |
return None, "Failed to extract content from the file."
|
| 176 |
|
|
@@ -200,7 +231,7 @@ def index_document_content(doc_content, doc_id):
|
|
| 200 |
"""
|
| 201 |
Indexes the document content by splitting it into chunks and creating embeddings.
|
| 202 |
"""
|
| 203 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=
|
| 204 |
texts = text_splitter.split_text(doc_content)
|
| 205 |
|
| 206 |
# Create embeddings for each chunk
|
|
@@ -758,7 +789,6 @@ def side():
|
|
| 758 |
|
| 759 |
# Fetch documents from Firebase
|
| 760 |
|
| 761 |
-
|
| 762 |
if "documents" not in st.session_state:
|
| 763 |
try:
|
| 764 |
docs = db.child("users").child(st.session_state["wix_user_id"]).child("KnowledgeBase").get().val()
|
|
@@ -785,41 +815,64 @@ def side():
|
|
| 785 |
)
|
| 786 |
|
| 787 |
# File uploader
|
| 788 |
-
|
| 789 |
-
|
| 790 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 791 |
try:
|
| 792 |
-
|
| 793 |
-
st.
|
| 794 |
except Exception as e:
|
| 795 |
-
st.sidebar.error(f"Error
|
| 796 |
-
|
| 797 |
-
|
| 798 |
# Display and delete functionality for documents
|
| 799 |
-
if st.session_state
|
| 800 |
-
|
| 801 |
-
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
|
| 805 |
-
|
|
|
|
|
|
|
| 806 |
)
|
| 807 |
-
|
| 808 |
-
|
| 809 |
-
if st.sidebar.button("Delete ", key="delete_button"):
|
| 810 |
-
try:
|
| 811 |
-
# Remove the document from Firebase
|
| 812 |
-
db.child("users").child(st.session_state["wix_user_id"]).child("KnowledgeBase").child(selected_doc_id).remove()
|
| 813 |
-
|
| 814 |
-
# Remove the document from session state
|
| 815 |
-
fetch_documents()
|
| 816 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 817 |
|
|
|
|
|
|
|
|
|
|
| 818 |
|
| 819 |
-
st.
|
|
|
|
| 820 |
except Exception as e:
|
| 821 |
-
st.
|
| 822 |
-
|
|
|
|
|
|
|
| 823 |
st.sidebar.markdown("</div>", unsafe_allow_html=True)
|
| 824 |
trust_buckets = ["Any","Stability", "Development", "Relationship", "Benefit", "Vision", "Competence"]
|
| 825 |
|
|
@@ -1074,31 +1127,37 @@ def google_search(query):
|
|
| 1074 |
return ["Error occurred during Google search"]
|
| 1075 |
|
| 1076 |
|
| 1077 |
-
|
| 1078 |
-
def rag_response(query):
|
| 1079 |
"""
|
| 1080 |
-
Handle queries by searching both
|
| 1081 |
"""
|
| 1082 |
try:
|
| 1083 |
-
# Initialize results list
|
| 1084 |
results = []
|
| 1085 |
|
| 1086 |
-
# Search FAISS database (
|
| 1087 |
if "faiss_db" in st.session_state:
|
| 1088 |
retrieved_docs = search_knowledge_base(query)
|
| 1089 |
results.extend(retrieved_docs)
|
| 1090 |
-
|
| 1091 |
-
#
|
| 1092 |
-
if
|
| 1093 |
-
|
| 1094 |
-
|
| 1095 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1096 |
|
| 1097 |
# Combine results into a single context
|
| 1098 |
context = "\n".join([doc.page_content for doc in results])
|
| 1099 |
-
if not context.strip():
|
| 1100 |
-
return "No relevant information found in the knowledge base."
|
| 1101 |
|
|
|
|
|
|
|
| 1102 |
# Generate AI response with the retrieved context
|
| 1103 |
prompt = f"""
|
| 1104 |
Context:
|
|
@@ -1115,6 +1174,7 @@ def rag_response(query):
|
|
| 1115 |
llm = ChatOpenAI(model="gpt-4", temperature=0.2, api_key=openai_api_key)
|
| 1116 |
response = llm.invoke(prompt)
|
| 1117 |
|
|
|
|
| 1118 |
return response.content.strip()
|
| 1119 |
except Exception as e:
|
| 1120 |
logger.error(f"Error generating RAG response: {e}")
|
|
|
|
| 102 |
|
| 103 |
backend_url = "https://backend-web-05122eab4e09.herokuapp.com"
|
| 104 |
|
| 105 |
+
|
| 106 |
+
def convert_file_to_txt(file):
|
| 107 |
+
"""
|
| 108 |
+
Convert different file types to plain text.
|
| 109 |
+
"""
|
| 110 |
+
if file.type == "application/pdf":
|
| 111 |
+
return convert_pdf_to_txt(file)
|
| 112 |
+
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
| 113 |
+
return convert_docx_to_txt(file)
|
| 114 |
+
elif file.type == "text/plain":
|
| 115 |
+
return convert_txt_to_txt(file)
|
| 116 |
+
elif file.type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
|
| 117 |
+
return convert_excel_to_txt(file)
|
| 118 |
+
elif file.type == "text/csv":
|
| 119 |
+
return convert_csv_to_txt(file)
|
| 120 |
+
else:
|
| 121 |
+
st.sidebar.warning(f"Unsupported file type: {file.type}")
|
| 122 |
+
return None
|
| 123 |
+
|
| 124 |
+
def convert_pdf_to_txt(file):
|
| 125 |
"""
|
| 126 |
+
Convert a PDF file to plain text.
|
| 127 |
"""
|
| 128 |
try:
|
| 129 |
+
text = extract_text(file) # Use PyPDF2 or pdfplumber for better accuracy if needed
|
| 130 |
+
return text.strip()
|
| 131 |
except Exception as e:
|
| 132 |
+
st.sidebar.error(f"Error converting PDF to TXT: {e}")
|
| 133 |
+
return None
|
| 134 |
|
| 135 |
+
def convert_docx_to_txt(file):
|
| 136 |
"""
|
| 137 |
+
Extract text from a .docx file.
|
| 138 |
"""
|
| 139 |
try:
|
| 140 |
+
doc = docx.Document(file)
|
|
|
|
|
|
|
| 141 |
text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
| 142 |
+
return text.strip()
|
|
|
|
|
|
|
| 143 |
except Exception as e:
|
| 144 |
+
st.sidebar.error(f"Error converting DOCX to TXT: {e}")
|
| 145 |
+
return None
|
| 146 |
|
| 147 |
+
def convert_txt_to_txt(file):
|
| 148 |
"""
|
| 149 |
+
Handle plain text file as is.
|
| 150 |
"""
|
| 151 |
try:
|
| 152 |
text = file.read().decode("utf-8")
|
| 153 |
+
return text.strip()
|
| 154 |
except Exception as e:
|
| 155 |
+
st.sidebar.error(f"Error reading TXT file: {e}")
|
| 156 |
+
return None
|
| 157 |
+
|
| 158 |
+
def convert_excel_to_txt(file):
|
| 159 |
"""
|
| 160 |
+
Convert an Excel file to plain text.
|
| 161 |
"""
|
| 162 |
+
try:
|
| 163 |
+
df = pd.read_excel(file)
|
| 164 |
+
text = df.to_string(index=False)
|
| 165 |
+
return text.strip()
|
| 166 |
+
except Exception as e:
|
| 167 |
+
st.sidebar.error(f"Error converting Excel to TXT: {e}")
|
| 168 |
+
return None
|
| 169 |
|
| 170 |
+
def convert_csv_to_txt(file):
|
| 171 |
"""
|
| 172 |
+
Convert a CSV file to plain text.
|
| 173 |
"""
|
| 174 |
+
try:
|
| 175 |
+
df = pd.read_csv(file)
|
| 176 |
+
text = df.to_string(index=False)
|
| 177 |
+
return text.strip()
|
| 178 |
+
except Exception as e:
|
| 179 |
+
st.sidebar.error(f"Error converting CSV to TXT: {e}")
|
| 180 |
+
return None
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
|
| 191 |
|
| 192 |
def merge_markdown_contents(contents):
|
| 193 |
"""
|
|
|
|
| 201 |
"""
|
| 202 |
Upload document to Firebase, extract content, and add it to the knowledge base.
|
| 203 |
"""
|
| 204 |
+
content = convert_file_to_txt(file) # Ensure this function extracts content correctly
|
| 205 |
if not content:
|
| 206 |
return None, "Failed to extract content from the file."
|
| 207 |
|
|
|
|
| 231 |
"""
|
| 232 |
Indexes the document content by splitting it into chunks and creating embeddings.
|
| 233 |
"""
|
| 234 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=500)
|
| 235 |
texts = text_splitter.split_text(doc_content)
|
| 236 |
|
| 237 |
# Create embeddings for each chunk
|
|
|
|
| 789 |
|
| 790 |
# Fetch documents from Firebase
|
| 791 |
|
|
|
|
| 792 |
if "documents" not in st.session_state:
|
| 793 |
try:
|
| 794 |
docs = db.child("users").child(st.session_state["wix_user_id"]).child("KnowledgeBase").get().val()
|
|
|
|
| 815 |
)
|
| 816 |
|
| 817 |
# File uploader
|
| 818 |
+
uploaded_files = st.file_uploader(
|
| 819 |
+
"",
|
| 820 |
+
type=["pdf", "docx", "txt"],
|
| 821 |
+
accept_multiple_files=True,
|
| 822 |
+
key="file_uploader"
|
| 823 |
+
)
|
| 824 |
+
|
| 825 |
+
if uploaded_files:
|
| 826 |
+
for uploaded_file in uploaded_files:
|
| 827 |
try:
|
| 828 |
+
upload_to_firebase(st.session_state["wix_user_id"], uploaded_file)
|
| 829 |
+
st.sidebar.success(f"File '{uploaded_file.name}' uploaded and converted to TXT!")
|
| 830 |
except Exception as e:
|
| 831 |
+
st.sidebar.error(f"Error processing file '{uploaded_file.name}': {e}")
|
| 832 |
+
|
| 833 |
+
|
| 834 |
# Display and delete functionality for documents
|
| 835 |
+
if st.session_state.get("documents"):
|
| 836 |
+
doc_ids = list(st.session_state["documents"].keys())
|
| 837 |
+
doc_options = ["None (use only main knowledge base)"] + doc_ids
|
| 838 |
+
selected_options = st.multiselect(
|
| 839 |
+
"Select documents to include in your query:",
|
| 840 |
+
options=doc_options,
|
| 841 |
+
default="None (use only main knowledge base)",
|
| 842 |
+
format_func=lambda x: st.session_state["documents"][x].get("name", f"Document {x}") if x != "None (use only main knowledge base)" else x,
|
| 843 |
+
key="select_docs"
|
| 844 |
)
|
| 845 |
+
selected_doc_ids = [doc_id for doc_id in selected_options if doc_id != "None (use only main knowledge base)"]
|
| 846 |
+
st.session_state['selected_doc_ids'] = selected_doc_ids
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 847 |
|
| 848 |
+
if selected_doc_ids:
|
| 849 |
+
selected_doc_names = [st.session_state['documents'][doc_id]['name'] for doc_id in selected_doc_ids]
|
| 850 |
+
st.info(f"Selected Documents: {', '.join(selected_doc_names)}")
|
| 851 |
+
else:
|
| 852 |
+
st.sidebar.info("Using only the main knowledge base.")
|
| 853 |
+
else:
|
| 854 |
+
|
| 855 |
+
selected_doc_ids = []
|
| 856 |
+
|
| 857 |
+
# Button to delete the selected documents
|
| 858 |
+
if selected_doc_ids:
|
| 859 |
+
if st.button("Delete Selected Documents", key="delete_button"):
|
| 860 |
+
try:
|
| 861 |
+
for doc_id in selected_doc_ids:
|
| 862 |
+
# Remove the document from Firebase
|
| 863 |
+
db.child("users").child(st.session_state["wix_user_id"]).child("KnowledgeBase").child(doc_id).remove()
|
| 864 |
|
| 865 |
+
# Remove from session state
|
| 866 |
+
st.session_state["vector_store"].pop(doc_id, None)
|
| 867 |
+
st.session_state["documents"].pop(doc_id, None)
|
| 868 |
|
| 869 |
+
st.success("Selected documents deleted successfully!")
|
| 870 |
+
st.rerun()
|
| 871 |
except Exception as e:
|
| 872 |
+
st.error(f"Error deleting documents: {e}")
|
| 873 |
+
|
| 874 |
+
|
| 875 |
+
|
| 876 |
st.sidebar.markdown("</div>", unsafe_allow_html=True)
|
| 877 |
trust_buckets = ["Any","Stability", "Development", "Relationship", "Benefit", "Vision", "Competence"]
|
| 878 |
|
|
|
|
| 1127 |
return ["Error occurred during Google search"]
|
| 1128 |
|
| 1129 |
|
| 1130 |
+
def rag_response(query, selected_doc_ids=None):
|
|
|
|
| 1131 |
"""
|
| 1132 |
+
Handle queries by searching both the main knowledge base and the selected documents.
|
| 1133 |
"""
|
| 1134 |
try:
|
|
|
|
| 1135 |
results = []
|
| 1136 |
|
| 1137 |
+
# Search FAISS database (main knowledge base)
|
| 1138 |
if "faiss_db" in st.session_state:
|
| 1139 |
retrieved_docs = search_knowledge_base(query)
|
| 1140 |
results.extend(retrieved_docs)
|
| 1141 |
+
|
| 1142 |
+
# If selected_doc_ids is None, try to get it from session state
|
| 1143 |
+
if selected_doc_ids is None:
|
| 1144 |
+
selected_doc_ids = st.session_state.get('selected_doc_ids', [])
|
| 1145 |
+
|
| 1146 |
+
# Search vector stores of the selected documents
|
| 1147 |
+
if selected_doc_ids:
|
| 1148 |
+
for doc_id in selected_doc_ids:
|
| 1149 |
+
vector_store = st.session_state.get("vector_store", {}).get(doc_id)
|
| 1150 |
+
if vector_store:
|
| 1151 |
+
vector_store_results = vector_store.similarity_search(query, k=5)
|
| 1152 |
+
results.extend(vector_store_results)
|
| 1153 |
+
else:
|
| 1154 |
+
st.warning(f"Vector store for document '{st.session_state['documents'][doc_id]['name']}' not found.")
|
| 1155 |
|
| 1156 |
# Combine results into a single context
|
| 1157 |
context = "\n".join([doc.page_content for doc in results])
|
|
|
|
|
|
|
| 1158 |
|
| 1159 |
+
if not context.strip():
|
| 1160 |
+
return "No relevant information found in the knowledge bases."
|
| 1161 |
# Generate AI response with the retrieved context
|
| 1162 |
prompt = f"""
|
| 1163 |
Context:
|
|
|
|
| 1174 |
llm = ChatOpenAI(model="gpt-4", temperature=0.2, api_key=openai_api_key)
|
| 1175 |
response = llm.invoke(prompt)
|
| 1176 |
|
| 1177 |
+
|
| 1178 |
return response.content.strip()
|
| 1179 |
except Exception as e:
|
| 1180 |
logger.error(f"Error generating RAG response: {e}")
|