Wajahat698 commited on
Commit
5660872
·
verified ·
1 Parent(s): 570d233

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +137 -77
app.py CHANGED
@@ -102,61 +102,92 @@ storage = firebase.storage()
102
 
103
  backend_url = "https://backend-web-05122eab4e09.herokuapp.com"
104
 
105
- def convert_pdf_to_md(file):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  """
107
- Convert a PDF file to Markdown.
108
  """
109
  try:
110
- text = extract_text(file)
111
- return f"# PDF Document\n\n{text}"
112
  except Exception as e:
113
- logger.error(f"Error converting PDF to MD: {e}")
114
- return ""
115
 
116
- def convert_docx_to_md(file):
117
  """
118
- Extract text from a .docx file and return as a single string.
119
  """
120
  try:
121
- # Read the file
122
- doc = docx.Document(file)
123
- # Extract all text
124
  text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
125
- if not text.strip(): # Handle empty content
126
- raise ValueError("The document has no content.")
127
- return text
128
  except Exception as e:
129
- raise ValueError(f"Error reading .docx file: {e}")
 
130
 
131
- def convert_txt_to_md(file):
132
  """
133
- Convert a TXT file to Markdown.
134
  """
135
  try:
136
  text = file.read().decode("utf-8")
137
- return f"# Text Document\n\n{text}"
138
  except Exception as e:
139
- logger.error(f"Error converting TXT to MD: {e}")
140
- return ""
141
- def display_save_confirmation(type_saved):
 
142
  """
143
- Display a confirmation message when content is saved.
144
  """
145
- st.info(f"Content successfully saved as **{type_saved}**!")
 
 
 
 
 
 
146
 
147
- def convert_file_to_md(file):
148
  """
149
- Detect file type and convert to Markdown accordingly.
150
  """
151
- if file.type == "application/pdf":
152
- return convert_pdf_to_md(file)
153
- elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
154
- return convert_docx_to_md(file)
155
- elif file.type == "text/plain":
156
- return convert_txt_to_md(file)
157
- else:
158
- st.sidebar.warning(f"Unsupported file type: {file.type}")
159
- return ""
 
 
 
 
 
 
 
 
160
 
161
  def merge_markdown_contents(contents):
162
  """
@@ -170,7 +201,7 @@ def upload_to_firebase(user_id, file):
170
  """
171
  Upload document to Firebase, extract content, and add it to the knowledge base.
172
  """
173
- content = convert_file_to_md(file) # Ensure this function extracts content correctly
174
  if not content:
175
  return None, "Failed to extract content from the file."
176
 
@@ -200,7 +231,7 @@ def index_document_content(doc_content, doc_id):
200
  """
201
  Indexes the document content by splitting it into chunks and creating embeddings.
202
  """
203
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
204
  texts = text_splitter.split_text(doc_content)
205
 
206
  # Create embeddings for each chunk
@@ -758,7 +789,6 @@ def side():
758
 
759
  # Fetch documents from Firebase
760
 
761
-
762
  if "documents" not in st.session_state:
763
  try:
764
  docs = db.child("users").child(st.session_state["wix_user_id"]).child("KnowledgeBase").get().val()
@@ -785,41 +815,64 @@ def side():
785
  )
786
 
787
  # File uploader
788
- uploaded_file = st.file_uploader("", type=["pdf", "docx", "txt"], key="file_uploader", label_visibility="collapsed")
789
- if st.sidebar.button("Upload", key="upload_button"):
790
- if uploaded_file:
 
 
 
 
 
 
791
  try:
792
- content, _= upload_to_firebase(st.session_state["wix_user_id"], uploaded_file)
793
- st.rerun()
794
  except Exception as e:
795
- st.sidebar.error(f"Error uploading document: {e}")
796
- else:
797
- st.sidebar.warning("Please select a file to upload.")
798
  # Display and delete functionality for documents
799
- if st.session_state["documents"]:
800
- # Select a document to view or delete
801
- selected_doc_id = st.selectbox(
802
- "Select document to view or delete",
803
- options=list(st.session_state["documents"].keys()),
804
- format_func=lambda x: st.session_state["documents"][x].get("name", f"Document {x}"),
805
- key="select_doc"
 
 
806
  )
807
-
808
- # Button to delete the selected document
809
- if st.sidebar.button("Delete ", key="delete_button"):
810
- try:
811
- # Remove the document from Firebase
812
- db.child("users").child(st.session_state["wix_user_id"]).child("KnowledgeBase").child(selected_doc_id).remove()
813
-
814
- # Remove the document from session state
815
- fetch_documents()
816
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
817
 
 
 
 
818
 
819
- st.sidebar.success("Document deleted successfully!")
 
820
  except Exception as e:
821
- st.sidebar.error(f"Error deleting document: {e}")
822
-
 
 
823
  st.sidebar.markdown("</div>", unsafe_allow_html=True)
824
  trust_buckets = ["Any","Stability", "Development", "Relationship", "Benefit", "Vision", "Competence"]
825
 
@@ -1074,31 +1127,37 @@ def google_search(query):
1074
  return ["Error occurred during Google search"]
1075
 
1076
 
1077
-
1078
- def rag_response(query):
1079
  """
1080
- Handle queries by searching both static and dynamically uploaded knowledge bases.
1081
  """
1082
  try:
1083
- # Initialize results list
1084
  results = []
1085
 
1086
- # Search FAISS database (static knowledge base)
1087
  if "faiss_db" in st.session_state:
1088
  retrieved_docs = search_knowledge_base(query)
1089
  results.extend(retrieved_docs)
1090
-
1091
- # Search vector stores (dynamic knowledge base)
1092
- if "vector_store" in st.session_state:
1093
- for vector_store in st.session_state["vector_store"].values():
1094
- vector_store_results = vector_store.similarity_search(query, k=3) # Adjust `k` as needed
1095
- results.extend(vector_store_results)
 
 
 
 
 
 
 
 
1096
 
1097
  # Combine results into a single context
1098
  context = "\n".join([doc.page_content for doc in results])
1099
- if not context.strip():
1100
- return "No relevant information found in the knowledge base."
1101
 
 
 
1102
  # Generate AI response with the retrieved context
1103
  prompt = f"""
1104
  Context:
@@ -1115,6 +1174,7 @@ def rag_response(query):
1115
  llm = ChatOpenAI(model="gpt-4", temperature=0.2, api_key=openai_api_key)
1116
  response = llm.invoke(prompt)
1117
 
 
1118
  return response.content.strip()
1119
  except Exception as e:
1120
  logger.error(f"Error generating RAG response: {e}")
 
102
 
103
  backend_url = "https://backend-web-05122eab4e09.herokuapp.com"
104
 
105
+
106
+ def convert_file_to_txt(file):
107
+ """
108
+ Convert different file types to plain text.
109
+ """
110
+ if file.type == "application/pdf":
111
+ return convert_pdf_to_txt(file)
112
+ elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
113
+ return convert_docx_to_txt(file)
114
+ elif file.type == "text/plain":
115
+ return convert_txt_to_txt(file)
116
+ elif file.type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
117
+ return convert_excel_to_txt(file)
118
+ elif file.type == "text/csv":
119
+ return convert_csv_to_txt(file)
120
+ else:
121
+ st.sidebar.warning(f"Unsupported file type: {file.type}")
122
+ return None
123
+
124
+ def convert_pdf_to_txt(file):
125
  """
126
+ Convert a PDF file to plain text.
127
  """
128
  try:
129
+ text = extract_text(file) # Use PyPDF2 or pdfplumber for better accuracy if needed
130
+ return text.strip()
131
  except Exception as e:
132
+ st.sidebar.error(f"Error converting PDF to TXT: {e}")
133
+ return None
134
 
135
+ def convert_docx_to_txt(file):
136
  """
137
+ Extract text from a .docx file.
138
  """
139
  try:
140
+ doc = docx.Document(file)
 
 
141
  text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
142
+ return text.strip()
 
 
143
  except Exception as e:
144
+ st.sidebar.error(f"Error converting DOCX to TXT: {e}")
145
+ return None
146
 
147
+ def convert_txt_to_txt(file):
148
  """
149
+ Handle plain text file as is.
150
  """
151
  try:
152
  text = file.read().decode("utf-8")
153
+ return text.strip()
154
  except Exception as e:
155
+ st.sidebar.error(f"Error reading TXT file: {e}")
156
+ return None
157
+
158
+ def convert_excel_to_txt(file):
159
  """
160
+ Convert an Excel file to plain text.
161
  """
162
+ try:
163
+ df = pd.read_excel(file)
164
+ text = df.to_string(index=False)
165
+ return text.strip()
166
+ except Exception as e:
167
+ st.sidebar.error(f"Error converting Excel to TXT: {e}")
168
+ return None
169
 
170
+ def convert_csv_to_txt(file):
171
  """
172
+ Convert a CSV file to plain text.
173
  """
174
+ try:
175
+ df = pd.read_csv(file)
176
+ text = df.to_string(index=False)
177
+ return text.strip()
178
+ except Exception as e:
179
+ st.sidebar.error(f"Error converting CSV to TXT: {e}")
180
+ return None
181
+
182
+
183
+
184
+
185
+
186
+
187
+
188
+
189
+
190
+
191
 
192
  def merge_markdown_contents(contents):
193
  """
 
201
  """
202
  Upload document to Firebase, extract content, and add it to the knowledge base.
203
  """
204
+ content = convert_file_to_txt(file) # Ensure this function extracts content correctly
205
  if not content:
206
  return None, "Failed to extract content from the file."
207
 
 
231
  """
232
  Indexes the document content by splitting it into chunks and creating embeddings.
233
  """
234
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=500)
235
  texts = text_splitter.split_text(doc_content)
236
 
237
  # Create embeddings for each chunk
 
789
 
790
  # Fetch documents from Firebase
791
 
 
792
  if "documents" not in st.session_state:
793
  try:
794
  docs = db.child("users").child(st.session_state["wix_user_id"]).child("KnowledgeBase").get().val()
 
815
  )
816
 
817
  # File uploader
818
+ uploaded_files = st.file_uploader(
819
+ "",
820
+ type=["pdf", "docx", "txt"],
821
+ accept_multiple_files=True,
822
+ key="file_uploader"
823
+ )
824
+
825
+ if uploaded_files:
826
+ for uploaded_file in uploaded_files:
827
  try:
828
+ upload_to_firebase(st.session_state["wix_user_id"], uploaded_file)
829
+ st.sidebar.success(f"File '{uploaded_file.name}' uploaded and converted to TXT!")
830
  except Exception as e:
831
+ st.sidebar.error(f"Error processing file '{uploaded_file.name}': {e}")
832
+
833
+
834
  # Display and delete functionality for documents
835
+ if st.session_state.get("documents"):
836
+ doc_ids = list(st.session_state["documents"].keys())
837
+ doc_options = ["None (use only main knowledge base)"] + doc_ids
838
+ selected_options = st.multiselect(
839
+ "Select documents to include in your query:",
840
+ options=doc_options,
841
+ default="None (use only main knowledge base)",
842
+ format_func=lambda x: st.session_state["documents"][x].get("name", f"Document {x}") if x != "None (use only main knowledge base)" else x,
843
+ key="select_docs"
844
  )
845
+ selected_doc_ids = [doc_id for doc_id in selected_options if doc_id != "None (use only main knowledge base)"]
846
+ st.session_state['selected_doc_ids'] = selected_doc_ids
 
 
 
 
 
 
 
847
 
848
+ if selected_doc_ids:
849
+ selected_doc_names = [st.session_state['documents'][doc_id]['name'] for doc_id in selected_doc_ids]
850
+ st.info(f"Selected Documents: {', '.join(selected_doc_names)}")
851
+ else:
852
+ st.sidebar.info("Using only the main knowledge base.")
853
+ else:
854
+
855
+ selected_doc_ids = []
856
+
857
+ # Button to delete the selected documents
858
+ if selected_doc_ids:
859
+ if st.button("Delete Selected Documents", key="delete_button"):
860
+ try:
861
+ for doc_id in selected_doc_ids:
862
+ # Remove the document from Firebase
863
+ db.child("users").child(st.session_state["wix_user_id"]).child("KnowledgeBase").child(doc_id).remove()
864
 
865
+ # Remove from session state
866
+ st.session_state["vector_store"].pop(doc_id, None)
867
+ st.session_state["documents"].pop(doc_id, None)
868
 
869
+ st.success("Selected documents deleted successfully!")
870
+ st.rerun()
871
  except Exception as e:
872
+ st.error(f"Error deleting documents: {e}")
873
+
874
+
875
+
876
  st.sidebar.markdown("</div>", unsafe_allow_html=True)
877
  trust_buckets = ["Any","Stability", "Development", "Relationship", "Benefit", "Vision", "Competence"]
878
 
 
1127
  return ["Error occurred during Google search"]
1128
 
1129
 
1130
+ def rag_response(query, selected_doc_ids=None):
 
1131
  """
1132
+ Handle queries by searching both the main knowledge base and the selected documents.
1133
  """
1134
  try:
 
1135
  results = []
1136
 
1137
+ # Search FAISS database (main knowledge base)
1138
  if "faiss_db" in st.session_state:
1139
  retrieved_docs = search_knowledge_base(query)
1140
  results.extend(retrieved_docs)
1141
+
1142
+ # If selected_doc_ids is None, try to get it from session state
1143
+ if selected_doc_ids is None:
1144
+ selected_doc_ids = st.session_state.get('selected_doc_ids', [])
1145
+
1146
+ # Search vector stores of the selected documents
1147
+ if selected_doc_ids:
1148
+ for doc_id in selected_doc_ids:
1149
+ vector_store = st.session_state.get("vector_store", {}).get(doc_id)
1150
+ if vector_store:
1151
+ vector_store_results = vector_store.similarity_search(query, k=5)
1152
+ results.extend(vector_store_results)
1153
+ else:
1154
+ st.warning(f"Vector store for document '{st.session_state['documents'][doc_id]['name']}' not found.")
1155
 
1156
  # Combine results into a single context
1157
  context = "\n".join([doc.page_content for doc in results])
 
 
1158
 
1159
+ if not context.strip():
1160
+ return "No relevant information found in the knowledge bases."
1161
  # Generate AI response with the retrieved context
1162
  prompt = f"""
1163
  Context:
 
1174
  llm = ChatOpenAI(model="gpt-4", temperature=0.2, api_key=openai_api_key)
1175
  response = llm.invoke(prompt)
1176
 
1177
+
1178
  return response.content.strip()
1179
  except Exception as e:
1180
  logger.error(f"Error generating RAG response: {e}")