Spaces:

Soha85
/

mydemoapp

Sleeping

App Files Files Community

Soha85 commited on Nov 17, 2025

Commit

91cc1cc

verified ·

1 Parent(s): c00b3ab

fixing loading files

Browse files

Files changed (1) hide show

src/streamlit_app.py +34 -31

src/streamlit_app.py CHANGED Viewed

@@ -11,12 +11,20 @@ from sentence_transformers import CrossEncoder
 import pickle
 import chromadb
 from chromadb.utils import embedding_functions
 # Global variables
-collected_file = "collected_data.txt"
-vector_db_file = "vector_db.faiss"
-embedding_file = "embeddings.npy"
-def bert_encode(texts, batch_size=300, device="cpu"):
     model.to(device)
     all_embeddings = []
     with torch.no_grad():
@@ -35,28 +43,20 @@ tab1, tab2, tab3 = st.tabs(["Collect Data", "DB Formation", "Inquiry Vector DB"]
 with tab1:
     st.header("Collect Data")
-    uploaded_files = st.file_uploader(
-        "Upload your .txt files",
-        type=["txt"],
-        accept_multiple_files=True
-    )
     if st.button("Collect") and uploaded_files:
         all_texts = []
-        # Hugging Face-safe path
-        collected_file_path = os.path.join("data", collected_file)
-        os.makedirs("data", exist_ok=True)
         for uploaded_file in uploaded_files:
             content = uploaded_file.read().decode("utf-8", errors="ignore")
             all_texts.append(content)
         with open(collected_file_path, "w", encoding="utf-8") as f:
             f.write("\n".join(all_texts))
         st.success(f"Collected {len(uploaded_files)} files successfully!")
 # Tab 2: DB Formation
 with tab2:
     st.header("Vector DB Formation")
@@ -66,8 +66,7 @@ with tab2:
     index_choice = st.selectbox("Vector DB", ["FAISS","ChromaDB"])
     embeddings = None
     if st.button("Create DB"):
-        with open("data/collected_data.txt", "r", encoding="utf-8") as f:
-        #with open(collected_file, "r", encoding="utf-8") as f:
             text_data = f.read()
         chunks = [text_data[i:i+chunk_size] for i in range(0, len(text_data), chunk_size-overlap)]
@@ -81,7 +80,7 @@ with tab2:
             model_name = "bert-base-uncased"
             tokenizer = AutoTokenizer.from_pretrained(model_name)
             model = AutoModel.from_pretrained(model_name)
-            embeddings = bert_encode(chunks)
         if index_choice == "FAISS":
             dim = len(embeddings[0])
@@ -90,8 +89,12 @@ with tab2:
             faiss.write_index(index, vector_db_file)
             np.save(embedding_file, embeddings)
         else:  # ChromaDB
-            client = chromadb.PersistentClient(path="chroma_db")
-            client.delete_collection("rag_collection")
             collection = client.get_or_create_collection("rag_collection")
             collection.add(
                 documents=chunks,
@@ -100,11 +103,11 @@ with tab2:
             )
-        with open("chunks.pkl", "wb") as f:
             pickle.dump(chunks, f)
-        with open("embedding_choice.txt", "w") as f:
             f.write(embedding_choice)
-        with open("index_choice.txt", "w") as f:
             f.write(index_choice)
         st.write(f"Saved embeddings with shape: {embeddings.shape}")
@@ -120,11 +123,11 @@ with tab3:
     if st.button("Search"):
         # Load chunks and embedding choice and index choice
-        with open("chunks.pkl", "rb") as f:
             chunks = pickle.load(f)
-        with open("embedding_choice.txt", "r") as f:
             embedding_choice = f.read().strip()
-        with open("index_choice.txt", "r") as f:
             index_choice = f.read().strip()
         #display embedding choice and index choice
         st.header(f"Using Embedding: {embedding_choice}, Index: {index_choice}")
@@ -140,7 +143,7 @@ with tab3:
             model_name = "bert-base-uncased"
             tokenizer = AutoTokenizer.from_pretrained(model_name)
             model = AutoModel.from_pretrained(model_name)
-            query_emb = bert_encode([user_query])
         if index_choice == "ChromaDB":
             #display similarity score measure used by chromadb and illustrate what number of score means more similar and its range
@@ -149,7 +152,7 @@ with tab3:
             "Cosine similarity scores range from -1 to 1, where 1 indicates perfect similarity, 0 indicates no similarity, and -1 indicates " \
             "perfect dissimilarity.")
-            client = chromadb.PersistentClient(path="chroma_db")
             collection = client.get_or_create_collection("rag_collection")
             results = collection.query(
                 query_embeddings=query_emb.tolist(),

 import pickle
 import chromadb
 from chromadb.utils import embedding_functions
+BASE_DIR = "/tmp/rag_app"
+os.makedirs(BASE_DIR, exist_ok=True)
 # Global variables
+collected_file = f"{BASE_DIR}/collected_data.txt"
+vector_db_file = f"{BASE_DIR}/vector_db.faiss"
+embedding_file = f"{BASE_DIR}/embeddings.npy"
+chunks_file = f"{BASE_DIR}/chunks.pkl"
+emb_choice_file = f"{BASE_DIR}/embedding_choice.txt"
+index_choice_file = f"{BASE_DIR}/index_choice.txt"
+chroma_dir = f"{BASE_DIR}/chroma_db"
+os.makedirs(chroma_dir, exist_ok=True)
+def bert_encode(model,tokenizer,texts, batch_size=300, device="cpu"):
     model.to(device)
     all_embeddings = []
     with torch.no_grad():
 with tab1:
     st.header("Collect Data")
+    uploaded_files = st.file_uploader("Upload your .txt files",type=["txt"], accept_multiple_files=True)
+    collected_file_path = collected_file
     if st.button("Collect") and uploaded_files:
         all_texts = []
         for uploaded_file in uploaded_files:
             content = uploaded_file.read().decode("utf-8", errors="ignore")
             all_texts.append(content)
         with open(collected_file_path, "w", encoding="utf-8") as f:
             f.write("\n".join(all_texts))
         st.success(f"Collected {len(uploaded_files)} files successfully!")
 # Tab 2: DB Formation
 with tab2:
     st.header("Vector DB Formation")
     index_choice = st.selectbox("Vector DB", ["FAISS","ChromaDB"])
     embeddings = None
     if st.button("Create DB"):
+        with open(collected_file, "r", encoding="utf-8") as f:
             text_data = f.read()
         chunks = [text_data[i:i+chunk_size] for i in range(0, len(text_data), chunk_size-overlap)]
             model_name = "bert-base-uncased"
             tokenizer = AutoTokenizer.from_pretrained(model_name)
             model = AutoModel.from_pretrained(model_name)
+            embeddings = bert_encode(model,tokenizer,chunks)
         if index_choice == "FAISS":
             dim = len(embeddings[0])
             faiss.write_index(index, vector_db_file)
             np.save(embedding_file, embeddings)
         else:  # ChromaDB
+#            client = chromadb.PersistentClient(path="chroma_db")
+            client = chromadb.PersistentClient(path=chroma_dir)
+            try:
+                client.delete_collection("rag_collection")
+            except:
+                pass
             collection = client.get_or_create_collection("rag_collection")
             collection.add(
                 documents=chunks,
             )
+        with open(chunks_file, "wb") as f:
             pickle.dump(chunks, f)
+        with open(emb_choice_file, "w") as f:
             f.write(embedding_choice)
+        with open(index_choice_file, "w") as f:
             f.write(index_choice)
         st.write(f"Saved embeddings with shape: {embeddings.shape}")
     if st.button("Search"):
         # Load chunks and embedding choice and index choice
+        with open(chunks_file, "rb") as f:
             chunks = pickle.load(f)
+        with open(emb_choice_file, "r") as f:
             embedding_choice = f.read().strip()
+        with open(index_choice_file, "r") as f:
             index_choice = f.read().strip()
         #display embedding choice and index choice
         st.header(f"Using Embedding: {embedding_choice}, Index: {index_choice}")
             model_name = "bert-base-uncased"
             tokenizer = AutoTokenizer.from_pretrained(model_name)
             model = AutoModel.from_pretrained(model_name)
+            query_emb = bert_encode(model,tokenizer,[user_query])
         if index_choice == "ChromaDB":
             #display similarity score measure used by chromadb and illustrate what number of score means more similar and its range
             "Cosine similarity scores range from -1 to 1, where 1 indicates perfect similarity, 0 indicates no similarity, and -1 indicates " \
             "perfect dissimilarity.")
+            client = chromadb.PersistentClient(path=chroma_dir)
             collection = client.get_or_create_collection("rag_collection")
             results = collection.query(
                 query_embeddings=query_emb.tolist(),