import pandas as pd from langchain.schema import Document from langchain_community.vectorstores import Chroma from langchain_huggingface import HuggingFaceEmbeddings import os import shutil # Paths for Chroma databases CHROMA_PATH_GOOD = "chroma_good" CHROMA_PATH_BAD = "chroma_bad" DATA_PATH = "./data/wikis/patient_reviews_with_symptoms_automated.csv" # Initialize Hugging Face embeddings hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") def main(): generate_data_store() def generate_data_store(): # Load documents from the CSV file using Pandas documents = load_documents_with_pandas() # Separate good and bad reviews based on rating good_reviews = [doc for doc in documents if is_good_review(doc)] bad_reviews = [doc for doc in documents if not is_good_review(doc)] # Save good reviews to Chroma save_to_chroma(good_reviews, CHROMA_PATH_GOOD) # Save bad reviews to Chroma save_to_chroma(bad_reviews, CHROMA_PATH_BAD) def load_documents_with_pandas(): # Read CSV file using Pandas df = pd.read_csv(DATA_PATH, encoding="utf-8") # Convert each row to a Document object documents = [ Document( page_content=row['review'], metadata={"rating": row['rating text']} ) for _, row in df.iterrows() ] return documents def is_good_review(doc: Document) -> bool: """Determine if a review is good based on its rating.""" rating = doc.metadata.get("rating", "").lower() # Ensure 'rating' field is lowercase return rating == "good" def save_to_chroma(documents: list[Document], chroma_path: str): # Clear out the existing database, if any if os.path.exists(chroma_path): shutil.rmtree(chroma_path) # Create a new Chroma DB from the documents db = Chroma.from_documents( documents, hf_embeddings, persist_directory=chroma_path ) db.persist() print(f"Saved {len(documents)} documents to {chroma_path}.") if __name__ == "__main__": main()