| import pandas as pd |
| from langchain.schema import Document |
| from langchain_community.vectorstores import Chroma |
| from langchain_huggingface import HuggingFaceEmbeddings |
| import os |
| import shutil |
|
|
|
|
| |
| CHROMA_PATH_GOOD = "chroma_good" |
| CHROMA_PATH_BAD = "chroma_bad" |
| DATA_PATH = "./data/wikis/patient_reviews_with_symptoms_automated.csv" |
|
|
| |
| hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") |
|
|
| def main(): |
| generate_data_store() |
|
|
| def generate_data_store(): |
| |
| documents = load_documents_with_pandas() |
| |
| |
| good_reviews = [doc for doc in documents if is_good_review(doc)] |
| bad_reviews = [doc for doc in documents if not is_good_review(doc)] |
| |
| |
| save_to_chroma(good_reviews, CHROMA_PATH_GOOD) |
| |
| |
| save_to_chroma(bad_reviews, CHROMA_PATH_BAD) |
|
|
| def load_documents_with_pandas(): |
| |
| df = pd.read_csv(DATA_PATH, encoding="utf-8") |
| |
| |
| documents = [ |
| Document( |
| page_content=row['review'], |
| metadata={"rating": row['rating text']} |
| ) |
| for _, row in df.iterrows() |
| ] |
| return documents |
|
|
| def is_good_review(doc: Document) -> bool: |
| """Determine if a review is good based on its rating.""" |
| rating = doc.metadata.get("rating", "").lower() |
| return rating == "good" |
|
|
| def save_to_chroma(documents: list[Document], chroma_path: str): |
| |
| if os.path.exists(chroma_path): |
| shutil.rmtree(chroma_path) |
|
|
| |
| db = Chroma.from_documents( |
| documents, hf_embeddings, persist_directory=chroma_path |
| ) |
| db.persist() |
| print(f"Saved {len(documents)} documents to {chroma_path}.") |
|
|
| if __name__ == "__main__": |
| main() |
|
|