File size: 2,070 Bytes
ad09253 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | import pandas as pd
from langchain.schema import Document
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
import os
import shutil
# Paths for Chroma databases
CHROMA_PATH_GOOD = "chroma_good"
CHROMA_PATH_BAD = "chroma_bad"
DATA_PATH = "./data/wikis/patient_reviews_with_symptoms_automated.csv"
# Initialize Hugging Face embeddings
hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
def main():
generate_data_store()
def generate_data_store():
# Load documents from the CSV file using Pandas
documents = load_documents_with_pandas()
# Separate good and bad reviews based on rating
good_reviews = [doc for doc in documents if is_good_review(doc)]
bad_reviews = [doc for doc in documents if not is_good_review(doc)]
# Save good reviews to Chroma
save_to_chroma(good_reviews, CHROMA_PATH_GOOD)
# Save bad reviews to Chroma
save_to_chroma(bad_reviews, CHROMA_PATH_BAD)
def load_documents_with_pandas():
# Read CSV file using Pandas
df = pd.read_csv(DATA_PATH, encoding="utf-8")
# Convert each row to a Document object
documents = [
Document(
page_content=row['review'],
metadata={"rating": row['rating text']}
)
for _, row in df.iterrows()
]
return documents
def is_good_review(doc: Document) -> bool:
"""Determine if a review is good based on its rating."""
rating = doc.metadata.get("rating", "").lower() # Ensure 'rating' field is lowercase
return rating == "good"
def save_to_chroma(documents: list[Document], chroma_path: str):
# Clear out the existing database, if any
if os.path.exists(chroma_path):
shutil.rmtree(chroma_path)
# Create a new Chroma DB from the documents
db = Chroma.from_documents(
documents, hf_embeddings, persist_directory=chroma_path
)
db.persist()
print(f"Saved {len(documents)} documents to {chroma_path}.")
if __name__ == "__main__":
main()
|