File size: 2,070 Bytes
ad09253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import pandas as pd
from langchain.schema import Document
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
import os
import shutil


# Paths for Chroma databases
CHROMA_PATH_GOOD = "chroma_good"
CHROMA_PATH_BAD = "chroma_bad"
DATA_PATH = "./data/wikis/patient_reviews_with_symptoms_automated.csv"

# Initialize Hugging Face embeddings
hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

def main():
    generate_data_store()

def generate_data_store():
    # Load documents from the CSV file using Pandas
    documents = load_documents_with_pandas()
    
    # Separate good and bad reviews based on rating
    good_reviews = [doc for doc in documents if is_good_review(doc)]
    bad_reviews = [doc for doc in documents if not is_good_review(doc)]
    
    # Save good reviews to Chroma
    save_to_chroma(good_reviews, CHROMA_PATH_GOOD)
    
    # Save bad reviews to Chroma
    save_to_chroma(bad_reviews, CHROMA_PATH_BAD)

def load_documents_with_pandas():
    # Read CSV file using Pandas
    df = pd.read_csv(DATA_PATH, encoding="utf-8")
    
    # Convert each row to a Document object
    documents = [
        Document(
            page_content=row['review'],
            metadata={"rating": row['rating text']}
        )
        for _, row in df.iterrows()
    ]
    return documents

def is_good_review(doc: Document) -> bool:
    """Determine if a review is good based on its rating."""
    rating = doc.metadata.get("rating", "").lower()  # Ensure 'rating' field is lowercase
    return rating == "good"

def save_to_chroma(documents: list[Document], chroma_path: str):
    # Clear out the existing database, if any
    if os.path.exists(chroma_path):
        shutil.rmtree(chroma_path)

    # Create a new Chroma DB from the documents
    db = Chroma.from_documents(
        documents, hf_embeddings, persist_directory=chroma_path
    )
    db.persist()
    print(f"Saved {len(documents)} documents to {chroma_path}.")

if __name__ == "__main__":
    main()