poc-app / create_database.py
xd2010's picture
Upload create_database.py
ad09253 verified
import pandas as pd
from langchain.schema import Document
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
import os
import shutil
# Paths for Chroma databases
CHROMA_PATH_GOOD = "chroma_good"
CHROMA_PATH_BAD = "chroma_bad"
DATA_PATH = "./data/wikis/patient_reviews_with_symptoms_automated.csv"
# Initialize Hugging Face embeddings
hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
def main():
generate_data_store()
def generate_data_store():
# Load documents from the CSV file using Pandas
documents = load_documents_with_pandas()
# Separate good and bad reviews based on rating
good_reviews = [doc for doc in documents if is_good_review(doc)]
bad_reviews = [doc for doc in documents if not is_good_review(doc)]
# Save good reviews to Chroma
save_to_chroma(good_reviews, CHROMA_PATH_GOOD)
# Save bad reviews to Chroma
save_to_chroma(bad_reviews, CHROMA_PATH_BAD)
def load_documents_with_pandas():
# Read CSV file using Pandas
df = pd.read_csv(DATA_PATH, encoding="utf-8")
# Convert each row to a Document object
documents = [
Document(
page_content=row['review'],
metadata={"rating": row['rating text']}
)
for _, row in df.iterrows()
]
return documents
def is_good_review(doc: Document) -> bool:
"""Determine if a review is good based on its rating."""
rating = doc.metadata.get("rating", "").lower() # Ensure 'rating' field is lowercase
return rating == "good"
def save_to_chroma(documents: list[Document], chroma_path: str):
# Clear out the existing database, if any
if os.path.exists(chroma_path):
shutil.rmtree(chroma_path)
# Create a new Chroma DB from the documents
db = Chroma.from_documents(
documents, hf_embeddings, persist_directory=chroma_path
)
db.persist()
print(f"Saved {len(documents)} documents to {chroma_path}.")
if __name__ == "__main__":
main()