DevAssist / smebuilder_vector.py
alaselababatunde's picture
Updated
5e51aba
raw
history blame
1.79 kB
# smebuilder_vector.py
import os
import pandas as pd
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
# ----------------- CONFIG -----------------
DATASET_PATH = "sme_builder_dataset.csv"
DB_LOCATION = "./Dev_Assist_SME_Builder_DB"
COLLECTION_NAME = "landing_page_generation_examples"
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
# ----------------- LOAD DATASET -----------------
if not os.path.exists(DATASET_PATH):
raise FileNotFoundError(f"Dataset file not found: {DATASET_PATH}")
df = pd.read_csv(DATASET_PATH)
# ----------------- EMBEDDINGS -----------------
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
# Check if vector store exists
add_documents = not os.path.exists(DB_LOCATION)
# ----------------- CREATE DOCUMENTS -----------------
documents, ids = [], []
if add_documents:
for i, row in df.iterrows():
prompt = row.get("prompt", "")
html_code = row.get("html_code", "")
css_code = row.get("css_code", "")
js_code = row.get("js_code", "")
sector = row.get("sector", "")
page_content = " ".join(
[str(prompt), str(html_code), str(css_code), str(js_code), str(sector)]
).strip()
documents.append(Document(page_content=page_content, id=str(i)))
ids.append(str(i))
# ----------------- VECTOR STORE -----------------
vector_store = Chroma(
collection_name=COLLECTION_NAME,
persist_directory=DB_LOCATION,
embedding_function=embeddings,
)
if add_documents and documents:
vector_store.add_documents(documents=documents, ids=ids)
# ----------------- RETRIEVER -----------------
retriever = vector_store.as_retriever(search_kwargs={"k": 20})