Spaces:
Sleeping
Sleeping
File size: 1,911 Bytes
4d95531 66d0fd5 82fd433 410396e 66d0fd5 410396e 66d0fd5 c27fb7c 410396e 66d0fd5 82fd433 66d0fd5 4d95531 66d0fd5 4d95531 66d0fd5 82fd433 8ec62ff 82fd433 66d0fd5 8ec62ff 66d0fd5 82fd433 66d0fd5 82fd433 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import os
import pandas as pd
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
# ----------------- CONFIG -----------------
DATASET_PATH = "sme_builder_dataset.csv"
DB_LOCATION = os.getenv("CHROMA_DB_DIR", "./Dev_Assist_SME_Builder_DB")
COLLECTION_NAME = "landing_page_generation_examples"
EMBEDDING_MODEL = os.getenv("HF_EMBEDDING_MODEL", "intfloat/e5-large-v2")
HF_CACHE_DIR = os.getenv("HF_CACHE_DIR", "/app/huggingface_cache")
os.makedirs(HF_CACHE_DIR, exist_ok=True)
os.makedirs(DB_LOCATION, exist_ok=True)
# ----------------- LOAD DATASET -----------------
if not os.path.exists(DATASET_PATH):
raise FileNotFoundError(f"Dataset file not found: {DATASET_PATH}")
df = pd.read_csv(DATASET_PATH)
# ----------------- EMBEDDINGS -----------------
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
# ----------------- VECTOR STORE -----------------
# Only add documents if DB is empty
add_documents = not os.listdir(DB_LOCATION)
vector_store = Chroma(
collection_name=COLLECTION_NAME,
persist_directory=DB_LOCATION,
embedding_function=embeddings,
)
if add_documents:
documents = []
for i, row in df.iterrows():
content = " ".join([
str(row.get("prompt", "")),
str(row.get("html_code", "")),
str(row.get("css_code", "")),
str(row.get("js_code", "")),
str(row.get("sector", ""))
]).strip()
documents.append(Document(page_content=content, metadata={"id": str(i)}))
if documents:
vector_store.add_documents(documents=documents)
# ----------------- RETRIEVER -----------------
retriever = vector_store.as_retriever(search_kwargs={"k": 20})
print(f"SME vector store initialized. collection={COLLECTION_NAME}, documents={vector_store._collection.count()}")
|