File size: 1,911 Bytes
4d95531
66d0fd5
82fd433
 
 
410396e
66d0fd5
 
 
 
 
 
410396e
66d0fd5
 
 
 
 
 
 
 
 
 
c27fb7c
410396e
66d0fd5
82fd433
 
66d0fd5
4d95531
66d0fd5
 
4d95531
 
 
66d0fd5
 
 
82fd433
 
 
 
 
 
 
8ec62ff
82fd433
66d0fd5
8ec62ff
66d0fd5
 
82fd433
66d0fd5
82fd433
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
import pandas as pd
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document

# ----------------- CONFIG -----------------
DATASET_PATH = "sme_builder_dataset.csv"
DB_LOCATION = os.getenv("CHROMA_DB_DIR", "./Dev_Assist_SME_Builder_DB")
COLLECTION_NAME = "landing_page_generation_examples"
EMBEDDING_MODEL = os.getenv("HF_EMBEDDING_MODEL", "intfloat/e5-large-v2")
HF_CACHE_DIR = os.getenv("HF_CACHE_DIR", "/app/huggingface_cache")

os.makedirs(HF_CACHE_DIR, exist_ok=True)
os.makedirs(DB_LOCATION, exist_ok=True)

# ----------------- LOAD DATASET -----------------
if not os.path.exists(DATASET_PATH):
    raise FileNotFoundError(f"Dataset file not found: {DATASET_PATH}")

df = pd.read_csv(DATASET_PATH)

# ----------------- EMBEDDINGS -----------------
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)

# ----------------- VECTOR STORE -----------------
# Only add documents if DB is empty
add_documents = not os.listdir(DB_LOCATION)

vector_store = Chroma(
    collection_name=COLLECTION_NAME,
    persist_directory=DB_LOCATION,
    embedding_function=embeddings,
)

if add_documents:
    documents = []
    for i, row in df.iterrows():
        content = " ".join([
            str(row.get("prompt", "")),
            str(row.get("html_code", "")),
            str(row.get("css_code", "")),
            str(row.get("js_code", "")),
            str(row.get("sector", ""))
        ]).strip()
        documents.append(Document(page_content=content, metadata={"id": str(i)}))
    
    if documents:
        vector_store.add_documents(documents=documents)

# ----------------- RETRIEVER -----------------
retriever = vector_store.as_retriever(search_kwargs={"k": 20})

print(f"SME vector store initialized. collection={COLLECTION_NAME}, documents={vector_store._collection.count()}")