alaselababatunde commited on
Commit
66d0fd5
·
1 Parent(s): 9a8aec4
Files changed (1) hide show
  1. smebuilder_vector.py +45 -12
smebuilder_vector.py CHANGED
@@ -1,24 +1,57 @@
1
  import os
 
2
  from langchain.embeddings import HuggingFaceEmbeddings
3
  from langchain.vectorstores import Chroma
 
4
 
5
- # Make sure HF_HOME is set for cache
6
- os.environ["HF_HOME"] = os.getenv("HF_HOME", "/app/huggingface_cache")
7
- os.environ["TRANSFORMERS_CACHE"] = os.environ["HF_HOME"]
 
 
 
8
 
9
- # Initialize embeddings (without cache_dir argument)
 
 
 
 
 
 
 
 
 
10
  embeddings = HuggingFaceEmbeddings(
11
- model_name="sentence-transformers/all-MiniLM-L6-v2",
12
- model_kwargs={"device": "cpu"} # or "cuda" if GPU available
13
  )
14
 
15
- # Chroma vector store (writable directory)
16
- chroma_db_path = os.getenv("CHROMA_DB_DIR", "/app/Dev_Assist_SME_Builder_DB")
 
 
17
  vector_store = Chroma(
18
- persist_directory=chroma_db_path,
 
19
  embedding_function=embeddings,
20
- collection_name="sme_collection"
21
  )
22
 
23
- # retriever object for querying
24
- retriever = vector_store.as_retriever()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import pandas as pd
3
  from langchain.embeddings import HuggingFaceEmbeddings
4
  from langchain.vectorstores import Chroma
5
+ from langchain.schema import Document
6
 
7
+ # ----------------- CONFIG -----------------
8
+ DATASET_PATH = "sme_builder_dataset.csv"
9
+ DB_LOCATION = os.getenv("CHROMA_DB_DIR", "./Dev_Assist_SME_Builder_DB")
10
+ COLLECTION_NAME = "landing_page_generation_examples"
11
+ EMBEDDING_MODEL = os.getenv("HF_EMBEDDING_MODEL", "intfloat/e5-large-v2")
12
+ HF_CACHE_DIR = os.getenv("HF_CACHE_DIR", "/app/huggingface_cache")
13
 
14
+ os.makedirs(HF_CACHE_DIR, exist_ok=True)
15
+ os.makedirs(DB_LOCATION, exist_ok=True)
16
+
17
+ # ----------------- LOAD DATASET -----------------
18
+ if not os.path.exists(DATASET_PATH):
19
+ raise FileNotFoundError(f"Dataset file not found: {DATASET_PATH}")
20
+
21
+ df = pd.read_csv(DATASET_PATH)
22
+
23
+ # ----------------- EMBEDDINGS -----------------
24
  embeddings = HuggingFaceEmbeddings(
25
+ model_name=EMBEDDING_MODEL,
26
+ cache_dir=HF_CACHE_DIR
27
  )
28
 
29
+ # ----------------- VECTOR STORE -----------------
30
+ # Only add documents if DB is empty
31
+ add_documents = not os.listdir(DB_LOCATION)
32
+
33
  vector_store = Chroma(
34
+ collection_name=COLLECTION_NAME,
35
+ persist_directory=DB_LOCATION,
36
  embedding_function=embeddings,
 
37
  )
38
 
39
+ if add_documents:
40
+ documents = []
41
+ for i, row in df.iterrows():
42
+ content = " ".join([
43
+ str(row.get("prompt", "")),
44
+ str(row.get("html_code", "")),
45
+ str(row.get("css_code", "")),
46
+ str(row.get("js_code", "")),
47
+ str(row.get("sector", ""))
48
+ ]).strip()
49
+ documents.append(Document(page_content=content, id=str(i)))
50
+
51
+ if documents:
52
+ vector_store.add_documents(documents=documents, ids=[doc.id for doc in documents])
53
+
54
+ # ----------------- RETRIEVER -----------------
55
+ retriever = vector_store.as_retriever(search_kwargs={"k": 20})
56
+
57
+ print(f"Vector store ready with {len(df)} documents.")