alaselababatunde commited on
Commit
82fd433
·
1 Parent(s): c27fb7c
Files changed (2) hide show
  1. requirements.txt +5 -7
  2. smebuilder_vector.py +15 -28
requirements.txt CHANGED
@@ -2,14 +2,12 @@ fastapi
2
  uvicorn[standard]
3
  pydantic
4
  spitch
5
- langchain
6
  langchain-community
7
- langdetect
8
- httpx
 
9
  huggingface_hub
 
 
10
  python-multipart
11
- langchain-huggingface>=0.0.8
12
  pandas
13
- langchain_chroma
14
- langchain_core
15
- sentence-transformers
 
2
  uvicorn[standard]
3
  pydantic
4
  spitch
 
5
  langchain-community
6
+ langchain-core
7
+ langchain-huggingface>=0.0.8
8
+ langchain-chroma
9
  huggingface_hub
10
+ httpx
11
+ langdetect
12
  python-multipart
 
13
  pandas
 
 
 
smebuilder_vector.py CHANGED
@@ -1,8 +1,8 @@
1
  import os
2
  import pandas as pd
3
- from langchain.embeddings import HuggingFaceEmbeddings
4
- from langchain.vectorstores import Chroma
5
- from langchain.schema import Document
6
 
7
  # ----------------- CONFIG -----------------
8
  DATASET_PATH = "sme_builder_dataset.csv"
@@ -11,13 +11,11 @@ COLLECTION_NAME = "landing_page_generation_examples"
11
  EMBEDDING_MODEL = os.getenv("HF_EMBEDDING_MODEL", "intfloat/e5-large-v2")
12
  HF_CACHE_DIR = os.getenv("HF_CACHE_DIR", "/app/huggingface_cache")
13
 
14
- # ensure directories exist
15
  os.makedirs(HF_CACHE_DIR, exist_ok=True)
16
  os.makedirs(DB_LOCATION, exist_ok=True)
17
 
18
  # ----------------- LOAD DATASET -----------------
19
  if not os.path.exists(DATASET_PATH):
20
- # If dataset is optional, consider returning an empty retriever. For now raise so developer notices.
21
  raise FileNotFoundError(f"Dataset file not found: {DATASET_PATH}")
22
 
23
  df = pd.read_csv(DATASET_PATH)
@@ -26,8 +24,8 @@ df = pd.read_csv(DATASET_PATH)
26
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
27
 
28
  # ----------------- VECTOR STORE -----------------
29
- # if directory is empty then we should add documents; otherwise assume already persisted
30
- add_documents = not bool(os.listdir(DB_LOCATION))
31
 
32
  vector_store = Chroma(
33
  collection_name=COLLECTION_NAME,
@@ -38,30 +36,19 @@ vector_store = Chroma(
38
  if add_documents:
39
  documents = []
40
  for i, row in df.iterrows():
41
- # build a single text blob per row combining prompt + code + sector
42
- content_pieces = [
43
- str(row.get("prompt", "")).strip(),
44
- str(row.get("html_code", "")).strip(),
45
- str(row.get("css_code", "")).strip(),
46
- str(row.get("js_code", "")).strip(),
47
- str(row.get("sector", "")).strip(),
48
- ]
49
- content = " \n".join([p for p in content_pieces if p])
50
- if not content:
51
- continue
52
  documents.append(Document(page_content=content, metadata={"id": str(i)}))
53
-
54
  if documents:
55
  vector_store.add_documents(documents=documents)
56
 
57
  # ----------------- RETRIEVER -----------------
58
- retriever = vector_store.as_retriever(search_kwargs={"k": 8})
59
-
60
- # Helpful info (no heavy introspection)
61
- try:
62
- # avoid private attributes; just confirm connectivity
63
- count = len(vector_store._collection.get()["ids"]) if hasattr(vector_store, "_collection") else "unknown"
64
- except Exception:
65
- count = "unknown"
66
 
67
- print(f"SME vector store initialized. collection={COLLECTION_NAME}, documents={count}")
 
1
  import os
2
  import pandas as pd
3
+ from langchain_community.embeddings import HuggingFaceEmbeddings
4
+ from langchain_community.vectorstores import Chroma
5
+ from langchain_core.documents import Document
6
 
7
  # ----------------- CONFIG -----------------
8
  DATASET_PATH = "sme_builder_dataset.csv"
 
11
  EMBEDDING_MODEL = os.getenv("HF_EMBEDDING_MODEL", "intfloat/e5-large-v2")
12
  HF_CACHE_DIR = os.getenv("HF_CACHE_DIR", "/app/huggingface_cache")
13
 
 
14
  os.makedirs(HF_CACHE_DIR, exist_ok=True)
15
  os.makedirs(DB_LOCATION, exist_ok=True)
16
 
17
  # ----------------- LOAD DATASET -----------------
18
  if not os.path.exists(DATASET_PATH):
 
19
  raise FileNotFoundError(f"Dataset file not found: {DATASET_PATH}")
20
 
21
  df = pd.read_csv(DATASET_PATH)
 
24
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
25
 
26
  # ----------------- VECTOR STORE -----------------
27
+ # Only add documents if DB is empty
28
+ add_documents = not os.listdir(DB_LOCATION)
29
 
30
  vector_store = Chroma(
31
  collection_name=COLLECTION_NAME,
 
36
  if add_documents:
37
  documents = []
38
  for i, row in df.iterrows():
39
+ content = " ".join([
40
+ str(row.get("prompt", "")),
41
+ str(row.get("html_code", "")),
42
+ str(row.get("css_code", "")),
43
+ str(row.get("js_code", "")),
44
+ str(row.get("sector", ""))
45
+ ]).strip()
 
 
 
 
46
  documents.append(Document(page_content=content, metadata={"id": str(i)}))
47
+
48
  if documents:
49
  vector_store.add_documents(documents=documents)
50
 
51
  # ----------------- RETRIEVER -----------------
52
+ retriever = vector_store.as_retriever(search_kwargs={"k": 20})
 
 
 
 
 
 
 
53
 
54
+ print(f"SME vector store initialized. collection={COLLECTION_NAME}, documents={vector_store._collection.count()}")