Spaces:
Sleeping
Sleeping
| import os | |
| import pandas as pd | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import Chroma | |
| from langchain_core.documents import Document | |
| # ----------------- CONFIG ----------------- | |
| DATASET_PATH = "sme_builder_dataset.csv" | |
| DB_LOCATION = os.getenv("CHROMA_DB_DIR", "./Dev_Assist_SME_Builder_DB") | |
| COLLECTION_NAME = "landing_page_generation_examples" | |
| EMBEDDING_MODEL = os.getenv("HF_EMBEDDING_MODEL", "intfloat/e5-large-v2") | |
| HF_CACHE_DIR = os.getenv("HF_CACHE_DIR", "/app/huggingface_cache") | |
| os.makedirs(HF_CACHE_DIR, exist_ok=True) | |
| os.makedirs(DB_LOCATION, exist_ok=True) | |
| # ----------------- LOAD DATASET ----------------- | |
| if not os.path.exists(DATASET_PATH): | |
| raise FileNotFoundError(f"Dataset file not found: {DATASET_PATH}") | |
| df = pd.read_csv(DATASET_PATH) | |
| # ----------------- EMBEDDINGS ----------------- | |
| embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL) | |
| # ----------------- VECTOR STORE ----------------- | |
| # Only add documents if DB is empty | |
| add_documents = not os.listdir(DB_LOCATION) | |
| vector_store = Chroma( | |
| collection_name=COLLECTION_NAME, | |
| persist_directory=DB_LOCATION, | |
| embedding_function=embeddings, | |
| ) | |
| if add_documents: | |
| documents = [] | |
| for i, row in df.iterrows(): | |
| content = " ".join([ | |
| str(row.get("prompt", "")), | |
| str(row.get("html_code", "")), | |
| str(row.get("css_code", "")), | |
| str(row.get("js_code", "")), | |
| str(row.get("sector", "")) | |
| ]).strip() | |
| documents.append(Document(page_content=content, metadata={"id": str(i)})) | |
| if documents: | |
| vector_store.add_documents(documents=documents) | |
| # ----------------- RETRIEVER ----------------- | |
| retriever = vector_store.as_retriever(search_kwargs={"k": 20}) | |
| print(f"SME vector store initialized. collection={COLLECTION_NAME}, documents={vector_store._collection.count()}") | |