alaselababatunde commited on
Commit
9c18fc4
·
1 Parent(s): 9046ade
Files changed (2) hide show
  1. Dockerfile +16 -24
  2. smebuilder_vector.py +20 -23
Dockerfile CHANGED
@@ -1,46 +1,38 @@
1
- # --------------------------
2
- # DevAssist AI Dockerfile
3
- # --------------------------
4
-
5
  # Use lightweight Python image
6
  FROM python:3.10-slim
7
 
8
- # Set working directory
9
- WORKDIR /app
10
-
11
- # Install system dependencies (for some models/libraries)
12
  RUN apt-get update && apt-get install -y \
13
  build-essential \
14
  curl \
15
  git \
16
  && rm -rf /var/lib/apt/lists/*
17
 
18
- # --------------------------
19
- # Hugging Face container-safe cache
20
- # --------------------------
21
- RUN mkdir -p /app/huggingface_cache && chmod -R 777 /app/huggingface_cache
22
-
23
- # Set environment variables for Hugging Face, Transformers, and Torch
24
  ENV HF_HOME=/app/huggingface_cache
25
  ENV TRANSFORMERS_CACHE=/app/huggingface_cache
26
  ENV TORCH_HOME=/app/huggingface_cache
27
- ENV HF_EMBEDDING_MODEL=intfloat/e5-large-v2
28
 
29
- # --------------------------
30
- # Copy and install Python dependencies
31
- # --------------------------
 
 
 
 
 
 
 
32
  COPY requirements.txt .
 
 
33
  RUN pip install --no-cache-dir -r requirements.txt
34
 
35
- # --------------------------
36
  # Copy project files
37
- # --------------------------
38
- COPY . .
39
 
40
  # Expose FastAPI default port
41
  EXPOSE 7860
42
 
43
- # --------------------------
44
- # Run FastAPI with Uvicorn
45
- # --------------------------
46
  CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
 
 
 
1
  # Use lightweight Python image
2
  FROM python:3.10-slim
3
 
4
+ # Install system dependencies
 
 
 
5
  RUN apt-get update && apt-get install -y \
6
  build-essential \
7
  curl \
8
  git \
9
  && rm -rf /var/lib/apt/lists/*
10
 
11
+ # Set environment variables for HF cache
 
 
 
 
 
12
  ENV HF_HOME=/app/huggingface_cache
13
  ENV TRANSFORMERS_CACHE=/app/huggingface_cache
14
  ENV TORCH_HOME=/app/huggingface_cache
 
15
 
16
+ # Create cache and DB folders with proper permissions
17
+ RUN mkdir -p /app/huggingface_cache /app/Dev_Assist_SME_Builder_DB \
18
+ && useradd -m appuser \
19
+ && chown -R appuser:appuser /app/huggingface_cache /app/Dev_Assist_SME_Builder_DB
20
+
21
+ # Switch to non-root user
22
+ USER appuser
23
+ WORKDIR /app
24
+
25
+ # Copy requirements first for caching
26
  COPY requirements.txt .
27
+
28
+ # Install Python dependencies
29
  RUN pip install --no-cache-dir -r requirements.txt
30
 
 
31
  # Copy project files
32
+ COPY --chown=appuser:appuser . .
 
33
 
34
  # Expose FastAPI default port
35
  EXPOSE 7860
36
 
37
+ # Command to run FastAPI with Uvicorn
 
 
38
  CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
smebuilder_vector.py CHANGED
@@ -1,5 +1,3 @@
1
- # smebuilder_vector.py
2
-
3
  import os
4
  import pandas as pd
5
  from langchain_huggingface import HuggingFaceEmbeddings
@@ -8,15 +6,10 @@ from langchain_core.documents import Document
8
 
9
  # ----------------- CONFIG -----------------
10
  DATASET_PATH = "sme_builder_dataset.csv"
11
- DB_LOCATION = "./Dev_Assist_SME_Builder_DB"
 
12
  COLLECTION_NAME = "landing_page_generation_examples"
13
-
14
- # Model name from environment, default to e5-large-v2
15
- EMBEDDING_MODEL = os.getenv("HF_EMBEDDING_MODEL", "intfloat/e5-large-v2")
16
-
17
- # Ensure Hugging Face cache directory exists (env vars handle cache paths)
18
- HF_CACHE_DIR = os.getenv("HF_HOME", "/app/huggingface_cache")
19
- os.makedirs(HF_CACHE_DIR, exist_ok=True)
20
 
21
  # ----------------- LOAD DATASET -----------------
22
  if not os.path.exists(DATASET_PATH):
@@ -25,34 +18,38 @@ if not os.path.exists(DATASET_PATH):
25
  df = pd.read_csv(DATASET_PATH)
26
 
27
  # ----------------- EMBEDDINGS -----------------
28
- # Do NOT pass cache_dir; HF reads env vars automatically
29
- embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
 
 
30
 
31
- # ----------------- VECTOR STORE -----------------
32
  add_documents = not os.path.exists(DB_LOCATION)
33
 
 
34
  documents, ids = [], []
35
  if add_documents:
36
  for i, row in df.iterrows():
37
- # Combine all page content into a single string
38
- page_content = " ".join([
39
- str(row.get("prompt", "")),
40
- str(row.get("html_code", "")),
41
- str(row.get("css_code", "")),
42
- str(row.get("js_code", "")),
43
- str(row.get("sector", ""))
44
- ]).strip()
 
45
 
46
  documents.append(Document(page_content=page_content, id=str(i)))
47
  ids.append(str(i))
48
 
 
49
  vector_store = Chroma(
50
  collection_name=COLLECTION_NAME,
51
- persist_directory=DB_LOCATION,
52
  embedding_function=embeddings,
53
  )
54
 
55
- # Add documents if DB doesn't exist
56
  if add_documents and documents:
57
  vector_store.add_documents(documents=documents, ids=ids)
58
 
 
 
 
1
  import os
2
  import pandas as pd
3
  from langchain_huggingface import HuggingFaceEmbeddings
 
6
 
7
  # ----------------- CONFIG -----------------
8
  DATASET_PATH = "sme_builder_dataset.csv"
9
+ DB_LOCATION = "/app/Dev_Assist_SME_Builder_DB" # absolute path inside container
10
+ HF_CACHE = "/app/huggingface_cache" # absolute path for HF cache
11
  COLLECTION_NAME = "landing_page_generation_examples"
12
+ EMBEDDING_MODEL = "intfloat/e5-base-v2"
 
 
 
 
 
 
13
 
14
  # ----------------- LOAD DATASET -----------------
15
  if not os.path.exists(DATASET_PATH):
 
18
  df = pd.read_csv(DATASET_PATH)
19
 
20
  # ----------------- EMBEDDINGS -----------------
21
+ embeddings = HuggingFaceEmbeddings(
22
+ model_name=EMBEDDING_MODEL,
23
+ cache_dir=HF_CACHE # ensures HF uses a container-safe writable folder
24
+ )
25
 
26
+ # Check if vector store exists
27
  add_documents = not os.path.exists(DB_LOCATION)
28
 
29
+ # ----------------- CREATE DOCUMENTS -----------------
30
  documents, ids = [], []
31
  if add_documents:
32
  for i, row in df.iterrows():
33
+ prompt = row.get("prompt", "")
34
+ html_code = row.get("html_code", "")
35
+ css_code = row.get("css_code", "")
36
+ js_code = row.get("js_code", "")
37
+ sector = row.get("sector", "")
38
+
39
+ page_content = " ".join(
40
+ [str(prompt), str(html_code), str(css_code), str(js_code), str(sector)]
41
+ ).strip()
42
 
43
  documents.append(Document(page_content=page_content, id=str(i)))
44
  ids.append(str(i))
45
 
46
+ # ----------------- VECTOR STORE -----------------
47
  vector_store = Chroma(
48
  collection_name=COLLECTION_NAME,
49
+ persist_directory=DB_LOCATION, # absolute path
50
  embedding_function=embeddings,
51
  )
52
 
 
53
  if add_documents and documents:
54
  vector_store.add_documents(documents=documents, ids=ids)
55