Rivalcoder commited on
Commit
7e9a861
·
1 Parent(s): a2b02a5
Files changed (3) hide show
  1. Dockerfile +9 -3
  2. process_aware_rag.py +28 -3
  3. vector_store_builder.py +17 -6
Dockerfile CHANGED
@@ -4,7 +4,12 @@ FROM python:3.10-slim
4
  ENV PYTHONDONTWRITEBYTECODE=1 \
5
  PYTHONUNBUFFERED=1 \
6
  PIP_NO_CACHE_DIR=1 \
7
- CHROMA_DB_PATH=/data/chroma
 
 
 
 
 
8
 
9
  # System deps
10
  RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -14,8 +19,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
14
 
15
  WORKDIR /app
16
 
17
- # Ensure writable path for Chroma
18
- RUN mkdir -p "$CHROMA_DB_PATH" && chmod -R 777 "$CHROMA_DB_PATH"
 
19
 
20
  # Copy requirements first for better caching
21
  COPY requirements.txt ./
 
4
  ENV PYTHONDONTWRITEBYTECODE=1 \
5
  PYTHONUNBUFFERED=1 \
6
  PIP_NO_CACHE_DIR=1 \
7
+ CHROMA_DB_PATH=/data/chroma \
8
+ CACHE_ROOT=/data/cache \
9
+ HF_HOME=/data/cache/hf \
10
+ TRANSFORMERS_CACHE=/data/cache/transformers \
11
+ SENTENCE_TRANSFORMERS_HOME=/data/cache/sentence-transformers \
12
+ XDG_CACHE_HOME=/data/cache
13
 
14
  # System deps
15
  RUN apt-get update && apt-get install -y --no-install-recommends \
 
19
 
20
  WORKDIR /app
21
 
22
+ # Ensure writable paths for Chroma and model caches
23
+ RUN mkdir -p "$CHROMA_DB_PATH" "$CACHE_ROOT" "$HF_HOME" "$TRANSFORMERS_CACHE" "$SENTENCE_TRANSFORMERS_HOME" \
24
+ && chmod -R 777 "$CHROMA_DB_PATH" "$CACHE_ROOT"
25
 
26
  # Copy requirements first for better caching
27
  COPY requirements.txt ./
process_aware_rag.py CHANGED
@@ -1,5 +1,7 @@
1
  # process_aware_rag.py
2
  import chromadb
 
 
3
  import google.generativeai as genai
4
  from query_classifier import QueryClassifier
5
  from graph_builder import LegalProcessGraph
@@ -16,13 +18,36 @@ class ProcessAwareRAG:
16
  # Initialize vector store (use writable path by default)
17
  chroma_path = os.getenv('CHROMA_DB_PATH', '/tmp/legal_vector_db')
18
  os.makedirs(chroma_path, exist_ok=True)
19
- client = chromadb.PersistentClient(path=chroma_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  # Ensure collection exists
 
 
22
  try:
23
- self.vector_collection = client.get_collection("legal_context")
 
 
 
24
  except Exception:
25
- self.vector_collection = client.create_collection("legal_context")
 
 
 
26
 
27
  # Initialize LLM
28
  genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
 
1
  # process_aware_rag.py
2
  import chromadb
3
+ from chromadb.config import Settings
4
+ from chromadb.utils import embedding_functions
5
  import google.generativeai as genai
6
  from query_classifier import QueryClassifier
7
  from graph_builder import LegalProcessGraph
 
18
  # Initialize vector store (use writable path by default)
19
  chroma_path = os.getenv('CHROMA_DB_PATH', '/tmp/legal_vector_db')
20
  os.makedirs(chroma_path, exist_ok=True)
21
+
22
+ # Redirect model caches to writable directories
23
+ default_cache_root = os.getenv('CACHE_ROOT', '/data/cache')
24
+ os.makedirs(default_cache_root, exist_ok=True)
25
+ os.environ.setdefault('HF_HOME', os.path.join(default_cache_root, 'hf'))
26
+ os.environ.setdefault('TRANSFORMERS_CACHE', os.path.join(default_cache_root, 'transformers'))
27
+ os.environ.setdefault('SENTENCE_TRANSFORMERS_HOME', os.path.join(default_cache_root, 'sentence-transformers'))
28
+ os.environ.setdefault('XDG_CACHE_HOME', default_cache_root)
29
+ for env_key in ['HF_HOME', 'TRANSFORMERS_CACHE', 'SENTENCE_TRANSFORMERS_HOME', 'XDG_CACHE_HOME']:
30
+ os.makedirs(os.environ[env_key], exist_ok=True)
31
+
32
+ # Disable Chroma anonymized telemetry and initialize client
33
+ client = chromadb.PersistentClient(
34
+ path=chroma_path,
35
+ settings=Settings(anonymized_telemetry=False)
36
+ )
37
 
38
  # Ensure collection exists
39
+ # Use explicit embedding function to ensure queries can compute embeddings
40
+ embedding_function = embedding_functions.DefaultEmbeddingFunction()
41
  try:
42
+ self.vector_collection = client.get_collection(
43
+ "legal_context",
44
+ embedding_function=embedding_function
45
+ )
46
  except Exception:
47
+ self.vector_collection = client.create_collection(
48
+ "legal_context",
49
+ embedding_function=embedding_function
50
+ )
51
 
52
  # Initialize LLM
53
  genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
vector_store_builder.py CHANGED
@@ -1,5 +1,7 @@
1
  # vector_store_builder.py
 
2
  import chromadb
 
3
  from chromadb.utils import embedding_functions
4
  from legal_aid_context import ALL_CONTEXT_DOCS
5
  import uuid
@@ -7,16 +9,25 @@ import uuid
7
  def build_vector_store():
8
  """Build ChromaDB vector store with legal context"""
9
 
10
- # Initialize ChromaDB client
11
- client = chromadb.PersistentClient(path="./legal_vector_db")
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  # Create collection with default embedding function
14
  embedding_function = embedding_functions.DefaultEmbeddingFunction()
15
 
16
- collection = client.get_or_create_collection(
17
- name="legal_context",
18
- embedding_function=embedding_function
19
- )
20
 
21
  # Prepare documents for vector store
22
  documents = []
 
1
  # vector_store_builder.py
2
+ import os
3
  import chromadb
4
+ from chromadb.config import Settings
5
  from chromadb.utils import embedding_functions
6
  from legal_aid_context import ALL_CONTEXT_DOCS
7
  import uuid
 
9
  def build_vector_store():
10
  """Build ChromaDB vector store with legal context"""
11
 
12
+ # Initialize ChromaDB client with telemetry disabled and writable paths
13
+ chroma_path = os.getenv('CHROMA_DB_PATH', './legal_vector_db')
14
+ os.makedirs(chroma_path, exist_ok=True)
15
+
16
+ default_cache_root = os.getenv('CACHE_ROOT', './cache')
17
+ os.makedirs(default_cache_root, exist_ok=True)
18
+ os.environ.setdefault('HF_HOME', os.path.join(default_cache_root, 'hf'))
19
+ os.environ.setdefault('TRANSFORMERS_CACHE', os.path.join(default_cache_root, 'transformers'))
20
+ os.environ.setdefault('SENTENCE_TRANSFORMERS_HOME', os.path.join(default_cache_root, 'sentence-transformers'))
21
+ os.environ.setdefault('XDG_CACHE_HOME', default_cache_root)
22
+ for env_key in ['HF_HOME', 'TRANSFORMERS_CACHE', 'SENTENCE_TRANSFORMERS_HOME', 'XDG_CACHE_HOME']:
23
+ os.makedirs(os.environ[env_key], exist_ok=True)
24
+
25
+ client = chromadb.PersistentClient(path=chroma_path, settings=Settings(anonymized_telemetry=False))
26
 
27
  # Create collection with default embedding function
28
  embedding_function = embedding_functions.DefaultEmbeddingFunction()
29
 
30
+ collection = client.get_or_create_collection(name="legal_context", embedding_function=embedding_function)
 
 
 
31
 
32
  # Prepare documents for vector store
33
  documents = []