Zubaish commited on
Commit
40676fe
·
1 Parent(s): f09a853
Files changed (3) hide show
  1. Dockerfile +10 -7
  2. download_models.py +11 -0
  3. ingest.py +9 -34
Dockerfile CHANGED
@@ -2,21 +2,24 @@ FROM python:3.10-slim
2
 
3
  WORKDIR /app
4
 
5
- # Install system dependencies for git and PDF processing
6
  RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
7
 
8
  # Install Python requirements
9
  COPY requirements.txt .
10
  RUN pip install --no-cache-dir -r requirements.txt
11
 
12
- # Copy project files
13
- COPY app.py rag.py ingest.py config.py ./
14
- COPY frontend ./frontend
15
 
16
- # CRITICAL: Build the knowledge base during the Docker build process
 
 
 
 
 
17
  RUN python ingest.py
18
 
19
- # Hugging Face Spaces standard port
20
  EXPOSE 7860
21
-
22
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
2
 
3
  WORKDIR /app
4
 
5
+ # Install system dependencies
6
  RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
7
 
8
  # Install Python requirements
9
  COPY requirements.txt .
10
  RUN pip install --no-cache-dir -r requirements.txt
11
 
12
+ # Copy all project files
13
+ COPY . .
 
14
 
15
+ # ---------------------------------------------------------
16
+ # PRE-BUILD PHASE
17
+ # This downloads models and processes PDFs during the build.
18
+ # This prevents httpx.ReadTimeout errors at runtime.
19
+ # ---------------------------------------------------------
20
+ RUN python download_models.py
21
  RUN python ingest.py
22
 
23
+ # Hugging Face Space setup
24
  EXPOSE 7860
 
25
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
download_models.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # download_models.py
2
+ from transformers import pipeline
3
+ from langchain_huggingface import HuggingFaceEmbeddings
4
+ from config import EMBEDDING_MODEL, LLM_MODEL
5
+
6
+ print("⏳ Pre-downloading models...")
7
+ # Download Embedding Model
8
+ HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
9
+ # Download LLM
10
+ pipeline("text-generation", model=LLM_MODEL)
11
+ print("✅ Models downloaded successfully")
ingest.py CHANGED
@@ -8,61 +8,36 @@ from langchain_chroma import Chroma
8
  from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR
9
 
10
  def run_ingestion():
11
- # 1. Clean Environment
12
- if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
13
- if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
14
  os.makedirs(KB_DIR, exist_ok=True)
15
-
16
- print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
17
  dataset = load_dataset(HF_DATASET_REPO, split="train")
18
 
19
  pdf_paths = []
20
  for i, row in enumerate(dataset):
21
  pdf_feature = row.get("pdf")
22
-
23
- # Determine Source Path
24
- # HF PdfFolder datasets store the local path in the 'path' key of the feature
25
  src_path = None
26
- if isinstance(pdf_feature, dict) and "path" in pdf_feature:
27
- src_path = pdf_feature["path"]
28
- elif hasattr(pdf_feature, "filename"):
29
- src_path = pdf_feature.filename
30
 
31
  if src_path and os.path.exists(src_path):
32
  dest_path = os.path.join(KB_DIR, f"doc_{i}.pdf")
33
  shutil.copy(src_path, dest_path)
34
  pdf_paths.append(dest_path)
35
- print(f"✅ Cached: doc_{i}.pdf")
36
- else:
37
- print(f"⚠️ Could not resolve path for doc_{i}, skipping.")
38
 
39
- # 2. Process Documents
40
- print(f"📄 Processing {len(pdf_paths)} documents...")
41
  docs = []
42
  for p in pdf_paths:
43
  try:
44
  loader = PyPDFLoader(p)
45
  docs.extend(loader.load())
46
- except Exception as e:
47
- print(f"❌ Error reading {p}: {e}")
48
-
49
- if not docs:
50
- print("❌ CRITICAL: No documents were successfully loaded.")
51
- return
52
 
53
- # 3. Chunk and Embed
54
- splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
55
- splits = splitter.split_documents(docs)
56
 
57
- print(f"🧠 Indexing {len(splits)} chunks...")
58
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
59
-
60
- Chroma.from_documents(
61
- documents=splits,
62
- embedding=embeddings,
63
- persist_directory=CHROMA_DIR
64
- )
65
- print(f"✅ Knowledge base initialized at {CHROMA_DIR}")
66
 
67
  if __name__ == "__main__":
68
  run_ingestion()
 
8
  from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR
9
 
10
  def run_ingestion():
 
 
 
11
  os.makedirs(KB_DIR, exist_ok=True)
12
+ # Load dataset - this will use the cached version from build phase
 
13
  dataset = load_dataset(HF_DATASET_REPO, split="train")
14
 
15
  pdf_paths = []
16
  for i, row in enumerate(dataset):
17
  pdf_feature = row.get("pdf")
18
+ # Access local path directly from HF cache
 
 
19
  src_path = None
20
+ if isinstance(pdf_feature, dict): src_path = pdf_feature.get("path")
21
+ elif hasattr(pdf_feature, 'filename'): src_path = pdf_feature.filename
 
 
22
 
23
  if src_path and os.path.exists(src_path):
24
  dest_path = os.path.join(KB_DIR, f"doc_{i}.pdf")
25
  shutil.copy(src_path, dest_path)
26
  pdf_paths.append(dest_path)
 
 
 
27
 
 
 
28
  docs = []
29
  for p in pdf_paths:
30
  try:
31
  loader = PyPDFLoader(p)
32
  docs.extend(loader.load())
33
+ except Exception as e: print(f"❌ Error: {e}")
 
 
 
 
 
34
 
35
+ if not docs: return
 
 
36
 
37
+ splits = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100).split_documents(docs)
38
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
39
+ Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=CHROMA_DIR)
40
+ print("✅ KB Initialized")
 
 
 
 
 
41
 
42
  if __name__ == "__main__":
43
  run_ingestion()