Zubaish commited on
Commit
19d8cbd
·
1 Parent(s): 9edda50
Files changed (4) hide show
  1. Dockerfile +0 -14
  2. config.py +0 -21
  3. ingest.py +17 -48
  4. rag.py +6 -15
Dockerfile CHANGED
@@ -1,24 +1,10 @@
1
  FROM python:3.10-slim
2
-
3
  WORKDIR /app
4
-
5
- # Install system dependencies
6
  RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
7
-
8
- # Install Python requirements
9
  COPY requirements.txt .
10
  RUN pip install --no-cache-dir -r requirements.txt
11
-
12
- # Copy everything (including your config and scripts)
13
  COPY . .
14
-
15
- # ---------------------------------------------------------
16
- # PRE-BUILD PHASE
17
- # We run these in the container so they are "baked into" the image.
18
- # ---------------------------------------------------------
19
  RUN python download_models.py
20
  RUN python ingest.py
21
-
22
- # Hugging Face Space setup
23
  EXPOSE 7860
24
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
  FROM python:3.10-slim
 
2
  WORKDIR /app
 
 
3
  RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 
 
4
  COPY requirements.txt .
5
  RUN pip install --no-cache-dir -r requirements.txt
 
 
6
  COPY . .
 
 
 
 
 
7
  RUN python download_models.py
8
  RUN python ingest.py
 
 
9
  EXPOSE 7860
10
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
config.py CHANGED
@@ -1,36 +1,15 @@
1
- # config.py
2
- # Central configuration for HubRAG (HF Space safe)
3
  import os
4
 
5
- # -----------------------------
6
- # Path Configuration
7
- # -----------------------------
8
- # Using absolute paths ensures the app finds the DB built in Dockerfile
9
  BASE_DIR = "/app"
10
-
11
- # Hugging Face Dataset
12
  HF_DATASET_REPO = "Zubaish/hubrag-kb"
13
  HF_TOKEN = os.getenv("HF_TOKEN")
14
 
15
- # Vector Store Path
16
  CHROMA_DIR = os.path.join(BASE_DIR, "chroma_db")
17
-
18
- # Knowledge Base (Temp PDF storage)
19
  KB_DIR = os.path.join(BASE_DIR, "kb")
20
 
21
- # -----------------------------
22
- # Model Configuration
23
- # -----------------------------
24
- # Small, fast, CPU-safe for free-tier Spaces
25
  EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
26
  LLM_MODEL = "google/flan-t5-small"
27
-
28
- # LLM Task type: 'text-generation' is more universally supported
29
- # than 'text2text-generation' in some transformers versions.
30
  LLM_TASK = "text-generation"
31
 
32
- # -----------------------------
33
- # Text splitting
34
- # -----------------------------
35
  CHUNK_SIZE = 1000
36
  CHUNK_OVERLAP = 100
 
 
 
1
  import os
2
 
 
 
 
 
3
  BASE_DIR = "/app"
 
 
4
  HF_DATASET_REPO = "Zubaish/hubrag-kb"
5
  HF_TOKEN = os.getenv("HF_TOKEN")
6
 
 
7
  CHROMA_DIR = os.path.join(BASE_DIR, "chroma_db")
 
 
8
  KB_DIR = os.path.join(BASE_DIR, "kb")
9
 
 
 
 
 
10
  EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
11
  LLM_MODEL = "google/flan-t5-small"
 
 
 
12
  LLM_TASK = "text-generation"
13
 
 
 
 
14
  CHUNK_SIZE = 1000
15
  CHUNK_OVERLAP = 100
ingest.py CHANGED
@@ -1,6 +1,4 @@
1
- # ingest.py
2
- import os
3
- import shutil
4
  from datasets import load_dataset
5
  from langchain_community.document_loaders import Docx2txtLoader
6
  from langchain_text_splitters import RecursiveCharacterTextSplitter
@@ -9,64 +7,35 @@ from langchain_chroma import Chroma
9
  from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR, CHUNK_SIZE, CHUNK_OVERLAP
10
 
11
  def run_ingestion():
12
- # 1. Clean directories
13
  if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
14
  if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
15
  os.makedirs(KB_DIR, exist_ok=True)
16
 
17
- print(f"⬇️ Loading files from {HF_DATASET_REPO}...")
18
-
19
- # Use standard load without extra flags that cause ValueErrors
20
- dataset = load_dataset(HF_DATASET_REPO, split="train")
21
 
22
  docs = []
23
- # Loop through the rows to find paths to files
24
  for i, row in enumerate(dataset):
25
- # We check common keys used by HF for file paths
26
- file_info = row.get("docx") or row.get("file") or row.get("pdf")
27
-
28
- src_path = None
29
- if isinstance(file_info, dict):
30
- src_path = file_info.get("path")
31
- elif isinstance(file_info, str):
32
- src_path = file_info
33
 
34
- if src_path and os.path.exists(src_path):
35
- ext = os.path.splitext(src_path)[1].lower()
36
-
37
- # ONLY process .docx files to avoid the PDF error
38
- if ext == ".docx":
39
- dest_path = os.path.join(KB_DIR, f"doc_{i}.docx")
40
- shutil.copy(src_path, dest_path)
41
-
42
- try:
43
- loader = Docx2txtLoader(dest_path)
44
- docs.extend(loader.load())
45
- print(f"✅ Successfully loaded: doc_{i}.docx")
46
- except Exception as e:
47
- print(f"❌ Loader error on doc_{i}: {e}")
48
- else:
49
- print(f"⏭️ Skipping non-docx file: {src_path}")
50
 
51
  if not docs:
52
- print("❌ CRITICAL: No .docx documents found. Ensure your dataset has .docx files.")
53
  return
54
 
55
- # 2. Chunking
56
- splitter = RecursiveCharacterTextSplitter(
57
- chunk_size=CHUNK_SIZE,
58
- chunk_overlap=CHUNK_OVERLAP
59
- )
60
- splits = splitter.split_documents(docs)
61
-
62
- # 3. Embedding and Storage
63
- print(f"🧠 Indexing {len(splits)} chunks...")
64
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
65
- Chroma.from_documents(
66
- documents=splits,
67
- embedding=embeddings,
68
- persist_directory=CHROMA_DIR
69
- )
70
  print(f"✅ Knowledge base initialized at {CHROMA_DIR}")
71
 
72
  if __name__ == "__main__":
 
1
+ import os, shutil
 
 
2
  from datasets import load_dataset
3
  from langchain_community.document_loaders import Docx2txtLoader
4
  from langchain_text_splitters import RecursiveCharacterTextSplitter
 
7
  from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR, CHUNK_SIZE, CHUNK_OVERLAP
8
 
9
  def run_ingestion():
 
10
  if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
11
  if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
12
  os.makedirs(KB_DIR, exist_ok=True)
13
 
14
+ print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
15
+ dataset = load_dataset(HF_DATASET_REPO, split="train", decode=False)
 
 
16
 
17
  docs = []
 
18
  for i, row in enumerate(dataset):
19
+ file_item = row.get("docx") or row.get("file")
20
+ src_path = file_item.get("path") if isinstance(file_item, dict) else None
 
 
 
 
 
 
21
 
22
+ if src_path and src_path.lower().endswith(".docx"):
23
+ dest_path = os.path.join(KB_DIR, f"doc_{i}.docx")
24
+ shutil.copy(src_path, dest_path)
25
+ try:
26
+ loader = Docx2txtLoader(dest_path)
27
+ docs.extend(loader.load())
28
+ print(f"✅ Loaded: doc_{i}.docx")
29
+ except Exception as e:
30
+ print(f"❌ Error loading doc_{i}: {e}")
 
 
 
 
 
 
 
31
 
32
  if not docs:
33
+ print("❌ CRITICAL: No .docx documents found.")
34
  return
35
 
36
+ splits = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP).split_documents(docs)
 
 
 
 
 
 
 
 
37
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
38
+ Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=CHROMA_DIR)
 
 
 
 
39
  print(f"✅ Knowledge base initialized at {CHROMA_DIR}")
40
 
41
  if __name__ == "__main__":
rag.py CHANGED
@@ -1,35 +1,26 @@
1
- # rag.py
2
  import os
3
  from transformers import pipeline
4
  from langchain_huggingface import HuggingFaceEmbeddings
5
  from langchain_chroma import Chroma
6
- from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR
7
 
8
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
9
 
10
- # Load database created in build phase
11
- if os.path.exists(CHROMA_DIR) and os.listdir(CHROMA_DIR):
12
  vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
13
- print("✅ Vector DB ready")
14
  else:
15
  vectordb = None
16
- print("⚠️ Vector DB missing")
17
 
18
- # Use generic text-generation for broadest compatibility
19
- qa_pipeline = pipeline(
20
- task="text-generation",
21
- model=LLM_MODEL,
22
- max_new_tokens=256,
23
- trust_remote_code=True
24
- )
25
 
26
  def ask_rag_with_status(question: str):
27
  if vectordb is None:
28
- return "Knowledge base not initialized.", "ERROR"
29
 
30
  docs = vectordb.similarity_search(question, k=3)
31
  context = "\n\n".join(d.page_content for d in docs)
32
- prompt = f"Using the context, answer correctly.\n\nContext: {context}\n\nQuestion: {question}\n\nAnswer:"
33
 
34
  result = qa_pipeline(prompt)
35
  answer = result[0]["generated_text"].split("Answer:")[-1].strip()
 
 
1
  import os
2
  from transformers import pipeline
3
  from langchain_huggingface import HuggingFaceEmbeddings
4
  from langchain_chroma import Chroma
5
+ from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR, LLM_TASK
6
 
7
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
8
 
9
+ if os.path.exists(CHROMA_DIR) and os.path.isdir(CHROMA_DIR):
 
10
  vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
11
+ print("✅ Vector DB loaded")
12
  else:
13
  vectordb = None
 
14
 
15
+ qa_pipeline = pipeline(task=LLM_TASK, model=LLM_MODEL, max_new_tokens=256, trust_remote_code=True)
 
 
 
 
 
 
16
 
17
  def ask_rag_with_status(question: str):
18
  if vectordb is None:
19
+ return "Knowledge base not initialized. Check build logs.", "ERROR"
20
 
21
  docs = vectordb.similarity_search(question, k=3)
22
  context = "\n\n".join(d.page_content for d in docs)
23
+ prompt = f"Answer using the context.\nContext: {context}\nQuestion: {question}\nAnswer:"
24
 
25
  result = qa_pipeline(prompt)
26
  answer = result[0]["generated_text"].split("Answer:")[-1].strip()