Zubaish commited on
Commit
19be3af
·
1 Parent(s): 19d8cbd
Files changed (2) hide show
  1. ingest.py +45 -25
  2. rag.py +14 -6
ingest.py CHANGED
@@ -1,42 +1,62 @@
1
- import os, shutil
2
- from datasets import load_dataset
 
 
3
  from langchain_community.document_loaders import Docx2txtLoader
4
  from langchain_text_splitters import RecursiveCharacterTextSplitter
5
  from langchain_huggingface import HuggingFaceEmbeddings
6
  from langchain_chroma import Chroma
7
- from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR, CHUNK_SIZE, CHUNK_OVERLAP
8
 
9
  def run_ingestion():
 
10
  if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
11
  if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
12
  os.makedirs(KB_DIR, exist_ok=True)
13
 
14
- print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
15
- dataset = load_dataset(HF_DATASET_REPO, split="train", decode=False)
16
 
17
- docs = []
18
- for i, row in enumerate(dataset):
19
- file_item = row.get("docx") or row.get("file")
20
- src_path = file_item.get("path") if isinstance(file_item, dict) else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- if src_path and src_path.lower().endswith(".docx"):
23
- dest_path = os.path.join(KB_DIR, f"doc_{i}.docx")
24
- shutil.copy(src_path, dest_path)
25
- try:
26
- loader = Docx2txtLoader(dest_path)
27
- docs.extend(loader.load())
28
- print(f"✅ Loaded: doc_{i}.docx")
29
- except Exception as e:
30
- print(f"❌ Error loading doc_{i}: {e}")
31
 
32
- if not docs:
33
- print("❌ CRITICAL: No .docx documents found.")
34
- return
35
 
36
- splits = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP).split_documents(docs)
37
- embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
38
- Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=CHROMA_DIR)
39
- print(f"✅ Knowledge base initialized at {CHROMA_DIR}")
 
 
 
 
 
 
 
 
40
 
41
  if __name__ == "__main__":
42
  run_ingestion()
 
1
+ # ingest.py
2
+ import os
3
+ import shutil
4
+ from huggingface_hub import hf_hub_download, list_repo_files
5
  from langchain_community.document_loaders import Docx2txtLoader
6
  from langchain_text_splitters import RecursiveCharacterTextSplitter
7
  from langchain_huggingface import HuggingFaceEmbeddings
8
  from langchain_chroma import Chroma
9
+ from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR, CHUNK_SIZE, CHUNK_OVERLAP, HF_TOKEN
10
 
11
  def run_ingestion():
12
+ # 1. Clean Environment
13
  if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
14
  if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
15
  os.makedirs(KB_DIR, exist_ok=True)
16
 
17
+ print(f"⬇️ Downloading files from NEW repo: {HF_DATASET_REPO}...")
 
18
 
19
+ try:
20
+ # List files using the hub API instead of load_dataset
21
+ all_files = list_repo_files(repo_id=HF_DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
22
+ docx_files = [f for f in all_files if f.lower().endswith(".docx")]
23
+
24
+ docs = []
25
+ for i, file_name in enumerate(docx_files):
26
+ # Download file directly to local folder
27
+ local_path = hf_hub_download(
28
+ repo_id=HF_DATASET_REPO,
29
+ filename=file_name,
30
+ repo_type="dataset",
31
+ local_dir=KB_DIR,
32
+ token=HF_TOKEN
33
+ )
34
+
35
+ # Load the text from docx
36
+ loader = Docx2txtLoader(local_path)
37
+ docs.extend(loader.load())
38
+ print(f"✅ Loaded: {file_name}")
39
 
40
+ if not docs:
41
+ print("❌ No documents found. Check repo files.")
42
+ return
 
 
 
 
 
 
43
 
44
+ # 2. Chunking
45
+ splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
46
+ splits = splitter.split_documents(docs)
47
 
48
+ # 3. Embedding and Storage
49
+ print(f"🧠 Indexing {len(splits)} chunks into ChromaDB...")
50
+ embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
51
+ Chroma.from_documents(
52
+ documents=splits,
53
+ embedding=embeddings,
54
+ persist_directory=CHROMA_DIR
55
+ )
56
+ print(f"✅ Knowledge base initialized successfully at {CHROMA_DIR}")
57
+
58
+ except Exception as e:
59
+ print(f"❌ Ingestion failed: {e}")
60
 
61
  if __name__ == "__main__":
62
  run_ingestion()
rag.py CHANGED
@@ -1,26 +1,34 @@
 
1
  import os
2
  from transformers import pipeline
3
  from langchain_huggingface import HuggingFaceEmbeddings
4
  from langchain_chroma import Chroma
5
- from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR, LLM_TASK
6
 
7
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
8
 
9
- if os.path.exists(CHROMA_DIR) and os.path.isdir(CHROMA_DIR):
 
10
  vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
11
- print("✅ Vector DB loaded")
12
  else:
13
  vectordb = None
 
14
 
15
- qa_pipeline = pipeline(task=LLM_TASK, model=LLM_MODEL, max_new_tokens=256, trust_remote_code=True)
 
 
 
 
 
16
 
17
  def ask_rag_with_status(question: str):
18
  if vectordb is None:
19
- return "Knowledge base not initialized. Check build logs.", "ERROR"
20
 
21
  docs = vectordb.similarity_search(question, k=3)
22
  context = "\n\n".join(d.page_content for d in docs)
23
- prompt = f"Answer using the context.\nContext: {context}\nQuestion: {question}\nAnswer:"
24
 
25
  result = qa_pipeline(prompt)
26
  answer = result[0]["generated_text"].split("Answer:")[-1].strip()
 
1
+ # rag.py
2
  import os
3
  from transformers import pipeline
4
  from langchain_huggingface import HuggingFaceEmbeddings
5
  from langchain_chroma import Chroma
6
+ from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR
7
 
8
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
9
 
10
+ # Check if directory exists AND has files
11
+ if os.path.exists(CHROMA_DIR) and any(os.scandir(CHROMA_DIR)):
12
  vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
13
+ print("✅ Vector DB ready")
14
  else:
15
  vectordb = None
16
+ print("⚠️ Vector DB not found or empty")
17
 
18
+ qa_pipeline = pipeline(
19
+ task="text-generation",
20
+ model=LLM_MODEL,
21
+ max_new_tokens=256,
22
+ trust_remote_code=True
23
+ )
24
 
25
  def ask_rag_with_status(question: str):
26
  if vectordb is None:
27
+ return "The knowledge base is not initialized. Please check deployment logs.", "ERROR"
28
 
29
  docs = vectordb.similarity_search(question, k=3)
30
  context = "\n\n".join(d.page_content for d in docs)
31
+ prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:"
32
 
33
  result = qa_pipeline(prompt)
34
  answer = result[0]["generated_text"].split("Answer:")[-1].strip()