Zubaish commited on
Commit
2194516
·
1 Parent(s): e8fa82e
Files changed (3) hide show
  1. download_models.py +9 -6
  2. ingest.py +54 -12
  3. requirements.txt +2 -1
download_models.py CHANGED
@@ -1,10 +1,13 @@
1
- from transformers import pipeline
 
2
  from langchain_huggingface import HuggingFaceEmbeddings
3
- from config import EMBEDDING_MODEL, LLM_MODEL, LLM_TASK
4
 
5
- print("⏳ Pre-downloading models...")
6
- # Cache Embedding Model
7
  HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
8
- # Cache Qwen LLM
9
- pipeline(LLM_TASK, model=LLM_MODEL, device_map="cpu", trust_remote_code=True)
 
 
 
10
  print("✅ Models cached successfully")
 
1
+ # download_models.py
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
  from langchain_huggingface import HuggingFaceEmbeddings
4
+ from config import EMBEDDING_MODEL, LLM_MODEL
5
 
6
+ print("⏳ Downloading Embedding Model...")
 
7
  HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
8
+
9
+ print(f"⏳ Downloading LLM: {LLM_MODEL}...")
10
+ # Direct download to cache
11
+ AutoTokenizer.from_pretrained(LLM_MODEL)
12
+ AutoModelForCausalLM.from_pretrained(LLM_MODEL)
13
  print("✅ Models cached successfully")
ingest.py CHANGED
@@ -1,4 +1,5 @@
1
- import os, shutil
 
2
  from huggingface_hub import hf_hub_download, list_repo_files
3
  from langchain_community.document_loaders import Docx2txtLoader
4
  from langchain_text_splitters import RecursiveCharacterTextSplitter
@@ -7,34 +8,75 @@ from langchain_chroma import Chroma
7
  from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR, CHUNK_SIZE, CHUNK_OVERLAP, HF_TOKEN
8
 
9
  def run_ingestion():
10
- if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
11
- if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
 
 
 
 
 
12
  os.makedirs(KB_DIR, exist_ok=True)
 
13
 
14
- print(f"⬇️ Downloading files from: {HF_DATASET_REPO}...")
15
 
16
  try:
 
 
17
  all_files = list_repo_files(repo_id=HF_DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
18
  docx_files = [f for f in all_files if f.lower().endswith(".docx")]
19
 
 
 
 
 
20
  docs = []
21
  for file_name in docx_files:
22
- local_path = hf_hub_download(repo_id=HF_DATASET_REPO, filename=file_name, repo_type="dataset", local_dir=KB_DIR, token=HF_TOKEN)
23
- loader = Docx2txtLoader(local_path)
 
 
 
 
 
 
 
 
 
 
 
 
24
  docs.extend(loader.load())
25
- print(f"✅ Loaded: {file_name}")
26
 
27
  if not docs:
28
- print("❌ No documents found.")
29
  return
30
 
31
- splits = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP).split_documents(docs)
 
 
 
 
 
 
 
 
 
 
32
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
33
- Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=CHROMA_DIR)
34
- print(f"✅ Knowledge base initialized successfully.")
 
 
 
 
 
 
 
35
 
36
  except Exception as e:
37
- print(f"❌ Ingestion failed: {e}")
38
 
39
  if __name__ == "__main__":
40
  run_ingestion()
 
1
+ import os
2
+ import shutil
3
  from huggingface_hub import hf_hub_download, list_repo_files
4
  from langchain_community.document_loaders import Docx2txtLoader
5
  from langchain_text_splitters import RecursiveCharacterTextSplitter
 
8
  from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR, CHUNK_SIZE, CHUNK_OVERLAP, HF_TOKEN
9
 
10
  def run_ingestion():
11
+ # 1. Environment Cleanup & Setup
12
+ # Using absolute paths from config (e.g., /app/kb and /app/chroma_db)
13
+ if os.path.exists(KB_DIR):
14
+ shutil.rmtree(KB_DIR)
15
+ if os.path.exists(CHROMA_DIR):
16
+ shutil.rmtree(CHROMA_DIR)
17
+
18
  os.makedirs(KB_DIR, exist_ok=True)
19
+ os.makedirs(CHROMA_DIR, exist_ok=True)
20
 
21
+ print(f"⬇️ Listing files in repository: {HF_DATASET_REPO}...")
22
 
23
  try:
24
+ # 2. Direct File Download (Bypassing load_dataset to avoid PDF errors)
25
+ # This only fetches .docx files to keep your Gandhi ji knowledge base clean
26
  all_files = list_repo_files(repo_id=HF_DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
27
  docx_files = [f for f in all_files if f.lower().endswith(".docx")]
28
 
29
+ if not docx_files:
30
+ print("❌ Error: No .docx files found in the dataset repository.")
31
+ return
32
+
33
  docs = []
34
  for file_name in docx_files:
35
+ print(f"📂 Downloading {file_name}...")
36
+ # Download to HF cache first
37
+ temp_path = hf_hub_download(
38
+ repo_id=HF_DATASET_REPO,
39
+ filename=file_name,
40
+ repo_type="dataset",
41
+ token=HF_TOKEN
42
+ )
43
+ # Copy to our predictable /app/kb directory
44
+ local_docx = os.path.join(KB_DIR, os.path.basename(file_name))
45
+ shutil.copy(temp_path, local_docx)
46
+
47
+ # 3. Load text from Docx (ignores images automatically)
48
+ loader = Docx2txtLoader(local_docx)
49
  docs.extend(loader.load())
50
+ print(f"✅ Text extracted from: {file_name}")
51
 
52
  if not docs:
53
+ print("❌ Error: Extracted document list is empty.")
54
  return
55
 
56
+ # 4. Text Splitting (Optimized for RAG context windows)
57
+ text_splitter = RecursiveCharacterTextSplitter(
58
+ chunk_size=CHUNK_SIZE,
59
+ chunk_overlap=CHUNK_OVERLAP,
60
+ add_start_index=True
61
+ )
62
+ splits = text_splitter.split_documents(docs)
63
+ print(f"✂️ Split into {len(splits)} text chunks.")
64
+
65
+ # 5. Embedding & Vector Store Creation
66
+ print(f"🧠 Generating embeddings with {EMBEDDING_MODEL}...")
67
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
68
+
69
+ # Save to the persistent directory specified in config (/app/chroma_db)
70
+ print(f"💾 Saving Vector Database to {CHROMA_DIR}...")
71
+ Chroma.from_documents(
72
+ documents=splits,
73
+ embedding=embeddings,
74
+ persist_directory=CHROMA_DIR
75
+ )
76
+ print(f"✨ Knowledge base fully initialized and saved.")
77
 
78
  except Exception as e:
79
+ print(f"❌ CRITICAL INGESTION ERROR: {str(e)}")
80
 
81
  if __name__ == "__main__":
82
  run_ingestion()
requirements.txt CHANGED
@@ -10,8 +10,9 @@ langchain-text-splitters==0.2.4
10
  chromadb==0.5.5
11
  sentence-transformers
12
  docx2txt
13
- pdfplumber
14
  transformers>=4.39.0
 
 
15
  huggingface_hub
16
  datasets
17
  torch
 
10
  chromadb==0.5.5
11
  sentence-transformers
12
  docx2txt
 
13
  transformers>=4.39.0
14
+ accelerate # Added for Qwen support
15
+ bitsandbytes # Added for memory efficiency
16
  huggingface_hub
17
  datasets
18
  torch