Zubaish commited on
Commit
e8fa82e
·
1 Parent(s): 1afe1ea
Files changed (5) hide show
  1. Dockerfile +1 -0
  2. config.py +2 -2
  3. download_models.py +5 -6
  4. ingest.py +8 -30
  5. rag.py +20 -30
Dockerfile CHANGED
@@ -4,6 +4,7 @@ RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
4
  COPY requirements.txt .
5
  RUN pip install --no-cache-dir -r requirements.txt
6
  COPY . .
 
7
  RUN python download_models.py
8
  RUN python ingest.py
9
  EXPOSE 7860
 
4
  COPY requirements.txt .
5
  RUN pip install --no-cache-dir -r requirements.txt
6
  COPY . .
7
+ # These run during the 'Building' phase on Hugging Face
8
  RUN python download_models.py
9
  RUN python ingest.py
10
  EXPOSE 7860
config.py CHANGED
@@ -1,14 +1,14 @@
1
  import os
2
 
3
  BASE_DIR = "/app"
4
- HF_DATASET_REPO = "Zubaish/hubrag-kb"
5
  HF_TOKEN = os.getenv("HF_TOKEN")
6
 
7
  CHROMA_DIR = os.path.join(BASE_DIR, "chroma_db")
8
  KB_DIR = os.path.join(BASE_DIR, "kb")
9
 
10
  EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
11
- LLM_MODEL = "google/flan-t5-small"
12
  LLM_TASK = "text-generation"
13
 
14
  CHUNK_SIZE = 1000
 
1
  import os
2
 
3
  BASE_DIR = "/app"
4
+ HF_DATASET_REPO = "Zubaish/gandhi-kb-docx" # Ensure this points to your NEW docx repo
5
  HF_TOKEN = os.getenv("HF_TOKEN")
6
 
7
  CHROMA_DIR = os.path.join(BASE_DIR, "chroma_db")
8
  KB_DIR = os.path.join(BASE_DIR, "kb")
9
 
10
  EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
11
+ LLM_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"
12
  LLM_TASK = "text-generation"
13
 
14
  CHUNK_SIZE = 1000
download_models.py CHANGED
@@ -1,11 +1,10 @@
1
- # download_models.py
2
  from transformers import pipeline
3
  from langchain_huggingface import HuggingFaceEmbeddings
4
- from config import EMBEDDING_MODEL, LLM_MODEL
5
 
6
  print("⏳ Pre-downloading models...")
7
- # Download Embedding Model
8
  HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
9
- # Download LLM
10
- pipeline("text-generation", model=LLM_MODEL, trust_remote_code=True)
11
- print("✅ Models downloaded successfully")
 
 
1
  from transformers import pipeline
2
  from langchain_huggingface import HuggingFaceEmbeddings
3
+ from config import EMBEDDING_MODEL, LLM_MODEL, LLM_TASK
4
 
5
  print("⏳ Pre-downloading models...")
6
+ # Cache Embedding Model
7
  HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
8
+ # Cache Qwen LLM
9
+ pipeline(LLM_TASK, model=LLM_MODEL, device_map="cpu", trust_remote_code=True)
10
+ print("✅ Models cached successfully")
ingest.py CHANGED
@@ -1,6 +1,4 @@
1
- # ingest.py
2
- import os
3
- import shutil
4
  from huggingface_hub import hf_hub_download, list_repo_files
5
  from langchain_community.document_loaders import Docx2txtLoader
6
  from langchain_text_splitters import RecursiveCharacterTextSplitter
@@ -9,51 +7,31 @@ from langchain_chroma import Chroma
9
  from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR, CHUNK_SIZE, CHUNK_OVERLAP, HF_TOKEN
10
 
11
  def run_ingestion():
12
- # 1. Clean Environment
13
  if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
14
  if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
15
  os.makedirs(KB_DIR, exist_ok=True)
16
 
17
- print(f"⬇️ Downloading files from NEW repo: {HF_DATASET_REPO}...")
18
 
19
  try:
20
- # List files using the hub API instead of load_dataset
21
  all_files = list_repo_files(repo_id=HF_DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
22
  docx_files = [f for f in all_files if f.lower().endswith(".docx")]
23
 
24
  docs = []
25
- for i, file_name in enumerate(docx_files):
26
- # Download file directly to local folder
27
- local_path = hf_hub_download(
28
- repo_id=HF_DATASET_REPO,
29
- filename=file_name,
30
- repo_type="dataset",
31
- local_dir=KB_DIR,
32
- token=HF_TOKEN
33
- )
34
-
35
- # Load the text from docx
36
  loader = Docx2txtLoader(local_path)
37
  docs.extend(loader.load())
38
  print(f"✅ Loaded: {file_name}")
39
 
40
  if not docs:
41
- print("❌ No documents found. Check repo files.")
42
  return
43
 
44
- # 2. Chunking
45
- splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
46
- splits = splitter.split_documents(docs)
47
-
48
- # 3. Embedding and Storage
49
- print(f"🧠 Indexing {len(splits)} chunks into ChromaDB...")
50
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
51
- Chroma.from_documents(
52
- documents=splits,
53
- embedding=embeddings,
54
- persist_directory=CHROMA_DIR
55
- )
56
- print(f"✅ Knowledge base initialized successfully at {CHROMA_DIR}")
57
 
58
  except Exception as e:
59
  print(f"❌ Ingestion failed: {e}")
 
1
+ import os, shutil
 
 
2
  from huggingface_hub import hf_hub_download, list_repo_files
3
  from langchain_community.document_loaders import Docx2txtLoader
4
  from langchain_text_splitters import RecursiveCharacterTextSplitter
 
7
  from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR, CHUNK_SIZE, CHUNK_OVERLAP, HF_TOKEN
8
 
9
  def run_ingestion():
 
10
  if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
11
  if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
12
  os.makedirs(KB_DIR, exist_ok=True)
13
 
14
+ print(f"⬇️ Downloading files from: {HF_DATASET_REPO}...")
15
 
16
  try:
 
17
  all_files = list_repo_files(repo_id=HF_DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
18
  docx_files = [f for f in all_files if f.lower().endswith(".docx")]
19
 
20
  docs = []
21
+ for file_name in docx_files:
22
+ local_path = hf_hub_download(repo_id=HF_DATASET_REPO, filename=file_name, repo_type="dataset", local_dir=KB_DIR, token=HF_TOKEN)
 
 
 
 
 
 
 
 
 
23
  loader = Docx2txtLoader(local_path)
24
  docs.extend(loader.load())
25
  print(f"✅ Loaded: {file_name}")
26
 
27
  if not docs:
28
+ print("❌ No documents found.")
29
  return
30
 
31
+ splits = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP).split_documents(docs)
 
 
 
 
 
32
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
33
+ Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=CHROMA_DIR)
34
+ print(f"✅ Knowledge base initialized successfully.")
 
 
 
 
35
 
36
  except Exception as e:
37
  print(f"❌ Ingestion failed: {e}")
rag.py CHANGED
@@ -2,47 +2,37 @@ import os
2
  from transformers import pipeline
3
  from langchain_huggingface import HuggingFaceEmbeddings
4
  from langchain_chroma import Chroma
5
- from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR
6
 
7
- # 1. Initialize Embeddings
8
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
9
 
10
- # 2. Load Vector DB
11
- if os.path.exists(CHROMA_DIR) and os.path.isdir(CHROMA_DIR):
12
  vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
13
- print("✅ Vector DB loaded successfully")
14
  else:
15
  vectordb = None
16
- print("⚠️ Vector DB folder missing")
17
 
18
- # 3. LLM Pipeline - Using the explicit class to avoid task errors
19
- qa_pipeline = pipeline(
20
- "text2text-generation", # T5 specifically needs this task name
21
- model=LLM_MODEL,
22
- max_new_tokens=128, # Reduced to keep responses concise
23
- model_kwargs={"torch_dtype": "auto"}
24
- )
25
 
26
  def ask_rag_with_status(question: str):
27
  if vectordb is None:
28
- return "The knowledge base is not initialized properly.", "ERROR"
29
 
30
- # Search for only 2 docs (k=2) to stay under the 512 token limit
31
- docs = vectordb.similarity_search(question, k=2)
32
 
33
- # Extract text and keep it short
34
- context = " ".join([d.page_content[:400] for d in docs])
 
 
 
35
 
36
- # Specific T5 Prompt Format: "question: ... context: ..."
37
- prompt = f"question: {question} context: {context}"
38
 
39
- try:
40
- result = qa_pipeline(prompt)
41
- answer = result[0]["generated_text"].strip()
42
-
43
- if not answer:
44
- answer = "I couldn't find a specific answer in the documents provided."
45
-
46
- return answer, ["Context retrieved", "T5 generating"]
47
- except Exception as e:
48
- return f"Error generating answer: {str(e)}", "ERROR"
 
2
  from transformers import pipeline
3
  from langchain_huggingface import HuggingFaceEmbeddings
4
  from langchain_chroma import Chroma
5
+ from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR, LLM_TASK
6
 
 
7
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
8
 
9
+ if os.path.exists(CHROMA_DIR) and any(os.scandir(CHROMA_DIR)):
 
10
  vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
11
+ print("✅ Vector DB loaded")
12
  else:
13
  vectordb = None
14
+ print("⚠️ Vector DB missing")
15
 
16
+ qa_pipeline = pipeline(task=LLM_TASK, model=LLM_MODEL, device_map="cpu", max_new_tokens=512, trust_remote_code=True)
 
 
 
 
 
 
17
 
18
  def ask_rag_with_status(question: str):
19
  if vectordb is None:
20
+ return "Knowledge base not ready.", "ERROR"
21
 
22
+ docs = vectordb.similarity_search(question, k=3)
23
+ context = "\n\n".join(d.page_content for d in docs)
24
 
25
+ # Qwen Chat Template
26
+ messages = [
27
+ {"role": "system", "content": "You are a Gandhi ji expert. Answer the question using ONLY the provided context."},
28
+ {"role": "user", "content": f"Context: {context}\n\nQuestion: {question}"}
29
+ ]
30
 
31
+ prompt = qa_pipeline.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
32
+ result = qa_pipeline(prompt, pad_token_id=qa_pipeline.tokenizer.eos_token_id)
33
 
34
+ # Extract Qwen answer
35
+ full_text = result[0]["generated_text"]
36
+ answer = full_text.split("<|im_start|>assistant")[-1].strip().replace("<|im_end|>", "")
37
+
38
+ return answer, ["Success"]