Zubaish commited on
Commit
2ae1f2d
·
1 Parent(s): a42513a

Fix: remove CHROMA_DIR, HF-dataset-based RAG

Browse files
Files changed (2) hide show
  1. config.py +7 -7
  2. rag.py +66 -59
config.py CHANGED
@@ -1,10 +1,10 @@
1
- import os
2
-
3
- KB_DIR = "kb"
4
- VECTOR_DB_DIR = "vector_db"
5
 
 
6
  EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
7
- LLM_MODEL = "microsoft/Phi-3-mini-4k-instruct"
8
 
9
- os.makedirs(VECTOR_DB_DIR, exist_ok=True)
10
- os.makedirs(KB_DIR, exist_ok=True)
 
 
 
 
1
+ # config.py
 
 
 
2
 
3
+ MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
4
  EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 
5
 
6
+ # Hugging Face Dataset repo where PDFs live
7
+ HF_DATASET_REPO = "Zubaish/HubRAG-docs"
8
+
9
+ # Retrieval
10
+ TOP_K = 3
rag.py CHANGED
@@ -1,69 +1,82 @@
1
  # rag.py
2
- from typing import List, Tuple
 
3
  from langchain_community.vectorstores import Chroma
4
- from langchain_community.embeddings import HuggingFaceEmbeddings
5
- from transformers import AutoTokenizer, AutoModelForCausalLM
6
- from config import (
7
- EMBEDDING_MODEL,
8
- LLM_MODEL,
9
- CHROMA_DIR,
10
- TOP_K,
11
- )
12
- import torch
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- # --- Embeddings ---
16
- embeddings = HuggingFaceEmbeddings(
17
- model_name=EMBEDDING_MODEL
18
- )
19
 
20
- # --- Vector DB (safe load) ---
21
- try:
22
- vectordb = Chroma(
23
- persist_directory=CHROMA_DIR,
24
- embedding_function=embeddings,
25
- )
26
- except Exception:
27
- vectordb = None
28
 
 
 
29
 
30
- # --- LLM ---
31
- tokenizer = AutoTokenizer.from_pretrained(
32
- LLM_MODEL,
33
- trust_remote_code=True
 
34
  )
35
 
 
 
 
 
 
 
36
  model = AutoModelForCausalLM.from_pretrained(
37
- LLM_MODEL,
38
- trust_remote_code=True,
39
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
40
- device_map="auto",
41
  )
42
 
 
 
 
 
 
 
 
43
 
44
- def ask_rag_with_status(question: str) -> Tuple[str, List[str]]:
45
- status = []
46
 
47
- if not vectordb:
48
- return (
49
- "⚠️ Knowledge base is not loaded yet. Upload documents first.",
50
- ["Vector DB not initialized"],
51
- )
52
 
 
53
  docs = vectordb.similarity_search(question, k=TOP_K)
54
 
55
  if not docs:
56
- return (
57
- "⚠️ I could not find relevant information in the knowledge base.",
58
- ["No documents retrieved"],
59
- )
60
 
61
  context = "\n\n".join(d.page_content for d in docs)
62
- status.append(f"Retrieved {len(docs)} chunks")
63
 
64
- prompt = f"""
65
- You are a helpful assistant.
66
- Answer ONLY using the context below.
67
 
68
  Context:
69
  {context}
@@ -71,20 +84,14 @@ Context:
71
  Question:
72
  {question}
73
 
74
- Answer:
75
- """
76
-
77
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
78
 
79
- with torch.no_grad():
80
- output = model.generate(
81
- **inputs,
82
- max_new_tokens=256,
83
- do_sample=True,
84
- temperature=0.7,
85
- )
86
 
87
- answer = tokenizer.decode(output[0], skip_special_tokens=True)
88
- answer = answer.split("Answer:")[-1].strip()
89
 
90
- return answer, status
 
 
 
 
1
  # rag.py
2
+
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
4
  from langchain_community.vectorstores import Chroma
5
+ from langchain_huggingface import HuggingFaceEmbeddings
6
+ from langchain.schema import Document
7
+ from datasets import load_dataset
 
 
 
 
 
 
8
 
9
+ from config import MODEL_ID, EMBEDDING_MODEL, HF_DATASET_REPO, TOP_K
10
+
11
+
12
+ # ----------------------------
13
+ # Load PDFs from HF Dataset
14
+ # ----------------------------
15
+ def load_documents():
16
+ ds = load_dataset(HF_DATASET_REPO, split="train")
17
+
18
+ docs = []
19
+ for row in ds:
20
+ text = row.get("text", "").strip()
21
+ if text:
22
+ docs.append(Document(page_content=text))
23
+
24
+ return docs
25
 
 
 
 
 
26
 
27
+ # ----------------------------
28
+ # Build vector store (in-memory)
29
+ # ----------------------------
30
+ documents = load_documents()
 
 
 
 
31
 
32
+ if not documents:
33
+ raise RuntimeError("No documents loaded from HF Dataset")
34
 
35
+ embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
36
+
37
+ vectordb = Chroma.from_documents(
38
+ documents=documents,
39
+ embedding=embeddings
40
  )
41
 
42
+
43
+ # ----------------------------
44
+ # Load LLM (NO device_map)
45
+ # ----------------------------
46
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
47
+
48
  model = AutoModelForCausalLM.from_pretrained(
49
+ MODEL_ID,
50
+ torch_dtype="auto"
 
 
51
  )
52
 
53
+ llm = pipeline(
54
+ "text-generation",
55
+ model=model,
56
+ tokenizer=tokenizer,
57
+ max_new_tokens=256,
58
+ temperature=0.2
59
+ )
60
 
 
 
61
 
62
+ # ----------------------------
63
+ # Public API
64
+ # ----------------------------
65
+ def ask_rag_with_status(question: str):
66
+ status = []
67
 
68
+ status.append("Retrieving relevant documents…")
69
  docs = vectordb.similarity_search(question, k=TOP_K)
70
 
71
  if not docs:
72
+ return {
73
+ "answer": "No relevant documents found.",
74
+ "status": status
75
+ }
76
 
77
  context = "\n\n".join(d.page_content for d in docs)
 
78
 
79
+ prompt = f"""Use the context below to answer the question.
 
 
80
 
81
  Context:
82
  {context}
 
84
  Question:
85
  {question}
86
 
87
+ Answer:"""
 
 
 
88
 
89
+ status.append("Generating answer…")
90
+ result = llm(prompt)[0]["generated_text"]
 
 
 
 
 
91
 
92
+ answer = result.split("Answer:")[-1].strip()
 
93
 
94
+ return {
95
+ "answer": answer,
96
+ "status": status
97
+ }