Zubaish commited on
Commit
f85dcaa
·
1 Parent(s): adf8857

Fix: remove device_map; CPU-safe Phi-3 load

Browse files
Files changed (1) hide show
  1. rag.py +39 -50
rag.py CHANGED
@@ -1,31 +1,35 @@
1
- from langchain_community.vectorstores import Chroma
2
- from langchain_community.embeddings import HuggingFaceEmbeddings
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
- import torch
 
 
 
 
5
 
6
- from ingest import load_and_split_docs
7
-
8
- print("⏳ Loading documents...")
9
-
10
- documents = load_and_split_docs()
11
 
 
12
  embeddings = HuggingFaceEmbeddings(
13
  model_name="sentence-transformers/all-MiniLM-L6-v2"
14
  )
15
 
16
- if documents:
17
- vectorstore = Chroma.from_documents(
18
- documents=documents,
19
- embedding=embeddings
20
- )
21
- retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
22
- else:
23
- retriever = None
 
 
 
 
 
 
 
 
24
 
25
  print("⏳ Loading LLM...")
26
-
27
- MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
28
-
29
  tokenizer = AutoTokenizer.from_pretrained(
30
  MODEL_ID,
31
  trust_remote_code=True
@@ -33,37 +37,16 @@ tokenizer = AutoTokenizer.from_pretrained(
33
 
34
  model = AutoModelForCausalLM.from_pretrained(
35
  MODEL_ID,
36
- trust_remote_code=True,
37
- torch_dtype=torch.float32,
38
- device_map="cpu"
39
- )
40
-
41
- print("✅ RAG initialized.")
42
-
43
- def generate(prompt: str) -> str:
44
- inputs = tokenizer(prompt, return_tensors="pt")
45
- with torch.no_grad():
46
- outputs = model.generate(
47
- **inputs,
48
- max_new_tokens=300,
49
- temperature=0.2,
50
- do_sample=True
51
- )
52
- return tokenizer.decode(outputs[0], skip_special_tokens=True)
53
 
54
  def ask_rag_with_status(question: str):
55
- if not retriever:
56
- return {
57
- "status": ["⚠️ No documents uploaded yet"],
58
- "answer": "Please upload PDF files to the kb_docs folder and restart the Space."
59
- }
60
-
61
  docs = retriever.get_relevant_documents(question)
 
62
  context = "\n\n".join(d.page_content for d in docs)
63
 
64
- prompt = f"""
65
- You are a helpful assistant.
66
- Answer ONLY using the context below.
67
 
68
  Context:
69
  {context}
@@ -71,12 +54,18 @@ Context:
71
  Question:
72
  {question}
73
 
74
- Answer:
75
- """
76
 
77
- answer = generate(prompt)
 
 
 
 
 
 
78
 
 
79
  return {
80
- "status": ["🔍 Retrieved documents", "🧠 Generated answer"],
81
- "answer": answer
82
  }
 
 
 
1
  from transformers import AutoTokenizer, AutoModelForCausalLM
2
+ from langchain_huggingface import HuggingFaceEmbeddings
3
+ from langchain_chroma import Chroma
4
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
5
+ from langchain_community.document_loaders import PyPDFLoader
6
+ import os
7
 
8
+ MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
 
 
 
 
9
 
10
+ print("⏳ Loading embeddings...")
11
  embeddings = HuggingFaceEmbeddings(
12
  model_name="sentence-transformers/all-MiniLM-L6-v2"
13
  )
14
 
15
+ print("⏳ Loading documents...")
16
+ docs = []
17
+ if os.path.exists("kb_docs"):
18
+ for f in os.listdir("kb_docs"):
19
+ if f.endswith(".pdf"):
20
+ loader = PyPDFLoader(os.path.join("kb_docs", f))
21
+ docs.extend(loader.load())
22
+
23
+ splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
24
+ splits = splitter.split_documents(docs)
25
+
26
+ vectorstore = Chroma.from_documents(
27
+ splits,
28
+ embedding=embeddings,
29
+ persist_directory="./chroma_db"
30
+ )
31
 
32
  print("⏳ Loading LLM...")
 
 
 
33
  tokenizer = AutoTokenizer.from_pretrained(
34
  MODEL_ID,
35
  trust_remote_code=True
 
37
 
38
  model = AutoModelForCausalLM.from_pretrained(
39
  MODEL_ID,
40
+ trust_remote_code=True
41
+ ) # 👈 NO device_map, NO low_cpu_mem_usage
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  def ask_rag_with_status(question: str):
44
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
 
 
 
 
 
45
  docs = retriever.get_relevant_documents(question)
46
+
47
  context = "\n\n".join(d.page_content for d in docs)
48
 
49
+ prompt = f"""Use the context below to answer the question.
 
 
50
 
51
  Context:
52
  {context}
 
54
  Question:
55
  {question}
56
 
57
+ Answer:"""
 
58
 
59
+ inputs = tokenizer(prompt, return_tensors="pt")
60
+ outputs = model.generate(
61
+ **inputs,
62
+ max_new_tokens=256,
63
+ do_sample=True,
64
+ temperature=0.7
65
+ )
66
 
67
+ answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
68
  return {
69
+ "answer": answer,
70
+ "status": ["✅ Answer generated"]
71
  }