Dinesh310 commited on
Commit
8ce6ed1
·
verified ·
1 Parent(s): 4e52498

Update src/RAG_builder.py

Browse files
Files changed (1) hide show
  1. src/RAG_builder.py +27 -5
src/RAG_builder.py CHANGED
@@ -34,14 +34,36 @@ class ProjectRAGGraph:
34
  self.memory = MemorySaver()
35
  self.workflow = self._build_graph()
36
 
37
- def process_documents(self, pdf_paths):
 
 
 
38
  all_docs = []
39
- for path in pdf_paths:
40
- loader = PyPDFLoader(path)
41
- all_docs.extend(loader.load())
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- splits = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100).split_documents(all_docs)
44
  self.vector_store = FAISS.from_documents(splits, self.embeddings)
 
 
 
 
 
 
 
 
 
45
 
46
  # --- GRAPH NODES ---
47
  def retrieve(self, state: GraphState):
 
34
  self.memory = MemorySaver()
35
  self.workflow = self._build_graph()
36
 
37
+ def process_documents(self, pdf_paths_with_names: list[tuple[str, str]]):
38
+ """
39
+ Expects a list of tuples: [(temp_path, original_name), ...]
40
+ """
41
  all_docs = []
42
+ for temp_path, original_name in pdf_paths_with_names:
43
+ loader = PyPDFLoader(temp_path)
44
+ docs = loader.load()
45
+
46
+ # Override the metadata source with the original filename
47
+ for doc in docs:
48
+ doc.metadata["source"] = original_name
49
+
50
+ all_docs.extend(docs)
51
+
52
+ splits = RecursiveCharacterTextSplitter(
53
+ chunk_size=500,
54
+ chunk_overlap=100
55
+ ).split_documents(all_docs)
56
 
 
57
  self.vector_store = FAISS.from_documents(splits, self.embeddings)
58
+
59
+ # def process_documents(self, pdf_paths):
60
+ # all_docs = []
61
+ # for path in pdf_paths:
62
+ # loader = PyPDFLoader(path)
63
+ # all_docs.extend(loader.load())
64
+
65
+ # splits = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100).split_documents(all_docs)
66
+ # self.vector_store = FAISS.from_documents(splits, self.embeddings)
67
 
68
  # --- GRAPH NODES ---
69
  def retrieve(self, state: GraphState):