Dinesh310 commited on
Commit
af62a2a
·
verified ·
1 Parent(s): 58316f5

Update src/RAG_builder.py

Browse files
Files changed (1) hide show
  1. src/RAG_builder.py +29 -5
src/RAG_builder.py CHANGED
@@ -34,16 +34,40 @@ class ProjectRAGGraph:
34
  self.memory = MemorySaver()
35
  self.workflow = self._build_graph()
36
 
37
- def process_documents(self, pdf_paths):
38
- self.pdf_count = len(pdf_paths) # Track how many PDFs were uploaded
39
  all_docs = []
40
- for path in pdf_paths:
 
 
41
  loader = PyPDFLoader(path)
42
- all_docs.extend(loader.load())
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- splits = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100).split_documents(all_docs)
45
  self.vector_store = FAISS.from_documents(splits, self.embeddings)
46
 
 
 
 
 
 
 
 
 
 
 
47
  # --- GRAPH NODES ---
48
  def retrieve(self, state: GraphState):
49
  print("--- RETRIEVING ---")
 
34
  self.memory = MemorySaver()
35
  self.workflow = self._build_graph()
36
 
37
+ def process_documents(self, pdf_paths, original_names=None):
38
+ self.pdf_count = len(pdf_paths)
39
  all_docs = []
40
+
41
+ # Iterate through paths and original names simultaneously
42
+ for i, path in enumerate(pdf_paths):
43
  loader = PyPDFLoader(path)
44
+ docs = loader.load()
45
+
46
+ # If original names are provided, overwrite the 'source' metadata
47
+ if original_names and i < len(original_names):
48
+ for doc in docs:
49
+ doc.metadata["source"] = original_names[i]
50
+
51
+ all_docs.extend(docs)
52
+
53
+ # Split documents after metadata has been corrected
54
+ splits = RecursiveCharacterTextSplitter(
55
+ chunk_size=500,
56
+ chunk_overlap=100
57
+ ).split_documents(all_docs)
58
 
 
59
  self.vector_store = FAISS.from_documents(splits, self.embeddings)
60
 
61
+ # def process_documents(self, pdf_paths):
62
+ # self.pdf_count = len(pdf_paths) # Track how many PDFs were uploaded
63
+ # all_docs = []
64
+ # for path in pdf_paths:
65
+ # loader = PyPDFLoader(path)
66
+ # all_docs.extend(loader.load())
67
+
68
+ # splits = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100).split_documents(all_docs)
69
+ # self.vector_store = FAISS.from_documents(splits, self.embeddings)
70
+
71
  # --- GRAPH NODES ---
72
  def retrieve(self, state: GraphState):
73
  print("--- RETRIEVING ---")