Update src/RAG_builder.py
Browse files- src/RAG_builder.py +29 -5
src/RAG_builder.py
CHANGED
|
@@ -34,16 +34,40 @@ class ProjectRAGGraph:
|
|
| 34 |
self.memory = MemorySaver()
|
| 35 |
self.workflow = self._build_graph()
|
| 36 |
|
| 37 |
-
def process_documents(self, pdf_paths):
|
| 38 |
-
self.pdf_count = len(pdf_paths)
|
| 39 |
all_docs = []
|
| 40 |
-
|
|
|
|
|
|
|
| 41 |
loader = PyPDFLoader(path)
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
-
splits = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100).split_documents(all_docs)
|
| 45 |
self.vector_store = FAISS.from_documents(splits, self.embeddings)
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
# --- GRAPH NODES ---
|
| 48 |
def retrieve(self, state: GraphState):
|
| 49 |
print("--- RETRIEVING ---")
|
|
|
|
| 34 |
self.memory = MemorySaver()
|
| 35 |
self.workflow = self._build_graph()
|
| 36 |
|
| 37 |
+
def process_documents(self, pdf_paths, original_names=None):
|
| 38 |
+
self.pdf_count = len(pdf_paths)
|
| 39 |
all_docs = []
|
| 40 |
+
|
| 41 |
+
# Iterate through paths and original names simultaneously
|
| 42 |
+
for i, path in enumerate(pdf_paths):
|
| 43 |
loader = PyPDFLoader(path)
|
| 44 |
+
docs = loader.load()
|
| 45 |
+
|
| 46 |
+
# If original names are provided, overwrite the 'source' metadata
|
| 47 |
+
if original_names and i < len(original_names):
|
| 48 |
+
for doc in docs:
|
| 49 |
+
doc.metadata["source"] = original_names[i]
|
| 50 |
+
|
| 51 |
+
all_docs.extend(docs)
|
| 52 |
+
|
| 53 |
+
# Split documents after metadata has been corrected
|
| 54 |
+
splits = RecursiveCharacterTextSplitter(
|
| 55 |
+
chunk_size=500,
|
| 56 |
+
chunk_overlap=100
|
| 57 |
+
).split_documents(all_docs)
|
| 58 |
|
|
|
|
| 59 |
self.vector_store = FAISS.from_documents(splits, self.embeddings)
|
| 60 |
|
| 61 |
+
# def process_documents(self, pdf_paths):
|
| 62 |
+
# self.pdf_count = len(pdf_paths) # Track how many PDFs were uploaded
|
| 63 |
+
# all_docs = []
|
| 64 |
+
# for path in pdf_paths:
|
| 65 |
+
# loader = PyPDFLoader(path)
|
| 66 |
+
# all_docs.extend(loader.load())
|
| 67 |
+
|
| 68 |
+
# splits = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100).split_documents(all_docs)
|
| 69 |
+
# self.vector_store = FAISS.from_documents(splits, self.embeddings)
|
| 70 |
+
|
| 71 |
# --- GRAPH NODES ---
|
| 72 |
def retrieve(self, state: GraphState):
|
| 73 |
print("--- RETRIEVING ---")
|