Update src/RAG_builder.py
Browse files- src/RAG_builder.py +27 -5
src/RAG_builder.py
CHANGED
|
@@ -34,14 +34,36 @@ class ProjectRAGGraph:
|
|
| 34 |
self.memory = MemorySaver()
|
| 35 |
self.workflow = self._build_graph()
|
| 36 |
|
| 37 |
-
def process_documents(self,
|
|
|
|
|
|
|
|
|
|
| 38 |
all_docs = []
|
| 39 |
-
for
|
| 40 |
-
loader = PyPDFLoader(
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
-
splits = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100).split_documents(all_docs)
|
| 44 |
self.vector_store = FAISS.from_documents(splits, self.embeddings)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
# --- GRAPH NODES ---
|
| 47 |
def retrieve(self, state: GraphState):
|
|
|
|
| 34 |
self.memory = MemorySaver()
|
| 35 |
self.workflow = self._build_graph()
|
| 36 |
|
| 37 |
+
def process_documents(self, pdf_paths_with_names: list[tuple[str, str]]):
|
| 38 |
+
"""
|
| 39 |
+
Expects a list of tuples: [(temp_path, original_name), ...]
|
| 40 |
+
"""
|
| 41 |
all_docs = []
|
| 42 |
+
for temp_path, original_name in pdf_paths_with_names:
|
| 43 |
+
loader = PyPDFLoader(temp_path)
|
| 44 |
+
docs = loader.load()
|
| 45 |
+
|
| 46 |
+
# Override the metadata source with the original filename
|
| 47 |
+
for doc in docs:
|
| 48 |
+
doc.metadata["source"] = original_name
|
| 49 |
+
|
| 50 |
+
all_docs.extend(docs)
|
| 51 |
+
|
| 52 |
+
splits = RecursiveCharacterTextSplitter(
|
| 53 |
+
chunk_size=500,
|
| 54 |
+
chunk_overlap=100
|
| 55 |
+
).split_documents(all_docs)
|
| 56 |
|
|
|
|
| 57 |
self.vector_store = FAISS.from_documents(splits, self.embeddings)
|
| 58 |
+
|
| 59 |
+
# def process_documents(self, pdf_paths):
|
| 60 |
+
# all_docs = []
|
| 61 |
+
# for path in pdf_paths:
|
| 62 |
+
# loader = PyPDFLoader(path)
|
| 63 |
+
# all_docs.extend(loader.load())
|
| 64 |
+
|
| 65 |
+
# splits = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100).split_documents(all_docs)
|
| 66 |
+
# self.vector_store = FAISS.from_documents(splits, self.embeddings)
|
| 67 |
|
| 68 |
# --- GRAPH NODES ---
|
| 69 |
def retrieve(self, state: GraphState):
|