Dinesh310 commited on
Commit
8d56dc3
·
verified ·
1 Parent(s): c0db5fc

Upload 29 files

Browse files
src/config/__init__.py ADDED
File without changes
src/config/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (141 Bytes). View file
 
src/config/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (127 Bytes). View file
 
src/config/__pycache__/config.cpython-313.pyc ADDED
Binary file (1.31 kB). View file
 
src/config/__pycache__/config.cpython-38.pyc ADDED
Binary file (1 kB). View file
 
src/config/config.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Configuration module for Agentic RAG system"""
2
+
3
+ import os
4
+ from dotenv import load_dotenv
5
+ from langchain.chat_models import init_chat_model
6
+
7
+ # Load environment variables
8
+ load_dotenv()
9
+
10
+ class Config:
11
+ """Configuration class for RAG system"""
12
+
13
+ # API Keys
14
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
15
+
16
+ # Model Configuration
17
+ LLM_MODEL = "openai:gpt-4o"
18
+
19
+ # Document Processing
20
+ CHUNK_SIZE = 500
21
+ CHUNK_OVERLAP = 50
22
+
23
+ # Default URLs
24
+ DEFAULT_URLS = [
25
+ "https://lilianweng.github.io/posts/2023-06-23-agent/",
26
+ "https://lilianweng.github.io/posts/2024-04-12-diffusion-video/"
27
+ ]
28
+
29
+ @classmethod
30
+ def get_llm(cls):
31
+ """Initialize and return the LLM model"""
32
+ os.environ["OPENAI_API_KEY"] = cls.OPENAI_API_KEY
33
+ return init_chat_model(cls.LLM_MODEL)
src/document_ingestion/__init__.py ADDED
File without changes
src/document_ingestion/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (153 Bytes). View file
 
src/document_ingestion/__pycache__/document_processor.cpython-313.pyc ADDED
Binary file (4.9 kB). View file
 
src/document_ingestion/document_processor.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Document processing module for loading and splitting documents"""
2
+
3
+ from typing import List
4
+ from langchain_community.document_loaders import WebBaseLoader
5
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
6
+ from langchain.schema import Document
7
+
8
+ from typing import List, Union
9
+ from pathlib import Path
10
+ from langchain_community.document_loaders import (
11
+ WebBaseLoader,
12
+ PyPDFLoader,
13
+ TextLoader,
14
+ PyPDFDirectoryLoader
15
+ )
16
+
17
+ class DocumentProcessor:
18
+ """Handles document loading and processing"""
19
+
20
+ def __init__(self, chunk_size: int = 500, chunk_overlap: int = 50):
21
+ """
22
+ Initialize document processor
23
+
24
+ Args:
25
+ chunk_size: Size of text chunks
26
+ chunk_overlap: Overlap between chunks
27
+ """
28
+ self.chunk_size = chunk_size
29
+ self.chunk_overlap = chunk_overlap
30
+ self.splitter = RecursiveCharacterTextSplitter(
31
+ chunk_size=chunk_size,
32
+ chunk_overlap=chunk_overlap
33
+ )
34
+ def load_from_url(self, url: str) -> List[Document]:
35
+ """Load document(s) from a URL"""
36
+ loader = WebBaseLoader(url)
37
+ return loader.load()
38
+
39
+ def load_from_pdf_dir(self, directory: Union[str, Path]) -> List[Document]:
40
+ """Load documents from all PDFs inside a directory"""
41
+ loader = PyPDFDirectoryLoader(str(directory))
42
+ return loader.load()
43
+
44
+ def load_from_txt(self, file_path: Union[str, Path]) -> List[Document]:
45
+ """Load document(s) from a TXT file"""
46
+ loader = TextLoader(str(file_path), encoding="utf-8")
47
+ return loader.load()
48
+
49
+ def load_from_pdf(self, file_path: Union[str, Path]) -> List[Document]:
50
+ """Load document(s) from a PDF file"""
51
+ loader = PyPDFDirectoryLoader(str("data"))
52
+ return loader.load()
53
+
54
+ def load_documents(self, sources: List[str]) -> List[Document]:
55
+ """
56
+ Load documents from URLs, PDF directories, or TXT files
57
+
58
+ Args:
59
+ sources: List of URLs, PDF folder paths, or TXT file paths
60
+
61
+ Returns:
62
+ List of loaded documents
63
+ """
64
+ docs: List[Document] = []
65
+ for src in sources:
66
+ if src.startswith("http://") or src.startswith("https://"):
67
+ docs.extend(self.load_from_url(src))
68
+
69
+ path = Path("data")
70
+ if path.is_dir(): # PDF directory
71
+ docs.extend(self.load_from_pdf_dir(path))
72
+ elif path.suffix.lower() == ".txt":
73
+ docs.extend(self.load_from_txt(path))
74
+ else:
75
+ raise ValueError(
76
+ f"Unsupported source type: {src}. "
77
+ "Use URL, .txt file, or PDF directory."
78
+ )
79
+ return docs
80
+
81
+ def split_documents(self, documents: List[Document]) -> List[Document]:
82
+ """
83
+ Split documents into chunks
84
+
85
+ Args:
86
+ documents: List of documents to split
87
+
88
+ Returns:
89
+ List of split documents
90
+ """
91
+ return self.splitter.split_documents(documents)
92
+
93
+ def process_urls(self, urls: List[str]) -> List[Document]:
94
+ """
95
+ Complete pipeline to load and split documents
96
+
97
+ Args:
98
+ urls: List of URLs to process
99
+
100
+ Returns:
101
+ List of processed document chunks
102
+ """
103
+ docs = self.load_documents(urls)
104
+ return self.split_documents(docs)
src/graph_builder/__init__.py ADDED
File without changes
src/graph_builder/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (148 Bytes). View file
 
src/graph_builder/__pycache__/graph_builder.cpython-313.pyc ADDED
Binary file (2.31 kB). View file
 
src/graph_builder/graph_builder.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Graph builder for LangGraph workflow"""
2
+
3
+ from langgraph.graph import StateGraph, END
4
+ from src.state.rag_state import RAGState
5
+ from src.node.reactnode import RAGNodes
6
+
7
+ class GraphBuilder:
8
+ """Builds and manages the LangGraph workflow"""
9
+
10
+ def __init__(self, retriever, llm):
11
+ """
12
+ Initialize graph builder
13
+
14
+ Args:
15
+ retriever: Document retriever instance
16
+ llm: Language model instance
17
+ """
18
+ self.nodes = RAGNodes(retriever, llm)
19
+ self.graph = None
20
+
21
+ def build(self):
22
+ """
23
+ Build the RAG workflow graph
24
+
25
+ Returns:
26
+ Compiled graph instance
27
+ """
28
+ # Create state graph
29
+ builder = StateGraph(RAGState)
30
+
31
+ # Add nodes
32
+ builder.add_node("retriever", self.nodes.retrieve_docs)
33
+ builder.add_node("responder", self.nodes.generate_answer)
34
+
35
+ # Set entry point
36
+ builder.set_entry_point("retriever")
37
+
38
+ # Add edges
39
+ builder.add_edge("retriever", "responder")
40
+ builder.add_edge("responder", END)
41
+
42
+ # Compile graph
43
+ self.graph = builder.compile()
44
+ return self.graph
45
+
46
+ def run(self, question: str) -> dict:
47
+ """
48
+ Run the RAG workflow
49
+
50
+ Args:
51
+ question: User question
52
+
53
+ Returns:
54
+ Final state with answer
55
+ """
56
+ if self.graph is None:
57
+ self.build()
58
+
59
+ initial_state = RAGState(question=question)
60
+ return self.graph.invoke(initial_state)
src/node/__init__.py ADDED
File without changes
src/node/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (139 Bytes). View file
 
src/node/__pycache__/modesex.cpython-313.pyc ADDED
Binary file (4.57 kB). View file
 
src/node/__pycache__/nodes.cpython-313.pyc ADDED
Binary file (2.28 kB). View file
 
src/node/__pycache__/reactnode.cpython-313.pyc ADDED
Binary file (4.57 kB). View file
 
src/node/nodes.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LangGraph nodes for RAG workflow"""
2
+
3
+ from src.state.rag_state import RAGState
4
+
5
+ class RAGNodes:
6
+ """Contains node functions for RAG workflow"""
7
+
8
+ def __init__(self, retriever, llm):
9
+ """
10
+ Initialize RAG nodes
11
+
12
+ Args:
13
+ retriever: Document retriever instance
14
+ llm: Language model instance
15
+ """
16
+ self.retriever = retriever
17
+ self.llm = llm
18
+
19
+ def retrieve_docs(self, state: RAGState) -> RAGState:
20
+ """
21
+ Retrieve relevant documents node
22
+
23
+ Args:
24
+ state: Current RAG state
25
+
26
+ Returns:
27
+ Updated RAG state with retrieved documents
28
+ """
29
+ docs = self.retriever.invoke(state.question)
30
+ return RAGState(
31
+ question=state.question,
32
+ retrieved_docs=docs
33
+ )
34
+
35
+ def generate_answer(self, state: RAGState) -> RAGState:
36
+ """
37
+ Generate answer from retrieved documents node
38
+
39
+ Args:
40
+ state: Current RAG state with retrieved documents
41
+
42
+ Returns:
43
+ Updated RAG state with generated answer
44
+ """
45
+ # Combine retrieved documents into context
46
+ context = "\n\n".join([doc.page_content for doc in state.retrieved_docs])
47
+
48
+ # Create prompt
49
+ prompt = f"""Answer the question based on the context.
50
+
51
+ Context:
52
+ {context}
53
+
54
+ Question: {state.question}"""
55
+
56
+ # Generate response
57
+ response = self.llm.invoke(prompt)
58
+
59
+ return RAGState(
60
+ question=state.question,
61
+ retrieved_docs=state.retrieved_docs,
62
+ answer=response.content
63
+ )
src/node/reactnode.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LangGraph nodes for RAG workflow + ReAct Agent inside generate_content"""
2
+
3
+ from typing import List, Optional
4
+ from src.state.rag_state import RAGState
5
+
6
+ from langchain_core.documents import Document
7
+ from langchain_core.tools import Tool
8
+ from langchain_core.messages import HumanMessage
9
+ from langgraph.prebuilt import create_react_agent
10
+
11
+ # Wikipedia tool
12
+ from langchain_community.utilities import WikipediaAPIWrapper
13
+ from langchain_community.tools.wikipedia.tool import WikipediaQueryRun
14
+
15
+
16
+ class RAGNodes:
17
+ """Contains node functions for RAG workflow"""
18
+
19
+ def __init__(self, retriever, llm):
20
+ self.retriever = retriever
21
+ self.llm = llm
22
+ self._agent = None # lazy-init agent
23
+
24
+ def retrieve_docs(self, state: RAGState) -> RAGState:
25
+ """Classic retriever node"""
26
+ docs = self.retriever.invoke(state.question)
27
+ return RAGState(
28
+ question=state.question,
29
+ retrieved_docs=docs
30
+ )
31
+
32
+ def _build_tools(self) -> List[Tool]:
33
+ """Build retriever + wikipedia tools"""
34
+
35
+ def retriever_tool_fn(query: str) -> str:
36
+ docs: List[Document] = self.retriever.invoke(query)
37
+ if not docs:
38
+ return "No documents found."
39
+ merged = []
40
+ for i, d in enumerate(docs[:8], start=1):
41
+ meta = d.metadata if hasattr(d, "metadata") else {}
42
+ title = meta.get("title") or meta.get("source") or f"doc_{i}"
43
+ merged.append(f"[{i}] {title}\n{d.page_content}")
44
+ return "\n\n".join(merged)
45
+
46
+ retriever_tool = Tool(
47
+ name="retriever",
48
+ description="Fetch passages from indexed corpus.",
49
+ func=retriever_tool_fn,
50
+ )
51
+
52
+ wiki = WikipediaQueryRun(
53
+ api_wrapper=WikipediaAPIWrapper(top_k_results=3, lang="en")
54
+ )
55
+ wikipedia_tool = Tool(
56
+ name="wikipedia",
57
+ description="Search Wikipedia for general knowledge.",
58
+ func=wiki.run,
59
+ )
60
+
61
+ return [retriever_tool, wikipedia_tool]
62
+
63
+ def _build_agent(self):
64
+ """ReAct agent with tools"""
65
+ tools = self._build_tools()
66
+ system_prompt = (
67
+ "You are a helpful RAG agent. "
68
+ "Prefer 'retriever' for user-provided docs; use 'wikipedia' for general knowledge. "
69
+ "Return only the final useful answer."
70
+ )
71
+ self._agent = create_react_agent(self.llm, tools=tools,prompt=system_prompt)
72
+
73
+ def generate_answer(self, state: RAGState) -> RAGState:
74
+ """
75
+ Generate answer using ReAct agent with retriever + wikipedia.
76
+ """
77
+ if self._agent is None:
78
+ self._build_agent()
79
+
80
+ result = self._agent.invoke({"messages": [HumanMessage(content=state.question)]})
81
+
82
+ messages = result.get("messages", [])
83
+ answer: Optional[str] = None
84
+ if messages:
85
+ answer_msg = messages[-1]
86
+ answer = getattr(answer_msg, "content", None)
87
+
88
+ return RAGState(
89
+ question=state.question,
90
+ retrieved_docs=state.retrieved_docs,
91
+ answer=answer or "Could not generate answer."
92
+ )
src/state/__init__.py ADDED
File without changes
src/state/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (140 Bytes). View file
 
src/state/__pycache__/rag_state.cpython-313.pyc ADDED
Binary file (752 Bytes). View file
 
src/state/rag_state.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """RAG state definition for LangGraph"""
2
+
3
+ from typing import List
4
+ from pydantic import BaseModel
5
+ from langchain.schema import Document
6
+
7
+ class RAGState(BaseModel):
8
+ """State object for RAG workflow"""
9
+
10
+ question: str
11
+ retrieved_docs: List[Document] = []
12
+ answer: str = ""
src/vectorstore/__init__.py ADDED
File without changes
src/vectorstore/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (146 Bytes). View file
 
src/vectorstore/__pycache__/vectorstore.cpython-313.pyc ADDED
Binary file (2.43 kB). View file
 
src/vectorstore/vectorstore.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Vector store module for document embedding and retrieval"""
2
+
3
+ from typing import List
4
+ from langchain_community.vectorstores import FAISS
5
+ from langchain_openai import OpenAIEmbeddings
6
+ from langchain.schema import Document
7
+
8
+ class VectorStore:
9
+ """Manages vector store operations"""
10
+
11
+ def __init__(self):
12
+ """Initialize vector store with OpenAI embeddings"""
13
+ self.embedding = OpenAIEmbeddings()
14
+ self.vectorstore = None
15
+ self.retriever = None
16
+
17
+ def create_vectorstore(self, documents: List[Document]):
18
+ """
19
+ Create vector store from documents
20
+
21
+ Args:
22
+ documents: List of documents to embed
23
+ """
24
+ self.vectorstore = FAISS.from_documents(documents, self.embedding)
25
+ self.retriever = self.vectorstore.as_retriever()
26
+
27
+ def get_retriever(self):
28
+ """
29
+ Get the retriever instance
30
+
31
+ Returns:
32
+ Retriever instance
33
+ """
34
+ if self.retriever is None:
35
+ raise ValueError("Vector store not initialized. Call create_vectorstore first.")
36
+ return self.retriever
37
+
38
+ def retrieve(self, query: str, k: int = 4) -> List[Document]:
39
+ """
40
+ Retrieve relevant documents for a query
41
+
42
+ Args:
43
+ query: Search query
44
+ k: Number of documents to retrieve
45
+
46
+ Returns:
47
+ List of relevant documents
48
+ """
49
+ if self.retriever is None:
50
+ raise ValueError("Vector store not initialized. Call create_vectorstore first.")
51
+ return self.retriever.invoke(query)