Dinesh310 commited on
Commit
0fe6232
·
verified ·
1 Parent(s): baa3fcb

Upload 16 files

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.streamlit/config.toml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ [server]
2
+ enableCORS = false
3
+ enableXsrfProtection = false
4
+ maxUploadSize = 200
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y \
6
+ build-essential \
7
+ curl \
8
+ git \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ COPY requirements.txt ./
12
+ COPY src/ ./src/
13
+ COPY . .
14
+
15
+ RUN pip3 install -r requirements.txt
16
+
17
+ EXPOSE 8501
18
+
19
+ HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
20
+
21
+ ENTRYPOINT ["streamlit", "run", "streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Demo 1
3
+ emoji: 🚀
4
+ colorFrom: red
5
+ colorTo: red
6
+ sdk: docker
7
+ app_port: 8501
8
+ tags:
9
+ - streamlit
10
+ pinned: false
11
+ short_description: for learning
12
+ license: mit
13
+ ---
14
+
15
+ # Welcome to Streamlit!
16
+
17
+ Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
18
+
19
+ If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
20
+ forums](https://discuss.streamlit.io).
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ langchain-community
3
+ sentence-transformers
4
+ langchain-huggingface
5
+ langchain-openai
6
+ langgraph
7
+ openai
8
+ faiss-cpu
9
+ pydantic
10
+ python-dotenv
11
+ requests
12
+ streamlit
13
+ pypdf
src/__init__.py ADDED
File without changes
src/config/__init__.py ADDED
File without changes
src/config/config.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/config.py
2
+ import os
3
+
4
+ # Embeddings
5
+ EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
6
+ EMBEDDING_DEVICE = "cpu"
7
+ NORMALIZE_EMBEDDINGS = True
8
+
9
+ # LLM
10
+ LLM_MODEL = "openai/gpt-oss-120b:free"
11
+ OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
12
+ OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
13
+
14
+ # Text Splitter
15
+ CHUNK_SIZE = 500
16
+ CHUNK_OVERLAP = 100
17
+
18
+ # Retriever
19
+ MMR_LAMBDA = 0.25
20
+ K_OFFSET = 2
src/core/__init__.py ADDED
File without changes
src/core/embeddings.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_huggingface import HuggingFaceEmbeddings
2
+ from src.config.config import (
3
+ EMBEDDING_MODEL,
4
+ EMBEDDING_DEVICE,
5
+ NORMALIZE_EMBEDDINGS
6
+ )
7
+
8
+ def load_embeddings():
9
+ try:
10
+ return HuggingFaceEmbeddings(
11
+ model_name=EMBEDDING_MODEL,
12
+ model_kwargs={"device": EMBEDDING_DEVICE},
13
+ encode_kwargs={"normalize_embeddings": NORMALIZE_EMBEDDINGS}
14
+ )
15
+ except Exception as e:
16
+ raise RuntimeError(f"Failed to load embeddings: {e}")
17
+
src/core/graph_state.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from typing import List, TypedDict
2
+ from langchain_core.documents import Document
3
+
4
+ class GraphState(TypedDict):
5
+ question: str
6
+ context: List[Document]
7
+ answer: str
8
+
src/core/llm.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/llm.py
2
+ from langchain_openai import ChatOpenAI
3
+ from src.config.config import (
4
+ LLM_MODEL,
5
+ OPENROUTER_BASE_URL,
6
+ OPENROUTER_API_KEY
7
+ )
8
+
9
+ def load_llm():
10
+ if not OPENROUTER_API_KEY:
11
+ raise EnvironmentError("OPENROUTER_API_KEY not set")
12
+
13
+ return ChatOpenAI(
14
+ model=LLM_MODEL,
15
+ base_url=OPENROUTER_BASE_URL,
16
+ api_key=OPENROUTER_API_KEY
17
+ )
src/exceptions.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ class DocumentProcessingError(Exception):
2
+ pass
3
+
4
+ class VectorStoreNotInitializedError(Exception):
5
+ pass
6
+
7
+ class LLMInvocationError(Exception):
8
+ pass
src/rag_graph.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/rag_graph.py
2
+ from langgraph.graph import StateGraph, END
3
+ from langgraph.checkpoint.memory import MemorySaver
4
+ from langchain_core.prompts import ChatPromptTemplate
5
+
6
+ from src.core.graph_state import GraphState
7
+ from src.core.embeddings import load_embeddings
8
+ from src.core.llm import load_llm
9
+ from src.vector_store.vector_store import build_vector_store
10
+ from src.config.config import K_OFFSET, MMR_LAMBDA
11
+ from src.exceptions import VectorStoreNotInitializedError, LLMInvocationError
12
+
13
+
14
+ class ProjectRAGGraph:
15
+ def __init__(self):
16
+ self.embeddings = load_embeddings()
17
+ self.llm = load_llm()
18
+ self.vector_store = None
19
+ self.pdf_count = 0
20
+ self.memory = MemorySaver()
21
+ self.workflow = self._build_graph()
22
+
23
+ def process_documents(self, pdf_paths, original_names=None):
24
+ self.pdf_count = len(pdf_paths)
25
+ self.vector_store = build_vector_store(
26
+ pdf_paths,
27
+ self.embeddings,
28
+ original_names
29
+ )
30
+
31
+ # ---------- Graph Nodes ----------
32
+
33
+ def retrieve(self, state: GraphState):
34
+ if not self.vector_store:
35
+ raise VectorStoreNotInitializedError("Vector store not initialized")
36
+
37
+ k_value = max(1, self.pdf_count + K_OFFSET)
38
+
39
+ retriever = self.vector_store.as_retriever(
40
+ search_type="mmr",
41
+ search_kwargs={"k": k_value, "lambda_mult": MMR_LAMBDA}
42
+ )
43
+
44
+ documents = retriever.invoke(state["question"])
45
+ return {"context": documents}
46
+
47
+ def generate(self, state: GraphState):
48
+ try:
49
+ prompt = ChatPromptTemplate.from_template(
50
+ """
51
+ You are an expert Project Analyst.
52
+ Answer ONLY using the provided context.
53
+ If the answer is not present, say "I don't know".
54
+
55
+ Context:
56
+ {context}
57
+
58
+ Question:
59
+ {question}
60
+ """
61
+ )
62
+
63
+ formatted_context = "\n\n".join(
64
+ doc.page_content for doc in state["context"]
65
+ )
66
+
67
+ chain = prompt | self.llm
68
+ response = chain.invoke({
69
+ "context": formatted_context,
70
+ "question": state["question"]
71
+ })
72
+
73
+ return {"answer": response.content}
74
+
75
+ except Exception as e:
76
+ raise LLMInvocationError(f"LLM failed: {e}")
77
+
78
+ # ---------- Graph Build ----------
79
+
80
+ def _build_graph(self):
81
+ workflow = StateGraph(GraphState)
82
+ workflow.add_node("retrieve", self.retrieve)
83
+ workflow.add_node("generate", self.generate)
84
+ workflow.set_entry_point("retrieve")
85
+ workflow.add_edge("retrieve", "generate")
86
+ workflow.add_edge("generate", END)
87
+ return workflow.compile(checkpointer=self.memory)
88
+
89
+ def query(self, question: str, thread_id: str):
90
+ config = {"configurable": {"thread_id": thread_id}}
91
+ result = self.workflow.invoke({"question": question}, config=config)
92
+ return result["answer"]
src/vector_store/vector_store.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/vector_store.py
2
+ from langchain_community.document_loaders import PyPDFLoader
3
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
4
+ from langchain_community.vectorstores import FAISS
5
+ from src.config.config import CHUNK_SIZE, CHUNK_OVERLAP
6
+ from src.exceptions import DocumentProcessingError
7
+
8
+ def build_vector_store(pdf_paths, embeddings, original_names=None):
9
+ try:
10
+ all_docs = []
11
+
12
+ for i, path in enumerate(pdf_paths):
13
+ loader = PyPDFLoader(path)
14
+ docs = loader.load()
15
+
16
+ if original_names and i < len(original_names):
17
+ for doc in docs:
18
+ doc.metadata["source"] = original_names[i]
19
+
20
+ all_docs.extend(docs)
21
+
22
+ splitter = RecursiveCharacterTextSplitter(
23
+ chunk_size=CHUNK_SIZE,
24
+ chunk_overlap=CHUNK_OVERLAP
25
+ )
26
+
27
+ splits = splitter.split_documents(all_docs)
28
+ return FAISS.from_documents(splits, embeddings)
29
+
30
+ except Exception as e:
31
+ raise DocumentProcessingError(f"PDF processing failed: {e}")
streamlit_app.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import tempfile
4
+ # from src.RAG_builder import ProjectRAGGraph # Ensure your graph class is in your_filename.py
5
+
6
+ from src.rag_graph import ProjectRAGGraph
7
+
8
+ # from src.graph.rag_graph import ProjectRAGGraph
9
+ # --- Page Config ---
10
+ st.set_page_config(page_title="Project Analyst RAG", layout="wide")
11
+ st.title("📄 Professional Project Analyst Chat")
12
+
13
+ # --- Initialize Session State ---
14
+ if "rag_graph" not in st.session_state:
15
+ st.session_state.rag_graph = ProjectRAGGraph()
16
+ if "messages" not in st.session_state:
17
+ st.session_state.messages = []
18
+ if "thread_id" not in st.session_state:
19
+ st.session_state.thread_id = "default_user_1" # Hardcoded for demo, could be unique per session
20
+
21
+ # --- Sidebar: File Upload ---
22
+ with st.sidebar:
23
+ st.header("Upload Documents")
24
+ uploaded_files = st.file_uploader(
25
+ "Upload Project PDFs",
26
+ type="pdf",
27
+ accept_multiple_files=True
28
+ )
29
+
30
+ process_button = st.button("Process Documents")
31
+
32
+ if process_button and uploaded_files:
33
+ with st.spinner("Processing PDFs..."):
34
+ pdf_paths = []
35
+ original_names = [] # <--- Add this
36
+ for uploaded_file in uploaded_files:
37
+ original_names.append(uploaded_file.name) # <--- Capture real name
38
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
39
+ tmp.write(uploaded_file.getvalue())
40
+ pdf_paths.append(tmp.name)
41
+
42
+ # Pass BOTH the paths and the original names
43
+ st.session_state.rag_graph.process_documents(
44
+ pdf_paths,
45
+ original_names=original_names
46
+ )
47
+
48
+ for path in pdf_paths:
49
+ os.remove(path)
50
+ st.success("Documents Indexed Successfully!")
51
+
52
+ # if process_button and uploaded_files:
53
+ # with st.spinner("Processing PDFs..."):
54
+ # # Create temporary file paths to pass to your PDF Loader
55
+ # pdf_paths = []
56
+ # for uploaded_file in uploaded_files:
57
+ # with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
58
+ # tmp.write(uploaded_file.getvalue())
59
+ # pdf_paths.append(tmp.name)
60
+
61
+ # # Use your existing process_documents method
62
+ # st.session_state.rag_graph.process_documents(pdf_paths)
63
+
64
+ # # Clean up temp files
65
+ # for path in pdf_paths:
66
+ # os.remove(path)
67
+
68
+ # st.success("Documents Indexed Successfully!")
69
+
70
+ # --- Chat Interface ---
71
+ # Display existing messages
72
+ for message in st.session_state.messages:
73
+ with st.chat_message(message["role"]):
74
+ st.markdown(message["content"])
75
+ if "citations" in message and message["citations"]:
76
+ with st.expander("View Sources"):
77
+ for doc in message["citations"]:
78
+ st.caption(f"Source: {doc.metadata.get('source', 'Unknown')} - Page: {doc.metadata.get('page', 'N/A')}")
79
+ st.write(f"_{doc.page_content[:200]}..._")
80
+
81
+ # User Input
82
+ if prompt := st.chat_input("Ask a question about your projects..."):
83
+ # Check if vector store is ready
84
+ if st.session_state.rag_graph.vector_store is None:
85
+ st.error("Please upload and process documents first!")
86
+ else:
87
+ # Add user message to state
88
+ st.session_state.messages.append({"role": "user", "content": prompt})
89
+ with st.chat_message("user"):
90
+ st.markdown(prompt)
91
+
92
+ # Generate Response using the Graph
93
+ with st.chat_message("assistant"):
94
+ with st.spinner("Analyzing..."):
95
+ # We need to call the graph. We'll modify the query return slightly to get citations
96
+ config = {"configurable": {"thread_id": st.session_state.thread_id}}
97
+ inputs = {"question": prompt}
98
+
99
+ # Execute graph
100
+ result = st.session_state.rag_graph.workflow.invoke(inputs, config=config)
101
+
102
+ answer = result["answer"]
103
+ context = result["context"] # These are the retrieved Document objects
104
+
105
+ st.markdown(answer)
106
+
107
+ # Citations section
108
+ if context:
109
+ with st.expander("View Sources"):
110
+ for doc in context:
111
+ source_name = os.path.basename(doc.metadata.get('source', 'Unknown'))
112
+ page_num = doc.metadata.get('page', 0) + 1
113
+ st.caption(f"📄 {source_name} (Page {page_num})")
114
+ st.write(f"_{doc.page_content[:300]}..._")
115
+
116
+ # Add assistant response to state
117
+ st.session_state.messages.append({
118
+ "role": "assistant",
119
+ "content": answer,
120
+ "citations": context
121
+ })