Spaces:

Dinesh310
/

Multi_Pdf_Rag_chat

Sleeping

App Files Files Community

Dinesh310 commited on Jan 26

Commit

baa3fcb

verified ·

1 Parent(s): 638efb2

Delete Repo

Browse files

Files changed (16) hide show

Repo/Demo_1/.gitattributes +0 -35
Repo/Demo_1/.streamlit/config.toml +0 -4
Repo/Demo_1/Dockerfile +0 -21
Repo/Demo_1/README.md +0 -20
Repo/Demo_1/requirements.txt +0 -13
Repo/Demo_1/src/__init__.py +0 -0
Repo/Demo_1/src/config/__init__.py +0 -0
Repo/Demo_1/src/config/config.py +0 -20
Repo/Demo_1/src/core/__init__.py +0 -0
Repo/Demo_1/src/core/embeddings.py +0 -17
Repo/Demo_1/src/core/graph_state.py +0 -8
Repo/Demo_1/src/core/llm.py +0 -17
Repo/Demo_1/src/exceptions.py +0 -8
Repo/Demo_1/src/rag_graph.py +0 -92
Repo/Demo_1/src/vector_store/vector_store.py +0 -31
Repo/Demo_1/streamlit_app.py +0 -121

Repo/Demo_1/.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

Repo/Demo_1/.streamlit/config.toml DELETED Viewed

@@ -1,4 +0,0 @@
-[server]
-enableCORS = false
-enableXsrfProtection = false
-maxUploadSize = 200

Repo/Demo_1/Dockerfile DELETED Viewed

@@ -1,21 +0,0 @@
-FROM python:3.11-slim
-WORKDIR /app
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt ./
-COPY src/ ./src/
-COPY . .
-RUN pip3 install -r requirements.txt
-EXPOSE 8501
-HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

Repo/Demo_1/README.md DELETED Viewed

@@ -1,20 +0,0 @@
----
-title: Demo 1
-emoji: 🚀
-colorFrom: red
-colorTo: red
-sdk: docker
-app_port: 8501
-tags:
-- streamlit
-pinned: false
-short_description: for learning
-license: mit
----
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

Repo/Demo_1/requirements.txt DELETED Viewed

@@ -1,13 +0,0 @@
-langchain
-langchain-community
-sentence-transformers
-langchain-huggingface
-langchain-openai
-langgraph
-openai
-faiss-cpu
-pydantic
-python-dotenv
-requests
-streamlit
-pypdf

Repo/Demo_1/src/__init__.py DELETED Viewed

File without changes

Repo/Demo_1/src/config/__init__.py DELETED Viewed

File without changes

Repo/Demo_1/src/config/config.py DELETED Viewed

@@ -1,20 +0,0 @@
-# src/config.py
-import os
-# Embeddings
-EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
-EMBEDDING_DEVICE = "cpu"
-NORMALIZE_EMBEDDINGS = True
-# LLM
-LLM_MODEL = "openai/gpt-oss-120b:free"
-OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
-OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
-# Text Splitter
-CHUNK_SIZE = 500
-CHUNK_OVERLAP = 100
-# Retriever
-MMR_LAMBDA = 0.25
-K_OFFSET = 2

Repo/Demo_1/src/core/__init__.py DELETED Viewed

File without changes

Repo/Demo_1/src/core/embeddings.py DELETED Viewed

@@ -1,17 +0,0 @@
-from langchain_huggingface import HuggingFaceEmbeddings
-from src.config.config import (
-    EMBEDDING_MODEL,
-    EMBEDDING_DEVICE,
-    NORMALIZE_EMBEDDINGS
-)
-def load_embeddings():
-    try:
-        return HuggingFaceEmbeddings(
-            model_name=EMBEDDING_MODEL,
-            model_kwargs={"device": EMBEDDING_DEVICE},
-            encode_kwargs={"normalize_embeddings": NORMALIZE_EMBEDDINGS}
-        )
-    except Exception as e:
-        raise RuntimeError(f"Failed to load embeddings: {e}")

Repo/Demo_1/src/core/graph_state.py DELETED Viewed

@@ -1,8 +0,0 @@
-from typing import List, TypedDict
-from langchain_core.documents import Document
-class GraphState(TypedDict):
-    question: str
-    context: List[Document]
-    answer: str

Repo/Demo_1/src/core/llm.py DELETED Viewed

@@ -1,17 +0,0 @@
-# src/llm.py
-from langchain_openai import ChatOpenAI
-from src.config.config import (
-    LLM_MODEL,
-    OPENROUTER_BASE_URL,
-    OPENROUTER_API_KEY
-)
-def load_llm():
-    if not OPENROUTER_API_KEY:
-        raise EnvironmentError("OPENROUTER_API_KEY not set")
-    return ChatOpenAI(
-        model=LLM_MODEL,
-        base_url=OPENROUTER_BASE_URL,
-        api_key=OPENROUTER_API_KEY
-    )

Repo/Demo_1/src/exceptions.py DELETED Viewed

@@ -1,8 +0,0 @@
-class DocumentProcessingError(Exception):
-    pass
-class VectorStoreNotInitializedError(Exception):
-    pass
-class LLMInvocationError(Exception):
-    pass

Repo/Demo_1/src/rag_graph.py DELETED Viewed

@@ -1,92 +0,0 @@
-# src/rag_graph.py
-from langgraph.graph import StateGraph, END
-from langgraph.checkpoint.memory import MemorySaver
-from langchain_core.prompts import ChatPromptTemplate
-from src.core.graph_state import GraphState
-from src.core.embeddings import load_embeddings
-from src.core.llm import load_llm
-from src.vector_store.vector_store import build_vector_store
-from src.config.config import K_OFFSET, MMR_LAMBDA
-from src.exceptions import VectorStoreNotInitializedError, LLMInvocationError
-class ProjectRAGGraph:
-    def __init__(self):
-        self.embeddings = load_embeddings()
-        self.llm = load_llm()
-        self.vector_store = None
-        self.pdf_count = 0
-        self.memory = MemorySaver()
-        self.workflow = self._build_graph()
-    def process_documents(self, pdf_paths, original_names=None):
-        self.pdf_count = len(pdf_paths)
-        self.vector_store = build_vector_store(
-            pdf_paths,
-            self.embeddings,
-            original_names
-        )
-    # ---------- Graph Nodes ----------
-    def retrieve(self, state: GraphState):
-        if not self.vector_store:
-            raise VectorStoreNotInitializedError("Vector store not initialized")
-        k_value = max(1, self.pdf_count + K_OFFSET)
-        retriever = self.vector_store.as_retriever(
-            search_type="mmr",
-            search_kwargs={"k": k_value, "lambda_mult": MMR_LAMBDA}
-        )
-        documents = retriever.invoke(state["question"])
-        return {"context": documents}
-    def generate(self, state: GraphState):
-        try:
-            prompt = ChatPromptTemplate.from_template(
-                """
-                You are an expert Project Analyst.
-                Answer ONLY using the provided context.
-                If the answer is not present, say "I don't know".
-                Context:
-                {context}
-                Question:
-                {question}
-                """
-            )
-            formatted_context = "\n\n".join(
-                doc.page_content for doc in state["context"]
-            )
-            chain = prompt | self.llm
-            response = chain.invoke({
-                "context": formatted_context,
-                "question": state["question"]
-            })
-            return {"answer": response.content}
-        except Exception as e:
-            raise LLMInvocationError(f"LLM failed: {e}")
-    # ---------- Graph Build ----------
-    def _build_graph(self):
-        workflow = StateGraph(GraphState)
-        workflow.add_node("retrieve", self.retrieve)
-        workflow.add_node("generate", self.generate)
-        workflow.set_entry_point("retrieve")
-        workflow.add_edge("retrieve", "generate")
-        workflow.add_edge("generate", END)
-        return workflow.compile(checkpointer=self.memory)
-    def query(self, question: str, thread_id: str):
-        config = {"configurable": {"thread_id": thread_id}}
-        result = self.workflow.invoke({"question": question}, config=config)
-        return result["answer"]

Repo/Demo_1/src/vector_store/vector_store.py DELETED Viewed

@@ -1,31 +0,0 @@
-# src/vector_store.py
-from langchain_community.document_loaders import PyPDFLoader
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain_community.vectorstores import FAISS
-from src.config.config import CHUNK_SIZE, CHUNK_OVERLAP
-from src.exceptions import DocumentProcessingError
-def build_vector_store(pdf_paths, embeddings, original_names=None):
-    try:
-        all_docs = []
-        for i, path in enumerate(pdf_paths):
-            loader = PyPDFLoader(path)
-            docs = loader.load()
-            if original_names and i < len(original_names):
-                for doc in docs:
-                    doc.metadata["source"] = original_names[i]
-            all_docs.extend(docs)
-        splitter = RecursiveCharacterTextSplitter(
-            chunk_size=CHUNK_SIZE,
-            chunk_overlap=CHUNK_OVERLAP
-        )
-        splits = splitter.split_documents(all_docs)
-        return FAISS.from_documents(splits, embeddings)
-    except Exception as e:
-        raise DocumentProcessingError(f"PDF processing failed: {e}")

Repo/Demo_1/streamlit_app.py DELETED Viewed

@@ -1,121 +0,0 @@
-import streamlit as st
-import os
-import tempfile
-# from src.RAG_builder import ProjectRAGGraph # Ensure your graph class is in your_filename.py
-from src.rag_graph import ProjectRAGGraph
-# from src.graph.rag_graph import ProjectRAGGraph
-# --- Page Config ---
-st.set_page_config(page_title="Project Analyst RAG", layout="wide")
-st.title("📄 Professional Project Analyst Chat")
-# --- Initialize Session State ---
-if "rag_graph" not in st.session_state:
-    st.session_state.rag_graph = ProjectRAGGraph()
-if "messages" not in st.session_state:
-    st.session_state.messages = []
-if "thread_id" not in st.session_state:
-    st.session_state.thread_id = "default_user_1" # Hardcoded for demo, could be unique per session
-# --- Sidebar: File Upload ---
-with st.sidebar:
-    st.header("Upload Documents")
-    uploaded_files = st.file_uploader(
-        "Upload Project PDFs",
-        type="pdf",
-        accept_multiple_files=True
-    )
-    process_button = st.button("Process Documents")
-    if process_button and uploaded_files:
-        with st.spinner("Processing PDFs..."):
-            pdf_paths = []
-            original_names = [] # <--- Add this
-            for uploaded_file in uploaded_files:
-                original_names.append(uploaded_file.name) # <--- Capture real name
-                with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
-                    tmp.write(uploaded_file.getvalue())
-                    pdf_paths.append(tmp.name)
-            # Pass BOTH the paths and the original names
-            st.session_state.rag_graph.process_documents(
-                pdf_paths,
-                original_names=original_names
-            )
-            for path in pdf_paths:
-                os.remove(path)
-            st.success("Documents Indexed Successfully!")
-    # if process_button and uploaded_files:
-    #     with st.spinner("Processing PDFs..."):
-    #         # Create temporary file paths to pass to your PDF Loader
-    #         pdf_paths = []
-    #         for uploaded_file in uploaded_files:
-    #             with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
-    #                 tmp.write(uploaded_file.getvalue())
-    #                 pdf_paths.append(tmp.name)
-    #         # Use your existing process_documents method
-    #         st.session_state.rag_graph.process_documents(pdf_paths)
-    #         # Clean up temp files
-    #         for path in pdf_paths:
-    #             os.remove(path)
-    #         st.success("Documents Indexed Successfully!")
-# --- Chat Interface ---
-# Display existing messages
-for message in st.session_state.messages:
-    with st.chat_message(message["role"]):
-        st.markdown(message["content"])
-        if "citations" in message and message["citations"]:
-            with st.expander("View Sources"):
-                for doc in message["citations"]:
-                    st.caption(f"Source: {doc.metadata.get('source', 'Unknown')} - Page: {doc.metadata.get('page', 'N/A')}")
-                    st.write(f"_{doc.page_content[:200]}..._")
-# User Input
-if prompt := st.chat_input("Ask a question about your projects..."):
-    # Check if vector store is ready
-    if st.session_state.rag_graph.vector_store is None:
-        st.error("Please upload and process documents first!")
-    else:
-        # Add user message to state
-        st.session_state.messages.append({"role": "user", "content": prompt})
-        with st.chat_message("user"):
-            st.markdown(prompt)
-        # Generate Response using the Graph
-        with st.chat_message("assistant"):
-            with st.spinner("Analyzing..."):
-                # We need to call the graph. We'll modify the query return slightly to get citations
-                config = {"configurable": {"thread_id": st.session_state.thread_id}}
-                inputs = {"question": prompt}
-                # Execute graph
-                result = st.session_state.rag_graph.workflow.invoke(inputs, config=config)
-                answer = result["answer"]
-                context = result["context"] # These are the retrieved Document objects
-                st.markdown(answer)
-                # Citations section
-                if context:
-                    with st.expander("View Sources"):
-                        for doc in context:
-                            source_name = os.path.basename(doc.metadata.get('source', 'Unknown'))
-                            page_num = doc.metadata.get('page', 0) + 1
-                            st.caption(f"📄 {source_name} (Page {page_num})")
-                            st.write(f"_{doc.page_content[:300]}..._")
-                # Add assistant response to state
-                st.session_state.messages.append({
-                    "role": "assistant",
-                    "content": answer,
-                    "citations": context
-                })