Spaces:

rupeshorg
/

kg-rag-bot

Runtime error

App Files Files Community

Rupesx007 commited on 4 days ago

Commit

73ad28e

verified ·

1 Parent(s): 49000b3

Upload 8 files

Browse files

added necessary files.

Files changed (8) hide show

.env +5 -0
.gitignore +0 -0
app.py +167 -0
chain.py +111 -0
ingest.py +89 -0
kg_builder.py +145 -0
requirements.txt +16 -0
retriever.py +128 -0

.env ADDED Viewed

	@@ -0,0 +1,5 @@

+HUGGINGFACEHUB_API_TOKEN=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
+NEO4J_URI=neo4j+s://b5a2cd23.databases.neo4j.io
+NEO4J_USERNAME=neo4j
+NEO4J_PASSWORD=6Cm5YT8pZUVxHdrXb3oCtyPNnAN8w7FHaZKqs-4ifLQ
+GROQ_API=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

.gitignore ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import streamlit as st
+import os, sys, tempfile
+from ingest import load_document, chunk_documents, store_in_chromadb, get_vectorstore
+from kg_builder import build_kg
+from chain import RAGChain
+# Page config
+st.set_page_config(
+    page_title="KG-RAG Chatbot",
+    page_icon="🧠",
+    layout="wide"
+)
+st.title("🧠 KG-RAG Chatbot")
+st.caption("Upload a PDF → Build Knowledge Graph → Chat with your document")
+# Sidebar
+with st.sidebar:
+    st.header("Upload Document")
+    uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
+    if uploaded_file:
+        # Save uploaded file to /data directory
+        os.makedirs("data", exist_ok=True)
+        save_path = os.path.join("data", uploaded_file.name)
+        with open(save_path, "wb") as f:
+            f.write(uploaded_file.read())
+        st.success(f"Saved: {uploaded_file.name}")
+        # Process button
+        if st.button("Process PDF", type="primary", use_container_width=True):
+            # Clear old chat + chain
+            st.session_state.messages = []
+            st.session_state.chain_ready = False
+            st.session_state.pop("chain", None)
+            # Step 1: Ingest → ChromaDB
+            with st.status("Ingesting document...", expanded=True) as status:
+                try:
+                    st.write("Loading PDF pages...")
+                    docs = load_document(uploaded_file.name)
+                    st.write(f"{len(docs)} pages loaded")
+                    st.write("Chunking text...")
+                    chunks = chunk_documents(docs)
+                    st.write(f"{len(chunks)} chunks created")
+                    st.write("Embedding + saving to ChromaDB...")
+                    store_in_chromadb(chunks)
+                    st.write("ChromaDB ready")
+                    # Step 2: Build KG → Neo4j
+                    st.write("Extracting triples → Neo4j Knowledge Graph...")
+                    build_kg(uploaded_file.name)
+                    st.write("Knowledge Graph built")
+                    status.update(label="PDF processed! You can now chat.", state="complete")
+                    st.session_state.chain_ready = True
+                    st.session_state.current_doc = uploaded_file.name
+                except Exception as e:
+                    status.update(label="Processing failed", state="error")
+                    st.error(str(e))
+    st.divider()
+    # Settings
+    st.header("Settings")
+    show_kg     = st.toggle("Show KG facts", value=True)
+    show_chunks = st.toggle("Show source chunks", value=False)
+    st.divider()
+    st.markdown("""
+    **How it works:**
+    1. Upload + Process your PDF
+    2. **ChromaDB** stores semantic chunks
+    3. **Neo4j** stores entity relationships
+    4. **LLaMA via Groq** answers your questions
+    """)
+    if st.button("🗑️ Clear chat history", use_container_width=True):
+        st.session_state.messages = []
+        st.rerun()
+# Load RAG chain (only after PDF is processed)
+@st.cache_resource
+def load_chain():
+    return RAGChain()
+# Main area
+if not st.session_state.get("chain_ready"):
+    # Show a friendly landing state
+    st.info("Upload a PDF from the sidebar and click **Process PDF** to get started.")
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        st.markdown("### Step 1\nUpload any PDF document from the sidebar")
+    with col2:
+        st.markdown("### Step 2\nClick **Process PDF** to build the knowledge graph")
+    with col3:
+        st.markdown("### Step 3\nAsk questions — get answers with KG + RAG context")
+else:
+    # Show which doc is loaded
+    st.success(f"Active document: **{st.session_state.get('current_doc', 'Unknown')}**")
+    # Load chain (cached)
+    try:
+        if "chain" not in st.session_state:
+            with st.spinner("Loading RAG chain..."):
+                st.session_state.chain = load_chain()
+        chain = st.session_state.chain
+    except Exception as e:
+        st.error(f"Failed to load chain: {e}")
+        st.stop()
+    # ── Chat history ──────────────────────────────────────────────────────────
+    if "messages" not in st.session_state:
+        st.session_state.messages = []
+    for msg in st.session_state.messages:
+        with st.chat_message(msg["role"]):
+            st.markdown(msg["content"])
+            if msg.get("kg_facts") and show_kg:
+                with st.expander("Knowledge Graph Facts"):
+                    st.code(msg["kg_facts"])
+            if msg.get("sources") and show_chunks:
+                with st.expander("Source Chunks"):
+                    for s in msg["sources"]:
+                        st.markdown(f"**Page {s['page']}:** {s['snippet']}...")
+    # Chat input
+    if question := st.chat_input("Ask anything about your document..."):
+        st.session_state.messages.append({"role": "user", "content": question})
+        with st.chat_message("user"):
+            st.markdown(question)
+        with st.chat_message("assistant"):
+            with st.spinner("Retrieving from ChromaDB + Neo4j → asking LLM..."):
+                try:
+                    result = chain.ask(question)
+                    answer = result["answer"]
+                    st.markdown(answer)
+                    if result["kg_facts"] and show_kg:
+                        with st.expander("Knowledge Graph Facts used"):
+                            st.code(result["kg_facts"])
+                    if show_chunks:
+                        with st.expander("Source Chunks"):
+                            for s in result["sources"]:
+                                st.markdown(f"**Page {s['page']}:** {s['snippet']}...")
+                    st.session_state.messages.append({
+                        "role": "assistant",
+                        "content": answer,
+                        "kg_facts": result["kg_facts"],
+                        "sources": result["sources"]
+                    })
+                except Exception as e:
+                    st.error(f"Error: {e}")

chain.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import os
+from dotenv import load_dotenv
+from huggingface_hub import InferenceClient
+from retriever import HybridRetriever
+from groq import Groq
+load_dotenv()
+HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
+HF_MODEL = "llama-3.1-8b-instant"
+# Prompt builder
+# Constructs the final prompt sent to Mistral LLM. Uses the [INST] instruction format
+# specific to Mistral-Instruct models. Injects the retrieved context (vector + graph)
+# and the user question, with strict instructions to only use provided context.
+def build_prompt(context: str, question: str) -> str:
+    return f"""
+            You are a helpful research assistant. Answer the question using ONLY the context provided.
+            If the answer is not in the context, say "I don't have enough information to answer that."
+            Be concise and factual.
+            Context:
+            {context}
+            Question: {question}
+            """
+# Main QA chain
+# Main class that runs the complete RAG pipeline:
+# 1. Retrieves hybrid context (ChromaDB vectors + Neo4j graph facts)
+# 2. Sends to llama-3.1 LLM via HuggingFace API
+# 3. Returns structured answer with source attribution
+class RAGChain:
+    def __init__(self):
+        self.retriever = HybridRetriever()
+        self.client = Groq(api_key=os.getenv("GROQ_API"))
+        print("Mistral chain ready")
+    def ask(self, question: str, verbose: bool = False) -> dict:
+        """
+        Full RAG pipeline:
+        question → hybrid retrieval → Mistral → answer
+        Returns dict with answer + sources for transparency.
+        """
+        # Step 1: Retrieve
+        retrieval = self.retriever.retrieve(question, k=4)
+        context   = retrieval["combined_context"]
+        if verbose:
+            print("\n Retrieved Context ")
+            print(context[:800], "..." if len(context) > 800 else "")
+        # Step 2: Generate
+        prompt = build_prompt(context, question)
+        response = self.client.chat.completions.create(
+            messages=[{"role": "user", "content": prompt}],
+            model=HF_MODEL,
+            temperature=0.2,
+            max_tokens=300
+        )
+        # Extract text from chat response
+        response_text = response.choices[0].message.content
+        # Clean up response (remove any trailing artifacts)
+        answer = response_text.strip()
+        return {
+            "question": question,
+            "answer": answer,
+            "kg_facts": retrieval["kg_facts"],
+            "sources": [
+                {
+                    "page": d.metadata.get("page", "?"),
+                    "snippet": d.page_content[:150]
+                }
+                for d in retrieval["semantic_chunks"]
+            ]
+        }
+    def close(self):
+        self.retriever.close()
+# CLI test
+if __name__ == "__main__":
+    chain = RAGChain()
+    print("\n KG-RAG Chatbot (type 'exit' to quit)\n")
+    while True:
+        question = input("You: ").strip()
+        if question.lower() in ["exit", "quit", "q"]:
+            break
+        if not question:
+            continue
+        result = chain.ask(question, verbose=False)
+        print(f"\n🤖 Answer:\n{result['answer']}")
+        if result["kg_facts"]:
+            print(f"\n📊 KG Facts used:\n{result['kg_facts']}")
+        print(f"\n📄 Sources: pages {[s['page'] for s in result['sources']]}")
+        print("─" * 50)
+    chain.close()

ingest.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import os
+from dotenv import load_dotenv
+from langchain_community.document_loaders import PyMuPDFLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores import Chroma
+load_dotenv()
+# Config
+DATA_DIR       = "data"
+CHROMA_DB_DIR  = "chroma_db"
+EMBED_MODEL    = "sentence-transformers/all-MiniLM-L6-v2"
+# Step 1: Load PDF
+def load_document(filename: str):
+    path = os.path.join(DATA_DIR, filename)
+    if not os.path.exists(path):
+        raise FileNotFoundError(f"No file found at {path}. Drop your PDF inside the /data folder.")
+    print(f"Loading document: {filename}")
+    loader = PyMuPDFLoader(path)
+    docs = loader.load()
+    print(f"Loaded {len(docs)} pages")
+    return docs
+# Step 2: Chunk the document
+def chunk_documents(docs):
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=500,        # characters per chunks
+        chunk_overlap=50,      # overlap so context isn't lost at boundaries
+        separators=["\n\n", "\n", ".", " "]
+    )
+    chunks = splitter.split_documents(docs)
+    print(f"Split into {len(chunks)} chunks")
+    return chunks
+# Step 3: Embed + Store in ChromaDB
+def store_in_chromadb(chunks):
+    print(f"Loading embedding model: {EMBED_MODEL}")
+    embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
+    print(f"Storing chunks in ChromaDB at ./{CHROMA_DB_DIR} ...")
+    vectorstore = Chroma.from_documents(
+        documents=chunks,
+        embedding=embeddings,
+        persist_directory=CHROMA_DB_DIR
+    )
+    print(f"ChromaDB ready — {len(chunks)} chunks stored")
+    return vectorstore
+# Step 4: Load existing ChromaDB
+def get_vectorstore():
+    """Load an already-persisted ChromaDB (used by retriever.py)."""
+    embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
+    return Chroma(
+        persist_directory=CHROMA_DB_DIR,
+        embedding_function=embeddings
+    )
+# Step 4: Test retrieval
+def test_retrieval(vectorstore, query: str = "What is this document about?"):
+    print(f"\nTest query: '{query}'")
+    results = vectorstore.similarity_search(query, k=3)
+    for i, r in enumerate(results):
+        print(f"\n--- Chunk {i+1} (page {r.metadata.get('page', '?')}) ---")
+        print(r.page_content[:300])
+# Main
+if __name__ == "__main__":
+    import sys
+    # Pass filename as argument: python ingest.py mypaper.pdf
+    # Or default to first PDF found in /data
+    if len(sys.argv) > 1:
+        filename = sys.argv[1]
+    else:
+        pdfs = [f for f in os.listdir(DATA_DIR) if f.endswith(".pdf")]
+        if not pdfs:
+            print("No PDF found in /data. Add one and retry.")
+            sys.exit(1)
+        filename = pdfs[0]
+        print(f"Auto-detected: {filename}")
+    docs   = load_document(filename)
+    chunks = chunk_documents(docs)
+    vs     = store_in_chromadb(chunks)
+    test_retrieval(vs)

kg_builder.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import os, re, json
+from dotenv import load_dotenv
+from neo4j import GraphDatabase
+from ingest import load_document, chunk_documents
+from groq import Groq
+load_dotenv()
+NEO4J_URI      = os.getenv("NEO4J_URI")
+NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
+NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
+HF_TOKEN       = os.getenv("HUGGINGFACEHUB_API_TOKEN")
+GROK_API       = os.getenv("GROQ_API")
+# use llama-3.1-8b-instant via GROQ API
+HF_MODEL = "llama-3.1-8b-instant",
+# knowldege graph builder
+class KnowledgeGraph:
+    def __init__(self):
+        # establish a connection with neo4j database and create a driver object
+        # the driver manage all communication between python code and graph database
+        self.driver = GraphDatabase.driver(
+            NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD)
+        )
+        print("Connected to Neo4j")
+    # utility to close the connection.
+    def close(self):
+        self.driver.close()
+    # utility to close/reset the session [clear graph entities and relations]
+    def clear(self):
+        with self.driver.session() as s:
+            s.run("MATCH (n) DETACH DELETE n")
+        print("Graph cleared")
+    # insert a triple to the graph Knowledge graph (KG)
+    # two entities and 1 relation.
+    # Uses MERGE to avoid duplicates (creates only if doesn't exist).
+    def insert_triple(self, subject: str, relation: str, obj: str):
+        query = """
+        MERGE (a:Entity {name: $subject})
+        MERGE (b:Entity {name: $obj})
+        MERGE (a)-[r:RELATION {type: $relation}]->(b)
+        """
+        with self.driver.session() as s:
+            s.run(query, subject=subject.strip(), relation=relation.strip(), obj=obj.strip())
+    # Searches the knowledge graph for any triples connected to a given entity.
+    # Returns up to 10 matching (subject, relation, object) triples.
+    def query_entity(self, entity: str) -> list[dict]:
+        query = """
+        MATCH (a:Entity)-[r]->(b:Entity)
+        WHERE toLower(a.name) CONTAINS toLower($entity)
+           OR toLower(b.name) CONTAINS toLower($entity)
+        RETURN a.name AS subject, r.type AS relation, b.name AS object
+        LIMIT 10
+        """
+        with self.driver.session() as s:
+            result = s.run(query, entity=entity)
+            return [dict(record) for record in result]
+# # Extract triples from text using Mistral 7B LLM, returns JSON format, max 8 triples
+def extract_triples(text: str, client: Groq) -> list[tuple]:
+    prompt = f"""Extract factual (subject, relation, object) triples from the text below.
+                Return ONLY a JSON array like: [{{"subject":"X","relation":"Y","object":"Z"}}]
+                Do not add explanation. Max 8 triples.
+                Text:
+                {text}
+                JSON:"""
+    chat_completion = client.chat.completions.create(
+        messages=[{"role": "user", "content": prompt}],
+        model="llama-3.1-8b-instant",
+        temperature=0.2,
+        max_tokens=300
+    )
+    # Extract text from chat response
+    response_text = chat_completion.choices[0].message.content
+    # Parse JSON from response
+    try:
+        # Find JSON array in the response
+        match = re.search(r'\[.*?\]', response_text, re.DOTALL)
+        if match:
+            triples_raw = json.loads(match.group())
+            return [(t["subject"], t["relation"], t["object"]) for t in triples_raw
+                    if all(k in t for k in ["subject", "relation", "object"])]
+    except Exception as e:
+        print(f"Parse error: {e}")
+    return []
+# ── Main ──────────────────────────────────────────────────────────────────────
+def build_kg(filename: str):
+    import sys, os
+    DATA_DIR = "data"
+    docs   = load_document(filename)
+    chunks = chunk_documents(docs)
+    # Only process first 20 chunks
+    chunks = chunks[:20]
+    print(f"\n Extracting triples from {len(chunks)} chunks via Mistral")
+    client = Groq(api_key=GROK_API)
+    kg     = KnowledgeGraph()
+    kg.clear()
+    total_triples = 0
+    for i, chunk in enumerate(chunks):
+        print(f"   Chunk {i+1}/{len(chunks)} ...", end=" ")
+        triples = extract_triples(chunk.page_content, client)
+        for s, r, o in triples:
+            kg.insert_triple(s, r, o)
+        total_triples += len(triples)
+        print(f"{len(triples)} triples")
+    print(f"\n Knowledge Graph built — {total_triples} triples stored in Neo4j")
+    kg.close()
+if __name__ == "__main__":
+    import sys, os
+    DATA_DIR = "data"
+    if len(sys.argv) > 1:
+        filename = sys.argv[1]
+    else:
+        pdfs = [f for f in os.listdir(DATA_DIR) if f.endswith(".pdf")]
+        if not pdfs:
+            print(" No PDF in /data.")
+            sys.exit(1)
+        filename = pdfs[0]
+    build_kg(filename)

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+langchain
+langchain-community
+langchain-text-splitters
+langchain-huggingface
+chromadb
+sentence-transformers
+neo4j
+python-dotenv
+pymupdf
+streamlit
+huggingface_hub
+transformers
+accelerate
+spacy

retriever.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import os
+from dotenv import load_dotenv
+from neo4j import GraphDatabase
+from ingest import get_vectorstore
+load_dotenv()
+NEO4J_URI      = os.getenv("NEO4J_URI")
+NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
+NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
+# Neo4j retrieval
+# Connects to Neo4j and retrieves structured knowledge (triples) based on
+# entity keywords extracted from the user query. Returns formatted facts.
+class Neo4jRetriever:
+    def __init__(self):
+        # Establishes connection to Neo4j
+        self.driver = GraphDatabase.driver(
+            NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD)
+        )
+    # Cleanup utility: closes the database connection when done
+    def close(self):
+        self.driver.close()
+    # Core retrieval logic: searches Knowledge Graph for triples where either
+    # Subject or Object contains any of the provided keywords.
+    def query(self, entity_keywords: list[str]) -> str:
+        """
+        Given a list of keywords, find related triples in the graph.
+        Returns a formatted string of facts.
+        """
+        facts = []
+        query = """
+                    MATCH (a:Entity)-[r]->(b:Entity)
+                    WHERE ANY(kw IN $keywords WHERE
+                        toLower(a.name) CONTAINS toLower(kw) OR
+                        toLower(b.name) CONTAINS toLower(kw))
+                    RETURN a.name AS subject, r.type AS relation, b.name AS object
+                    LIMIT 15
+                """
+        with self.driver.session() as s:
+            results = s.run(query, keywords=entity_keywords)
+            for rec in results:
+                facts.append(f"{rec['subject']} → {rec['relation']} → {rec['object']}")
+        if facts:
+            return "Knowledge Graph Facts:\n" + "\n".join(facts)
+        return ""
+# Keyword extractor (simple, no extra model needed)
+def extract_keywords(query: str) -> list[str]:
+    """
+    Naive keyword extractor : filters stopwords, returns meaningful tokens.
+    Good enough for KG lookup without needing spaCy/NER.
+    """
+    stopwords = {
+        "what","is","are","the","a","an","of","in","on","at","to","for",
+        "how","why","who","when","where","does","do","was","were","has",
+        "have","had","be","been","being","and","or","but","with","from",
+        "this","that","these","those","it","its","their","there","about",
+        "can","could","would","should","will","tell","me","explain","give"
+    }
+    tokens = query.lower().split()
+    keywords = [t.strip("?.!,") for t in tokens if t not in stopwords and len(t) > 2]
+    return keywords
+# Hybrid Retriever
+# Combines both Vector Search (ChromaDB for semantic similarity) and
+# Graph Search (Neo4j for logical connections) to provide comprehensive
+# context to the LLM. This is the main retrieval engine of your RAG system
+class HybridRetriever:
+    def __init__(self):
+        print("Loading ChromaDB vectorstore...")
+        self.vectorstore   = get_vectorstore()
+        self.neo4j         = Neo4jRetriever()
+        print("Hybrid retriever ready")
+    def retrieve(self, query: str, k: int = 4) -> dict:
+        """
+        Returns:
+            {
+                "semantic_chunks": [...],   # from ChromaDB
+                "kg_facts": "...",          # from Neo4j
+                "combined_context": "..."   # merged string for LLM
+            }
+        """
+        # 1. Semantic retrieval from ChromaDB
+        semantic_docs = self.vectorstore.similarity_search(query, k=k)
+        semantic_text = "\n\n".join([d.page_content for d in semantic_docs])
+        # 2. KG retrieval from Neo4j
+        keywords = extract_keywords(query)
+        kg_facts = self.neo4j.query(keywords)
+        # 3. Combine
+        combined = ""
+        if kg_facts:
+            combined += f"{kg_facts}\n\n"
+        combined += f"Document Excerpts:\n{semantic_text}"
+        return {
+            "semantic_chunks": semantic_docs,
+            "kg_facts": kg_facts,
+            "combined_context": combined
+        }
+    def close(self):
+        self.neo4j.close()
+# ── Quick test ────────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    retriever = HybridRetriever()
+    query = input("Enter a test query: ")
+    result = retriever.retrieve(query)
+    print("\n── KG Facts ──────────────────────────────")
+    print(result["kg_facts"] or "(none found)")
+    print("\n── Semantic Chunks ───────────────────────")
+    for i, doc in enumerate(result["semantic_chunks"]):
+        print(f"\nChunk {i+1}: {doc.page_content[:200]}")
+    retriever.close()