Spaces:

samuelolubukun
/

KnowledgeGraphChatbot

Sleeping

File size: 32,272 Bytes

6acb6dd

import os
import io
import tempfile
from typing import List, Tuple, Optional
import gradio as gr
from dotenv import load_dotenv

# --- Web scraping ---
import requests
from bs4 import BeautifulSoup

# --- LangChain core ---
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate

# --- Loaders (files) ---
from langchain_community.document_loaders import (
    PyPDFLoader,
    UnstructuredWordDocumentLoader,
    TextLoader,
    CSVLoader,
    UnstructuredExcelLoader,
)

# --- Neo4j + Vector store + Graph ---
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain.chains.graph_qa.cypher import GraphCypherQAChain

# --- LLMs & Embeddings: Cohere or Gemini ---
# Cohere
from langchain_community.embeddings import CohereEmbeddings
from langchain_community.llms import Cohere

# Gemini (Google Generative AI API key)
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

# Global state management
class AppState:
    def __init__(self):
        self.graph: Optional[Neo4jGraph] = None
        self.vs: Optional[Neo4jVector] = None
        self.llm = None
        self.embeddings = None
        self.chat_history = []

app_state = AppState()

# ===============================
# Helpers
# ===============================

def clean_chunks(docs: List[Document], chunk_size=800, chunk_overlap=120) -> List[Document]:
    """Split to moderately large chunks for better retrieval and context quality."""
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_documents(docs)


def load_and_split_file(file_path: str) -> List[Document]:
    """Load a single file (by extension) and return split docs."""
    filename = os.path.basename(file_path)
    name = filename.lower()
    _, ext = os.path.splitext(name)
    ext = ext.lstrip(".")

    try:
        if ext == "pdf":
            loader = PyPDFLoader(file_path)
        elif ext in ("docx", "doc"):
            loader = UnstructuredWordDocumentLoader(file_path)
        elif ext == "txt":
            loader = TextLoader(file_path, autodetect_encoding=True)
        elif ext == "csv":
            loader = CSVLoader(file_path, csv_args={"delimiter": ","})
        elif ext in ("xlsx", "xls"):
            loader = UnstructuredExcelLoader(file_path, mode="elements")
        else:
            print(f"Unsupported file type: {ext}")
            return []

        docs = loader.load()
        # Attach source metadata
        for d in docs:
            d.metadata = d.metadata or {}
            d.metadata["source"] = filename

        return clean_chunks(docs)
    except Exception as e:
        print(f"Error processing {filename}: {e}")
        return []


def scrape_webpage(url: str) -> List[Document]:
    """Scrape a single URL (no crawling), extract visible text, split into chunks."""
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        r = requests.get(url, headers=headers, timeout=20, allow_redirects=True)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")

        # Remove script and style elements
        for script in soup(["script", "style", "nav", "header", "footer"]):
            script.decompose()

        # Restrict to likely content areas to reduce nav noise
        main_candidates = soup.select("main, article, section, .content, .post, .entry") or [soup.body or soup]
        texts = []
        for node in main_candidates:
            if node:
                text = node.get_text(separator=" ", strip=True)
                if text and len(text) > 50:  # Only keep substantial text
                    texts.append(text)
        
        joined = " ".join(texts).strip()
        if not joined or len(joined) < 100:
            return []

        base_doc = Document(page_content=joined, metadata={"source": url, "type": "web"})
        return clean_chunks([base_doc], chunk_size=800, chunk_overlap=120)
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return []


def init_models(provider: str, api_key: str):
    """Initialize LLM and embeddings for the chosen provider."""
    if provider == "Cohere":
        if not api_key:
            raise ValueError("Please provide a Cohere API key.")
        
        # Initialize Cohere LLM
        llm = Cohere(model="command", temperature=0.2, cohere_api_key=api_key)
        
        # Initialize Cohere Embeddings with user_agent parameter
        embeddings = CohereEmbeddings(
            model="embed-english-v3.0", 
            cohere_api_key=api_key,
            user_agent="langchain-knowledge-graph-chatbot"
        )
        return llm, embeddings

    elif provider == "Gemini":
        if not api_key:
            raise ValueError("Please provide a Gemini API key.")
        
        # Chat + Embeddings via Google Generative AI (no GCP project required)
        llm = ChatGoogleGenerativeAI(
            model="gemini-1.5-pro", 
            temperature=0.2, 
            google_api_key=api_key
        )
        embeddings = GoogleGenerativeAIEmbeddings(
            model="models/embedding-001", 
            google_api_key=api_key
        )
        return llm, embeddings

    else:
        raise ValueError(f"Unsupported provider: {provider}")


def upsert_chunks_vector_index(

    docs: List[Document],

    embeddings,

    neo4j_url: str,

    neo4j_user: str,

    neo4j_password: str,

    database: str = "neo4j",

    node_label: str = "Chunk",

    text_prop: str = "text",

    embed_prop: str = "embedding",

    index_name: str = "chunk_vector_index",

    keyword_index_name: str = "chunk_keyword_index",

):
    """Create/update a Neo4j vector index with chunk nodes for retrieval."""
    # Ensure text property exists per Document
    prepared_docs = []
    for d in docs:
        content = d.page_content.strip()
        if not content:
            continue
        d.metadata = d.metadata or {}
        # Neo4jVector expects text under a known property; it will write it.
        prepared_docs.append(Document(page_content=content, metadata=d.metadata))

    if not prepared_docs:
        return None

    vs = Neo4jVector.from_documents(
        documents=prepared_docs,
        embedding=embeddings,
        url=neo4j_url,
        username=neo4j_user,
        password=neo4j_password,
        database=database,
        node_label=node_label,
        text_node_property=text_prop,
        embedding_node_property=embed_prop,
        index_name=index_name,
        keyword_index_name=keyword_index_name,
        # Removed search_type parameter - let it use default
    )
    return vs


def build_kg_with_llm(

    docs: List[Document],

    graph: Neo4jGraph,

    llm,

    allowed_nodes: List[str],

    allowed_rels: List[str],

):
    """Extract a lean, controllable KG from your documents and persist in Neo4j."""
    try:
        # Try to import json_repair, install if missing
        try:
            import json_repair
        except ImportError:
            print("Installing json-repair package...")
            import subprocess
            import sys
            subprocess.check_call([sys.executable, "-m", "pip", "install", "json-repair"])
            import json_repair
        
        transformer = LLMGraphTransformer(
            llm=llm,
            allowed_nodes=allowed_nodes,
            allowed_relationships=allowed_rels,
            node_properties=False,
            relationship_properties=False,
        )
        
        # Process documents in smaller batches to avoid token limits
        batch_size = 3
        total_batches = (len(docs) + batch_size - 1) // batch_size
        
        for i in range(0, len(docs), batch_size):
            batch = docs[i:i + batch_size]
            batch_num = (i // batch_size) + 1
            print(f"Processing batch {batch_num}/{total_batches} ({len(batch)} documents)")
            
            try:
                graph_docs = transformer.convert_to_graph_documents(batch)
                if graph_docs:
                    graph.add_graph_documents(graph_docs, include_source=True)
                    print(f"Successfully processed batch {batch_num}")
                else:
                    print(f"No graph documents generated for batch {batch_num}")
            except Exception as e:
                print(f"Error processing batch {batch_num}: {e}")
                continue
                
    except Exception as e:
        print(f"Knowledge graph extraction error: {e}")
        raise e


def query_knowledge_graph(graph: Neo4jGraph, question: str, llm) -> str:
    """Query the knowledge graph using natural language and return results."""
    try:
        # Create a GraphCypherQAChain to query the knowledge graph
        cypher_chain = GraphCypherQAChain.from_llm(
            llm=llm,
            graph=graph,
            verbose=True,
            return_intermediate_steps=True,
            allow_dangerous_requests=True  # Allow complex queries
        )
        
        # Query the knowledge graph
        result = cypher_chain(question)
        
        # Extract the answer and query information
        answer = result.get("result", "")
        intermediate_steps = result.get("intermediate_steps", [])
        
        # Format the response with graph context
        formatted_answer = f"**Knowledge Graph Answer:**\n{answer}"
        
        # Add query information if available
        if intermediate_steps:
            for step in intermediate_steps:
                if "query" in step:
                    formatted_answer += f"\n\n*Graph Query Used:* `{step['query']}`"
        
        return formatted_answer
        
    except Exception as e:
        return f"Error querying knowledge graph: {e}"


def hybrid_retrieval_answer(

    question: str, 

    graph: Neo4jGraph, 

    vs: Neo4jVector, 

    llm

) -> str:
    """Combine knowledge graph querying with vector search for comprehensive answers."""
    
    # 1. Query the Knowledge Graph first
    kg_answer = query_knowledge_graph(graph, question, llm)
    
    # 2. Get vector search results
    try:
        retriever = vs.as_retriever(search_type="similarity", search_kwargs={"k": 6})
        relevant_docs = retriever.get_relevant_documents(question)
        
        context_texts = []
        for d in relevant_docs:
            src = d.metadata.get("source", "unknown")
            snippet = d.page_content[:1200]
            context_texts.append(f"[Source: {src}] {snippet}")
            
        vector_context = "\n\n---\n\n".join(context_texts)
    except Exception as e:
        vector_context = f"Vector search error: {e}"
    
    # 3. Combine both approaches for a comprehensive answer
    combined_prompt = f"""

You are a helpful assistant that must provide comprehensive answers using BOTH knowledge graph data and document context.



KNOWLEDGE GRAPH RESULTS:

{kg_answer}



DOCUMENT CONTEXT:

{vector_context}



USER QUESTION: {question}



Instructions:

- Synthesize information from BOTH the knowledge graph and document context

- If the knowledge graph provides structured relationships, highlight those

- If the documents provide additional details, include those

- Always cite sources when possible

- If information conflicts, note the discrepancy

- If neither source has sufficient information, say so clearly



Provide a comprehensive answer that leverages both structured knowledge and document content:

"""
    
    try:
        response = llm.invoke(combined_prompt)
        if hasattr(response, "content"):
            return response.content
        return str(response)
    except Exception as e:
        return f"Error generating combined answer: {e}"


# ===============================
# Gradio Interface Functions
# ===============================

def connect_neo4j(neo4j_url: str, neo4j_user: str, neo4j_password: str) -> str:
    """Connect to Neo4j database and check for existing data."""
    try:
        app_state.graph = Neo4jGraph(url=neo4j_url, username=neo4j_user, password=neo4j_password)
        
        # Check if there's existing data in the database
        try:
            # Check for existing chunks/nodes
            chunk_count = app_state.graph.query("MATCH (n:Chunk) RETURN count(n) as count")[0]["count"]
            entity_count = app_state.graph.query("MATCH (n) WHERE NOT n:Chunk RETURN count(n) as count")[0]["count"]
            
            status_msg = f"✅ Successfully connected to Neo4j!\n"
            status_msg += f"📊 Found {chunk_count} document chunks and {entity_count} knowledge graph entities"
            
            # Try to reconnect to existing vector store if chunks exist
            if chunk_count > 0:
                try:
                    # We need embeddings to reconnect to vector store
                    # For now, just indicate data exists but needs model setup
                    status_msg += f"\n💡 Existing data detected! Please set up your LLM provider and click 'Reconnect to Existing Data' to restore full functionality."
                except Exception as e:
                    status_msg += f"\n⚠️ Data found but vector store needs reconnection."
            
            return status_msg
            
        except Exception as e:
            return "✅ Successfully connected to Neo4j! (Empty database)"
            
    except Exception as e:
        return f"❌ Neo4j connection failed: {e}"


def reconnect_existing_data(

    provider: str,

    api_key: str,

    neo4j_url: str,

    neo4j_user: str,

    neo4j_password: str

) -> str:
    """Reconnect to existing vector store and LLM models."""
    if app_state.graph is None:
        return "❌ Please connect to Neo4j first."
    
    try:
        # Initialize models
        llm, embeddings = init_models(provider, api_key)
        app_state.llm = llm
        app_state.embeddings = embeddings
        
        # Check if chunks exist
        chunk_count = app_state.graph.query("MATCH (n:Chunk) RETURN count(n) as count")[0]["count"]
        
        if chunk_count == 0:
            return "❌ No existing data found. Please ingest new data first."
        
        # Reconnect to existing vector store
        try:
            app_state.vs = Neo4jVector(
                embedding=embeddings,
                url=neo4j_url,
                username=neo4j_user,
                password=neo4j_password,
                database="neo4j",
                node_label="Chunk",
                text_node_property="text",
                embedding_node_property="embedding",
                index_name="chunk_vector_index",
                keyword_index_name="chunk_keyword_index",
            )
            
            # Test the vector store
            test_results = app_state.vs.similarity_search("test", k=1)
            
            return f"✅ Successfully reconnected to existing data! Found {chunk_count} chunks. Vector store is ready for chat."
            
        except Exception as vs_error:
            # If vector store connection fails, try to rebuild it
            return f"⚠️ Vector store connection failed: {vs_error}. You may need to re-ingest your data."
            
    except Exception as e:
        import traceback
        error_details = traceback.format_exc()
        print(f"Reconnection error: {error_details}")
        return f"❌ Reconnection failed: {str(e)}"


def wipe_database() -> str:
    """Wipe the Neo4j database."""
    if app_state.graph is None:
        return "❌ Please connect to Neo4j first."
    
    try:
        app_state.graph.query("MATCH (n) DETACH DELETE n;")
        return "✅ Database successfully wiped!"
    except Exception as e:
        return f"❌ Failed to wipe database: {e}"


def process_knowledge(

    provider: str,

    api_key: str,

    files: List[str],

    urls: str,

    neo4j_url: str,

    neo4j_user: str,

    neo4j_password: str

) -> str:
    """Process files and URLs to build knowledge graph."""
    if app_state.graph is None:
        return "❌ Please connect to Neo4j first."
    
    try:
        # Initialize models
        llm, embeddings = init_models(provider, api_key)
        app_state.llm = llm
        app_state.embeddings = embeddings
        
        all_docs: List[Document] = []
        processed_files = 0
        processed_urls = 0
        
        # Process uploaded files
        if files:
            for file_path in files:
                if file_path:  # Check if file_path is not None/empty
                    try:
                        print(f"Processing file: {file_path}")
                        file_docs = load_and_split_file(file_path)
                        all_docs.extend(file_docs)
                        processed_files += 1
                        print(f"Successfully processed {file_path}: {len(file_docs)} chunks")
                    except Exception as e:
                        print(f"Failed to process file {file_path}: {e}")
                        continue
        
        # Process URLs
        if urls and urls.strip():
            url_list = [u.strip() for u in urls.splitlines() if u.strip()]
            for url in url_list:
                try:
                    print(f"Processing URL: {url}")
                    url_docs = scrape_webpage(url)
                    all_docs.extend(url_docs)
                    processed_urls += 1
                    print(f"Successfully processed {url}: {len(url_docs)} chunks")
                except Exception as e:
                    print(f"Failed to process URL {url}: {e}")
                    continue
        
        if not all_docs:
            return f"⚠️ No data extracted. Processed {processed_files} files and {processed_urls} URLs, but no usable content found."
        
        print(f"Total documents to process: {len(all_docs)}")
        
        # Build Knowledge Graph
        allowed_nodes = ["Entity", "Concept", "Person", "Organization", "Location", "Event", "Fact"]
        allowed_rels = ["RELATED_TO", "MENTIONS", "PART_OF", "CAUSES", "ASSOCIATED_WITH"]
        
        try:
            print("Building knowledge graph...")
            build_kg_with_llm(all_docs, app_state.graph, llm, allowed_nodes, allowed_rels)
            print("Knowledge graph built successfully")
        except Exception as e:
            print(f"KG extraction error: {e}")
            return f"❌ KG extraction failed: {e}"
        
        # Build Vector Index
        try:
            print("Building vector index...")
            vs = upsert_chunks_vector_index(
                docs=all_docs,
                embeddings=embeddings,
                neo4j_url=neo4j_url,
                neo4j_user=neo4j_user,
                neo4j_password=neo4j_password,
                node_label="Chunk",
                text_prop="text",
                embed_prop="embedding",
                index_name="chunk_vector_index",
                keyword_index_name="chunk_keyword_index",
            )
            app_state.vs = vs
            print("Vector index built successfully")
        except Exception as e:
            print(f"Vector indexing error: {e}")
            return f"❌ Vector indexing failed: {e}"
        
        return f"✅ Successfully processed {processed_files} files and {processed_urls} URLs ({len(all_docs)} total chunks)! Knowledge graph and vector index are ready."
        
    except Exception as e:
        import traceback
        error_details = traceback.format_exc()
        print(f"Full error details: {error_details}")
        return f"❌ Processing failed: {str(e)}"


def chat_with_knowledge(message: str, history: List[Tuple[str, str]]) -> Tuple[str, List[Tuple[str, str]]]:
    """Chat function using both knowledge graph and vector search."""
    if app_state.graph is None or app_state.vs is None:
        response = "❌ Please connect to Neo4j and ingest data first."
        history.append((message, response))
        return "", history
    
    if app_state.llm is None:
        response = "❌ Model not initialized. Please process some data first."
        history.append((message, response))
        return "", history
    
    try:
        # Use hybrid approach: Knowledge Graph + Vector Search
        answer = hybrid_retrieval_answer(
            question=message,
            graph=app_state.graph,
            vs=app_state.vs,
            llm=app_state.llm
        )
        
        if not answer or answer.strip() == "":
            answer = "I don't have enough information to answer that based on the ingested data."
        
        history.append((message, answer))
        return "", history
        
    except Exception as e:
        import traceback
        error_details = traceback.format_exc()
        print(f"Chat error details: {error_details}")
        response = f"❌ Error during chat: {str(e)}"
        history.append((message, response))
        return "", history


def clear_chat_history():
    """Clear the chat history."""
    app_state.chat_history = []
    return []


# ===============================
# Gradio Interface
# ===============================

def create_interface():
    """Create the Gradio interface."""
    load_dotenv()
    
    with gr.Blocks(title="Knowledge Graph Chatbot", theme=gr.themes.Soft()) as demo:
        gr.Markdown("# 📚 Knowledge Graph Chatbot (Strict)")
        gr.Markdown("Upload documents, scrape URLs, and chat with your knowledge using Neo4j and vector search!")
        
        with gr.Tab("🔧 Setup & Configuration"):
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("### Model Settings")
                    provider = gr.Dropdown(
                        choices=["Cohere", "Gemini"],
                        value="Cohere",
                        label="LLM Provider"
                    )
                    api_key = gr.Textbox(
                        label="API Key",
                        type="password",
                        value=os.getenv("COHERE_API_KEY", ""),
                        placeholder="Enter your API key"
                    )
                
                with gr.Column(scale=1):
                    gr.Markdown("### Neo4j Configuration")
                    neo4j_url = gr.Textbox(
                        label="Neo4j URL",
                        value=os.getenv("NEO4J_URI", "neo4j+s://your-neo4j-url"),
                        placeholder="neo4j+s://your-neo4j-url"
                    )
                    neo4j_user = gr.Textbox(
                        label="Username",
                        value=os.getenv("NEO4J_USER", "neo4j")
                    )
                    neo4j_password = gr.Textbox(
                        label="Password",
                        type="password",
                        value=os.getenv("NEO4J_PASSWORD", "")
                    )
            
            with gr.Row():
                connect_btn = gr.Button("🔗 Connect to Neo4j", variant="primary")
                wipe_btn = gr.Button("🗑️ Wipe Database", variant="stop")
                reconnect_btn = gr.Button("🔄 Reconnect to Existing Data", variant="secondary")
            
            connection_status = gr.Textbox(
                label="Connection Status",
                interactive=False,
                placeholder="Click 'Connect to Neo4j' to establish connection"
            )
            
            connect_btn.click(
                fn=connect_neo4j,
                inputs=[neo4j_url, neo4j_user, neo4j_password],
                outputs=[connection_status]
            )
            
            wipe_btn.click(
                fn=wipe_database,
                outputs=[connection_status]
            )
            
            reconnect_btn.click(
                fn=reconnect_existing_data,
                inputs=[provider, api_key, neo4j_url, neo4j_user, neo4j_password],
                outputs=[connection_status]
            )
        
        with gr.Tab("📁 Data Ingestion"):
            gr.Markdown("### Upload Knowledge Sources")
            
            files = gr.File(
                label="Upload Files",
                file_types=[".pdf", ".docx", ".doc", ".txt", ".csv", ".xls", ".xlsx"],
                file_count="multiple"
            )
            
            urls = gr.Textbox(
                label="URLs to Scrape",
                placeholder="Enter URLs, one per line",
                lines=5
            )
            
            process_btn = gr.Button("🚀 Process & Build Knowledge Graph", variant="primary")
            
            processing_status = gr.Textbox(
                label="Processing Status",
                interactive=False,
                placeholder="Click 'Process & Build Knowledge Graph' to start"
            )
            
            process_btn.click(
                fn=process_knowledge,
                inputs=[provider, api_key, files, urls, neo4j_url, neo4j_user, neo4j_password],
                outputs=[processing_status]
            )
        
        with gr.Tab("💬 Chat"):
            gr.Markdown("### Chat with Your Knowledge Graph")
            gr.Markdown("Ask questions about your ingested data. The system uses **both knowledge graph queries and vector search** for comprehensive answers.")
            
            chatbot = gr.Chatbot(
                label="Knowledge Graph Chat",
                height=500,
                placeholder="Your conversation will appear here..."
            )
            
            with gr.Row():
                msg_box = gr.Textbox(
                    label="Your Question",
                    placeholder="Ask about entities, relationships, or any content from your data...",
                    scale=4
                )
                send_btn = gr.Button("Send", variant="primary", scale=1)
            
            clear_btn = gr.Button("🗑️ Clear Chat History", variant="secondary")
            
            # Example questions
            with gr.Accordion("💡 Example Questions", open=False):
                gr.Markdown("""

                **Entity-based questions:**

                - "What organizations are mentioned in the documents?"

                - "Tell me about [person name] and their relationships"

                - "What events are connected to [organization]?"

                

                **Relationship queries:**

                - "How are [entity1] and [entity2] related?"

                - "What causes [concept] according to the documents?"

                - "Show me all connections to [topic]"

                

                **Content questions:**

                - "Summarize the main concepts in the documents"

                - "What are the key findings about [topic]?"

                - "Explain [concept] based on the ingested data"

                """)
            
            # Chat functionality
            msg_box.submit(
                fn=chat_with_knowledge,
                inputs=[msg_box, chatbot],
                outputs=[msg_box, chatbot]
            )
            
            send_btn.click(
                fn=chat_with_knowledge,
                inputs=[msg_box, chatbot],
                outputs=[msg_box, chatbot]
            )
            
            clear_btn.click(
                fn=clear_chat_history,
                outputs=[chatbot]
            )
        
        with gr.Tab("ℹ️ Instructions"):
            gr.Markdown("""

            ## How to Use This Knowledge Graph Chatbot

            

            ### 1. Setup & Configuration

            - Choose your LLM provider (Cohere or Gemini)

            - Enter your API key for the chosen provider

            - Configure your Neo4j connection details

            - Click "Connect to Neo4j" to establish the database connection

            

            ### 1.5. Reconnecting to Existing Data

            **If you already have data in Neo4j from a previous session:**

            - After connecting to Neo4j, if you see existing data detected

            - Set up your LLM provider and API key

            - Click "🔄 Reconnect to Existing Data" instead of re-ingesting

            - This will restore your vector store and enable chat without re-processing documents

            

            ### 2. Data Ingestion

            - Upload files (PDF, DOCX, TXT, CSV, XLS, XLSX) or enter URLs to scrape

            - Click "Process & Build Knowledge Graph" to:

              - Extract text from your sources

              - Build a knowledge graph using LLM-based entity extraction

              - Create a vector index for semantic search

            

            ### 3. Chat

            - Ask questions about your ingested data

            - The chatbot will provide **strict** answers only based on your uploaded content

            - If the answer isn't in your data, it will explicitly say so

            

            ### Features

            - **Knowledge Graph Queries**: Direct Cypher queries to find entities and relationships

            - **Vector Semantic Search**: Dense vector similarity search for relevant content

            - **Hybrid Intelligence**: Combines structured graph data with unstructured document content

            - **Source Attribution**: Answers include references to source files/URLs

            - **Strict Mode**: Only answers from your ingested data, no hallucination

            - **Entity Extraction**: Automatically identifies people, organizations, locations, events

            - **Relationship Mapping**: Discovers and queries connections between entities

            - **Batch Processing**: Handles large document collections efficiently

            

            ### Requirements

            - Neo4j database (Neo4j Aura or self-hosted)

            - API key for Cohere or Google Gemini

            - Documents or URLs to process

            

            ### Required Packages for Kaggle

            Run this in a Kaggle cell before using the interface:

            ```python

            !pip install gradio langchain neo4j beautifulsoup4 requests python-dotenv

            !pip install langchain-community langchain-experimental

            !pip install langchain-google-genai cohere

            !pip install json-repair  # Required for knowledge graph extraction

            !pip install unstructured[all-docs]  # For better document parsing

            ```

            

            ### For Kaggle Notebooks

            This interface is optimized for Kaggle notebooks. Make sure to:

            1. Install required packages in your notebook

            2. Set up your API keys as environment variables or enter them in the interface

            3. Use a cloud-hosted Neo4j instance (like Neo4j Aura)

            """)
    
    return demo


# ===============================
# Main Function
# ===============================

if __name__ == "__main__":
    demo = create_interface()
    demo.launch(
        share=True,  # Create a public link for sharing
        debug=True,
        server_name="0.0.0.0",  # Allow external connections
        server_port=7860
    )