| import os |
| import io |
| import tempfile |
| from typing import List, Tuple, Optional |
| import gradio as gr |
| from dotenv import load_dotenv |
|
|
| |
| import requests |
| from bs4 import BeautifulSoup |
|
|
| |
| from langchain_core.documents import Document |
| from langchain_text_splitters import RecursiveCharacterTextSplitter |
| from langchain_core.prompts import PromptTemplate |
|
|
| |
| from langchain_community.document_loaders import ( |
| PyPDFLoader, |
| UnstructuredWordDocumentLoader, |
| TextLoader, |
| CSVLoader, |
| UnstructuredExcelLoader, |
| ) |
|
|
| |
| from langchain_community.graphs import Neo4jGraph |
| from langchain_community.vectorstores import Neo4jVector |
| from langchain_experimental.graph_transformers import LLMGraphTransformer |
| from langchain_community.chains.graph_qa.cypher import GraphCypherQAChain |
|
|
| |
| |
| from langchain_community.embeddings import CohereEmbeddings |
| from langchain_community.llms import Cohere |
|
|
| |
| from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings |
|
|
| |
| class AppState: |
| def __init__(self): |
| self.graph: Optional[Neo4jGraph] = None |
| self.vs: Optional[Neo4jVector] = None |
| self.llm = None |
| self.embeddings = None |
| self.chat_history = [] |
|
|
| app_state = AppState() |
|
|
| |
| |
| |
|
|
| def clean_chunks(docs: List[Document], chunk_size=800, chunk_overlap=120) -> List[Document]: |
| """Split to moderately large chunks for better retrieval and context quality.""" |
| splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) |
| return splitter.split_documents(docs) |
|
|
|
|
| def load_and_split_file(file_path: str) -> List[Document]: |
| """Load a single file (by extension) and return split docs.""" |
| filename = os.path.basename(file_path) |
| name = filename.lower() |
| _, ext = os.path.splitext(name) |
| ext = ext.lstrip(".") |
|
|
| try: |
| if ext == "pdf": |
| loader = PyPDFLoader(file_path) |
| elif ext in ("docx", "doc"): |
| loader = UnstructuredWordDocumentLoader(file_path) |
| elif ext == "txt": |
| loader = TextLoader(file_path, autodetect_encoding=True) |
| elif ext == "csv": |
| loader = CSVLoader(file_path, csv_args={"delimiter": ","}) |
| elif ext in ("xlsx", "xls"): |
| loader = UnstructuredExcelLoader(file_path, mode="elements") |
| else: |
| print(f"Unsupported file type: {ext}") |
| return [] |
|
|
| docs = loader.load() |
| |
| for d in docs: |
| d.metadata = d.metadata or {} |
| d.metadata["source"] = filename |
|
|
| return clean_chunks(docs) |
| except Exception as e: |
| print(f"Error processing {filename}: {e}") |
| return [] |
|
|
|
|
| def scrape_webpage(url: str) -> List[Document]: |
| """Scrape a single URL (no crawling), extract visible text, split into chunks.""" |
| try: |
| headers = { |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" |
| } |
| r = requests.get(url, headers=headers, timeout=20, allow_redirects=True) |
| r.raise_for_status() |
| soup = BeautifulSoup(r.text, "html.parser") |
|
|
| |
| for script in soup(["script", "style", "nav", "header", "footer"]): |
| script.decompose() |
|
|
| |
| main_candidates = soup.select("main, article, section, .content, .post, .entry") or [soup.body or soup] |
| texts = [] |
| for node in main_candidates: |
| if node: |
| text = node.get_text(separator=" ", strip=True) |
| if text and len(text) > 50: |
| texts.append(text) |
| |
| joined = " ".join(texts).strip() |
| if not joined or len(joined) < 100: |
| return [] |
|
|
| base_doc = Document(page_content=joined, metadata={"source": url, "type": "web"}) |
| return clean_chunks([base_doc], chunk_size=800, chunk_overlap=120) |
| except Exception as e: |
| print(f"Error scraping {url}: {e}") |
| return [] |
|
|
|
|
| def init_models(provider: str, api_key: str): |
| """Initialize LLM and embeddings for the chosen provider.""" |
| if provider == "Cohere": |
| if not api_key: |
| raise ValueError("Please provide a Cohere API key.") |
| |
| |
| llm = Cohere(model="command", temperature=0.2, cohere_api_key=api_key) |
| |
| |
| embeddings = CohereEmbeddings( |
| model="embed-english-v3.0", |
| cohere_api_key=api_key, |
| user_agent="langchain-knowledge-graph-chatbot" |
| ) |
| return llm, embeddings |
|
|
| elif provider == "Gemini": |
| if not api_key: |
| raise ValueError("Please provide a Gemini API key.") |
| |
| |
| llm = ChatGoogleGenerativeAI( |
| model="gemini-1.5-pro", |
| temperature=0.2, |
| google_api_key=api_key |
| ) |
| embeddings = GoogleGenerativeAIEmbeddings( |
| model="models/embedding-001", |
| google_api_key=api_key |
| ) |
| return llm, embeddings |
|
|
| else: |
| raise ValueError(f"Unsupported provider: {provider}") |
|
|
|
|
| def upsert_chunks_vector_index( |
| docs: List[Document], |
| embeddings, |
| neo4j_url: str, |
| neo4j_user: str, |
| neo4j_password: str, |
| database: str = "neo4j", |
| node_label: str = "Chunk", |
| text_prop: str = "text", |
| embed_prop: str = "embedding", |
| index_name: str = "chunk_vector_index", |
| keyword_index_name: str = "chunk_keyword_index", |
| ): |
| """Create/update a Neo4j vector index with chunk nodes for retrieval.""" |
| |
| prepared_docs = [] |
| for d in docs: |
| content = d.page_content.strip() |
| if not content: |
| continue |
| d.metadata = d.metadata or {} |
| |
| prepared_docs.append(Document(page_content=content, metadata=d.metadata)) |
|
|
| if not prepared_docs: |
| return None |
|
|
| vs = Neo4jVector.from_documents( |
| documents=prepared_docs, |
| embedding=embeddings, |
| url=neo4j_url, |
| username=neo4j_user, |
| password=neo4j_password, |
| database=database, |
| node_label=node_label, |
| text_node_property=text_prop, |
| embedding_node_property=embed_prop, |
| index_name=index_name, |
| keyword_index_name=keyword_index_name, |
| |
| ) |
| return vs |
|
|
|
|
| def build_kg_with_llm( |
| docs: List[Document], |
| graph: Neo4jGraph, |
| llm, |
| allowed_nodes: List[str], |
| allowed_rels: List[str], |
| ): |
| """Extract a lean, controllable KG from your documents and persist in Neo4j.""" |
| try: |
| |
| try: |
| import json_repair |
| except ImportError: |
| print("Installing json-repair package...") |
| import subprocess |
| import sys |
| subprocess.check_call([sys.executable, "-m", "pip", "install", "json-repair"]) |
| import json_repair |
| |
| transformer = LLMGraphTransformer( |
| llm=llm, |
| allowed_nodes=allowed_nodes, |
| allowed_relationships=allowed_rels, |
| node_properties=False, |
| relationship_properties=False, |
| ) |
| |
| |
| batch_size = 3 |
| total_batches = (len(docs) + batch_size - 1) // batch_size |
| |
| for i in range(0, len(docs), batch_size): |
| batch = docs[i:i + batch_size] |
| batch_num = (i // batch_size) + 1 |
| print(f"Processing batch {batch_num}/{total_batches} ({len(batch)} documents)") |
| |
| try: |
| graph_docs = transformer.convert_to_graph_documents(batch) |
| if graph_docs: |
| graph.add_graph_documents(graph_docs, include_source=True) |
| print(f"Successfully processed batch {batch_num}") |
| else: |
| print(f"No graph documents generated for batch {batch_num}") |
| except Exception as e: |
| print(f"Error processing batch {batch_num}: {e}") |
| continue |
| |
| except Exception as e: |
| print(f"Knowledge graph extraction error: {e}") |
| raise e |
|
|
|
|
| def query_knowledge_graph(graph: Neo4jGraph, question: str, llm) -> str: |
| """Query the knowledge graph using natural language and return results.""" |
| try: |
| |
| cypher_chain = GraphCypherQAChain.from_llm( |
| llm=llm, |
| graph=graph, |
| verbose=True, |
| return_intermediate_steps=True, |
| allow_dangerous_requests=True |
| ) |
| |
| |
| result = cypher_chain(question) |
| |
| |
| answer = result.get("result", "") |
| intermediate_steps = result.get("intermediate_steps", []) |
| |
| |
| formatted_answer = f"**Knowledge Graph Answer:**\n{answer}" |
| |
| |
| if intermediate_steps: |
| for step in intermediate_steps: |
| if "query" in step: |
| formatted_answer += f"\n\n*Graph Query Used:* `{step['query']}`" |
| |
| return formatted_answer |
| |
| except Exception as e: |
| return f"Error querying knowledge graph: {e}" |
|
|
|
|
| def hybrid_retrieval_answer( |
| question: str, |
| graph: Neo4jGraph, |
| vs: Neo4jVector, |
| llm |
| ) -> str: |
| """Combine knowledge graph querying with vector search for comprehensive answers.""" |
| |
| |
| kg_answer = query_knowledge_graph(graph, question, llm) |
| |
| |
| try: |
| retriever = vs.as_retriever(search_type="similarity", search_kwargs={"k": 6}) |
| relevant_docs = retriever.get_relevant_documents(question) |
| |
| context_texts = [] |
| for d in relevant_docs: |
| src = d.metadata.get("source", "unknown") |
| snippet = d.page_content[:1200] |
| context_texts.append(f"[Source: {src}] {snippet}") |
| |
| vector_context = "\n\n---\n\n".join(context_texts) |
| except Exception as e: |
| vector_context = f"Vector search error: {e}" |
| |
| |
| combined_prompt = f""" |
| You are a helpful assistant that must provide comprehensive answers using BOTH knowledge graph data and document context. |
| |
| KNOWLEDGE GRAPH RESULTS: |
| {kg_answer} |
| |
| DOCUMENT CONTEXT: |
| {vector_context} |
| |
| USER QUESTION: {question} |
| |
| Instructions: |
| - Synthesize information from BOTH the knowledge graph and document context |
| - If the knowledge graph provides structured relationships, highlight those |
| - If the documents provide additional details, include those |
| - Always cite sources when possible |
| - If information conflicts, note the discrepancy |
| - If neither source has sufficient information, say so clearly |
| |
| Provide a comprehensive answer that leverages both structured knowledge and document content: |
| """ |
| |
| try: |
| response = llm.invoke(combined_prompt) |
| if hasattr(response, "content"): |
| return response.content |
| return str(response) |
| except Exception as e: |
| return f"Error generating combined answer: {e}" |
|
|
|
|
| |
| |
| |
|
|
| def connect_neo4j(neo4j_url: str, neo4j_user: str, neo4j_password: str) -> str: |
| """Connect to Neo4j database and check for existing data.""" |
| try: |
| app_state.graph = Neo4jGraph(url=neo4j_url, username=neo4j_user, password=neo4j_password) |
| |
| |
| try: |
| |
| chunk_count = app_state.graph.query("MATCH (n:Chunk) RETURN count(n) as count")[0]["count"] |
| entity_count = app_state.graph.query("MATCH (n) WHERE NOT n:Chunk RETURN count(n) as count")[0]["count"] |
| |
| status_msg = f"✅ Successfully connected to Neo4j!\n" |
| status_msg += f"📊 Found {chunk_count} document chunks and {entity_count} knowledge graph entities" |
| |
| |
| if chunk_count > 0: |
| try: |
| |
| |
| status_msg += f"\n💡 Existing data detected! Please set up your LLM provider and click 'Reconnect to Existing Data' to restore full functionality." |
| except Exception as e: |
| status_msg += f"\n⚠️ Data found but vector store needs reconnection." |
| |
| return status_msg |
| |
| except Exception as e: |
| return "✅ Successfully connected to Neo4j! (Empty database)" |
| |
| except Exception as e: |
| return f"❌ Neo4j connection failed: {e}" |
|
|
|
|
| def reconnect_existing_data( |
| provider: str, |
| api_key: str, |
| neo4j_url: str, |
| neo4j_user: str, |
| neo4j_password: str |
| ) -> str: |
| """Reconnect to existing vector store and LLM models.""" |
| if app_state.graph is None: |
| return "❌ Please connect to Neo4j first." |
| |
| try: |
| |
| llm, embeddings = init_models(provider, api_key) |
| app_state.llm = llm |
| app_state.embeddings = embeddings |
| |
| |
| chunk_count = app_state.graph.query("MATCH (n:Chunk) RETURN count(n) as count")[0]["count"] |
| |
| if chunk_count == 0: |
| return "❌ No existing data found. Please ingest new data first." |
| |
| |
| try: |
| app_state.vs = Neo4jVector( |
| embedding=embeddings, |
| url=neo4j_url, |
| username=neo4j_user, |
| password=neo4j_password, |
| database="neo4j", |
| node_label="Chunk", |
| text_node_property="text", |
| embedding_node_property="embedding", |
| index_name="chunk_vector_index", |
| keyword_index_name="chunk_keyword_index", |
| ) |
| |
| |
| test_results = app_state.vs.similarity_search("test", k=1) |
| |
| return f"✅ Successfully reconnected to existing data! Found {chunk_count} chunks. Vector store is ready for chat." |
| |
| except Exception as vs_error: |
| |
| return f"⚠️ Vector store connection failed: {vs_error}. You may need to re-ingest your data." |
| |
| except Exception as e: |
| import traceback |
| error_details = traceback.format_exc() |
| print(f"Reconnection error: {error_details}") |
| return f"❌ Reconnection failed: {str(e)}" |
|
|
|
|
| def wipe_database() -> str: |
| """Wipe the Neo4j database.""" |
| if app_state.graph is None: |
| return "❌ Please connect to Neo4j first." |
| |
| try: |
| app_state.graph.query("MATCH (n) DETACH DELETE n;") |
| return "✅ Database successfully wiped!" |
| except Exception as e: |
| return f"❌ Failed to wipe database: {e}" |
|
|
|
|
| def process_knowledge( |
| provider: str, |
| api_key: str, |
| files: List[str], |
| urls: str, |
| neo4j_url: str, |
| neo4j_user: str, |
| neo4j_password: str |
| ) -> str: |
| """Process files and URLs to build knowledge graph.""" |
| if app_state.graph is None: |
| return "❌ Please connect to Neo4j first." |
| |
| try: |
| |
| llm, embeddings = init_models(provider, api_key) |
| app_state.llm = llm |
| app_state.embeddings = embeddings |
| |
| all_docs: List[Document] = [] |
| processed_files = 0 |
| processed_urls = 0 |
| |
| |
| if files: |
| for file_path in files: |
| if file_path: |
| try: |
| print(f"Processing file: {file_path}") |
| file_docs = load_and_split_file(file_path) |
| all_docs.extend(file_docs) |
| processed_files += 1 |
| print(f"Successfully processed {file_path}: {len(file_docs)} chunks") |
| except Exception as e: |
| print(f"Failed to process file {file_path}: {e}") |
| continue |
| |
| |
| if urls and urls.strip(): |
| url_list = [u.strip() for u in urls.splitlines() if u.strip()] |
| for url in url_list: |
| try: |
| print(f"Processing URL: {url}") |
| url_docs = scrape_webpage(url) |
| all_docs.extend(url_docs) |
| processed_urls += 1 |
| print(f"Successfully processed {url}: {len(url_docs)} chunks") |
| except Exception as e: |
| print(f"Failed to process URL {url}: {e}") |
| continue |
| |
| if not all_docs: |
| return f"⚠️ No data extracted. Processed {processed_files} files and {processed_urls} URLs, but no usable content found." |
| |
| print(f"Total documents to process: {len(all_docs)}") |
| |
| |
| allowed_nodes = ["Entity", "Concept", "Person", "Organization", "Location", "Event", "Fact"] |
| allowed_rels = ["RELATED_TO", "MENTIONS", "PART_OF", "CAUSES", "ASSOCIATED_WITH"] |
| |
| try: |
| print("Building knowledge graph...") |
| build_kg_with_llm(all_docs, app_state.graph, llm, allowed_nodes, allowed_rels) |
| print("Knowledge graph built successfully") |
| except Exception as e: |
| print(f"KG extraction error: {e}") |
| return f"❌ KG extraction failed: {e}" |
| |
| |
| try: |
| print("Building vector index...") |
| vs = upsert_chunks_vector_index( |
| docs=all_docs, |
| embeddings=embeddings, |
| neo4j_url=neo4j_url, |
| neo4j_user=neo4j_user, |
| neo4j_password=neo4j_password, |
| node_label="Chunk", |
| text_prop="text", |
| embed_prop="embedding", |
| index_name="chunk_vector_index", |
| keyword_index_name="chunk_keyword_index", |
| ) |
| app_state.vs = vs |
| print("Vector index built successfully") |
| except Exception as e: |
| print(f"Vector indexing error: {e}") |
| return f"❌ Vector indexing failed: {e}" |
| |
| return f"✅ Successfully processed {processed_files} files and {processed_urls} URLs ({len(all_docs)} total chunks)! Knowledge graph and vector index are ready." |
| |
| except Exception as e: |
| import traceback |
| error_details = traceback.format_exc() |
| print(f"Full error details: {error_details}") |
| return f"❌ Processing failed: {str(e)}" |
|
|
|
|
| def chat_with_knowledge(message: str, history: List[Tuple[str, str]]) -> Tuple[str, List[Tuple[str, str]]]: |
| """Chat function using both knowledge graph and vector search.""" |
| if app_state.graph is None or app_state.vs is None: |
| response = "❌ Please connect to Neo4j and ingest data first." |
| history.append((message, response)) |
| return "", history |
| |
| if app_state.llm is None: |
| response = "❌ Model not initialized. Please process some data first." |
| history.append((message, response)) |
| return "", history |
| |
| try: |
| |
| answer = hybrid_retrieval_answer( |
| question=message, |
| graph=app_state.graph, |
| vs=app_state.vs, |
| llm=app_state.llm |
| ) |
| |
| if not answer or answer.strip() == "": |
| answer = "I don't have enough information to answer that based on the ingested data." |
| |
| history.append((message, answer)) |
| return "", history |
| |
| except Exception as e: |
| import traceback |
| error_details = traceback.format_exc() |
| print(f"Chat error details: {error_details}") |
| response = f"❌ Error during chat: {str(e)}" |
| history.append((message, response)) |
| return "", history |
|
|
|
|
| def clear_chat_history(): |
| """Clear the chat history.""" |
| app_state.chat_history = [] |
| return [] |
|
|
|
|
| |
| |
| |
|
|
| def create_interface(): |
| """Create the Gradio interface.""" |
| load_dotenv() |
| |
| with gr.Blocks(title="Knowledge Graph Chatbot", theme=gr.themes.Soft()) as demo: |
| gr.Markdown("# 📚 Knowledge Graph Chatbot (Strict)") |
| gr.Markdown("Upload documents, scrape URLs, and chat with your knowledge using Neo4j and vector search!") |
| |
| with gr.Tab("🔧 Setup & Configuration"): |
| with gr.Row(): |
| with gr.Column(scale=1): |
| gr.Markdown("### Model Settings") |
| provider = gr.Dropdown( |
| choices=["Cohere", "Gemini"], |
| value="Cohere", |
| label="LLM Provider" |
| ) |
| api_key = gr.Textbox( |
| label="API Key", |
| type="password", |
| value=os.getenv("COHERE_API_KEY", ""), |
| placeholder="Enter your API key" |
| ) |
| |
| with gr.Column(scale=1): |
| gr.Markdown("### Neo4j Configuration") |
| neo4j_url = gr.Textbox( |
| label="Neo4j URL", |
| value=os.getenv("NEO4J_URI", "neo4j+s://your-neo4j-url"), |
| placeholder="neo4j+s://your-neo4j-url" |
| ) |
| neo4j_user = gr.Textbox( |
| label="Username", |
| value=os.getenv("NEO4J_USER", "neo4j") |
| ) |
| neo4j_password = gr.Textbox( |
| label="Password", |
| type="password", |
| value=os.getenv("NEO4J_PASSWORD", "") |
| ) |
| |
| with gr.Row(): |
| connect_btn = gr.Button("🔗 Connect to Neo4j", variant="primary") |
| wipe_btn = gr.Button("🗑️ Wipe Database", variant="stop") |
| reconnect_btn = gr.Button("🔄 Reconnect to Existing Data", variant="secondary") |
| |
| connection_status = gr.Textbox( |
| label="Connection Status", |
| interactive=False, |
| placeholder="Click 'Connect to Neo4j' to establish connection" |
| ) |
| |
| connect_btn.click( |
| fn=connect_neo4j, |
| inputs=[neo4j_url, neo4j_user, neo4j_password], |
| outputs=[connection_status] |
| ) |
| |
| wipe_btn.click( |
| fn=wipe_database, |
| outputs=[connection_status] |
| ) |
| |
| reconnect_btn.click( |
| fn=reconnect_existing_data, |
| inputs=[provider, api_key, neo4j_url, neo4j_user, neo4j_password], |
| outputs=[connection_status] |
| ) |
| |
| with gr.Tab("📁 Data Ingestion"): |
| gr.Markdown("### Upload Knowledge Sources") |
| |
| files = gr.File( |
| label="Upload Files", |
| file_types=[".pdf", ".docx", ".doc", ".txt", ".csv", ".xls", ".xlsx"], |
| file_count="multiple" |
| ) |
| |
| urls = gr.Textbox( |
| label="URLs to Scrape", |
| placeholder="Enter URLs, one per line", |
| lines=5 |
| ) |
| |
| process_btn = gr.Button("🚀 Process & Build Knowledge Graph", variant="primary") |
| |
| processing_status = gr.Textbox( |
| label="Processing Status", |
| interactive=False, |
| placeholder="Click 'Process & Build Knowledge Graph' to start" |
| ) |
| |
| process_btn.click( |
| fn=process_knowledge, |
| inputs=[provider, api_key, files, urls, neo4j_url, neo4j_user, neo4j_password], |
| outputs=[processing_status] |
| ) |
| |
| with gr.Tab("💬 Chat"): |
| gr.Markdown("### Chat with Your Knowledge Graph") |
| gr.Markdown("Ask questions about your ingested data. The system uses **both knowledge graph queries and vector search** for comprehensive answers.") |
| |
| chatbot = gr.Chatbot( |
| label="Knowledge Graph Chat", |
| height=500, |
| placeholder="Your conversation will appear here..." |
| ) |
| |
| with gr.Row(): |
| msg_box = gr.Textbox( |
| label="Your Question", |
| placeholder="Ask about entities, relationships, or any content from your data...", |
| scale=4 |
| ) |
| send_btn = gr.Button("Send", variant="primary", scale=1) |
| |
| clear_btn = gr.Button("🗑️ Clear Chat History", variant="secondary") |
| |
| |
| with gr.Accordion("💡 Example Questions", open=False): |
| gr.Markdown(""" |
| **Entity-based questions:** |
| - "What organizations are mentioned in the documents?" |
| - "Tell me about [person name] and their relationships" |
| - "What events are connected to [organization]?" |
| |
| **Relationship queries:** |
| - "How are [entity1] and [entity2] related?" |
| - "What causes [concept] according to the documents?" |
| - "Show me all connections to [topic]" |
| |
| **Content questions:** |
| - "Summarize the main concepts in the documents" |
| - "What are the key findings about [topic]?" |
| - "Explain [concept] based on the ingested data" |
| """) |
| |
| |
| msg_box.submit( |
| fn=chat_with_knowledge, |
| inputs=[msg_box, chatbot], |
| outputs=[msg_box, chatbot] |
| ) |
| |
| send_btn.click( |
| fn=chat_with_knowledge, |
| inputs=[msg_box, chatbot], |
| outputs=[msg_box, chatbot] |
| ) |
| |
| clear_btn.click( |
| fn=clear_chat_history, |
| outputs=[chatbot] |
| ) |
| |
| with gr.Tab("ℹ️ Instructions"): |
| gr.Markdown(""" |
| ## How to Use This Knowledge Graph Chatbot |
| |
| ### 1. Setup & Configuration |
| - Choose your LLM provider (Cohere or Gemini) |
| - Enter your API key for the chosen provider |
| - Configure your Neo4j connection details |
| - Click "Connect to Neo4j" to establish the database connection |
| |
| ### 1.5. Reconnecting to Existing Data |
| **If you already have data in Neo4j from a previous session:** |
| - After connecting to Neo4j, if you see existing data detected |
| - Set up your LLM provider and API key |
| - Click "🔄 Reconnect to Existing Data" instead of re-ingesting |
| - This will restore your vector store and enable chat without re-processing documents |
| |
| ### 2. Data Ingestion |
| - Upload files (PDF, DOCX, TXT, CSV, XLS, XLSX) or enter URLs to scrape |
| - Click "Process & Build Knowledge Graph" to: |
| - Extract text from your sources |
| - Build a knowledge graph using LLM-based entity extraction |
| - Create a vector index for semantic search |
| |
| ### 3. Chat |
| - Ask questions about your ingested data |
| - The chatbot will provide **strict** answers only based on your uploaded content |
| - If the answer isn't in your data, it will explicitly say so |
| |
| ### Features |
| - **Knowledge Graph Queries**: Direct Cypher queries to find entities and relationships |
| - **Vector Semantic Search**: Dense vector similarity search for relevant content |
| - **Hybrid Intelligence**: Combines structured graph data with unstructured document content |
| - **Source Attribution**: Answers include references to source files/URLs |
| - **Strict Mode**: Only answers from your ingested data, no hallucination |
| - **Entity Extraction**: Automatically identifies people, organizations, locations, events |
| - **Relationship Mapping**: Discovers and queries connections between entities |
| - **Batch Processing**: Handles large document collections efficiently |
| |
| ### Requirements |
| - Neo4j database (Neo4j Aura or self-hosted) |
| - API key for Cohere or Google Gemini |
| - Documents or URLs to process |
| |
| ### Required Packages for Kaggle |
| Run this in a Kaggle cell before using the interface: |
| ```python |
| !pip install gradio langchain neo4j beautifulsoup4 requests python-dotenv |
| !pip install langchain-community langchain-experimental |
| !pip install langchain-google-genai cohere |
| !pip install json-repair # Required for knowledge graph extraction |
| !pip install unstructured[all-docs] # For better document parsing |
| ``` |
| |
| ### For Kaggle Notebooks |
| This interface is optimized for Kaggle notebooks. Make sure to: |
| 1. Install required packages in your notebook |
| 2. Set up your API keys as environment variables or enter them in the interface |
| 3. Use a cloud-hosted Neo4j instance (like Neo4j Aura) |
| """) |
| |
| return demo |
|
|
|
|
| |
| |
| |
|
|
| if __name__ == "__main__": |
| demo = create_interface() |
| demo.launch( |
| share=True, |
| debug=True, |
| server_name="0.0.0.0", |
| server_port=7860 |
| ) |