samuelolubukun commited on
Commit
6acb6dd
·
verified ·
1 Parent(s): ecc5643

Full commit

Browse files
Files changed (2) hide show
  1. app.py +830 -0
  2. requirements.txt +16 -0
app.py ADDED
@@ -0,0 +1,830 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import tempfile
4
+ from typing import List, Tuple, Optional
5
+ import gradio as gr
6
+ from dotenv import load_dotenv
7
+
8
+ # --- Web scraping ---
9
+ import requests
10
+ from bs4 import BeautifulSoup
11
+
12
+ # --- LangChain core ---
13
+ from langchain.schema import Document
14
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
15
+ from langchain.prompts import PromptTemplate
16
+
17
+ # --- Loaders (files) ---
18
+ from langchain_community.document_loaders import (
19
+ PyPDFLoader,
20
+ UnstructuredWordDocumentLoader,
21
+ TextLoader,
22
+ CSVLoader,
23
+ UnstructuredExcelLoader,
24
+ )
25
+
26
+ # --- Neo4j + Vector store + Graph ---
27
+ from langchain_community.graphs import Neo4jGraph
28
+ from langchain_community.vectorstores import Neo4jVector
29
+ from langchain_experimental.graph_transformers import LLMGraphTransformer
30
+ from langchain.chains.graph_qa.cypher import GraphCypherQAChain
31
+
32
+ # --- LLMs & Embeddings: Cohere or Gemini ---
33
+ # Cohere
34
+ from langchain_community.embeddings import CohereEmbeddings
35
+ from langchain_community.llms import Cohere
36
+
37
+ # Gemini (Google Generative AI API key)
38
+ from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
39
+
40
+ # Global state management
41
+ class AppState:
42
+ def __init__(self):
43
+ self.graph: Optional[Neo4jGraph] = None
44
+ self.vs: Optional[Neo4jVector] = None
45
+ self.llm = None
46
+ self.embeddings = None
47
+ self.chat_history = []
48
+
49
+ app_state = AppState()
50
+
51
+ # ===============================
52
+ # Helpers
53
+ # ===============================
54
+
55
+ def clean_chunks(docs: List[Document], chunk_size=800, chunk_overlap=120) -> List[Document]:
56
+ """Split to moderately large chunks for better retrieval and context quality."""
57
+ splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
58
+ return splitter.split_documents(docs)
59
+
60
+
61
+ def load_and_split_file(file_path: str) -> List[Document]:
62
+ """Load a single file (by extension) and return split docs."""
63
+ filename = os.path.basename(file_path)
64
+ name = filename.lower()
65
+ _, ext = os.path.splitext(name)
66
+ ext = ext.lstrip(".")
67
+
68
+ try:
69
+ if ext == "pdf":
70
+ loader = PyPDFLoader(file_path)
71
+ elif ext in ("docx", "doc"):
72
+ loader = UnstructuredWordDocumentLoader(file_path)
73
+ elif ext == "txt":
74
+ loader = TextLoader(file_path, autodetect_encoding=True)
75
+ elif ext == "csv":
76
+ loader = CSVLoader(file_path, csv_args={"delimiter": ","})
77
+ elif ext in ("xlsx", "xls"):
78
+ loader = UnstructuredExcelLoader(file_path, mode="elements")
79
+ else:
80
+ print(f"Unsupported file type: {ext}")
81
+ return []
82
+
83
+ docs = loader.load()
84
+ # Attach source metadata
85
+ for d in docs:
86
+ d.metadata = d.metadata or {}
87
+ d.metadata["source"] = filename
88
+
89
+ return clean_chunks(docs)
90
+ except Exception as e:
91
+ print(f"Error processing {filename}: {e}")
92
+ return []
93
+
94
+
95
+ def scrape_webpage(url: str) -> List[Document]:
96
+ """Scrape a single URL (no crawling), extract visible text, split into chunks."""
97
+ try:
98
+ headers = {
99
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
100
+ }
101
+ r = requests.get(url, headers=headers, timeout=20, allow_redirects=True)
102
+ r.raise_for_status()
103
+ soup = BeautifulSoup(r.text, "html.parser")
104
+
105
+ # Remove script and style elements
106
+ for script in soup(["script", "style", "nav", "header", "footer"]):
107
+ script.decompose()
108
+
109
+ # Restrict to likely content areas to reduce nav noise
110
+ main_candidates = soup.select("main, article, section, .content, .post, .entry") or [soup.body or soup]
111
+ texts = []
112
+ for node in main_candidates:
113
+ if node:
114
+ text = node.get_text(separator=" ", strip=True)
115
+ if text and len(text) > 50: # Only keep substantial text
116
+ texts.append(text)
117
+
118
+ joined = " ".join(texts).strip()
119
+ if not joined or len(joined) < 100:
120
+ return []
121
+
122
+ base_doc = Document(page_content=joined, metadata={"source": url, "type": "web"})
123
+ return clean_chunks([base_doc], chunk_size=800, chunk_overlap=120)
124
+ except Exception as e:
125
+ print(f"Error scraping {url}: {e}")
126
+ return []
127
+
128
+
129
+ def init_models(provider: str, api_key: str):
130
+ """Initialize LLM and embeddings for the chosen provider."""
131
+ if provider == "Cohere":
132
+ if not api_key:
133
+ raise ValueError("Please provide a Cohere API key.")
134
+
135
+ # Initialize Cohere LLM
136
+ llm = Cohere(model="command", temperature=0.2, cohere_api_key=api_key)
137
+
138
+ # Initialize Cohere Embeddings with user_agent parameter
139
+ embeddings = CohereEmbeddings(
140
+ model="embed-english-v3.0",
141
+ cohere_api_key=api_key,
142
+ user_agent="langchain-knowledge-graph-chatbot"
143
+ )
144
+ return llm, embeddings
145
+
146
+ elif provider == "Gemini":
147
+ if not api_key:
148
+ raise ValueError("Please provide a Gemini API key.")
149
+
150
+ # Chat + Embeddings via Google Generative AI (no GCP project required)
151
+ llm = ChatGoogleGenerativeAI(
152
+ model="gemini-1.5-pro",
153
+ temperature=0.2,
154
+ google_api_key=api_key
155
+ )
156
+ embeddings = GoogleGenerativeAIEmbeddings(
157
+ model="models/embedding-001",
158
+ google_api_key=api_key
159
+ )
160
+ return llm, embeddings
161
+
162
+ else:
163
+ raise ValueError(f"Unsupported provider: {provider}")
164
+
165
+
166
+ def upsert_chunks_vector_index(
167
+ docs: List[Document],
168
+ embeddings,
169
+ neo4j_url: str,
170
+ neo4j_user: str,
171
+ neo4j_password: str,
172
+ database: str = "neo4j",
173
+ node_label: str = "Chunk",
174
+ text_prop: str = "text",
175
+ embed_prop: str = "embedding",
176
+ index_name: str = "chunk_vector_index",
177
+ keyword_index_name: str = "chunk_keyword_index",
178
+ ):
179
+ """Create/update a Neo4j vector index with chunk nodes for retrieval."""
180
+ # Ensure text property exists per Document
181
+ prepared_docs = []
182
+ for d in docs:
183
+ content = d.page_content.strip()
184
+ if not content:
185
+ continue
186
+ d.metadata = d.metadata or {}
187
+ # Neo4jVector expects text under a known property; it will write it.
188
+ prepared_docs.append(Document(page_content=content, metadata=d.metadata))
189
+
190
+ if not prepared_docs:
191
+ return None
192
+
193
+ vs = Neo4jVector.from_documents(
194
+ documents=prepared_docs,
195
+ embedding=embeddings,
196
+ url=neo4j_url,
197
+ username=neo4j_user,
198
+ password=neo4j_password,
199
+ database=database,
200
+ node_label=node_label,
201
+ text_node_property=text_prop,
202
+ embedding_node_property=embed_prop,
203
+ index_name=index_name,
204
+ keyword_index_name=keyword_index_name,
205
+ # Removed search_type parameter - let it use default
206
+ )
207
+ return vs
208
+
209
+
210
+ def build_kg_with_llm(
211
+ docs: List[Document],
212
+ graph: Neo4jGraph,
213
+ llm,
214
+ allowed_nodes: List[str],
215
+ allowed_rels: List[str],
216
+ ):
217
+ """Extract a lean, controllable KG from your documents and persist in Neo4j."""
218
+ try:
219
+ # Try to import json_repair, install if missing
220
+ try:
221
+ import json_repair
222
+ except ImportError:
223
+ print("Installing json-repair package...")
224
+ import subprocess
225
+ import sys
226
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "json-repair"])
227
+ import json_repair
228
+
229
+ transformer = LLMGraphTransformer(
230
+ llm=llm,
231
+ allowed_nodes=allowed_nodes,
232
+ allowed_relationships=allowed_rels,
233
+ node_properties=False,
234
+ relationship_properties=False,
235
+ )
236
+
237
+ # Process documents in smaller batches to avoid token limits
238
+ batch_size = 3
239
+ total_batches = (len(docs) + batch_size - 1) // batch_size
240
+
241
+ for i in range(0, len(docs), batch_size):
242
+ batch = docs[i:i + batch_size]
243
+ batch_num = (i // batch_size) + 1
244
+ print(f"Processing batch {batch_num}/{total_batches} ({len(batch)} documents)")
245
+
246
+ try:
247
+ graph_docs = transformer.convert_to_graph_documents(batch)
248
+ if graph_docs:
249
+ graph.add_graph_documents(graph_docs, include_source=True)
250
+ print(f"Successfully processed batch {batch_num}")
251
+ else:
252
+ print(f"No graph documents generated for batch {batch_num}")
253
+ except Exception as e:
254
+ print(f"Error processing batch {batch_num}: {e}")
255
+ continue
256
+
257
+ except Exception as e:
258
+ print(f"Knowledge graph extraction error: {e}")
259
+ raise e
260
+
261
+
262
+ def query_knowledge_graph(graph: Neo4jGraph, question: str, llm) -> str:
263
+ """Query the knowledge graph using natural language and return results."""
264
+ try:
265
+ # Create a GraphCypherQAChain to query the knowledge graph
266
+ cypher_chain = GraphCypherQAChain.from_llm(
267
+ llm=llm,
268
+ graph=graph,
269
+ verbose=True,
270
+ return_intermediate_steps=True,
271
+ allow_dangerous_requests=True # Allow complex queries
272
+ )
273
+
274
+ # Query the knowledge graph
275
+ result = cypher_chain(question)
276
+
277
+ # Extract the answer and query information
278
+ answer = result.get("result", "")
279
+ intermediate_steps = result.get("intermediate_steps", [])
280
+
281
+ # Format the response with graph context
282
+ formatted_answer = f"**Knowledge Graph Answer:**\n{answer}"
283
+
284
+ # Add query information if available
285
+ if intermediate_steps:
286
+ for step in intermediate_steps:
287
+ if "query" in step:
288
+ formatted_answer += f"\n\n*Graph Query Used:* `{step['query']}`"
289
+
290
+ return formatted_answer
291
+
292
+ except Exception as e:
293
+ return f"Error querying knowledge graph: {e}"
294
+
295
+
296
+ def hybrid_retrieval_answer(
297
+ question: str,
298
+ graph: Neo4jGraph,
299
+ vs: Neo4jVector,
300
+ llm
301
+ ) -> str:
302
+ """Combine knowledge graph querying with vector search for comprehensive answers."""
303
+
304
+ # 1. Query the Knowledge Graph first
305
+ kg_answer = query_knowledge_graph(graph, question, llm)
306
+
307
+ # 2. Get vector search results
308
+ try:
309
+ retriever = vs.as_retriever(search_type="similarity", search_kwargs={"k": 6})
310
+ relevant_docs = retriever.get_relevant_documents(question)
311
+
312
+ context_texts = []
313
+ for d in relevant_docs:
314
+ src = d.metadata.get("source", "unknown")
315
+ snippet = d.page_content[:1200]
316
+ context_texts.append(f"[Source: {src}] {snippet}")
317
+
318
+ vector_context = "\n\n---\n\n".join(context_texts)
319
+ except Exception as e:
320
+ vector_context = f"Vector search error: {e}"
321
+
322
+ # 3. Combine both approaches for a comprehensive answer
323
+ combined_prompt = f"""
324
+ You are a helpful assistant that must provide comprehensive answers using BOTH knowledge graph data and document context.
325
+
326
+ KNOWLEDGE GRAPH RESULTS:
327
+ {kg_answer}
328
+
329
+ DOCUMENT CONTEXT:
330
+ {vector_context}
331
+
332
+ USER QUESTION: {question}
333
+
334
+ Instructions:
335
+ - Synthesize information from BOTH the knowledge graph and document context
336
+ - If the knowledge graph provides structured relationships, highlight those
337
+ - If the documents provide additional details, include those
338
+ - Always cite sources when possible
339
+ - If information conflicts, note the discrepancy
340
+ - If neither source has sufficient information, say so clearly
341
+
342
+ Provide a comprehensive answer that leverages both structured knowledge and document content:
343
+ """
344
+
345
+ try:
346
+ response = llm.invoke(combined_prompt)
347
+ if hasattr(response, "content"):
348
+ return response.content
349
+ return str(response)
350
+ except Exception as e:
351
+ return f"Error generating combined answer: {e}"
352
+
353
+
354
+ # ===============================
355
+ # Gradio Interface Functions
356
+ # ===============================
357
+
358
+ def connect_neo4j(neo4j_url: str, neo4j_user: str, neo4j_password: str) -> str:
359
+ """Connect to Neo4j database and check for existing data."""
360
+ try:
361
+ app_state.graph = Neo4jGraph(url=neo4j_url, username=neo4j_user, password=neo4j_password)
362
+
363
+ # Check if there's existing data in the database
364
+ try:
365
+ # Check for existing chunks/nodes
366
+ chunk_count = app_state.graph.query("MATCH (n:Chunk) RETURN count(n) as count")[0]["count"]
367
+ entity_count = app_state.graph.query("MATCH (n) WHERE NOT n:Chunk RETURN count(n) as count")[0]["count"]
368
+
369
+ status_msg = f"✅ Successfully connected to Neo4j!\n"
370
+ status_msg += f"📊 Found {chunk_count} document chunks and {entity_count} knowledge graph entities"
371
+
372
+ # Try to reconnect to existing vector store if chunks exist
373
+ if chunk_count > 0:
374
+ try:
375
+ # We need embeddings to reconnect to vector store
376
+ # For now, just indicate data exists but needs model setup
377
+ status_msg += f"\n💡 Existing data detected! Please set up your LLM provider and click 'Reconnect to Existing Data' to restore full functionality."
378
+ except Exception as e:
379
+ status_msg += f"\n⚠️ Data found but vector store needs reconnection."
380
+
381
+ return status_msg
382
+
383
+ except Exception as e:
384
+ return "✅ Successfully connected to Neo4j! (Empty database)"
385
+
386
+ except Exception as e:
387
+ return f"❌ Neo4j connection failed: {e}"
388
+
389
+
390
+ def reconnect_existing_data(
391
+ provider: str,
392
+ api_key: str,
393
+ neo4j_url: str,
394
+ neo4j_user: str,
395
+ neo4j_password: str
396
+ ) -> str:
397
+ """Reconnect to existing vector store and LLM models."""
398
+ if app_state.graph is None:
399
+ return "❌ Please connect to Neo4j first."
400
+
401
+ try:
402
+ # Initialize models
403
+ llm, embeddings = init_models(provider, api_key)
404
+ app_state.llm = llm
405
+ app_state.embeddings = embeddings
406
+
407
+ # Check if chunks exist
408
+ chunk_count = app_state.graph.query("MATCH (n:Chunk) RETURN count(n) as count")[0]["count"]
409
+
410
+ if chunk_count == 0:
411
+ return "❌ No existing data found. Please ingest new data first."
412
+
413
+ # Reconnect to existing vector store
414
+ try:
415
+ app_state.vs = Neo4jVector(
416
+ embedding=embeddings,
417
+ url=neo4j_url,
418
+ username=neo4j_user,
419
+ password=neo4j_password,
420
+ database="neo4j",
421
+ node_label="Chunk",
422
+ text_node_property="text",
423
+ embedding_node_property="embedding",
424
+ index_name="chunk_vector_index",
425
+ keyword_index_name="chunk_keyword_index",
426
+ )
427
+
428
+ # Test the vector store
429
+ test_results = app_state.vs.similarity_search("test", k=1)
430
+
431
+ return f"✅ Successfully reconnected to existing data! Found {chunk_count} chunks. Vector store is ready for chat."
432
+
433
+ except Exception as vs_error:
434
+ # If vector store connection fails, try to rebuild it
435
+ return f"⚠️ Vector store connection failed: {vs_error}. You may need to re-ingest your data."
436
+
437
+ except Exception as e:
438
+ import traceback
439
+ error_details = traceback.format_exc()
440
+ print(f"Reconnection error: {error_details}")
441
+ return f"❌ Reconnection failed: {str(e)}"
442
+
443
+
444
+ def wipe_database() -> str:
445
+ """Wipe the Neo4j database."""
446
+ if app_state.graph is None:
447
+ return "❌ Please connect to Neo4j first."
448
+
449
+ try:
450
+ app_state.graph.query("MATCH (n) DETACH DELETE n;")
451
+ return "✅ Database successfully wiped!"
452
+ except Exception as e:
453
+ return f"❌ Failed to wipe database: {e}"
454
+
455
+
456
+ def process_knowledge(
457
+ provider: str,
458
+ api_key: str,
459
+ files: List[str],
460
+ urls: str,
461
+ neo4j_url: str,
462
+ neo4j_user: str,
463
+ neo4j_password: str
464
+ ) -> str:
465
+ """Process files and URLs to build knowledge graph."""
466
+ if app_state.graph is None:
467
+ return "❌ Please connect to Neo4j first."
468
+
469
+ try:
470
+ # Initialize models
471
+ llm, embeddings = init_models(provider, api_key)
472
+ app_state.llm = llm
473
+ app_state.embeddings = embeddings
474
+
475
+ all_docs: List[Document] = []
476
+ processed_files = 0
477
+ processed_urls = 0
478
+
479
+ # Process uploaded files
480
+ if files:
481
+ for file_path in files:
482
+ if file_path: # Check if file_path is not None/empty
483
+ try:
484
+ print(f"Processing file: {file_path}")
485
+ file_docs = load_and_split_file(file_path)
486
+ all_docs.extend(file_docs)
487
+ processed_files += 1
488
+ print(f"Successfully processed {file_path}: {len(file_docs)} chunks")
489
+ except Exception as e:
490
+ print(f"Failed to process file {file_path}: {e}")
491
+ continue
492
+
493
+ # Process URLs
494
+ if urls and urls.strip():
495
+ url_list = [u.strip() for u in urls.splitlines() if u.strip()]
496
+ for url in url_list:
497
+ try:
498
+ print(f"Processing URL: {url}")
499
+ url_docs = scrape_webpage(url)
500
+ all_docs.extend(url_docs)
501
+ processed_urls += 1
502
+ print(f"Successfully processed {url}: {len(url_docs)} chunks")
503
+ except Exception as e:
504
+ print(f"Failed to process URL {url}: {e}")
505
+ continue
506
+
507
+ if not all_docs:
508
+ return f"⚠️ No data extracted. Processed {processed_files} files and {processed_urls} URLs, but no usable content found."
509
+
510
+ print(f"Total documents to process: {len(all_docs)}")
511
+
512
+ # Build Knowledge Graph
513
+ allowed_nodes = ["Entity", "Concept", "Person", "Organization", "Location", "Event", "Fact"]
514
+ allowed_rels = ["RELATED_TO", "MENTIONS", "PART_OF", "CAUSES", "ASSOCIATED_WITH"]
515
+
516
+ try:
517
+ print("Building knowledge graph...")
518
+ build_kg_with_llm(all_docs, app_state.graph, llm, allowed_nodes, allowed_rels)
519
+ print("Knowledge graph built successfully")
520
+ except Exception as e:
521
+ print(f"KG extraction error: {e}")
522
+ return f"❌ KG extraction failed: {e}"
523
+
524
+ # Build Vector Index
525
+ try:
526
+ print("Building vector index...")
527
+ vs = upsert_chunks_vector_index(
528
+ docs=all_docs,
529
+ embeddings=embeddings,
530
+ neo4j_url=neo4j_url,
531
+ neo4j_user=neo4j_user,
532
+ neo4j_password=neo4j_password,
533
+ node_label="Chunk",
534
+ text_prop="text",
535
+ embed_prop="embedding",
536
+ index_name="chunk_vector_index",
537
+ keyword_index_name="chunk_keyword_index",
538
+ )
539
+ app_state.vs = vs
540
+ print("Vector index built successfully")
541
+ except Exception as e:
542
+ print(f"Vector indexing error: {e}")
543
+ return f"❌ Vector indexing failed: {e}"
544
+
545
+ return f"✅ Successfully processed {processed_files} files and {processed_urls} URLs ({len(all_docs)} total chunks)! Knowledge graph and vector index are ready."
546
+
547
+ except Exception as e:
548
+ import traceback
549
+ error_details = traceback.format_exc()
550
+ print(f"Full error details: {error_details}")
551
+ return f"❌ Processing failed: {str(e)}"
552
+
553
+
554
+ def chat_with_knowledge(message: str, history: List[Tuple[str, str]]) -> Tuple[str, List[Tuple[str, str]]]:
555
+ """Chat function using both knowledge graph and vector search."""
556
+ if app_state.graph is None or app_state.vs is None:
557
+ response = "❌ Please connect to Neo4j and ingest data first."
558
+ history.append((message, response))
559
+ return "", history
560
+
561
+ if app_state.llm is None:
562
+ response = "❌ Model not initialized. Please process some data first."
563
+ history.append((message, response))
564
+ return "", history
565
+
566
+ try:
567
+ # Use hybrid approach: Knowledge Graph + Vector Search
568
+ answer = hybrid_retrieval_answer(
569
+ question=message,
570
+ graph=app_state.graph,
571
+ vs=app_state.vs,
572
+ llm=app_state.llm
573
+ )
574
+
575
+ if not answer or answer.strip() == "":
576
+ answer = "I don't have enough information to answer that based on the ingested data."
577
+
578
+ history.append((message, answer))
579
+ return "", history
580
+
581
+ except Exception as e:
582
+ import traceback
583
+ error_details = traceback.format_exc()
584
+ print(f"Chat error details: {error_details}")
585
+ response = f"❌ Error during chat: {str(e)}"
586
+ history.append((message, response))
587
+ return "", history
588
+
589
+
590
+ def clear_chat_history():
591
+ """Clear the chat history."""
592
+ app_state.chat_history = []
593
+ return []
594
+
595
+
596
+ # ===============================
597
+ # Gradio Interface
598
+ # ===============================
599
+
600
+ def create_interface():
601
+ """Create the Gradio interface."""
602
+ load_dotenv()
603
+
604
+ with gr.Blocks(title="Knowledge Graph Chatbot", theme=gr.themes.Soft()) as demo:
605
+ gr.Markdown("# 📚 Knowledge Graph Chatbot (Strict)")
606
+ gr.Markdown("Upload documents, scrape URLs, and chat with your knowledge using Neo4j and vector search!")
607
+
608
+ with gr.Tab("🔧 Setup & Configuration"):
609
+ with gr.Row():
610
+ with gr.Column(scale=1):
611
+ gr.Markdown("### Model Settings")
612
+ provider = gr.Dropdown(
613
+ choices=["Cohere", "Gemini"],
614
+ value="Cohere",
615
+ label="LLM Provider"
616
+ )
617
+ api_key = gr.Textbox(
618
+ label="API Key",
619
+ type="password",
620
+ value=os.getenv("COHERE_API_KEY", ""),
621
+ placeholder="Enter your API key"
622
+ )
623
+
624
+ with gr.Column(scale=1):
625
+ gr.Markdown("### Neo4j Configuration")
626
+ neo4j_url = gr.Textbox(
627
+ label="Neo4j URL",
628
+ value=os.getenv("NEO4J_URI", "neo4j+s://your-neo4j-url"),
629
+ placeholder="neo4j+s://your-neo4j-url"
630
+ )
631
+ neo4j_user = gr.Textbox(
632
+ label="Username",
633
+ value=os.getenv("NEO4J_USER", "neo4j")
634
+ )
635
+ neo4j_password = gr.Textbox(
636
+ label="Password",
637
+ type="password",
638
+ value=os.getenv("NEO4J_PASSWORD", "")
639
+ )
640
+
641
+ with gr.Row():
642
+ connect_btn = gr.Button("🔗 Connect to Neo4j", variant="primary")
643
+ wipe_btn = gr.Button("🗑️ Wipe Database", variant="stop")
644
+ reconnect_btn = gr.Button("🔄 Reconnect to Existing Data", variant="secondary")
645
+
646
+ connection_status = gr.Textbox(
647
+ label="Connection Status",
648
+ interactive=False,
649
+ placeholder="Click 'Connect to Neo4j' to establish connection"
650
+ )
651
+
652
+ connect_btn.click(
653
+ fn=connect_neo4j,
654
+ inputs=[neo4j_url, neo4j_user, neo4j_password],
655
+ outputs=[connection_status]
656
+ )
657
+
658
+ wipe_btn.click(
659
+ fn=wipe_database,
660
+ outputs=[connection_status]
661
+ )
662
+
663
+ reconnect_btn.click(
664
+ fn=reconnect_existing_data,
665
+ inputs=[provider, api_key, neo4j_url, neo4j_user, neo4j_password],
666
+ outputs=[connection_status]
667
+ )
668
+
669
+ with gr.Tab("📁 Data Ingestion"):
670
+ gr.Markdown("### Upload Knowledge Sources")
671
+
672
+ files = gr.File(
673
+ label="Upload Files",
674
+ file_types=[".pdf", ".docx", ".doc", ".txt", ".csv", ".xls", ".xlsx"],
675
+ file_count="multiple"
676
+ )
677
+
678
+ urls = gr.Textbox(
679
+ label="URLs to Scrape",
680
+ placeholder="Enter URLs, one per line",
681
+ lines=5
682
+ )
683
+
684
+ process_btn = gr.Button("🚀 Process & Build Knowledge Graph", variant="primary")
685
+
686
+ processing_status = gr.Textbox(
687
+ label="Processing Status",
688
+ interactive=False,
689
+ placeholder="Click 'Process & Build Knowledge Graph' to start"
690
+ )
691
+
692
+ process_btn.click(
693
+ fn=process_knowledge,
694
+ inputs=[provider, api_key, files, urls, neo4j_url, neo4j_user, neo4j_password],
695
+ outputs=[processing_status]
696
+ )
697
+
698
+ with gr.Tab("💬 Chat"):
699
+ gr.Markdown("### Chat with Your Knowledge Graph")
700
+ gr.Markdown("Ask questions about your ingested data. The system uses **both knowledge graph queries and vector search** for comprehensive answers.")
701
+
702
+ chatbot = gr.Chatbot(
703
+ label="Knowledge Graph Chat",
704
+ height=500,
705
+ placeholder="Your conversation will appear here..."
706
+ )
707
+
708
+ with gr.Row():
709
+ msg_box = gr.Textbox(
710
+ label="Your Question",
711
+ placeholder="Ask about entities, relationships, or any content from your data...",
712
+ scale=4
713
+ )
714
+ send_btn = gr.Button("Send", variant="primary", scale=1)
715
+
716
+ clear_btn = gr.Button("🗑️ Clear Chat History", variant="secondary")
717
+
718
+ # Example questions
719
+ with gr.Accordion("💡 Example Questions", open=False):
720
+ gr.Markdown("""
721
+ **Entity-based questions:**
722
+ - "What organizations are mentioned in the documents?"
723
+ - "Tell me about [person name] and their relationships"
724
+ - "What events are connected to [organization]?"
725
+
726
+ **Relationship queries:**
727
+ - "How are [entity1] and [entity2] related?"
728
+ - "What causes [concept] according to the documents?"
729
+ - "Show me all connections to [topic]"
730
+
731
+ **Content questions:**
732
+ - "Summarize the main concepts in the documents"
733
+ - "What are the key findings about [topic]?"
734
+ - "Explain [concept] based on the ingested data"
735
+ """)
736
+
737
+ # Chat functionality
738
+ msg_box.submit(
739
+ fn=chat_with_knowledge,
740
+ inputs=[msg_box, chatbot],
741
+ outputs=[msg_box, chatbot]
742
+ )
743
+
744
+ send_btn.click(
745
+ fn=chat_with_knowledge,
746
+ inputs=[msg_box, chatbot],
747
+ outputs=[msg_box, chatbot]
748
+ )
749
+
750
+ clear_btn.click(
751
+ fn=clear_chat_history,
752
+ outputs=[chatbot]
753
+ )
754
+
755
+ with gr.Tab("ℹ️ Instructions"):
756
+ gr.Markdown("""
757
+ ## How to Use This Knowledge Graph Chatbot
758
+
759
+ ### 1. Setup & Configuration
760
+ - Choose your LLM provider (Cohere or Gemini)
761
+ - Enter your API key for the chosen provider
762
+ - Configure your Neo4j connection details
763
+ - Click "Connect to Neo4j" to establish the database connection
764
+
765
+ ### 1.5. Reconnecting to Existing Data
766
+ **If you already have data in Neo4j from a previous session:**
767
+ - After connecting to Neo4j, if you see existing data detected
768
+ - Set up your LLM provider and API key
769
+ - Click "🔄 Reconnect to Existing Data" instead of re-ingesting
770
+ - This will restore your vector store and enable chat without re-processing documents
771
+
772
+ ### 2. Data Ingestion
773
+ - Upload files (PDF, DOCX, TXT, CSV, XLS, XLSX) or enter URLs to scrape
774
+ - Click "Process & Build Knowledge Graph" to:
775
+ - Extract text from your sources
776
+ - Build a knowledge graph using LLM-based entity extraction
777
+ - Create a vector index for semantic search
778
+
779
+ ### 3. Chat
780
+ - Ask questions about your ingested data
781
+ - The chatbot will provide **strict** answers only based on your uploaded content
782
+ - If the answer isn't in your data, it will explicitly say so
783
+
784
+ ### Features
785
+ - **Knowledge Graph Queries**: Direct Cypher queries to find entities and relationships
786
+ - **Vector Semantic Search**: Dense vector similarity search for relevant content
787
+ - **Hybrid Intelligence**: Combines structured graph data with unstructured document content
788
+ - **Source Attribution**: Answers include references to source files/URLs
789
+ - **Strict Mode**: Only answers from your ingested data, no hallucination
790
+ - **Entity Extraction**: Automatically identifies people, organizations, locations, events
791
+ - **Relationship Mapping**: Discovers and queries connections between entities
792
+ - **Batch Processing**: Handles large document collections efficiently
793
+
794
+ ### Requirements
795
+ - Neo4j database (Neo4j Aura or self-hosted)
796
+ - API key for Cohere or Google Gemini
797
+ - Documents or URLs to process
798
+
799
+ ### Required Packages for Kaggle
800
+ Run this in a Kaggle cell before using the interface:
801
+ ```python
802
+ !pip install gradio langchain neo4j beautifulsoup4 requests python-dotenv
803
+ !pip install langchain-community langchain-experimental
804
+ !pip install langchain-google-genai cohere
805
+ !pip install json-repair # Required for knowledge graph extraction
806
+ !pip install unstructured[all-docs] # For better document parsing
807
+ ```
808
+
809
+ ### For Kaggle Notebooks
810
+ This interface is optimized for Kaggle notebooks. Make sure to:
811
+ 1. Install required packages in your notebook
812
+ 2. Set up your API keys as environment variables or enter them in the interface
813
+ 3. Use a cloud-hosted Neo4j instance (like Neo4j Aura)
814
+ """)
815
+
816
+ return demo
817
+
818
+
819
+ # ===============================
820
+ # Main Function
821
+ # ===============================
822
+
823
+ if __name__ == "__main__":
824
+ demo = create_interface()
825
+ demo.launch(
826
+ share=True, # Create a public link for sharing
827
+ debug=True,
828
+ server_name="0.0.0.0", # Allow external connections
829
+ server_port=7860
830
+ )
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ langchain
3
+ langchain-community
4
+ langchain-experimental
5
+ langchain-google-genai
6
+ neo4j
7
+ beautifulsoup4
8
+ requests
9
+ python-dotenv
10
+ cohere
11
+ json-repair
12
+ unstructured[all-docs]
13
+ pypdf
14
+ python-docx
15
+ openpyxl
16
+ xlrd