Spaces:

jmzlx
/

dd-poc

Sleeping

dd-poc / scripts /build_knowledge_graphs.py

Juan Salas

Basic graph functionality and updated tests

d1564d4 4 months ago

14.8 kB

	#!/usr/bin/env python3
	"""
	Knowledge Graph Builder Script

	This script extends the FAISS index building process to create knowledge graphs
	from document analysis. It extracts entities and relationships from processed
	documents and builds NetworkX graphs that can be stored in the repo and loaded
	efficiently in Streamlit Cloud.

	The graph building process:
	1. Load existing FAISS indices and document chunks
	2. Extract entities (companies, people, contracts, etc.) using NER
	3. Identify relationships between entities using pattern matching and AI
	4. Build NetworkX graphs with rich metadata
	5. Serialize graphs to files for fast loading in Streamlit

	Run this after build_indexes.py to generate knowledge graphs.
	"""

	import sys
	import json
	import pickle
	from pathlib import Path
	from typing import Dict, List, Any, Optional
	from collections import defaultdict
	from datetime import datetime

	# Progress indicators
	from tqdm import tqdm

	# NetworkX for graph operations
	import networkx as nx

	# Colors for output
	RED='\033[0;31m'
	GREEN='\033[0;32m'
	YELLOW='\033[1;33m'
	BLUE='\033[0;34m'
	NC='\033[0m' # No Color

	# Add app to path for imports
	sys.path.insert(0, str(Path(__file__).parent.parent))

	from app.core.config import get_config
	from app.core.logging import setup_logging
	from app.core.utils import create_document_processor
	from app.core.entity_resolution import EntityResolver
	from app.core.legal_coreference import LegalCoreferenceResolver
	from scripts.transformer_extractors import TransformerEntityExtractor

	# Set up logging
	logger = setup_logging("build_knowledge_graphs", log_level="INFO")

	# Old regex-based extractors have been removed
	# Now using transformer-based extractors from scripts.transformer_extractors

	class KnowledgeGraphBuilder:
	"""Build NetworkX knowledge graphs from extracted entities and relationships"""

	def __init__(self, store_name: str):
	self.store_name = store_name
	self.graph = nx.MultiDiGraph() # Allow multiple edges between nodes

	def build_graph(self, entities: Dict[str, List[Dict]], relationships: List[Dict]) -> nx.MultiDiGraph:
	"""Build knowledge graph from entities and relationships"""

	# Add entity nodes
	print(f"{BLUE}Adding entity nodes...{NC}")
	for entity_type, entity_list in entities.items():
	for entity in tqdm(entity_list, desc=f"Adding {entity_type}"):
	# Skip entities without names
	if 'name' not in entity or not entity['name']:
	continue

	node_id = f"{entity_type}:{entity['name']}"

	# Add node with rich metadata
	self.graph.add_node(node_id,
	name=entity['name'],
	type=entity_type,
	sources=entity.get('source', ''),
	document_type=entity.get('document_type', 'unknown'),
	context_samples=[entity.get('context', '')],
	first_seen=datetime.now().isoformat()
	)

	# Add relationship edges
	print(f"{BLUE}Adding relationship edges...{NC}")
	for rel in tqdm(relationships, desc="Adding relationships"):
	# Find matching nodes
	source_nodes = [n for n in self.graph.nodes() if rel['source_entity'].lower() in n.lower()]
	target_nodes = [n for n in self.graph.nodes() if rel['target_entity'].lower() in n.lower()]

	for source_node in source_nodes:
	for target_node in target_nodes:
	if source_node != target_node:
	self.graph.add_edge(
	source_node,
	target_node,
	relationship=rel['relationship_type'],
	source_document=rel['source_document'],
	context=rel['context'],
	confidence=rel['confidence']
	)

	# Add graph metadata
	self.graph.graph.update({
	'store_name': self.store_name,
	'created_at': datetime.now().isoformat(),
	'num_entities': len(self.graph.nodes()),
	'num_relationships': len(self.graph.edges()),
	'entity_types': list(entities.keys())
	})

	return self.graph

	def compute_graph_metrics(self) -> Dict[str, Any]:
	"""Compute useful graph metrics for analysis"""
	metrics = {
	'num_nodes': len(self.graph.nodes()),
	'num_edges': len(self.graph.edges()),
	'density': nx.density(self.graph),
	'is_connected': nx.is_weakly_connected(self.graph),
	}

	# Node centrality measures
	if len(self.graph.nodes()) > 1:
	try:
	centrality = nx.degree_centrality(self.graph)
	metrics['top_central_entities'] = sorted(
	[(node, score) for node, score in centrality.items()],
	key=lambda x: x[1], reverse=True
	)[:10]
	except:
	metrics['top_central_entities'] = []

	# Entity type distribution
	entity_types = defaultdict(int)
	for node in self.graph.nodes():
	node_type = self.graph.nodes[node].get('type', 'unknown')
	entity_types[node_type] += 1
	metrics['entity_distribution'] = dict(entity_types)

	return metrics

	def process_company_knowledge_graph(store_name: str, config) -> Optional[Dict[str, Any]]:
	"""Process a single company's knowledge graph"""
	# Determine what type of data store this is
	store_type = "unknown"
	if "summit-digital-solutions" in store_name or "deepshield-systems" in store_name:
	store_type = "company data room"
	elif "questions" in store_name:
	store_type = "due diligence questions"
	elif "checklist" in store_name:
	store_type = "due diligence checklist"

	print(f"\n{GREEN}Processing knowledge graph for: {store_name} ({store_type}){NC}")

	try:
	# Load existing FAISS index and document processor
	document_processor = create_document_processor(store_name=store_name)

	if not document_processor.vector_store:
	print(f"{YELLOW}⚠️ No FAISS index found for {store_name}, skipping...{NC}")
	return None

	# Extract chunks from FAISS metadata
	chunks = []
	if hasattr(document_processor, 'chunks') and document_processor.chunks:
	chunks = document_processor.chunks
	else:
	# Fallback: extract from FAISS docstore
	for i, doc in enumerate(document_processor.vector_store.docstore._dict.values()):
	chunks.append({
	'text': doc.page_content,
	'source': doc.metadata.get('name', f'doc_{i}'),
	'metadata': doc.metadata
	})

	if not chunks:
	print(f"{YELLOW}⚠️ No chunks found for {store_name}, skipping...{NC}")
	return None

	print(f"📄 Processing {len(chunks)} document chunks")

	# Apply legal coreference resolution (hybrid approach)
	print(f"{BLUE}Applying legal coreference resolution...{NC}")
	coreference_resolver = LegalCoreferenceResolver()
	processed_chunks, legal_definitions = coreference_resolver.process_document_chunks(
	chunks, use_preprocessing=True
	)

	total_definitions = sum(len(defs) for defs in legal_definitions.values())
	if total_definitions > 0:
	print(f"📋 Found {total_definitions} legal keyword definitions across {len(legal_definitions)} documents")

	# Extract entities using transformer-based extraction (on processed chunks)
	print(f"{BLUE}Initializing transformer-based entity extraction...{NC}")
	entity_extractor = TransformerEntityExtractor()
	raw_entities = entity_extractor.extract_entities(processed_chunks)

	total_raw_entities = sum(len(entity_list) for entity_list in raw_entities.values())
	print(f"🏷️ Extracted {total_raw_entities} raw entities")

	# Add legal keyword entities to the collection (Strategy 2)
	print(f"{BLUE}Adding legal keyword entities to knowledge graph...{NC}")
	entities_with_keywords = coreference_resolver.enhance_entities_with_keywords(raw_entities, legal_definitions)

	# Resolve duplicate entities using semantic embeddings
	print(f"{BLUE}Resolving duplicate entities using semantic embeddings...{NC}")
	entity_resolver = EntityResolver()
	entities = entity_resolver.resolve_entities(entities_with_keywords)

	# Get resolution statistics
	resolution_stats = entity_resolver.get_resolution_stats(raw_entities, entities)
	total_entities = sum(len(entity_list) for entity_list in entities.values())
	print(f"✨ Entity resolution complete: {total_raw_entities} → {total_entities} entities "
	f"({resolution_stats['overall_reduction_percentage']:.1f}% reduction)")

	# Print per-type statistics
	for entity_type, stats in resolution_stats['by_type'].items():
	if stats['duplicates_removed'] > 0:
	print(f" • {entity_type}: {stats['before']} → {stats['after']} "
	f"({stats['duplicates_removed']} duplicates removed)")

	# Extract high-quality legal keyword relationships only
	print(f"{BLUE}Extracting legal keyword relationships...{NC}")
	relationships = coreference_resolver.create_all_keyword_relationships(legal_definitions)

	print(f"🔗 Extracted {len(relationships)} high-quality legal relationships")

	# Removed: Base transformer relationship extraction (low yield: 59 relationships from 3,091 chunks)
	# Legal keyword relationships provide 98% of the value with much higher precision

	# Build knowledge graph
	graph_builder = KnowledgeGraphBuilder(store_name)
	knowledge_graph = graph_builder.build_graph(entities, relationships)

	# Compute metrics
	metrics = graph_builder.compute_graph_metrics()
	print(f"📊 Graph metrics: {metrics['num_nodes']} nodes, {metrics['num_edges']} edges")

	# Save knowledge graph files
	graphs_dir = config.paths['faiss_dir'] / 'knowledge_graphs'
	graphs_dir.mkdir(exist_ok=True)

	# Save NetworkX graph (pickle format for fast loading)
	graph_file = graphs_dir / f"{store_name}_knowledge_graph.pkl"
	with open(graph_file, 'wb') as f:
	pickle.dump(knowledge_graph, f)

	# Save graph metadata and metrics (JSON for inspection)
	metadata_file = graphs_dir / f"{store_name}_graph_metadata.json"
	with open(metadata_file, 'w') as f:
	json.dump({
	'store_name': store_name,
	'metrics': metrics,
	'entities': {k: len(v) for k, v in entities.items()},
	'relationships_count': len(relationships),
	'created_at': datetime.now().isoformat()
	}, f, indent=2)

	# Save entities for inspection (JSON)
	entities_file = graphs_dir / f"{store_name}_entities.json"
	with open(entities_file, 'w') as f:
	json.dump(entities, f, indent=2)

	print(f"✅ Knowledge graph saved to {graph_file}")

	return {
	'store_name': store_name,
	'success': True,
	'metrics': metrics,
	'files_created': [str(graph_file), str(metadata_file), str(entities_file)]
	}

	except Exception as e:
	print(f"{RED}❌ Error processing {store_name}: {str(e)}{NC}")
	logger.error(f"Knowledge graph processing failed for {store_name}: {e}")
	return {
	'store_name': store_name,
	'success': False,
	'error': str(e)
	}

	def main():
	"""Main function to build knowledge graphs for all companies"""
	print(f"{GREEN}🧠 Building Knowledge Graphs for Due Diligence Analysis{NC}")
	print(f"{GREEN}Using transformer-based entity and relationship extraction{NC}")
	print("=" * 60)

	# Load configuration
	config = get_config()

	# Find all existing FAISS indices
	faiss_dir = config.paths['faiss_dir']
	if not faiss_dir.exists():
	print(f"{RED}❌ FAISS directory not found: {faiss_dir}{NC}")
	print("Please run scripts/build_indexes.py first")
	return

	# Find all .faiss files (these are the indices)
	faiss_files = list(faiss_dir.glob("*.faiss"))
	if not faiss_files:
	print(f"{RED}❌ No FAISS indices found in {faiss_dir}{NC}")
	print("Please run scripts/build_indexes.py first")
	return

	# Extract store names from FAISS files
	store_names = [f.stem for f in faiss_files]
	print(f"Found {len(store_names)} FAISS indices: {', '.join(store_names)}")

	# Process each company's knowledge graph
	results = []
	for store_name in store_names:
	result = process_company_knowledge_graph(store_name, config)
	if result:
	results.append(result)

	# Summary
	print(f"\n{GREEN}📋 Knowledge Graph Building Summary{NC}")
	print("=" * 40)

	successful = [r for r in results if r.get('success', False)]
	failed = [r for r in results if not r.get('success', False)]

	print(f"✅ Successfully processed: {len(successful)} data stores")
	for result in successful:
	metrics = result.get('metrics', {})
	store_name = result['store_name']

	# Determine store type for clearer output
	if "summit-digital-solutions" in store_name or "deepshield-systems" in store_name:
	store_type = "company"
	elif "questions" in store_name:
	store_type = "questions"
	elif "checklist" in store_name:
	store_type = "checklist"
	else:
	store_type = "unknown"

	print(f" • {store_name} ({store_type}): {metrics.get('num_nodes', 0)} entities, {metrics.get('num_edges', 0)} relationships")

	if failed:
	print(f"❌ Failed to process: {len(failed)} data stores")
	for result in failed:
	print(f" • {result['store_name']}: {result.get('error', 'Unknown error')}")

	print(f"\n🎉 Knowledge graph building complete!")
	print(f"📁 Files saved in: {config.paths['faiss_dir'] / 'knowledge_graphs'}")

	if __name__ == "__main__":
	main()