Spaces:
Sleeping
Sleeping
File size: 13,923 Bytes
78e8dd4 0cfa3a6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 |
"""
Knowledge Graph Visualization Module
Creates knowledge maps and similarity heatmaps from document relationships
"""
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import os
import re
import json
from typing import Tuple, Optional, Dict, List
from openai import OpenAI
from pathlib import Path
class KnowledgeGraphGenerator:
"""Generates knowledge graphs and visualizations"""
def __init__(self, client: OpenAI, vector_store_id: str, output_dir: str = "output"):
self.client = client
self.vector_store_id = vector_store_id
self.output_dir = Path(output_dir)
self.output_dir.mkdir(exist_ok=True)
def get_files_from_vector_store(self) -> List[str]:
"""Get list of files from vector store"""
try:
query = "List all documents in the manual"
response = self.client.responses.create(
input=query,
model="gpt-4o-mini",
tools=[{
"type": "file_search",
"vector_store_ids": [self.vector_store_id],
"max_num_results": 25
}]
)
file_list = []
if response and hasattr(response.output[1].content[0], 'annotations'):
annotations = response.output[1].content[0].annotations
file_list = list(set([annotation.filename for annotation in annotations]))
file_list = [f.replace('.pdf', '') for f in file_list]
file_list.sort()
return file_list
except Exception as e:
print(f"β Error getting files: {str(e)}")
return []
def extract_topics_from_content(self, file_list: List[str]) -> Tuple[Dict[str, List[str]], List[str]]:
"""Extract topics from document content using GPT"""
all_topics = set()
file_topics = {}
file_descriptions = {}
print("π Getting content descriptions for each file...")
# Get descriptions for each file
for file in file_list:
try:
query = f"What is the main purpose and key concepts covered in the document titled '{file}'? Be brief and focused on technical concepts."
response = self.client.responses.create(
input=query,
model="gpt-4o-mini",
tools=[{
"type": "file_search",
"vector_store_ids": [self.vector_store_id]
}]
)
if response and hasattr(response.output[1], 'content'):
description = response.output[1].content[0].text
file_descriptions[file] = description
print(f" β Got description for {file}")
else:
file_descriptions[file] = f"Information about {file}"
except Exception as e:
print(f" β οΈ Error getting description for {file}: {e}")
file_descriptions[file] = f"Information about {file}"
# Extract topics from descriptions
prompt = "Extract key technical concepts (single words or short phrases) from these document descriptions. Focus on functional concepts, components, and technologies.\n\n"
for file, desc in file_descriptions.items():
prompt += f"Document: {file}\nDescription: {desc}\n\n"
prompt += "\nFor each document, list 3-5 key technical concepts. Format as a JSON object where keys are document names and values are arrays of concepts."
try:
response = self.client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "You extract key technical concepts from document descriptions in a structured way."},
{"role": "user", "content": prompt}
],
temperature=0.3
)
topics_text = response.choices[0].message.content
json_match = re.search(r'\{.*\}', topics_text, re.DOTALL)
if json_match:
try:
file_topics = json.loads(json_match.group(0))
for topics in file_topics.values():
all_topics.update(topics)
print(f"β
Successfully extracted topics for {len(file_topics)} documents")
except json.JSONDecodeError:
print("β οΈ Error parsing JSON response, using fallback")
file_topics = self._create_fallback_topics(file_list)
else:
file_topics = self._create_fallback_topics(file_list)
except Exception as e:
print(f"β οΈ Error extracting topics: {e}, using fallback")
file_topics = self._create_fallback_topics(file_list)
# Ensure all files have topics
for file in file_list:
if file not in file_topics or not file_topics[file]:
words = [word for word in re.findall(r'\b[A-Za-z]{3,}\b', file)
if word.lower() not in ['the', 'and', 'for', 'with', 'function', 'of']]
file_topics[file] = words if words else ["Topic"]
return file_topics, list(all_topics)
def _create_fallback_topics(self, file_list: List[str]) -> Dict[str, List[str]]:
"""Create fallback topics from filenames"""
file_topics = {}
for file in file_list:
words = [word for word in re.findall(r'\b[A-Za-z]{3,}\b', file)
if word.lower() not in ['the', 'and', 'for', 'with', 'function', 'of']]
file_topics[file] = words if words else ["Topic"]
return file_topics
def analyze_document_relationships(self, file_list: List[str],
file_topics: Dict[str, List[str]]) -> np.ndarray:
"""Analyze relationships between documents based on topics"""
n = len(file_list)
similarity_matrix = np.zeros((n, n))
# Create topic vectors
all_topics = set()
for topics in file_topics.values():
all_topics.update(topics)
topic_list = list(all_topics)
# Create binary vectors for each document
topic_vectors = {}
for file in file_list:
vector = np.zeros(len(topic_list))
for i, topic in enumerate(topic_list):
if topic in file_topics.get(file, []):
vector[i] = 1
topic_vectors[file] = vector
# Calculate cosine similarity
for i, file1 in enumerate(file_list):
for j, file2 in enumerate(file_list):
if i == j:
similarity_matrix[i][j] = 1.0
else:
vec1 = topic_vectors.get(file1, np.zeros(len(topic_list)))
vec2 = topic_vectors.get(file2, np.zeros(len(topic_list)))
dot_product = np.dot(vec1, vec2)
norm1 = np.linalg.norm(vec1)
norm2 = np.linalg.norm(vec2)
if norm1 > 0 and norm2 > 0:
similarity_matrix[i][j] = dot_product / (norm1 * norm2)
return similarity_matrix
def create_knowledge_graph(self, file_list: List[str], file_topics: Dict[str, List[str]],
similarity_matrix: np.ndarray) -> nx.Graph:
"""Create knowledge graph from documents and topics"""
G = nx.Graph()
# Add document nodes
for file in file_list:
G.add_node(file, type='document', size=700)
# Add topic nodes and connections
for file, topics in file_topics.items():
for topic in topics:
if topic not in G:
G.add_node(topic, type='topic', size=500)
G.add_edge(file, topic, weight=3)
# Add edges between similar documents
for i, file1 in enumerate(file_list):
for j, file2 in enumerate(file_list):
if i < j:
sim = similarity_matrix[i][j]
if sim > 0.25:
G.add_edge(file1, file2, weight=sim * 5)
return G
def save_knowledge_graph(self, G: nx.Graph) -> str:
"""Save knowledge graph visualization"""
plt.figure(figsize=(16, 12))
pos = nx.kamada_kawai_layout(G)
document_nodes = [n for n, attr in G.nodes(data=True) if attr.get('type') == 'document']
topic_nodes = [n for n, attr in G.nodes(data=True) if attr.get('type') == 'topic']
edge_widths = [G[u][v].get('weight', 1) * 0.6 for u, v in G.edges()]
nx.draw_networkx_nodes(G, pos, nodelist=document_nodes, node_color='#5B9BD5',
node_size=800, alpha=0.8)
nx.draw_networkx_nodes(G, pos, nodelist=topic_nodes, node_color='#70AD47',
node_size=600, alpha=0.8)
nx.draw_networkx_edges(G, pos, width=edge_widths, alpha=0.7, edge_color='#A5A5A5')
# Create labels
doc_labels = {}
for node in document_nodes:
if len(node) > 20:
shortened = re.sub(r'(?:Function|Operating|Setting|Activating|Deactivating) of ', '', node)
shortened = re.sub(r' Assist', '', shortened)
if len(shortened) > 20:
shortened = shortened[:18] + '...'
doc_labels[node] = shortened
else:
doc_labels[node] = node
# Draw labels
for node, label in doc_labels.items():
x, y = pos[node]
plt.text(x, y, label, fontsize=9, fontweight='bold',
ha='center', va='center',
bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', boxstyle='round,pad=0.3'))
for node in topic_nodes:
x, y = pos[node]
plt.text(x, y, node, fontsize=8, ha='center', va='center',
bbox=dict(facecolor='#E8F4E5', alpha=0.9, edgecolor='none', boxstyle='round,pad=0.2'))
plt.title("System Knowledge Map", fontsize=18)
plt.axis('off')
plt.tight_layout()
output_path = self.output_dir / "knowledge_graph.png"
plt.savefig(output_path, dpi=300, bbox_inches='tight')
plt.close()
print(f"β
Knowledge graph saved to {output_path}")
return str(output_path)
def save_similarity_heatmap(self, matrix: np.ndarray, labels: List[str]) -> str:
"""Save similarity heatmap"""
plt.figure(figsize=(12, 10))
plt.imshow(matrix, cmap='Blues')
plt.colorbar(label='Similarity')
# Shorten labels
shortened_labels = []
for label in labels:
if len(label) > 15:
shortened = re.sub(r'(?:Function|Operating|Setting|Activating|Deactivating) of ', '', label)
shortened = re.sub(r' Assist', '', shortened)
if len(shortened) > 15:
shortened = shortened[:13] + '...'
shortened_labels.append(shortened)
else:
shortened_labels.append(label)
plt.xticks(range(len(labels)), shortened_labels, rotation=45, ha='right')
plt.yticks(range(len(labels)), shortened_labels)
# Add similarity values
for i in range(len(labels)):
for j in range(len(labels)):
if i != j:
plt.text(j, i, f'{matrix[i, j]:.2f}',
ha="center", va="center",
color="white" if matrix[i, j] > 0.5 else "black")
plt.title("Document Similarity Heatmap", fontsize=16)
plt.tight_layout()
output_path = self.output_dir / "similarity_heatmap.png"
plt.savefig(output_path, dpi=300, bbox_inches='tight')
plt.close()
print(f"β
Similarity heatmap saved to {output_path}")
return str(output_path)
def generate_visualizations(self) -> Tuple[Optional[str], Optional[str]]:
"""Generate both knowledge graph and heatmap visualizations"""
print("π Generating knowledge graph visualizations...")
file_list = self.get_files_from_vector_store()
if not file_list:
print("β οΈ No files found. Cannot create knowledge map.")
return None, None
print("π Extracting topics from content...")
file_topics, all_topics = self.extract_topics_from_content(file_list)
print("π Analyzing document relationships...")
similarity_matrix = self.analyze_document_relationships(file_list, file_topics)
print("π¨ Creating knowledge graph...")
G = self.create_knowledge_graph(file_list, file_topics, similarity_matrix)
print("πΎ Saving visualizations...")
graph_path = self.save_knowledge_graph(G)
heatmap_path = self.save_similarity_heatmap(similarity_matrix, file_list)
print("β
Dynamic visualizations complete!")
return graph_path, heatmap_path
|