File size: 13,923 Bytes
78e8dd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0cfa3a6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
"""

Knowledge Graph Visualization Module

Creates knowledge maps and similarity heatmaps from document relationships

"""
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import os
import re
import json
from typing import Tuple, Optional, Dict, List
from openai import OpenAI
from pathlib import Path


class KnowledgeGraphGenerator:
    """Generates knowledge graphs and visualizations"""
    
    def __init__(self, client: OpenAI, vector_store_id: str, output_dir: str = "output"):
        self.client = client
        self.vector_store_id = vector_store_id
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
    
    def get_files_from_vector_store(self) -> List[str]:
        """Get list of files from vector store"""
        try:
            query = "List all documents in the manual"
            response = self.client.responses.create(
                input=query,
                model="gpt-4o-mini",
                tools=[{
                    "type": "file_search",
                    "vector_store_ids": [self.vector_store_id],
                    "max_num_results": 25
                }]
            )
            
            file_list = []
            if response and hasattr(response.output[1].content[0], 'annotations'):
                annotations = response.output[1].content[0].annotations
                file_list = list(set([annotation.filename for annotation in annotations]))
                file_list = [f.replace('.pdf', '') for f in file_list]
                file_list.sort()
            
            return file_list
        except Exception as e:
            print(f"❌ Error getting files: {str(e)}")
            return []
    
    def extract_topics_from_content(self, file_list: List[str]) -> Tuple[Dict[str, List[str]], List[str]]:
        """Extract topics from document content using GPT"""
        all_topics = set()
        file_topics = {}
        file_descriptions = {}
        
        print("πŸ“– Getting content descriptions for each file...")
        
        # Get descriptions for each file
        for file in file_list:
            try:
                query = f"What is the main purpose and key concepts covered in the document titled '{file}'? Be brief and focused on technical concepts."
                response = self.client.responses.create(
                    input=query,
                    model="gpt-4o-mini",
                    tools=[{
                        "type": "file_search",
                        "vector_store_ids": [self.vector_store_id]
                    }]
                )
                
                if response and hasattr(response.output[1], 'content'):
                    description = response.output[1].content[0].text
                    file_descriptions[file] = description
                    print(f"  βœ“ Got description for {file}")
                else:
                    file_descriptions[file] = f"Information about {file}"
            except Exception as e:
                print(f"  ⚠️ Error getting description for {file}: {e}")
                file_descriptions[file] = f"Information about {file}"
        
        # Extract topics from descriptions
        prompt = "Extract key technical concepts (single words or short phrases) from these document descriptions. Focus on functional concepts, components, and technologies.\n\n"
        
        for file, desc in file_descriptions.items():
            prompt += f"Document: {file}\nDescription: {desc}\n\n"
        
        prompt += "\nFor each document, list 3-5 key technical concepts. Format as a JSON object where keys are document names and values are arrays of concepts."
        
        try:
            response = self.client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You extract key technical concepts from document descriptions in a structured way."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.3
            )
            
            topics_text = response.choices[0].message.content
            json_match = re.search(r'\{.*\}', topics_text, re.DOTALL)
            
            if json_match:
                try:
                    file_topics = json.loads(json_match.group(0))
                    for topics in file_topics.values():
                        all_topics.update(topics)
                    print(f"βœ… Successfully extracted topics for {len(file_topics)} documents")
                except json.JSONDecodeError:
                    print("⚠️ Error parsing JSON response, using fallback")
                    file_topics = self._create_fallback_topics(file_list)
            else:
                file_topics = self._create_fallback_topics(file_list)
        except Exception as e:
            print(f"⚠️ Error extracting topics: {e}, using fallback")
            file_topics = self._create_fallback_topics(file_list)
        
        # Ensure all files have topics
        for file in file_list:
            if file not in file_topics or not file_topics[file]:
                words = [word for word in re.findall(r'\b[A-Za-z]{3,}\b', file)
                        if word.lower() not in ['the', 'and', 'for', 'with', 'function', 'of']]
                file_topics[file] = words if words else ["Topic"]
        
        return file_topics, list(all_topics)
    
    def _create_fallback_topics(self, file_list: List[str]) -> Dict[str, List[str]]:
        """Create fallback topics from filenames"""
        file_topics = {}
        for file in file_list:
            words = [word for word in re.findall(r'\b[A-Za-z]{3,}\b', file)
                    if word.lower() not in ['the', 'and', 'for', 'with', 'function', 'of']]
            file_topics[file] = words if words else ["Topic"]
        return file_topics
    
    def analyze_document_relationships(self, file_list: List[str], 

                                      file_topics: Dict[str, List[str]]) -> np.ndarray:
        """Analyze relationships between documents based on topics"""
        n = len(file_list)
        similarity_matrix = np.zeros((n, n))
        
        # Create topic vectors
        all_topics = set()
        for topics in file_topics.values():
            all_topics.update(topics)
        topic_list = list(all_topics)
        
        # Create binary vectors for each document
        topic_vectors = {}
        for file in file_list:
            vector = np.zeros(len(topic_list))
            for i, topic in enumerate(topic_list):
                if topic in file_topics.get(file, []):
                    vector[i] = 1
            topic_vectors[file] = vector
        
        # Calculate cosine similarity
        for i, file1 in enumerate(file_list):
            for j, file2 in enumerate(file_list):
                if i == j:
                    similarity_matrix[i][j] = 1.0
                else:
                    vec1 = topic_vectors.get(file1, np.zeros(len(topic_list)))
                    vec2 = topic_vectors.get(file2, np.zeros(len(topic_list)))
                    
                    dot_product = np.dot(vec1, vec2)
                    norm1 = np.linalg.norm(vec1)
                    norm2 = np.linalg.norm(vec2)
                    
                    if norm1 > 0 and norm2 > 0:
                        similarity_matrix[i][j] = dot_product / (norm1 * norm2)
        
        return similarity_matrix
    
    def create_knowledge_graph(self, file_list: List[str], file_topics: Dict[str, List[str]],

                             similarity_matrix: np.ndarray) -> nx.Graph:
        """Create knowledge graph from documents and topics"""
        G = nx.Graph()
        
        # Add document nodes
        for file in file_list:
            G.add_node(file, type='document', size=700)
        
        # Add topic nodes and connections
        for file, topics in file_topics.items():
            for topic in topics:
                if topic not in G:
                    G.add_node(topic, type='topic', size=500)
                G.add_edge(file, topic, weight=3)
        
        # Add edges between similar documents
        for i, file1 in enumerate(file_list):
            for j, file2 in enumerate(file_list):
                if i < j:
                    sim = similarity_matrix[i][j]
                    if sim > 0.25:
                        G.add_edge(file1, file2, weight=sim * 5)
        
        return G
    
    def save_knowledge_graph(self, G: nx.Graph) -> str:
        """Save knowledge graph visualization"""
        plt.figure(figsize=(16, 12))
        
        pos = nx.kamada_kawai_layout(G)
        
        document_nodes = [n for n, attr in G.nodes(data=True) if attr.get('type') == 'document']
        topic_nodes = [n for n, attr in G.nodes(data=True) if attr.get('type') == 'topic']
        
        edge_widths = [G[u][v].get('weight', 1) * 0.6 for u, v in G.edges()]
        
        nx.draw_networkx_nodes(G, pos, nodelist=document_nodes, node_color='#5B9BD5',
                             node_size=800, alpha=0.8)
        nx.draw_networkx_nodes(G, pos, nodelist=topic_nodes, node_color='#70AD47',
                             node_size=600, alpha=0.8)
        nx.draw_networkx_edges(G, pos, width=edge_widths, alpha=0.7, edge_color='#A5A5A5')
        
        # Create labels
        doc_labels = {}
        for node in document_nodes:
            if len(node) > 20:
                shortened = re.sub(r'(?:Function|Operating|Setting|Activating|Deactivating) of ', '', node)
                shortened = re.sub(r' Assist', '', shortened)
                if len(shortened) > 20:
                    shortened = shortened[:18] + '...'
                doc_labels[node] = shortened
            else:
                doc_labels[node] = node
        
        # Draw labels
        for node, label in doc_labels.items():
            x, y = pos[node]
            plt.text(x, y, label, fontsize=9, fontweight='bold',
                    ha='center', va='center', 
                    bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', boxstyle='round,pad=0.3'))
        
        for node in topic_nodes:
            x, y = pos[node]
            plt.text(x, y, node, fontsize=8, ha='center', va='center',
                   bbox=dict(facecolor='#E8F4E5', alpha=0.9, edgecolor='none', boxstyle='round,pad=0.2'))
        
        plt.title("System Knowledge Map", fontsize=18)
        plt.axis('off')
        plt.tight_layout()
        
        output_path = self.output_dir / "knowledge_graph.png"
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.close()
        
        print(f"βœ… Knowledge graph saved to {output_path}")
        return str(output_path)
    
    def save_similarity_heatmap(self, matrix: np.ndarray, labels: List[str]) -> str:
        """Save similarity heatmap"""
        plt.figure(figsize=(12, 10))
        
        plt.imshow(matrix, cmap='Blues')
        plt.colorbar(label='Similarity')
        
        # Shorten labels
        shortened_labels = []
        for label in labels:
            if len(label) > 15:
                shortened = re.sub(r'(?:Function|Operating|Setting|Activating|Deactivating) of ', '', label)
                shortened = re.sub(r' Assist', '', shortened)
                if len(shortened) > 15:
                    shortened = shortened[:13] + '...'
                shortened_labels.append(shortened)
            else:
                shortened_labels.append(label)
        
        plt.xticks(range(len(labels)), shortened_labels, rotation=45, ha='right')
        plt.yticks(range(len(labels)), shortened_labels)
        
        # Add similarity values
        for i in range(len(labels)):
            for j in range(len(labels)):
                if i != j:
                    plt.text(j, i, f'{matrix[i, j]:.2f}',
                           ha="center", va="center",
                           color="white" if matrix[i, j] > 0.5 else "black")
        
        plt.title("Document Similarity Heatmap", fontsize=16)
        plt.tight_layout()
        
        output_path = self.output_dir / "similarity_heatmap.png"
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.close()
        
        print(f"βœ… Similarity heatmap saved to {output_path}")
        return str(output_path)
    
    def generate_visualizations(self) -> Tuple[Optional[str], Optional[str]]:
        """Generate both knowledge graph and heatmap visualizations"""
        print("πŸ”„ Generating knowledge graph visualizations...")
        
        file_list = self.get_files_from_vector_store()
        if not file_list:
            print("⚠️ No files found. Cannot create knowledge map.")
            return None, None
        
        print("πŸ“Š Extracting topics from content...")
        file_topics, all_topics = self.extract_topics_from_content(file_list)
        
        print("πŸ”— Analyzing document relationships...")
        similarity_matrix = self.analyze_document_relationships(file_list, file_topics)
        
        print("🎨 Creating knowledge graph...")
        G = self.create_knowledge_graph(file_list, file_topics, similarity_matrix)
        
        print("πŸ’Ύ Saving visualizations...")
        graph_path = self.save_knowledge_graph(G)
        heatmap_path = self.save_similarity_heatmap(similarity_matrix, file_list)
        
        print("βœ… Dynamic visualizations complete!")
        return graph_path, heatmap_path