File size: 15,653 Bytes
14357d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300f165
c22fc23
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
# aggile.py
class Aggile:
    """
    Graph generator for plain text
    """
    def __init__(self, client):
        self.client = client

        n = None
        self.subj_prompt = f"""
        extract {n} collocations describing key concepts, keywords, named entities from the provided source
        """

        self.obj_prompt = """
        extract 5-10 most representative collocations from the provided source that are related to the provided concept
        """

        self.pred_prompt = """
        define the relationship between two words: generate a verb or a phrase decribing a relationship between two entities; return a predicate for a knowledge graph triplet
        """

    def _get_subj(self, text, n=10):
        """
        Extract entities from the text:
            - named entities
            - kewords
            - concepts

        :text: input text (str)
        :n: the number of genrated entities (int)

        :return: {core_concepts: list of extracted keywords (subjects that will form triplets)} (dict)
        """
        import ast
        # Generate keywords from the given text using LLM
        core_concepts = self.client.chat.completions.create(messages=
                                                            [
                                                                {
                                                                    "role": "system", 
                                                                    "content": self.subj_prompt
                                                                },
                                                                {
                                                                    "role": "user", 
                                                                    "content": text
                                                                },
                                                            ],
                                                            response_format=
                                                            {
                                                                "type": "json",
                                                                "value": 
                                                                {
                                                                    "properties": 
                                                                    {
                                                                        "core_concepts": 
                                                                        {
                                                                            "type": "array", 
                                                                            "items": 
                                                                            {
                                                                                "type": "string"
                                                                            }
                                                                        },
                                                                    }
                                                                }
                                                            },
                                                            stream=False,
                                                            max_tokens=1024,
                                                            temperature=0.5,
                                                            top_p=0.1
                                                            ).choices[0].get('message')['content']
        return ast.literal_eval(core_concepts)
    
    def __extract_relations(self, word, text):
        import ast
        """
        Extract relation for the provided concepts (subjects) based on the information from the text:
            - collocations

        :text: input text (str)
        :concepts: the list of kewords and other key concepts extracted with aggile._get_subj (dict)
        
        :return: {related_concepts: list of related words and collocations (objects that will form triplets)} (dict)
        """
        related_concepts = self.client.chat.completions.create(messages=
                                                               [
                                                                   {
                                                                       "role": "system", 
                                                                       "content": self.obj_prompt
                                                                    },
                                                                    {
                                                                        "role": "user", 
                                                                        "content": f"concept = {word}, source = {text}"
                                                                    },
                                                                ],
                                                                response_format=
                                                                {
                                                                    "type": "json",
                                                                    "value": 
                                                                    {
                                                                        "properties": 
                                                                        {
                                                                            "related_concepts": 
                                                                            {
                                                                                "type": "array", 
                                                                                "items": 
                                                                                {
                                                                                    "type": "string"
                                                                                }
                                                                            },
                                                                        }
                                                                    }
                                                                },
                                                                stream=False,
                                                                max_tokens=512,
                                                                temperature=0.5,
                                                                top_p=0.1
                                                                ).choices[0].get('message')['content']
        return ast.literal_eval(related_concepts)
        
    def _get_obj(self, text):
        """
        Execute the extraction of related concepts for the list of keywords: 
            - generate list of objects for each object in the dictionarytract relation for the provided concepts (subjects) based on the information from the text:

        :text: input text (str)
        :concepts: the list of keywords and other key concepts extracted with aggile._get_subj (dict)
            
        :return: {related_concepts: list of related words and collocations (objects that will form triplets)} (dict)
        """
        # Generate list of subjects
        core_concepts = self._get_subj(text, n=10)
        # Get object for each subject
        relations = {word: self.__extract_relations(word, text) for word in core_concepts['core_concepts']}
        return relations
    
    def __generate_predicates(self, subj, obj):
        import ast
        """
        Generate predicates between objects and subjects

        :subj: one generated subject from core_concepts (str)
        :obj: one generated object from relations (str) 
        :text: input text (str)
            
        :return: one relevant predicate to form triplets (str)
        """
        predicate = self.client.chat.completions.create(messages=
                                                        [
                                                            {
                                                                "role": "system", 
                                                                "content": self.pred_prompt
                                                            },
                                                            {
                                                                "role": "user", 
                                                                "content": f"what is the relationship between {subj} and {obj}? return a predicate only"
                                                            },
                                                        ],
                                                        response_format=
                                                        {
                                                            "type": "json",
                                                            "value": 
                                                            {
                                                                "properties": 
                                                                {
                                                                    "predicate": 
                                                                    {
                                                                        "type": "string"
                                                                    },
                                                                }
                                                            }
                                                        },
                                                        stream=False,
                                                        max_tokens=512,
                                                        temperature=0.5,
                                                        top_p=0.1
                                                        ).choices[0].get('message')['content']
        return ast.literal_eval(predicate)['predicate'] # Return predicate only, not the whole dictionary

    def form_triples(self, text):
        """
        :text: input text (str) if from_string=True
        """

        # Generate objects from text
        relations = self._get_obj(text)
        # Placeholder for triplets
        triplets = dict()
        # Form triplets for each subject
        for subj in relations:
            # Placeholder for the current subject
            triplets[subj] = list()
            # For each object generated for this subject:
            for obj in relations[subj]['related_concepts']:
                # Create placeholder with the triplet structure "subject-predicate-object"
                temp = {'subject': subj, 'predicate': '', 'object': ''}
                # Save the object to the triplet
                temp['object'] = obj
                # Generate predicate between the current object and the current subject
                temp['predicate'] = self.__generate_predicates(subj, obj)
                # Hallucincation check: if object and subjects are the same entities, do not append them to the list of triplets
                if temp['subject'] != temp['object']:
                    # Otherwise, append the triplet 
                    triplets[subj].append(temp)
        
        return triplets

class Graph:
    def __init__(self, triplets):
        self.triplets = triplets
    
    def build_graph(self):
        import plotly.graph_objects as go
        import networkx as nx
        from collections import Counter
        import random

        # Prepare nodes and edges
        nodes = set()
        edges = []

        # Extract noded and edges from the set of triplets
        for key, values in self.triplets.items():
            for rel in values:
                nodes.add(rel['subject'])
                nodes.add(rel['object'])
                edges.append((rel['subject'], rel['object'], rel['predicate']))

        # Create a networkx graph
        G = nx.Graph()

        # Add nodes and edges to the graph
        for edge in edges:
            G.add_edge(edge[0], edge[1], label=edge[2])

        # Generate positions for nodes using force-directed layout with more space
        pos = nx.spring_layout(G, seed=42)  # Increasing k for more spacing

        # Extract node and edge data for Plotly
        node_x = [pos[node][0] for node in G.nodes()]
        node_y = [pos[node][1] for node in G.nodes()]
        node_labels = list(G.nodes())

        # Count connections
        node_degrees = Counter([node for edge in edges for node in edge[:2]])

        # Assign distinct colors for each predicate (use a set to avoid duplicates)
        unique_predicates = list(set([edge[2] for edge in edges]))
        predicate_colors = {predicate: f'rgba({random.randint(0,255)},{random.randint(0,255)},{random.randint(0,255)},1)'
                            for predicate in unique_predicates}

        # Plotly data for edges
        edge_x = []
        edge_y = []

        for edge in edges:
            x0, y0 = pos[edge[0]]
            x1, y1 = pos[edge[1]]
            edge_x += [x0, x1, None]
            edge_y += [y0, y1, None]

        # Create the figure
        fig = go.Figure()

        # Add edges
        fig.add_trace(go.Scatter(
            x=edge_x, y=edge_y,
            line=dict(width=0.5, color='#888'),
            hoverinfo='text',
            mode='lines'
        ))

        # Add nodes with uniform size and labels
        fig.add_trace(go.Scatter(
            x=node_x, y=node_y,
            mode='markers+text',
            marker=dict(
                size=25,  # Uniform node size for all nodes
                color=[node_degrees[node] for node in node_labels],
                #colorscale='Viridis',
                colorbar=dict(title='Connections')
            ),
            text=node_labels,
            hoverinfo='text',
            textposition='top center',
            textfont=dict(size=13, weight="bold")
        ))

        # Add predicate labels near the nodes with black text
        for edge in edges:
            x0, y0 = pos[edge[0]]
            x1, y1 = pos[edge[1]]
            predicate_label = edge[2]

            # Calculate the midpoint of the edge and add small offsets to create spacing
            mid_x = (x0 + x1) / 2
            mid_y = (y0 + y1) / 2

            # Add the label near the midpoint of the edge with black text
            fig.add_trace(go.Scatter(
                x=[mid_x], y=[mid_y],
                mode='text',
                text=[predicate_label],
                textposition='middle center',
                showlegend=False,
                textfont=dict(size=10)
            ))

        # Update layout
        fig.update_layout(
            showlegend=False,
            margin=dict(l=0, r=0, t=0, b=0),
            xaxis=dict(showgrid=False, zeroline=False),
            yaxis=dict(showgrid=False, zeroline=False),
            title="Force-Directed Graph with Predicate Labels on Nodes"
        )

        # Save the figure as an HTML file
        #fig.write_html("graph_with_predicates.html")
        return fig