Spaces:
Running
on
Zero
Running
on
Zero
Fixing weird behaviors
Browse files- app.py +60 -48
- knowledge_graph.html +2 -2
- llm_graph.py +28 -12
- sample/kv_store_doc_status.json +35 -0
- visualize.py +37 -95
app.py
CHANGED
|
@@ -31,11 +31,14 @@ CUSTOM_CSS = """
|
|
| 31 |
"""
|
| 32 |
|
| 33 |
# Cache directory and file paths
|
| 34 |
-
CACHE_DIR = "cache"
|
|
|
|
| 35 |
EXAMPLE_CACHE_FILE = os.path.join(CACHE_DIR, "first_example_cache.pkl")
|
|
|
|
| 36 |
|
| 37 |
# Create cache directory if it doesn't exist
|
| 38 |
os.makedirs(CACHE_DIR, exist_ok=True)
|
|
|
|
| 39 |
|
| 40 |
# Initialize the LLMGraph model
|
| 41 |
model = LLMGraph()
|
|
@@ -62,7 +65,7 @@ def handle_text(text=""):
|
|
| 62 |
|
| 63 |
return " ".join(text.split())
|
| 64 |
|
| 65 |
-
def extract_kg(text="", model_name=
|
| 66 |
"""
|
| 67 |
Extract knowledge graph from text
|
| 68 |
"""
|
|
@@ -73,7 +76,10 @@ def extract_kg(text="", model_name=None):
|
|
| 73 |
try:
|
| 74 |
result = model.extract(text, model_name)
|
| 75 |
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
| 77 |
except Exception as e:
|
| 78 |
raise gr.Error(f"❌ Extraction error: {str(e)}")
|
| 79 |
|
|
@@ -108,7 +114,7 @@ def find_token_indices(doc, substring, text):
|
|
| 108 |
|
| 109 |
return result
|
| 110 |
|
| 111 |
-
def create_custom_entity_viz(data, full_text):
|
| 112 |
"""
|
| 113 |
Create custom entity visualization using spaCy's displacy
|
| 114 |
"""
|
|
@@ -130,7 +136,7 @@ def create_custom_entity_viz(data, full_text):
|
|
| 130 |
overlapping = any(s.start < end and start < s.end for s in spans)
|
| 131 |
|
| 132 |
if not overlapping:
|
| 133 |
-
node_type = node.get(
|
| 134 |
span = Span(doc, start, end, label=node_type)
|
| 135 |
spans.append(span)
|
| 136 |
|
|
@@ -156,30 +162,33 @@ def create_custom_entity_viz(data, full_text):
|
|
| 156 |
|
| 157 |
return styled_html
|
| 158 |
|
| 159 |
-
def create_graph(json_data):
|
| 160 |
"""
|
| 161 |
Create interactive knowledge graph using pyvis
|
| 162 |
"""
|
| 163 |
|
| 164 |
-
|
|
|
|
| 165 |
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
|
|
|
|
|
|
| 183 |
|
| 184 |
# Create network visualization
|
| 185 |
network = Network(
|
|
@@ -193,17 +202,20 @@ def create_graph(json_data):
|
|
| 193 |
|
| 194 |
# Configure network display
|
| 195 |
network.from_nx(G)
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
|
| 205 |
# Customize node appearance
|
| 206 |
for node in network.nodes:
|
|
|
|
|
|
|
|
|
|
| 207 |
node['color'] = {'background': '#e0e7ff', 'border': '#6366f1', 'highlight': {'background': '#c7d2fe', 'border': '#4f46e5'}}
|
| 208 |
node['font'] = {'size': 14, 'color': '#1e293b'}
|
| 209 |
node['shape'] = 'dot'
|
|
@@ -211,6 +223,9 @@ def create_graph(json_data):
|
|
| 211 |
|
| 212 |
# Customize edge appearance
|
| 213 |
for edge in network.edges:
|
|
|
|
|
|
|
|
|
|
| 214 |
edge['width'] = 4
|
| 215 |
# edge['arrows'] = {'to': {'enabled': False, 'type': 'arrow'}}
|
| 216 |
edge['color'] = {'color': '#6366f1', 'highlight': '#4f46e5'}
|
|
@@ -236,20 +251,20 @@ def process_and_visualize(text, model_name, progress=gr.Progress()):
|
|
| 236 |
|
| 237 |
# Check if we're processing the first example for caching
|
| 238 |
is_first_example = text == EXAMPLES[0][0]
|
| 239 |
-
|
| 240 |
-
|
|
|
|
| 241 |
|
| 242 |
# Try to load from cache if it's the first example
|
| 243 |
-
if is_first_example and os.path.exists(EXAMPLE_CACHE_FILE):
|
| 244 |
try:
|
| 245 |
progress(0.3, desc="Loading from cache...")
|
| 246 |
with open(EXAMPLE_CACHE_FILE, 'rb') as f:
|
| 247 |
-
|
| 248 |
|
| 249 |
progress(1.0, desc="Loaded from cache!")
|
| 250 |
-
return
|
| 251 |
except Exception as e:
|
| 252 |
-
# print(f"Cache loading error: {str(e)}")
|
| 253 |
logging.error(f"Cache loading error: {str(e)}")
|
| 254 |
|
| 255 |
# Continue with normal processing if cache fails
|
|
@@ -257,28 +272,30 @@ def process_and_visualize(text, model_name, progress=gr.Progress()):
|
|
| 257 |
json_data = extract_kg(text, model_name)
|
| 258 |
|
| 259 |
progress(0.5, desc="Creating entity visualization...")
|
| 260 |
-
|
|
|
|
|
|
|
|
|
|
| 261 |
|
| 262 |
progress(0.8, desc="Building knowledge graph...")
|
| 263 |
-
graph_html = create_graph(json_data)
|
| 264 |
|
| 265 |
node_count = len(json_data["nodes"])
|
| 266 |
edge_count = len(json_data["edges"])
|
| 267 |
stats = f"📊 Extracted {node_count} entities and {edge_count} relationships"
|
| 268 |
|
| 269 |
# Save to cache if it's the first example
|
| 270 |
-
if is_first_example:
|
| 271 |
try:
|
| 272 |
-
|
| 273 |
"graph_html": graph_html,
|
| 274 |
"entities_viz": entities_viz,
|
| 275 |
"json_data": json_data,
|
| 276 |
"stats": stats
|
| 277 |
}
|
| 278 |
with open(EXAMPLE_CACHE_FILE, 'wb') as f:
|
| 279 |
-
pickle.dump(
|
| 280 |
except Exception as e:
|
| 281 |
-
# print(f"Cache saving error: {str(e)}")
|
| 282 |
logging.error(f"Cache saving error: {str(e)}")
|
| 283 |
|
| 284 |
progress(1.0, desc="Complete!")
|
|
@@ -312,7 +329,6 @@ def generate_first_example():
|
|
| 312 |
"""
|
| 313 |
|
| 314 |
if not os.path.exists(EXAMPLE_CACHE_FILE):
|
| 315 |
-
# print("Generating cache for first example...")
|
| 316 |
logging.info("Generating cache for first example...")
|
| 317 |
|
| 318 |
try:
|
|
@@ -338,15 +354,12 @@ def generate_first_example():
|
|
| 338 |
|
| 339 |
with open(EXAMPLE_CACHE_FILE, 'wb') as f:
|
| 340 |
pickle.dump(cached_data, f)
|
| 341 |
-
# print("First example cache generated successfully")
|
| 342 |
logging.info("First example cache generated successfully")
|
| 343 |
|
| 344 |
return cached_data
|
| 345 |
except Exception as e:
|
| 346 |
-
# print(f"Error generating first example cache: {str(e)}")
|
| 347 |
logging.error(f"Error generating first example cache: {str(e)}")
|
| 348 |
else:
|
| 349 |
-
# print("First example cache already exists")
|
| 350 |
logging.info("First example cache already exists")
|
| 351 |
|
| 352 |
# Load existing cache
|
|
@@ -354,7 +367,6 @@ def generate_first_example():
|
|
| 354 |
with open(EXAMPLE_CACHE_FILE, 'rb') as f:
|
| 355 |
return pickle.load(f)
|
| 356 |
except Exception as e:
|
| 357 |
-
# print(f"Error loading existing cache: {str(e)}")
|
| 358 |
logging.error(f"Error loading existing cache: {str(e)}")
|
| 359 |
|
| 360 |
return None
|
|
|
|
| 31 |
"""
|
| 32 |
|
| 33 |
# Cache directory and file paths
|
| 34 |
+
CACHE_DIR = "./cache"
|
| 35 |
+
WORKING_DIR = "./sample"
|
| 36 |
EXAMPLE_CACHE_FILE = os.path.join(CACHE_DIR, "first_example_cache.pkl")
|
| 37 |
+
GRAPHML_FILE = WORKING_DIR + "/graph_chunk_entity_relation.graphml"
|
| 38 |
|
| 39 |
# Create cache directory if it doesn't exist
|
| 40 |
os.makedirs(CACHE_DIR, exist_ok=True)
|
| 41 |
+
os.makedirs(WORKING_DIR, exist_ok=True)
|
| 42 |
|
| 43 |
# Initialize the LLMGraph model
|
| 44 |
model = LLMGraph()
|
|
|
|
| 65 |
|
| 66 |
return " ".join(text.split())
|
| 67 |
|
| 68 |
+
def extract_kg(text="", model_name=MODEL_LIST[0]):
|
| 69 |
"""
|
| 70 |
Extract knowledge graph from text
|
| 71 |
"""
|
|
|
|
| 76 |
try:
|
| 77 |
result = model.extract(text, model_name)
|
| 78 |
|
| 79 |
+
if isinstance(result, dict):
|
| 80 |
+
return result
|
| 81 |
+
else: # convert string to dict
|
| 82 |
+
return rapidjson.loads(result)
|
| 83 |
except Exception as e:
|
| 84 |
raise gr.Error(f"❌ Extraction error: {str(e)}")
|
| 85 |
|
|
|
|
| 114 |
|
| 115 |
return result
|
| 116 |
|
| 117 |
+
def create_custom_entity_viz(data, full_text, type_col="type"):
|
| 118 |
"""
|
| 119 |
Create custom entity visualization using spaCy's displacy
|
| 120 |
"""
|
|
|
|
| 136 |
overlapping = any(s.start < end and start < s.end for s in spans)
|
| 137 |
|
| 138 |
if not overlapping:
|
| 139 |
+
node_type = node.get(type_col, "Entity")
|
| 140 |
span = Span(doc, start, end, label=node_type)
|
| 141 |
spans.append(span)
|
| 142 |
|
|
|
|
| 162 |
|
| 163 |
return styled_html
|
| 164 |
|
| 165 |
+
def create_graph(json_data, model_name=MODEL_LIST[0]):
|
| 166 |
"""
|
| 167 |
Create interactive knowledge graph using pyvis
|
| 168 |
"""
|
| 169 |
|
| 170 |
+
if model_name == MODEL_LIST[0]:
|
| 171 |
+
G = nx.Graph()
|
| 172 |
|
| 173 |
+
# Add nodes with tooltips and error handling for missing keys
|
| 174 |
+
for node in json_data['nodes']:
|
| 175 |
+
# Get node type with fallback
|
| 176 |
+
type = node.get("type", "Entity")
|
| 177 |
|
| 178 |
+
# Get detailed type with fallback
|
| 179 |
+
detailed_type = node.get("detailed_type", type)
|
| 180 |
+
|
| 181 |
+
# Use node ID and type info for the tooltip
|
| 182 |
+
G.add_node(node['id'], title=f"{type}: {detailed_type}")
|
| 183 |
+
|
| 184 |
+
# Add edges with labels
|
| 185 |
+
for edge in json_data['edges']:
|
| 186 |
+
# Check if the required keys exist
|
| 187 |
+
if 'from' in edge and 'to' in edge:
|
| 188 |
+
label = edge.get('label', 'related')
|
| 189 |
+
G.add_edge(edge['from'], edge['to'], title=label, label=label)
|
| 190 |
+
else:
|
| 191 |
+
G = nx.read_graphml(GRAPHML_FILE)
|
| 192 |
|
| 193 |
# Create network visualization
|
| 194 |
network = Network(
|
|
|
|
| 202 |
|
| 203 |
# Configure network display
|
| 204 |
network.from_nx(G)
|
| 205 |
+
network.barnes_hut(
|
| 206 |
+
gravity=-3000,
|
| 207 |
+
central_gravity=0.3,
|
| 208 |
+
spring_length=50,
|
| 209 |
+
spring_strength=0.001,
|
| 210 |
+
damping=0.09,
|
| 211 |
+
overlap=0,
|
| 212 |
+
)
|
| 213 |
|
| 214 |
# Customize node appearance
|
| 215 |
for node in network.nodes:
|
| 216 |
+
if "description" in node:
|
| 217 |
+
node["title"] = node["description"]
|
| 218 |
+
|
| 219 |
node['color'] = {'background': '#e0e7ff', 'border': '#6366f1', 'highlight': {'background': '#c7d2fe', 'border': '#4f46e5'}}
|
| 220 |
node['font'] = {'size': 14, 'color': '#1e293b'}
|
| 221 |
node['shape'] = 'dot'
|
|
|
|
| 223 |
|
| 224 |
# Customize edge appearance
|
| 225 |
for edge in network.edges:
|
| 226 |
+
if "description" in edge:
|
| 227 |
+
edge["title"] = edge["description"]
|
| 228 |
+
|
| 229 |
edge['width'] = 4
|
| 230 |
# edge['arrows'] = {'to': {'enabled': False, 'type': 'arrow'}}
|
| 231 |
edge['color'] = {'color': '#6366f1', 'highlight': '#4f46e5'}
|
|
|
|
| 251 |
|
| 252 |
# Check if we're processing the first example for caching
|
| 253 |
is_first_example = text == EXAMPLES[0][0]
|
| 254 |
+
|
| 255 |
+
# Ensure RAG is initialized
|
| 256 |
+
asyncio.run(model.initialize_rag())
|
| 257 |
|
| 258 |
# Try to load from cache if it's the first example
|
| 259 |
+
if is_first_example and model_name == MODEL_LIST[0] and os.path.exists(EXAMPLE_CACHE_FILE):
|
| 260 |
try:
|
| 261 |
progress(0.3, desc="Loading from cache...")
|
| 262 |
with open(EXAMPLE_CACHE_FILE, 'rb') as f:
|
| 263 |
+
cached_data = pickle.load(f)
|
| 264 |
|
| 265 |
progress(1.0, desc="Loaded from cache!")
|
| 266 |
+
return cached_data["graph_html"], cached_data["entities_viz"], cached_data["json_data"], cached_data["stats"]
|
| 267 |
except Exception as e:
|
|
|
|
| 268 |
logging.error(f"Cache loading error: {str(e)}")
|
| 269 |
|
| 270 |
# Continue with normal processing if cache fails
|
|
|
|
| 272 |
json_data = extract_kg(text, model_name)
|
| 273 |
|
| 274 |
progress(0.5, desc="Creating entity visualization...")
|
| 275 |
+
if model_name == MODEL_LIST[0]:
|
| 276 |
+
entities_viz = create_custom_entity_viz(json_data, text, type_col="type")
|
| 277 |
+
else:
|
| 278 |
+
entities_viz = create_custom_entity_viz(json_data, text, type_col="entity_type")
|
| 279 |
|
| 280 |
progress(0.8, desc="Building knowledge graph...")
|
| 281 |
+
graph_html = create_graph(json_data, model_name)
|
| 282 |
|
| 283 |
node_count = len(json_data["nodes"])
|
| 284 |
edge_count = len(json_data["edges"])
|
| 285 |
stats = f"📊 Extracted {node_count} entities and {edge_count} relationships"
|
| 286 |
|
| 287 |
# Save to cache if it's the first example
|
| 288 |
+
if is_first_example and model_name == MODEL_LIST[0]:
|
| 289 |
try:
|
| 290 |
+
cached_data = {
|
| 291 |
"graph_html": graph_html,
|
| 292 |
"entities_viz": entities_viz,
|
| 293 |
"json_data": json_data,
|
| 294 |
"stats": stats
|
| 295 |
}
|
| 296 |
with open(EXAMPLE_CACHE_FILE, 'wb') as f:
|
| 297 |
+
pickle.dump(cached_data, f)
|
| 298 |
except Exception as e:
|
|
|
|
| 299 |
logging.error(f"Cache saving error: {str(e)}")
|
| 300 |
|
| 301 |
progress(1.0, desc="Complete!")
|
|
|
|
| 329 |
"""
|
| 330 |
|
| 331 |
if not os.path.exists(EXAMPLE_CACHE_FILE):
|
|
|
|
| 332 |
logging.info("Generating cache for first example...")
|
| 333 |
|
| 334 |
try:
|
|
|
|
| 354 |
|
| 355 |
with open(EXAMPLE_CACHE_FILE, 'wb') as f:
|
| 356 |
pickle.dump(cached_data, f)
|
|
|
|
| 357 |
logging.info("First example cache generated successfully")
|
| 358 |
|
| 359 |
return cached_data
|
| 360 |
except Exception as e:
|
|
|
|
| 361 |
logging.error(f"Error generating first example cache: {str(e)}")
|
| 362 |
else:
|
|
|
|
| 363 |
logging.info("First example cache already exists")
|
| 364 |
|
| 365 |
# Load existing cache
|
|
|
|
| 367 |
with open(EXAMPLE_CACHE_FILE, 'rb') as f:
|
| 368 |
return pickle.load(f)
|
| 369 |
except Exception as e:
|
|
|
|
| 370 |
logging.error(f"Error loading existing cache: {str(e)}")
|
| 371 |
|
| 372 |
return None
|
knowledge_graph.html
CHANGED
|
@@ -88,8 +88,8 @@
|
|
| 88 |
|
| 89 |
|
| 90 |
// parsing and collecting nodes and edges from the python
|
| 91 |
-
nodes = new vis.DataSet([{"color": {"background": "#e0e7ff", "border": "#6366f1", "highlight": {"background": "#c7d2fe", "border": "#4f46e5"}}, "created_at":
|
| 92 |
-
edges = new vis.DataSet([{"color": {"color": "#6366f1", "highlight": "#4f46e5"}, "created_at":
|
| 93 |
|
| 94 |
nodeColors = {};
|
| 95 |
allNodes = nodes.get({ returnType: "Object" });
|
|
|
|
| 88 |
|
| 89 |
|
| 90 |
// parsing and collecting nodes and edges from the python
|
| 91 |
+
nodes = new vis.DataSet([{"color": {"background": "#e0e7ff", "border": "#6366f1", "highlight": {"background": "#c7d2fe", "border": "#4f46e5"}}, "created_at": 1756651432, "description": "Aerosmith is a legendary rock band that has announced their retirement from touring after 54 years.", "entity_id": "Aerosmith", "entity_type": "organization", "file_path": "unknown_source", "font": {"color": "#1e293b", "size": 14}, "id": "Aerosmith", "label": "Aerosmith", "shape": "dot", "size": 20, "source_id": "chunk-150cfba3862e116efcee671d872955be", "title": "Aerosmith is a legendary rock band that has announced their retirement from touring after 54 years."}, {"color": {"background": "#e0e7ff", "border": "#6366f1", "highlight": {"background": "#c7d2fe", "border": "#4f46e5"}}, "created_at": 1756651432, "description": "Steven Tyler is the lead singer of Aerosmith who suffered an unrecoverable vocal cord injury leading to the band\u0027s retirement from touring.", "entity_id": "Steven Tyler", "entity_type": "person", "file_path": "unknown_source", "font": {"color": "#1e293b", "size": 14}, "id": "Steven Tyler", "label": "Steven Tyler", "shape": "dot", "size": 20, "source_id": "chunk-150cfba3862e116efcee671d872955be", "title": "Steven Tyler is the lead singer of Aerosmith who suffered an unrecoverable vocal cord injury leading to the band\u0027s retirement from touring."}, {"color": {"background": "#e0e7ff", "border": "#6366f1", "highlight": {"background": "#c7d2fe", "border": "#4f46e5"}}, "created_at": 1756651432, "description": "Vocal cord injury refers to the unrecoverable injury suffered by Steven Tyler that caused Aerosmith to retire from touring.", "entity_id": "Vocal Cord Injury", "entity_type": "category", "file_path": "unknown_source", "font": {"color": "#1e293b", "size": 14}, "id": "Vocal Cord Injury", "label": "Vocal Cord Injury", "shape": "dot", "size": 20, "source_id": "chunk-150cfba3862e116efcee671d872955be", "title": "Vocal cord injury refers to the unrecoverable injury suffered by Steven Tyler that caused Aerosmith to retire from touring."}, {"color": {"background": "#e0e7ff", "border": "#6366f1", "highlight": {"background": "#c7d2fe", "border": "#4f46e5"}}, "created_at": 1756651432, "description": "Touring refers to the activity of performing live concerts in various locations, which Aerosmith has retired from after 54 years.", "entity_id": "Touring", "entity_type": "category", "file_path": "unknown_source", "font": {"color": "#1e293b", "size": 14}, "id": "Touring", "label": "Touring", "shape": "dot", "size": 20, "source_id": "chunk-150cfba3862e116efcee671d872955be", "title": "Touring refers to the activity of performing live concerts in various locations, which Aerosmith has retired from after 54 years."}, {"color": {"background": "#e0e7ff", "border": "#6366f1", "highlight": {"background": "#c7d2fe", "border": "#4f46e5"}}, "created_at": 1756651432, "description": "Fractured larynx is the specific injury Steven Tyler suffered in September 2023, which was unsuccessfully treated.", "entity_id": "Fractured Larynx", "entity_type": "category", "file_path": "unknown_source", "font": {"color": "#1e293b", "size": 14}, "id": "Fractured Larynx", "label": "Fractured Larynx", "shape": "dot", "size": 20, "source_id": "chunk-150cfba3862e116efcee671d872955be", "title": "Fractured larynx is the specific injury Steven Tyler suffered in September 2023, which was unsuccessfully treated."}, {"color": {"background": "#e0e7ff", "border": "#6366f1", "highlight": {"background": "#c7d2fe", "border": "#4f46e5"}}, "created_at": 1756651432, "description": "Unsuccessful treatment refers to the medical efforts to heal Steven Tyler\u0027s fractured larynx that did not result in recovery.", "entity_id": "Unsuccessful Treatment", "entity_type": "category", "file_path": "unknown_source", "font": {"color": "#1e293b", "size": 14}, "id": "Unsuccessful Treatment", "label": "Unsuccessful Treatment", "shape": "dot", "size": 20, "source_id": "chunk-150cfba3862e116efcee671d872955be", "title": "Unsuccessful treatment refers to the medical efforts to heal Steven Tyler\u0027s fractured larynx that did not result in recovery."}, {"color": {"background": "#e0e7ff", "border": "#6366f1", "highlight": {"background": "#c7d2fe", "border": "#4f46e5"}}, "created_at": 1756651432, "description": "September 2023 is the time when Steven Tyler suffered a fractured larynx.", "entity_id": "September 2023", "entity_type": "event", "file_path": "unknown_source", "font": {"color": "#1e293b", "size": 14}, "id": "September 2023", "label": "September 2023", "shape": "dot", "size": 20, "source_id": "chunk-150cfba3862e116efcee671d872955be", "title": "September 2023 is the time when Steven Tyler suffered a fractured larynx."}]);
|
| 92 |
+
edges = new vis.DataSet([{"color": {"color": "#6366f1", "highlight": "#4f46e5"}, "created_at": 1756651440, "description": "Steven Tyler is the lead singer of Aerosmith, whose vocal injury led to the band\u0027s retirement from touring.", "file_path": "unknown_source", "font": {"color": "#4b5563", "face": "Arial", "size": 12}, "from": "Aerosmith", "keywords": "band membership,cause of retirement", "source_id": "chunk-150cfba3862e116efcee671d872955be", "title": "Steven Tyler is the lead singer of Aerosmith, whose vocal injury led to the band\u0027s retirement from touring.", "to": "Steven Tyler", "width": 4}, {"color": {"color": "#6366f1", "highlight": "#4f46e5"}, "created_at": 1756651442, "description": "The vocal cord injury to Steven Tyler led to Aerosmith\u0027s retirement from touring.", "file_path": "unknown_source", "font": {"color": "#4b5563", "face": "Arial", "size": 12}, "from": "Aerosmith", "keywords": "band decision,cause and effect", "source_id": "chunk-150cfba3862e116efcee671d872955be", "title": "The vocal cord injury to Steven Tyler led to Aerosmith\u0027s retirement from touring.", "to": "Vocal Cord Injury", "width": 4}, {"color": {"color": "#6366f1", "highlight": "#4f46e5"}, "created_at": 1756651444, "description": "Aerosmith has officially retired from touring after 54 years.", "file_path": "unknown_source", "font": {"color": "#4b5563", "face": "Arial", "size": 12}, "from": "Aerosmith", "keywords": "career activity,retirement", "source_id": "chunk-150cfba3862e116efcee671d872955be", "title": "Aerosmith has officially retired from touring after 54 years.", "to": "Touring", "width": 4}, {"color": {"color": "#6366f1", "highlight": "#4f46e5"}, "created_at": 1756651441, "description": "Steven Tyler suffered an unrecoverable vocal cord injury that affected his ability to perform.", "file_path": "unknown_source", "font": {"color": "#4b5563", "face": "Arial", "size": 12}, "from": "Steven Tyler", "keywords": "career impact,health issue", "source_id": "chunk-150cfba3862e116efcee671d872955be", "title": "Steven Tyler suffered an unrecoverable vocal cord injury that affected his ability to perform.", "to": "Vocal Cord Injury", "width": 4}, {"color": {"color": "#6366f1", "highlight": "#4f46e5"}, "created_at": 1756651442, "description": "Steven Tyler\u0027s fractured larynx in September 2023 was the cause of his vocal cord injury.", "file_path": "unknown_source", "font": {"color": "#4b5563", "face": "Arial", "size": 12}, "from": "Steven Tyler", "keywords": "health event,injury cause", "source_id": "chunk-150cfba3862e116efcee671d872955be", "title": "Steven Tyler\u0027s fractured larynx in September 2023 was the cause of his vocal cord injury.", "to": "Fractured Larynx", "width": 4}, {"color": {"color": "#6366f1", "highlight": "#4f46e5"}, "created_at": 1756651444, "description": "Steven Tyler underwent unsuccessful treatment for his fractured larynx.", "file_path": "unknown_source", "font": {"color": "#4b5563", "face": "Arial", "size": 12}, "from": "Steven Tyler", "keywords": "health outcome,medical treatment", "source_id": "chunk-150cfba3862e116efcee671d872955be", "title": "Steven Tyler underwent unsuccessful treatment for his fractured larynx.", "to": "Unsuccessful Treatment", "width": 4}, {"color": {"color": "#6366f1", "highlight": "#4f46e5"}, "created_at": 1756651444, "description": "The fractured larynx injury occurred in September 2023.", "file_path": "unknown_source", "font": {"color": "#4b5563", "face": "Arial", "size": 12}, "from": "September 2023", "keywords": "injury timing,temporal occurrence", "source_id": "chunk-150cfba3862e116efcee671d872955be", "title": "The fractured larynx injury occurred in September 2023.", "to": "Fractured Larynx", "width": 4}, {"color": {"color": "#6366f1", "highlight": "#4f46e5"}, "created_at": 1756651447, "description": "The unsuccessful treatment was aimed at healing the fractured larynx.", "file_path": "unknown_source", "font": {"color": "#4b5563", "face": "Arial", "size": 12}, "from": "Fractured Larynx", "keywords": "injury management,medical intervention", "source_id": "chunk-150cfba3862e116efcee671d872955be", "title": "The unsuccessful treatment was aimed at healing the fractured larynx.", "to": "Unsuccessful Treatment", "width": 4}]);
|
| 93 |
|
| 94 |
nodeColors = {};
|
| 95 |
allNodes = nodes.get({ returnType: "Object" });
|
llm_graph.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
import os
|
| 2 |
-
import
|
|
|
|
| 3 |
import numpy as np
|
|
|
|
| 4 |
|
| 5 |
from textwrap import dedent
|
| 6 |
from dotenv import load_dotenv
|
|
@@ -25,7 +27,8 @@ AZURE_OPENAI_ENDPOINT = os.environ["AZURE_OPENAI_ENDPOINT"]
|
|
| 25 |
AZURE_EMBEDDING_DEPLOYMENT = os.environ["AZURE_EMBEDDING_DEPLOYMENT"]
|
| 26 |
AZURE_EMBEDDING_API_VERSION = os.environ["AZURE_EMBEDDING_API_VERSION"]
|
| 27 |
|
| 28 |
-
WORKING_DIR = "./
|
|
|
|
| 29 |
|
| 30 |
MODEL_LIST = [
|
| 31 |
"EmergentMethods/Phi-3-mini-128k-instruct-graph",
|
|
@@ -52,9 +55,9 @@ class LLMGraph:
|
|
| 52 |
func=self._embedding_func,
|
| 53 |
),
|
| 54 |
)
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
|
| 59 |
# async def test_responses(self):
|
| 60 |
# """
|
|
@@ -151,24 +154,37 @@ class LLMGraph:
|
|
| 151 |
|
| 152 |
return messages
|
| 153 |
|
| 154 |
-
def extract(self, text, model_name=MODEL_LIST[0])
|
| 155 |
"""
|
| 156 |
-
Extract knowledge graph from text
|
| 157 |
"""
|
| 158 |
-
|
| 159 |
-
generated_text = "This is a placeholder response."
|
| 160 |
|
| 161 |
if model_name == MODEL_LIST[0]:
|
| 162 |
# Use Hugging Face Inference API with Phi-3-mini-128k-instruct-graph
|
| 163 |
messages = self._get_messages(text)
|
| 164 |
-
|
|
|
|
|
|
|
| 165 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
# Use LightRAG with Azure OpenAI
|
|
|
|
| 167 |
self.rag.insert(text) # Insert the text into the RAG storage
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
-
|
|
|
|
| 171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
async def _llm_model_func(self, prompt, system_prompt=None, history_messages=[], **kwargs) -> str:
|
| 173 |
"""
|
| 174 |
Call the Azure OpenAI chat completion endpoint with the given prompt and optional system prompt and history messages.
|
|
|
|
| 1 |
import os
|
| 2 |
+
import time
|
| 3 |
+
import shutil
|
| 4 |
import numpy as np
|
| 5 |
+
import networkx as nx
|
| 6 |
|
| 7 |
from textwrap import dedent
|
| 8 |
from dotenv import load_dotenv
|
|
|
|
| 27 |
AZURE_EMBEDDING_DEPLOYMENT = os.environ["AZURE_EMBEDDING_DEPLOYMENT"]
|
| 28 |
AZURE_EMBEDDING_API_VERSION = os.environ["AZURE_EMBEDDING_API_VERSION"]
|
| 29 |
|
| 30 |
+
WORKING_DIR = "./sample"
|
| 31 |
+
GRAPHML_FILE = WORKING_DIR + "/graph_chunk_entity_relation.graphml"
|
| 32 |
|
| 33 |
MODEL_LIST = [
|
| 34 |
"EmergentMethods/Phi-3-mini-128k-instruct-graph",
|
|
|
|
| 55 |
func=self._embedding_func,
|
| 56 |
),
|
| 57 |
)
|
| 58 |
+
# TODO: Check if this works as expected
|
| 59 |
+
await self.rag.initialize_storages()
|
| 60 |
+
await initialize_pipeline_status()
|
| 61 |
|
| 62 |
# async def test_responses(self):
|
| 63 |
# """
|
|
|
|
| 154 |
|
| 155 |
return messages
|
| 156 |
|
| 157 |
+
def extract(self, text, model_name=MODEL_LIST[0]):
|
| 158 |
"""
|
| 159 |
+
Extract knowledge graph in structured format from text.
|
| 160 |
"""
|
|
|
|
|
|
|
| 161 |
|
| 162 |
if model_name == MODEL_LIST[0]:
|
| 163 |
# Use Hugging Face Inference API with Phi-3-mini-128k-instruct-graph
|
| 164 |
messages = self._get_messages(text)
|
| 165 |
+
|
| 166 |
+
json_graph = self._generate(messages)
|
| 167 |
+
return json_graph
|
| 168 |
else:
|
| 169 |
+
if os.path.exists(WORKING_DIR):
|
| 170 |
+
shutil.rmtree(WORKING_DIR)
|
| 171 |
+
os.makedirs(WORKING_DIR, exist_ok=True)
|
| 172 |
+
|
| 173 |
# Use LightRAG with Azure OpenAI
|
| 174 |
+
# TODO: Clear all the previous inserted texts first
|
| 175 |
self.rag.insert(text) # Insert the text into the RAG storage
|
| 176 |
+
|
| 177 |
+
# Wait for GRAPHML_FILE to be created
|
| 178 |
+
while not os.path.exists(GRAPHML_FILE):
|
| 179 |
+
time.sleep(0.1) # Sleep for 100ms before checking again
|
| 180 |
|
| 181 |
+
# Extract dict format of the knowledge graph
|
| 182 |
+
G = nx.read_graphml(GRAPHML_FILE)
|
| 183 |
|
| 184 |
+
# Convert the graph to node-link data format
|
| 185 |
+
dict_graph = nx.node_link_data(G, edges="edges")
|
| 186 |
+
return dict_graph
|
| 187 |
+
|
| 188 |
async def _llm_model_func(self, prompt, system_prompt=None, history_messages=[], **kwargs) -> str:
|
| 189 |
"""
|
| 190 |
Call the Azure OpenAI chat completion endpoint with the given prompt and optional system prompt and history messages.
|
sample/kv_store_doc_status.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"doc-605403c35618c5288c57c562f8eca566": {
|
| 3 |
+
"status": "processed",
|
| 4 |
+
"chunks_count": 1,
|
| 5 |
+
"chunks_list": [
|
| 6 |
+
"chunk-605403c35618c5288c57c562f8eca566"
|
| 7 |
+
],
|
| 8 |
+
"content_summary": "The family of Azerbaijan President Ilham Aliyev leads a charmed, glamorous life, thanks in part to financial interests in almost every sector of the economy. His wife, Mehriban, comes from the privileged and powerful Pashayev family that owns banks, ...",
|
| 9 |
+
"content_length": 1074,
|
| 10 |
+
"created_at": "2025-08-31T15:50:59.506391+00:00",
|
| 11 |
+
"updated_at": "2025-08-31T15:52:26.018288+00:00",
|
| 12 |
+
"file_path": "unknown_source",
|
| 13 |
+
"track_id": "insert_20250831_235059_6946ff78",
|
| 14 |
+
"metadata": {
|
| 15 |
+
"processing_start_time": 1756655459,
|
| 16 |
+
"processing_end_time": 1756655546
|
| 17 |
+
}
|
| 18 |
+
},
|
| 19 |
+
"doc-eea199eb7feea197ebb82e9333a2d2f2": {
|
| 20 |
+
"status": "processing",
|
| 21 |
+
"chunks_count": 1,
|
| 22 |
+
"chunks_list": [
|
| 23 |
+
"chunk-eea199eb7feea197ebb82e9333a2d2f2"
|
| 24 |
+
],
|
| 25 |
+
"content_summary": "Les jardins du Luxembourg, situés au cœur du sixième arrondissement de Paris, offrent un véritable havre de paix aux citadins pressés. Créés au début du dix-septième siècle sur l'initiative de Marie de Médicis, ces jardins à la française s'étendent s...",
|
| 26 |
+
"content_length": 697,
|
| 27 |
+
"created_at": "2025-08-31T15:54:38.060638+00:00",
|
| 28 |
+
"updated_at": "2025-08-31T15:54:38.068349+00:00",
|
| 29 |
+
"file_path": "unknown_source",
|
| 30 |
+
"track_id": "insert_20250831_235438_22d326d7",
|
| 31 |
+
"metadata": {
|
| 32 |
+
"processing_start_time": 1756655678
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
}
|
visualize.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
import networkx as nx
|
| 2 |
-
import rapidjson
|
| 3 |
import warnings
|
| 4 |
import os
|
| 5 |
|
|
@@ -13,98 +12,41 @@ file_path = "./cache/graph_chunk_entity_relation.graphml"
|
|
| 13 |
assert os.path.exists(file_path), f"File {file_path} does not exist."
|
| 14 |
G = nx.read_graphml(file_path)
|
| 15 |
|
| 16 |
-
def create_graph(json_data):
|
| 17 |
-
"""
|
| 18 |
-
Create interactive knowledge graph using pyvis
|
| 19 |
-
"""
|
| 20 |
-
|
| 21 |
-
G = nx.Graph()
|
| 22 |
-
|
| 23 |
-
# Add nodes with tooltips and error handling for missing keys
|
| 24 |
-
for node in json_data['nodes']:
|
| 25 |
-
# Get node type with fallback
|
| 26 |
-
type = node.get("type", "Entity")
|
| 27 |
-
|
| 28 |
-
# Get detailed type with fallback
|
| 29 |
-
detailed_type = node.get("detailed_type", type)
|
| 30 |
-
|
| 31 |
-
# Use node ID and type info for the tooltip
|
| 32 |
-
G.add_node(node['id'], title=f"{type}: {detailed_type}")
|
| 33 |
-
|
| 34 |
-
# Add edges with labels
|
| 35 |
-
for edge in json_data['edges']:
|
| 36 |
-
# Check if the required keys exist
|
| 37 |
-
if 'from' in edge and 'to' in edge:
|
| 38 |
-
label = edge.get('label', 'related')
|
| 39 |
-
G.add_edge(edge['from'], edge['to'], title=label, label=label)
|
| 40 |
-
|
| 41 |
-
# Create network visualization
|
| 42 |
-
network = Network(
|
| 43 |
-
width="100%",
|
| 44 |
-
height="100vh",
|
| 45 |
-
notebook=False,
|
| 46 |
-
bgcolor="#f8fafc",
|
| 47 |
-
font_color="#1e293b"
|
| 48 |
-
)
|
| 49 |
-
|
| 50 |
-
# Configure network display
|
| 51 |
-
network.from_nx(G)
|
| 52 |
-
|
| 53 |
-
# Customize node appearance
|
| 54 |
-
for node in network.nodes:
|
| 55 |
-
node['color'] = {'background': '#e0e7ff', 'border': '#6366f1', 'highlight': {'background': '#c7d2fe', 'border': '#4f46e5'}}
|
| 56 |
-
node['font'] = {'size': 14, 'color': '#1e293b'}
|
| 57 |
-
node['shape'] = 'dot'
|
| 58 |
-
node['size'] = 20
|
| 59 |
-
|
| 60 |
-
# Customize edge appearance
|
| 61 |
-
for edge in network.edges:
|
| 62 |
-
edge['width'] = 4
|
| 63 |
-
edge['color'] = {'color': '#6366f1', 'highlight': '#4f46e5'}
|
| 64 |
-
edge['font'] = {'size': 12, 'color': '#4b5563', 'face': 'Arial'}
|
| 65 |
-
|
| 66 |
-
# Save and display the network
|
| 67 |
-
filename_out = "knowledge_graph.html"
|
| 68 |
-
network.show(filename_out)
|
| 69 |
-
print(f"Knowledge graph saved to {filename_out}")
|
| 70 |
-
|
| 71 |
# Convert the graph to node-link data format
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
#
|
| 82 |
-
|
| 83 |
-
#
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
#
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
#
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
#
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
# network.show(filename_out)
|
| 110 |
-
# print(f"Knowledge graph saved to {filename_out}")
|
|
|
|
| 1 |
import networkx as nx
|
|
|
|
| 2 |
import warnings
|
| 3 |
import os
|
| 4 |
|
|
|
|
| 12 |
assert os.path.exists(file_path), f"File {file_path} does not exist."
|
| 13 |
G = nx.read_graphml(file_path)
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
# Convert the graph to node-link data format
|
| 16 |
+
dict_graph = nx.node_link_data(G)
|
| 17 |
+
print("Number of nodes:", len(dict_graph['nodes']))
|
| 18 |
+
print("Number of edges:", len(dict_graph['links']))
|
| 19 |
+
|
| 20 |
+
# Create a Pyvis network
|
| 21 |
+
network = Network(width="100%",
|
| 22 |
+
height="100vh",
|
| 23 |
+
notebook=True,
|
| 24 |
+
bgcolor="#f8fafc",
|
| 25 |
+
font_color="#1e293b")
|
| 26 |
+
|
| 27 |
+
# Convert NetworkX graph to Pyvis network
|
| 28 |
+
network.from_nx(G)
|
| 29 |
+
|
| 30 |
+
# Add colors and title to nodes
|
| 31 |
+
for node in network.nodes:
|
| 32 |
+
if "description" in node:
|
| 33 |
+
node["title"] = node["description"]
|
| 34 |
+
|
| 35 |
+
node['color'] = {'background': '#e0e7ff', 'border': '#6366f1', 'highlight': {'background': '#c7d2fe', 'border': '#4f46e5'}}
|
| 36 |
+
node['font'] = {'size': 14, 'color': '#1e293b'}
|
| 37 |
+
node['shape'] = 'dot'
|
| 38 |
+
node['size'] = 20
|
| 39 |
+
|
| 40 |
+
# Add title to edges
|
| 41 |
+
for edge in network.edges:
|
| 42 |
+
if "description" in edge:
|
| 43 |
+
edge["title"] = edge["description"]
|
| 44 |
+
|
| 45 |
+
edge['width'] = 4
|
| 46 |
+
edge['color'] = {'color': '#6366f1', 'highlight': '#4f46e5'}
|
| 47 |
+
edge['font'] = {'size': 12, 'color': '#4b5563', 'face': 'Arial'}
|
| 48 |
+
|
| 49 |
+
# Save and display the network
|
| 50 |
+
filename_out = "knowledge_graph.html"
|
| 51 |
+
network.show(filename_out)
|
| 52 |
+
print(f"Knowledge graph saved to {filename_out}")
|
|
|
|
|
|